PyPI - wafer-cli - Versions diffs - 0.2.14__py3-none-any.whl → 0.2.30__py3-none-any.whl - Mend

wafer-cli 0.2.14py3-none-any.whl → 0.2.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

wafer/GUIDE.md +1 -1
wafer/agent_defaults.py +42 -0
wafer/auth.py +7 -0
wafer/billing.py +6 -6
wafer/cli.py +905 -131
wafer/cli_instructions.py +143 -0
wafer/corpus.py +313 -15
wafer/evaluate.py +480 -146
wafer/global_config.py +13 -0
wafer/kernel_scope.py +1 -1
wafer/ncu_analyze.py +1 -1
wafer/nsys_analyze.py +1 -1
wafer/skills/wafer-guide/SKILL.md +22 -6
wafer/specs_cli.py +157 -0
wafer/ssh_keys.py +6 -6
wafer/targets_cli.py +472 -0
wafer/targets_ops.py +29 -2
wafer/templates/ask_docs.py +1 -1
wafer/templates/optimize_kernel.py +3 -1
wafer/templates/optimize_kernelbench.py +17 -62
wafer/templates/trace_analyze.py +1 -1
wafer/tests/test_eval_cli_parity.py +199 -0
wafer/trace_compare.py +274 -0
wafer/wevin_cli.py +125 -26
wafer/workspaces.py +163 -16
wafer_cli-0.2.30.dist-info/METADATA +107 -0
wafer_cli-0.2.30.dist-info/RECORD +47 -0
wafer_cli-0.2.14.dist-info/METADATA +0 -16
wafer_cli-0.2.14.dist-info/RECORD +0 -41
{wafer_cli-0.2.14.dist-info → wafer_cli-0.2.30.dist-info}/WHEEL +0 -0
{wafer_cli-0.2.14.dist-info → wafer_cli-0.2.30.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.14.dist-info → wafer_cli-0.2.30.dist-info}/top_level.txt +0 -0

wafer/cli.py CHANGED Viewed

@@ -1,6 +1,8 @@
-# ruff: noqa: PLR0913
+# ruff: noqa: PLR0913, E402
 # PLR0913 (too many arguments) is suppressed because Typer CLI commands
 # naturally have many parameters - each --flag becomes a function argument.
+# E402 (module level import not at top) is suppressed because we intentionally
+# load .env files before importing other modules that may read env vars.
 """Wafer CLI - GPU development toolkit for LLM coding agents.
 Core commands:
@@ -27,6 +29,12 @@ from pathlib import Path
 import trio
 import typer
+from dotenv import load_dotenv
+# Auto-load .env from current directory and ~/.wafer/.env
+# This runs at import time so env vars are available before any config is accessed
+load_dotenv()  # cwd/.env
+load_dotenv(Path.home() / ".wafer" / ".env")  # ~/.wafer/.env
 from .config import WaferConfig, WaferEnvironment
 from .inference import infer_upload_files, resolve_environment
@@ -42,6 +50,7 @@ from .problems import (
 app = typer.Typer(
     help="GPU development toolkit for LLM coding agents",
     no_args_is_help=True,
+    pretty_exceptions_show_locals=False,  # Don't dump local vars (makes tracebacks huge)
 )
 # =============================================================================
@@ -58,11 +67,11 @@ def _show_version() -> None:
     """Show CLI version and environment, then exit."""
     from .analytics import _get_cli_version
     from .global_config import load_global_config
     version = _get_cli_version()
     config = load_global_config()
     environment = config.environment
     typer.echo(f"wafer-cli {version} ({environment})")
     raise typer.Exit()
@@ -110,7 +119,7 @@ def main_callback(
     if version:
         _show_version()
         return
     global _command_start_time, _command_outcome
     _command_start_time = time.time()
     _command_outcome = "success"  # Default to success, mark failure on exceptions
@@ -121,6 +130,7 @@ def main_callback(
     analytics.init_analytics()
     # Install exception hook to catch SystemExit and mark failures
+    # Also prints error message FIRST so it's visible even when traceback is truncated
     original_excepthook = sys.excepthook
     def custom_excepthook(
@@ -136,7 +146,11 @@ def main_callback(
                 _command_outcome = "failure"
         else:
             _command_outcome = "failure"
-        # Call original excepthook
+            # Print error summary FIRST (before traceback) so it's visible even if truncated
+            print(
+                f"\n\033[1;31m>>> ERROR: {exc_type.__name__}: {exc_value}\033[0m\n", file=sys.stderr
+            )
+        # Call original excepthook (prints the full traceback)
         original_excepthook(exc_type, exc_value, exc_traceback)
     sys.excepthook = custom_excepthook
@@ -180,11 +194,16 @@ def complete_target_name(incomplete: str) -> list[str]:
 # =============================================================================
 # Core subcommand groups (visible in --help)
+#
+# TODO: Further consolidate top-level commands to reduce --help surface area.
+# Candidates:
+#   - compare → wafer nvidia compare or keep top-level (cross-platform)
+#   - guide/skill/demo → wafer onboard {guide,skill,demo}
 # =============================================================================
 # Config management (includes targets as nested subcommand)
 config_app = typer.Typer(help="Manage CLI configuration and local GPU targets")
-app.add_typer(config_app, name="config")
+app.add_typer(config_app, name="config", rich_help_panel="Configuration")
 # Target management - nested under config
 targets_app = typer.Typer(
@@ -204,7 +223,7 @@ config_app.add_typer(targets_app, name="targets")
 workspaces_app = typer.Typer(
     help="""Manage cloud GPU workspaces for remote development.
-Workspaces are on-demand cloud GPU environments. Requires authentication (wafer login).
+Workspaces are on-demand cloud GPU environments. Requires authentication (wafer auth login).
 Available GPUs:
   MI300X  AMD Instinct MI300X (192GB HBM3, ROCm)
@@ -217,21 +236,21 @@ Commands:
   wafer workspaces sync dev ./project      # Sync files
   wafer workspaces delete dev              # Clean up"""
 )
-app.add_typer(workspaces_app, name="workspaces")
+app.add_typer(workspaces_app, name="workspaces", rich_help_panel="Infrastructure")
-# SSH Key management (BYOK - Bring Your Own Key)
+# SSH Key management (BYOK - Bring Your Own Key) - nested under config
 ssh_keys_app = typer.Typer(
     help="""Manage SSH public keys for workspace access.
 Register your SSH public keys here. These keys are installed in all workspaces
 you provision, enabling SSH access from any machine with your private key.
-  wafer ssh-keys list              # List registered keys
-  wafer ssh-keys add               # Add key (auto-detects ~/.ssh/id_ed25519.pub)
-  wafer ssh-keys add ~/.ssh/id_rsa.pub --name laptop  # Add specific key
-  wafer ssh-keys remove <key-id>   # Remove a key"""
+  wafer config ssh-keys list              # List registered keys
+  wafer config ssh-keys add               # Add key (auto-detects ~/.ssh/id_ed25519.pub)
+  wafer config ssh-keys add ~/.ssh/id_rsa.pub --name laptop  # Add specific key
+  wafer config ssh-keys remove <key-id>   # Remove a key"""
 )
-app.add_typer(ssh_keys_app, name="ssh-keys")
+config_app.add_typer(ssh_keys_app, name="ssh-keys")
 # Target operations (exec/ssh/sync on configured targets)
 targets_ops_app = typer.Typer(
@@ -247,22 +266,48 @@ Useful for exploratory work, debugging, or custom scripts.
 Supports: RunPod, DigitalOcean (auto-provisions), SSH targets (baremetal/vm).
 Configure targets with: wafer config targets init ..."""
 )
-app.add_typer(targets_ops_app, name="targets")
+app.add_typer(targets_ops_app, name="targets", rich_help_panel="Infrastructure")
+# Specs management (new: local TOML configs)
+from wafer.specs_cli import specs_app
+app.add_typer(specs_app, name="specs", rich_help_panel="Configuration")
+# Live resource management (new: API-backed commands on `wafer targets`)
+# These become: wafer targets list, wafer targets terminate, etc.
+from wafer.targets_cli import (
+    targets_list as _targets_list_cmd,
+)
+from wafer.targets_cli import (
+    targets_provision as _targets_provision_cmd,
+)
+from wafer.targets_cli import (
+    targets_reconcile as _targets_reconcile_cmd,
+)
+from wafer.targets_cli import (
+    targets_terminate as _targets_terminate_cmd,
+)
+from wafer.targets_cli import (
+    targets_pools as _targets_pools_cmd,
+)
+from wafer.targets_cli import (
+    targets_probe as _targets_probe_cmd,
+)
-# Billing management
+# Billing management - nested under config
 billing_app = typer.Typer(help="Manage billing, credits, and subscription")
-app.add_typer(billing_app, name="billing")
+config_app.add_typer(billing_app, name="billing")
 # Corpus management
 corpus_app = typer.Typer(help="Download and manage GPU documentation")
-app.add_typer(corpus_app, name="corpus")
+app.add_typer(corpus_app, name="corpus", rich_help_panel="Kernel Development")
 # Evaluate (supports multiple kernel formats)
 evaluate_app = typer.Typer(
     help="Test kernel correctness and performance",
     invoke_without_command=True,
 )
-app.add_typer(evaluate_app, name="evaluate")
+app.add_typer(evaluate_app, name="evaluate", rich_help_panel="Kernel Development")
 # Nested subcommand for kernelbench format
 kernelbench_app = typer.Typer(
@@ -291,7 +336,7 @@ app.add_typer(dev_app, name="dev")
 # =============================================================================
 nvidia_app = typer.Typer(help="NVIDIA GPU profiling and analysis tools")
-app.add_typer(nvidia_app, name="nvidia")
+app.add_typer(nvidia_app, name="nvidia", rich_help_panel="Profiling")
 # NCU analysis - under nvidia
 ncu_app = typer.Typer(help="Nsight Compute profile analysis")
@@ -314,18 +359,25 @@ nvidia_app.add_typer(tracelens_app, name="tracelens")
 # =============================================================================
 amd_app = typer.Typer(help="AMD GPU profiling and analysis tools")
-app.add_typer(amd_app, name="amd")
+app.add_typer(amd_app, name="amd", rich_help_panel="Profiling")
 # Unified ISA Analyzer - supports both .co files and Triton artifacts
 isa_app = typer.Typer(help="ISA analysis for AMD GPU kernels (.co, .s, .ll, .ttgir files)")
 amd_app.add_typer(isa_app, name="isa")
+# =============================================================================
+# Trace comparison (wafer compare)
+# =============================================================================
+compare_app = typer.Typer(help="Compare GPU traces across platforms (AMD vs NVIDIA)")
+app.add_typer(compare_app, name="compare", rich_help_panel="Profiling")
 # =============================================================================
 # Roofline analysis (wafer roofline)
 # =============================================================================
-@app.command("roofline")
+@app.command("roofline", rich_help_panel="Kernel Development")
 def roofline_cmd(
     gpu: str | None = typer.Option(
         None, "--gpu", "-g", help="GPU name (e.g., H100, B200, MI300X, A100)"
@@ -416,7 +468,7 @@ def roofline_cmd(
 # =============================================================================
 skill_app = typer.Typer(help="Manage AI coding assistant skills (Claude Code, Codex)")
-app.add_typer(skill_app, name="skill")
+app.add_typer(skill_app, name="skill", rich_help_panel="Onboarding")
 @skill_app.command("install")
@@ -580,18 +632,23 @@ def skill_status() -> None:
 # =============================================================================
-# Provider auth management (wafer auth ...)
+# Authentication (wafer auth ...)
 # =============================================================================
-provider_auth_app = typer.Typer(help="Manage API keys for cloud GPU providers")
-app.add_typer(provider_auth_app, name="auth")
+auth_app = typer.Typer(help="Authenticate with Wafer and cloud GPU providers")
+app.add_typer(auth_app, name="auth", rich_help_panel="Configuration")
+providers_app = typer.Typer(
+    help="Manage API keys for cloud GPU providers (RunPod, DigitalOcean, etc.)"
+)
+auth_app.add_typer(providers_app, name="providers")
-@provider_auth_app.command("login")
+@providers_app.command("login")
 def provider_auth_login(
     provider: str = typer.Argument(
         ...,
-        help="Provider name: runpod, digitalocean, or modal",
+        help="Provider name: runpod, digitalocean, modal, anthropic, or openai",
     ),
     api_key: str | None = typer.Option(
         None,
@@ -600,15 +657,16 @@ def provider_auth_login(
         help="API key (if not provided, reads from stdin)",
     ),
 ) -> None:
-    """Save API key for a cloud GPU provider.
+    """Save API key for a provider.
     Stores the key in ~/.wafer/auth.json. Environment variables
-    (e.g., WAFER_RUNPOD_API_KEY) take precedence over stored keys.
+    (e.g., ANTHROPIC_API_KEY) take precedence over stored keys.
     Examples:
-        wafer auth login runpod --api-key rp_xxx
-        wafer auth login digitalocean --api-key dop_v1_xxx
-        echo $API_KEY | wafer auth login runpod
+        wafer auth providers login anthropic --api-key sk-ant-xxx
+        wafer auth providers login runpod --api-key rp_xxx
+        wafer auth providers login openai --api-key sk-xxx
+        echo $API_KEY | wafer auth providers login anthropic
     """
     import sys
@@ -638,18 +696,18 @@ def provider_auth_login(
     typer.echo("Stored in: ~/.wafer/auth.json")
-@provider_auth_app.command("logout")
+@providers_app.command("logout")
 def provider_auth_logout(
     provider: str = typer.Argument(
         ...,
-        help="Provider name: runpod, digitalocean, or modal",
+        help="Provider name: runpod, digitalocean, modal, anthropic, or openai",
     ),
 ) -> None:
     """Remove stored API key for a cloud GPU provider.
     Examples:
-        wafer auth logout runpod
-        wafer auth logout digitalocean
+        wafer auth providers logout runpod
+        wafer auth providers logout digitalocean
     """
     from wafer_core.auth import PROVIDERS, remove_api_key
@@ -665,7 +723,7 @@ def provider_auth_logout(
         typer.echo(f"No stored API key found for {PROVIDERS[provider]['display_name']}")
-@provider_auth_app.command("status")
+@providers_app.command("status")
 def provider_auth_status() -> None:
     """Show authentication status for all cloud GPU providers.
@@ -673,7 +731,7 @@ def provider_auth_status() -> None:
     the keys are coming from (environment variable or auth.json).
     Example:
-        wafer auth status
+        wafer auth providers status
     """
     from wafer_core.auth import get_all_auth_status
@@ -688,7 +746,7 @@ def provider_auth_status() -> None:
             typer.echo(f"  {status.display_name}: ✓ {status.key_preview} {source_str}")
         else:
             typer.echo(f"  {status.display_name}: ✗ Not configured")
-            typer.echo(f"      Run: wafer auth login {status.provider}")
+            typer.echo(f"      Run: wafer auth providers login {status.provider}")
             typer.echo(f"      Or set: {status.key_url}")
     typer.echo("")
@@ -1233,7 +1291,7 @@ def config_show_legacy() -> None:
     config_show_new()
-@app.command()
+@app.command(rich_help_panel="Kernel Development")
 def agent(  # noqa: PLR0913
     prompt: str | None = typer.Argument(
         None,
@@ -1303,7 +1361,7 @@ def agent(  # noqa: PLR0913
         None,
         "--model",
         "-m",
-        help="Model override (default: claude-sonnet-4-5)",
+        help="Model override (default: claude-opus-4-5)",
     ),
     json_output: bool = typer.Option(
         False,
@@ -1327,6 +1385,16 @@ def agent(  # noqa: PLR0913
         "-c",
         help="Documentation corpus to use (cuda, cutlass, hip, amd). Must be downloaded first.",
     ),
+    no_sandbox: bool = typer.Option(
+        False,
+        "--no-sandbox",
+        help="Disable OS-level sandboxing (YOU accept liability for any damage caused by the agent)",
+    ),
+    no_proxy: bool = typer.Option(
+        False,
+        "--no-proxy",
+        help="Skip wafer proxy, use ANTHROPIC_API_KEY directly",
+    ),
 ) -> None:
     """AI assistant for GPU kernel development.
@@ -1408,6 +1476,13 @@ def agent(  # noqa: PLR0913
                 raise typer.Exit(1) from None
         corpus_path = str(path)
+    # Warn user about sandbox disabled
+    if no_sandbox:
+        print(
+            "Warning: Sandbox disabled. You accept liability for any damage caused by the agent.",
+            file=sys.stderr,
+        )
     wevin_main(
         prompt=actual_prompt,
         interactive=use_tui,
@@ -1425,6 +1500,8 @@ def agent(  # noqa: PLR0913
         template=template,
         template_args=parsed_template_args,
         corpus_path=corpus_path,
+        no_sandbox=no_sandbox,
+        no_proxy=no_proxy,
     )
@@ -1455,6 +1532,7 @@ def _make_agent_alias(name: str, doc: str) -> None:
         template: str | None = typer.Option(None, "--template", "-t"),
         template_args: list[str] | None = typer.Option(None, "--args"),
         corpus: str | None = typer.Option(None, "--corpus"),
+        no_sandbox: bool = typer.Option(False, "--no-sandbox"),
     ) -> None:
         agent(
             prompt=prompt,
@@ -1474,6 +1552,7 @@ def _make_agent_alias(name: str, doc: str) -> None:
             template=template,
             template_args=template_args,
             corpus=corpus,
+            no_sandbox=no_sandbox,
         )
     alias_cmd.__doc__ = doc
@@ -1497,7 +1576,11 @@ def evaluate(  # noqa: PLR0913
         None, "--reference", help="Path to reference kernel file"
     ),
     test_cases: Path | None = typer.Option(
-        None, "--test-cases", help="Path to test cases JSON file"
+        None,
+        "--test-cases",
+        help="Path to test cases JSON file. "
+        'Format: [{"name": "small", "n": 1024, "seed": 42}, ...]. '
+        "Run 'wafer evaluate make-template' to generate an example.",
     ),
     target: str | None = typer.Option(
         None,
@@ -1527,20 +1610,20 @@ def evaluate(  # noqa: PLR0913
     Examples:
         # Basic correctness check
-        wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json
+        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json
         # With benchmarking on a specific target
-        wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json \\
+        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json \\
             --target vultr-b200 --benchmark
         # Full evaluation with defensive timing (detects cheating)
-        wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json \\
+        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json \\
             --benchmark --defensive
     Subcommands:
         gpumode        Use GPUMode format (functional) - RECOMMENDED
         kernelbench    Use KernelBench format (ModelNew class)
-        make-template  Generate template files for this format (deprecated)
+        make-template  Generate template files for this format
     """
     # If a subcommand is being invoked, skip the main evaluation logic
     if ctx.invoked_subcommand is not None:
@@ -1694,7 +1777,7 @@ def evaluate_make_template(
     typer.echo(f"  2. Edit {output_dir / 'reference.py'} with the ground truth + input generator")
     typer.echo(f"  3. Edit {output_dir / 'test_cases.json'} with your test parameters")
     typer.echo("  4. Run:")
-    typer.echo(f"     wafer evaluate --impl {output_dir / 'kernel.py'} \\")
+    typer.echo(f"     wafer evaluate gpumode --impl {output_dir / 'kernel.py'} \\")
     typer.echo(f"         --reference {output_dir / 'reference.py'} \\")
     typer.echo(f"         --test-cases {output_dir / 'test_cases.json'} --benchmark")
@@ -1758,6 +1841,93 @@ def kernelbench_list_problems() -> None:
         raise typer.Exit(1) from None
+def _resolve_pool_query(pool: str, collector) -> tuple[str, object]:
+    """Resolve a PoolQuery pool to a target spec name + lock context.
+    Queries live providers, matches by pool query, locks one target,
+    returns (spec_name, lock_context) for the evaluator.
+    """
+    import trio
+    from wafer_core.targets.pool import resolve_pool
+    from .target_lock import acquire_from_pool
+    matched_targets = trio.run(resolve_pool, pool)
+    if not matched_targets:
+        collector.set_error("pool", "NoMatchingTargets", pool=pool)
+        collector.finalize()
+        raise typer.Exit(1)
+    # Filter to targets with a spec (evaluator needs spec fields)
+    spec_targets = [t for t in matched_targets if t.spec_name]
+    if not spec_targets:
+        collector.set_error(
+            "pool", "NoSpecTargets", pool=pool,
+            message="Matched targets have no spec binding — evaluator needs spec fields",
+        )
+        collector.finalize()
+        raise typer.Exit(1)
+    # Lock one by resource_id
+    resource_ids = [t.resource_id for t in spec_targets]
+    collector.emit("pool_acquire", pool=pool, count=len(resource_ids))
+    lock_ctx = acquire_from_pool(resource_ids)
+    acquired_id = lock_ctx.__enter__()
+    if acquired_id is None:
+        lock_ctx.__exit__(None, None, None)
+        collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=resource_ids)
+        collector.finalize()
+        raise typer.Exit(1)
+    # Map resource_id back to spec_name
+    acquired_target = next(t for t in spec_targets if t.resource_id == acquired_id)
+    spec_name = acquired_target.spec_name
+    collector.emit("pool_acquired", target=spec_name, resource_id=acquired_id)
+    return spec_name, lock_ctx
+def _resolve_pool_legacy(pool: str, collector) -> tuple[str, object]:
+    """Resolve an old-style pool (static target name list) to a target name + lock context.
+    Old format: [pools.name] targets = ["t1", "t2"]
+    """
+    from .target_lock import acquire_from_pool
+    from .targets import filter_pool_by_auth, get_pool
+    try:
+        pool_targets = get_pool(pool)
+    except FileNotFoundError as e:
+        collector.set_error("pool", "PoolNotFound", pool=pool, message=str(e))
+        collector.finalize()
+        raise typer.Exit(1) from None
+    usable_targets, skipped = filter_pool_by_auth(pool_targets)
+    if skipped:
+        collector.emit("pool_auth_skip", targets=skipped)
+    if not usable_targets:
+        collector.set_error("pool", "NoUsableTargets", pool=pool)
+        collector.finalize()
+        raise typer.Exit(1) from None
+    collector.emit("pool_acquire", pool=pool, count=len(usable_targets))
+    lock_ctx = acquire_from_pool(usable_targets)
+    acquired_target = lock_ctx.__enter__()
+    if acquired_target is None:
+        lock_ctx.__exit__(None, None, None)
+        collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=usable_targets)
+        collector.finalize()
+        raise typer.Exit(1)
+    collector.emit("pool_acquired", target=acquired_target)
+    return acquired_target, lock_ctx
 @kernelbench_app.callback(invoke_without_command=True)
 def kernelbench_evaluate(  # noqa: PLR0913, PLR0915
     ctx: typer.Context,
@@ -1888,39 +2058,12 @@ def kernelbench_evaluate(  # noqa: PLR0913, PLR0915
     pool_lock_context = None
     if pool:
-        from .target_lock import acquire_from_pool
-        from .targets import filter_pool_by_auth, get_pool
-        try:
-            pool_targets = get_pool(pool)
-        except FileNotFoundError as e:
-            collector.set_error("pool", "PoolNotFound", pool=pool, message=str(e))
-            collector.finalize()
-            raise typer.Exit(1) from None
-        # Filter to only targets with valid auth
-        usable_targets, skipped = filter_pool_by_auth(pool_targets)
-        if skipped:
-            collector.emit("pool_auth_skip", targets=skipped)
-        if not usable_targets:
-            collector.set_error("pool", "NoUsableTargets", pool=pool)
-            collector.finalize()
-            raise typer.Exit(1) from None
+        from wafer_core.targets.pool import is_query_pool
-        collector.emit("pool_acquire", pool=pool, count=len(usable_targets))
-        pool_lock_context = acquire_from_pool(usable_targets)
-        acquired_target = pool_lock_context.__enter__()
-        if acquired_target is None:
-            # Exit context manager before raising to avoid resource leak
-            pool_lock_context.__exit__(None, None, None)
-            collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=usable_targets)
-            collector.finalize()
-            raise typer.Exit(1)
-        collector.emit("pool_acquired", target=acquired_target)
-        resolved_target = acquired_target
+        if is_query_pool(pool):
+            resolved_target, pool_lock_context = _resolve_pool_query(pool, collector)
+        else:
+            resolved_target, pool_lock_context = _resolve_pool_legacy(pool, collector)
     collector.target = resolved_target
@@ -2245,7 +2388,11 @@ def gpumode_evaluate(  # noqa: PLR0913, PLR0915
         None, "--reference", help="Path to reference kernel file"
     ),
     test_cases: Path | None = typer.Option(
-        None, "--test-cases", help="Path to test cases JSON file"
+        None,
+        "--test-cases",
+        help="Path to test cases JSON file. "
+        'Format: [{"name": "small", "n": 1024, "seed": 42}, ...]. '
+        "Run 'wafer evaluate make-template' to generate an example.",
     ),
     target: str | None = typer.Option(
         None,
@@ -2313,6 +2460,13 @@ def gpumode_evaluate(  # noqa: PLR0913, PLR0915
             err=True,
         )
         typer.echo("", err=True)
+        if "--test-cases" in missing_args:
+            typer.echo(
+                "Tip: Run 'wafer evaluate make-template' to generate template files "
+                "including test_cases.json.",
+                err=True,
+            )
+            typer.echo("", err=True)
         typer.echo("Run 'wafer evaluate gpumode --help' for full options.", err=True)
         typer.echo("Run 'wafer evaluate gpumode download' to download problem sets.", err=True)
         raise typer.Exit(1)
@@ -2719,7 +2873,7 @@ def remote_run(  # noqa: PLR0913
 # =============================================================================
-@app.command("login")
+@auth_app.command("login")
 def login(
     token: str | None = typer.Option(
         None, "--token", "-t", help="Access token (skip browser OAuth)"
@@ -2744,7 +2898,7 @@ def login(
     Uses the API environment from config (see 'wafer config show').
     SSH Users (Easiest):
-    - Just run: wafer login
+    - Just run: wafer auth login
     - Visit the URL and enter the code shown
     - No port forwarding needed!
@@ -2754,17 +2908,17 @@ def login(
     Manual token option:
     - Visit auth.wafer.ai, authenticate, copy token from URL
-    - Run: wafer login --token <paste-token>
+    - Run: wafer auth login --token <paste-token>
     Examples:
-        wafer login                    # device code on SSH, browser on local
-        wafer login --no-device-code   # force browser (needs port forwarding on SSH)
-        wafer login --port 9000        # custom port for browser flow
-        wafer login --token xyz        # manual token (no browser)
+        wafer auth login                    # device code on SSH, browser on local
+        wafer auth login --no-device-code   # force browser (needs port forwarding on SSH)
+        wafer auth login --port 9000        # custom port for browser flow
+        wafer auth login --token xyz        # manual token (no browser)
         # Change environment:
         wafer config set api.environment staging
-        wafer login
+        wafer auth login
     """
     import httpx
@@ -2848,7 +3002,7 @@ def login(
     typer.echo("Token saved to ~/.wafer/credentials.json")
-@app.command("logout")
+@auth_app.command("logout")
 def logout() -> None:
     """Remove stored credentials."""
     from . import analytics
@@ -2865,7 +3019,7 @@ def logout() -> None:
         typer.echo("Not logged in (no credentials found).")
-@app.command("whoami")
+@auth_app.command("whoami")
 def whoami(
     verify: bool = typer.Option(False, "--verify", "-v", help="Verify token with API"),
     refresh: bool = typer.Option(False, "--refresh", "-r", help="Refresh token if expired"),
@@ -2879,7 +3033,7 @@ def whoami(
     creds = load_credentials()
     if creds is None:
-        typer.echo("Not logged in. Run: wafer login")
+        typer.echo("Not logged in. Run: wafer auth login")
         raise typer.Exit(1)
     if verify or refresh:
@@ -2887,7 +3041,7 @@ def whoami(
             # Try to get valid token with auto-refresh
             token = get_valid_token()
             if token is None:
-                typer.echo("Token expired and refresh failed. Run: wafer login", err=True)
+                typer.echo("Token expired and refresh failed. Run: wafer auth login", err=True)
                 raise typer.Exit(1)
             if token != creds.access_token:
                 typer.echo("Token refreshed successfully")
@@ -2900,10 +3054,10 @@ def whoami(
         except Exception as e:
             if creds.refresh_token and not refresh:
                 typer.echo(f"Token expired: {e}", err=True)
-                typer.echo("Try: wafer whoami --refresh", err=True)
+                typer.echo("Try: wafer auth whoami --refresh", err=True)
             else:
                 typer.echo(f"Token invalid or expired: {e}", err=True)
-                typer.echo("Run: wafer login", err=True)
+                typer.echo("Run: wafer auth login", err=True)
             raise typer.Exit(1) from None
     elif creds.email:
         typer.echo(creds.email)
@@ -2911,7 +3065,7 @@ def whoami(
         typer.echo("Logged in (email not available)")
-@app.command("guide")
+@app.command("guide", rich_help_panel="Onboarding")
 def guide() -> None:
     """Show the Wafer CLI usage guide.
@@ -2942,7 +3096,7 @@ demo_app = typer.Typer(
   wafer demo trace  Analyze a sample performance trace
   wafer demo eval   Run kernel evaluation on cloud GPU (requires login)"""
 )
-app.add_typer(demo_app, name="demo")
+app.add_typer(demo_app, name="demo", rich_help_panel="Onboarding")
 DEMO_TRACES_URL = "https://github.com/wafer-ai/wafer/raw/main/apps/wafer-cli/wafer/demo_data"
 DEMO_DIR = Path.home() / ".cache" / "wafer" / "demo"
@@ -3162,7 +3316,7 @@ def demo_eval(
     """Demo: Evaluate a kernel on a cloud GPU.
     Creates a workspace, runs a sample Triton kernel evaluation, and cleans up.
-    Requires authentication (wafer login).
+    Requires authentication (wafer auth login).
     Example:
         wafer demo eval
@@ -3177,7 +3331,7 @@ def demo_eval(
     # Check auth first
     creds = load_credentials()
     if not creds:
-        typer.echo("Error: Not authenticated. Run: wafer login")
+        typer.echo("Error: Not authenticated. Run: wafer auth login")
         raise typer.Exit(1)
     if not yes:
@@ -3458,7 +3612,7 @@ def init_runpod(
     gpu_configs = {
         "MI300X": {
             "gpu_type_id": "AMD Instinct MI300X OAM",
-            "image": "runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04",
+            "image": "rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.7.1",
             "compute_capability": "9.4",
         },
         "H100": {
@@ -3554,7 +3708,7 @@ def init_digitalocean(
         "ssh_key": ssh_key,
         "region": region,
         "size_slug": "gpu-mi300x1-192gb-devcloud",
-        "image": "gpu-amd-base",
+        "image": "amd-pytorchrocm7",  # PyTorch (ROCm7) marketplace image
         "provision_timeout": 600,
         "eval_timeout": 600,
         "keep_alive": keep_alive,
@@ -3826,12 +3980,16 @@ def targets_add(
 @targets_app.command("list")
 def targets_list() -> None:
-    """List all configured targets.
+    """List all configured targets with live provider status.
     Example:
         wafer config targets list
     """
-    from .targets import get_default_target, list_targets
+    import socket
+    import trio
+    from .targets import get_default_target, list_targets, load_target, remove_target
     targets = list_targets()
     default = get_default_target()
@@ -3841,10 +3999,146 @@ def targets_list() -> None:
         typer.echo("Add one with: wafer config targets add <path/to/target.toml>")
         return
+    def _parse_ssh_target(ssh_target: str) -> tuple[str, int]:
+        """Extract (host, port) from user@host:port string."""
+        parts = ssh_target.rsplit(":", 1)
+        host_part = parts[0]
+        port = int(parts[1]) if len(parts) > 1 else 22
+        if "@" in host_part:
+            host = host_part.split("@", 1)[1]
+        else:
+            host = host_part
+        return (host, port)
+    async def _get_live_provider_endpoints() -> set[tuple[str, int]]:
+        """Query RunPod + DO APIs. Returns set of live (ip, port) endpoints."""
+        from wafer_core.targets.digitalocean import list_running_droplets
+        from wafer_core.targets.runpod import sync_pods_from_api
+        live_endpoints: set[tuple[str, int]] = set()
+        async def _fetch_runpod() -> None:
+            try:
+                pods = await sync_pods_from_api()
+                for p in pods:
+                    live_endpoints.add((p.public_ip, p.ssh_port))
+            except Exception:
+                pass
+        async def _fetch_do() -> None:
+            try:
+                droplets = await list_running_droplets()
+                for d in droplets:
+                    live_endpoints.add((d.public_ip, d.ssh_port))
+            except Exception:
+                pass
+        async with trio.open_nursery() as nursery:
+            nursery.start_soon(_fetch_runpod)
+            nursery.start_soon(_fetch_do)
+        return live_endpoints
+    async def _get_target_status(
+        name: str,
+        live_endpoints: set[tuple[str, int]],
+    ) -> tuple[str, str, str]:
+        """Returns (name, status, ssh_info)."""
+        from wafer_core.targets.digitalocean import (
+            _remove_droplet_from_state,
+            check_droplet_running,
+            get_droplet_state,
+        )
+        from wafer_core.targets.runpod import (
+            _remove_pod_from_state,
+            check_pod_running,
+            get_pod_state,
+        )
+        from wafer_core.utils.kernel_utils.targets.config import (
+            BaremetalTarget,
+            DigitalOceanTarget,
+            ModalTarget,
+            RunPodTarget,
+        )
+        try:
+            target = load_target(name)
+        except (FileNotFoundError, ValueError, AssertionError, TypeError):
+            return (name, "error", "")
+        if isinstance(target, RunPodTarget):
+            pod = get_pod_state(name)
+            if not pod:
+                return (name, "no instance", "")
+            if await check_pod_running(pod.pod_id):
+                return (name, "running", f"{pod.ssh_username}@{pod.public_ip}:{pod.ssh_port}")
+            _remove_pod_from_state(name)
+            return (name, "stopped", "")
+        if isinstance(target, DigitalOceanTarget):
+            droplet = get_droplet_state(name)
+            if not droplet:
+                return (name, "no instance", "")
+            if await check_droplet_running(droplet.droplet_id):
+                return (
+                    name,
+                    "running",
+                    f"{droplet.ssh_username}@{droplet.public_ip}:{droplet.ssh_port}",
+                )
+            _remove_droplet_from_state(name)
+            return (name, "stopped", "")
+        if isinstance(target, BaremetalTarget):
+            ssh_target = target.ssh_target
+            host, port = _parse_ssh_target(ssh_target)
+            def _tcp_check() -> bool:
+                try:
+                    sock = socket.create_connection((host, port), timeout=2)
+                    sock.close()
+                    return True
+                except OSError:
+                    return False
+            reachable = await trio.to_thread.run_sync(_tcp_check)
+            if reachable:
+                return (name, "reachable", ssh_target)
+            # Unreachable + has a provider = backed by an ephemeral instance.
+            # If not in the live provider listing, the instance is gone — remove config.
+            if target.provider and (host, port) not in live_endpoints:
+                remove_target(name)
+                return (name, "removed (dead pod)", ssh_target)
+            return (name, "unreachable", ssh_target)
+        if isinstance(target, ModalTarget):
+            return (name, "serverless", "")
+        # Unknown target type
+        return (name, "unknown", "")
+    async def _gather_statuses() -> list[tuple[str, str, str]]:
+        live_endpoints = await _get_live_provider_endpoints()
+        results: list[tuple[str, str, str]] = [("", "", "")] * len(targets)
+        async def _check(i: int, name: str) -> None:
+            results[i] = await _get_target_status(name, live_endpoints)
+        async with trio.open_nursery() as nursery:
+            for i, name in enumerate(targets):
+                nursery.start_soon(_check, i, name)
+        return results
+    statuses = trio.run(_gather_statuses)
     typer.echo("Configured targets:")
-    for name in targets:
+    for name, status, ssh_info in statuses:
         marker = " (default)" if name == default else ""
-        typer.echo(f"  {name}{marker}")
+        label = f"  {name}{marker}"
+        detail = f"  {ssh_info}" if ssh_info else ""
+        typer.echo(f"{label:<40}{status}{detail}")
 @targets_app.command("show")
@@ -4056,6 +4350,216 @@ def targets_cleanup(
         raise typer.Exit(1) from None
+# Known libraries that can be installed on targets
+# TODO: Consider adding HipKittens to the default RunPod/DO Docker images
+# so this install step isn't needed. For now, this command handles it.
+# Architecture → branch mapping for libraries that ship per-arch branches.
+# "default" is used when the detected arch has no explicit entry.
+_ARCH_BRANCHES: dict[str, dict[str, str]] = {
+    "hipkittens": {
+        "gfx942": "cdna3",  # MI300X, MI325X
+        "default": "main",  # MI350X, MI355X, and future CDNA4+
+    },
+}
+INSTALLABLE_LIBRARIES: dict[str, dict[str, object]] = {
+    "hipkittens": {
+        "description": "HipKittens - AMD port of ThunderKittens",
+        "git_url": "https://github.com/HazyResearch/HipKittens.git",
+        "install_path": "/opt/hipkittens",
+        "requires_amd": True,
+    },
+    # CK is already installed with ROCm 7.0, no action needed
+    "repair-headers": {
+        "description": "Repair ROCm thrust headers (fixes hipify corruption)",
+        "custom_script": "apt-get update -qq && apt-get install --reinstall -y rocthrust >/dev/null 2>&1 && echo REPAIRED",
+        "requires_amd": True,
+    },
+}
+def _resolve_gfx_arch(target: object, ssh_cmd: list[str]) -> str | None:
+    """Return the gfx architecture string for *target*.
+    1. If the target config already carries a compute_capability, map it.
+    2. Otherwise SSH in and probe with ``rocminfo``.
+    Returns None only if detection fails entirely.
+    """
+    import subprocess
+    from .evaluate import AMD_CC_TO_ARCH
+    cc = getattr(target, "compute_capability", None)
+    if cc and cc in AMD_CC_TO_ARCH:
+        return AMD_CC_TO_ARCH[cc]
+    typer.echo("  Detecting GPU architecture via rocminfo...")
+    probe_script = "rocminfo 2>/dev/null | grep -oP 'gfx\\d+' | head -1"
+    result = subprocess.run(
+        ssh_cmd + [probe_script],
+        capture_output=True,
+        text=True,
+        timeout=30,
+    )
+    arch = result.stdout.strip()
+    if result.returncode == 0 and arch.startswith("gfx"):
+        typer.echo(f"  Detected: {arch}")
+        return arch
+    typer.echo("  Warning: could not detect GPU architecture", err=True)
+    return None
+@targets_app.command("install")
+def targets_install(
+    name: str = typer.Argument(..., help="Target name"),
+    library: str = typer.Argument(..., help="Library to install (hipkittens, repair-headers)"),
+) -> None:
+    """Install a library or run maintenance on a target (idempotent).
+    Installs header-only libraries like HipKittens on remote targets.
+    Safe to run multiple times - will skip if already installed.
+    For libraries with per-architecture branches (e.g. HipKittens), the
+    correct branch is selected automatically based on the target's GPU.
+    Available libraries:
+        hipkittens     - HipKittens (AMD ThunderKittens port)
+        repair-headers - Fix ROCm thrust headers (after hipify corruption)
+    Examples:
+        wafer config targets install runpod-mi300x hipkittens
+        wafer config targets install runpod-mi300x repair-headers
+        wafer config targets install do-mi300x hipkittens
+    """
+    import subprocess
+    from .targets import load_target
+    from .targets_ops import get_target_ssh_info
+    if library not in INSTALLABLE_LIBRARIES:
+        available = ", ".join(INSTALLABLE_LIBRARIES.keys())
+        typer.echo(f"Error: Unknown library '{library}'. Available: {available}", err=True)
+        raise typer.Exit(1)
+    lib_info = INSTALLABLE_LIBRARIES[library]
+    try:
+        target = load_target(name)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    # Check if target is AMD (for AMD-only libraries)
+    if lib_info.get("requires_amd"):
+        from wafer_core.utils.kernel_utils.targets.config import (
+            DigitalOceanTarget,
+            RunPodTarget,
+        )
+        is_amd = isinstance(target, (RunPodTarget, DigitalOceanTarget))
+        if not is_amd and hasattr(target, "compute_capability"):
+            # Check compute capability for MI300X (gfx942 = 9.4)
+            is_amd = target.compute_capability.startswith("9.")
+        if not is_amd:
+            typer.echo(f"Error: {library} requires an AMD GPU target", err=True)
+            raise typer.Exit(1)
+    typer.echo(f"Installing {library} on {name}...")
+    typer.echo(f"  {lib_info['description']}")
+    async def _install() -> bool:
+        # get_target_ssh_info uses pure trio async (no asyncio bridging needed)
+        # and we use subprocess for SSH, not AsyncSSHClient
+        ssh_info = await get_target_ssh_info(target)
+        ssh_cmd = [
+            "ssh",
+            "-o",
+            "StrictHostKeyChecking=no",
+            "-o",
+            "UserKnownHostsFile=/dev/null",
+            "-o",
+            "ConnectTimeout=30",
+            "-i",
+            str(ssh_info.key_path),
+            "-p",
+            str(ssh_info.port),
+            f"{ssh_info.user}@{ssh_info.host}",
+        ]
+        # Handle custom scripts (like repair-headers) vs git installs
+        if "custom_script" in lib_info:
+            install_script = str(lib_info["custom_script"])
+            success_marker = "REPAIRED"
+        else:
+            install_path = lib_info["install_path"]
+            git_url = lib_info["git_url"]
+            # Resolve the branch for arch-aware libraries
+            branch = "main"
+            arch_map = _ARCH_BRANCHES.get(library)
+            if arch_map:
+                gfx = await trio.to_thread.run_sync(lambda: _resolve_gfx_arch(target, ssh_cmd))
+                branch = arch_map.get(gfx, arch_map["default"]) if gfx else arch_map["default"]
+                typer.echo(f"  Branch: {branch} (arch={gfx or 'unknown'})")
+            # Idempotent: if already cloned, ensure correct branch & pull
+            install_script = f"""
+if [ -d "{install_path}" ]; then
+    echo "ALREADY_INSTALLED: {install_path} exists"
+    cd {install_path} && git fetch --quiet origin && git checkout {branch} --quiet && git pull --quiet origin {branch}
+else
+    echo "INSTALLING: cloning to {install_path}"
+    git clone --quiet --branch {branch} {git_url} {install_path}
+fi
+echo "DONE"
+"""
+            success_marker = "DONE"
+        def run_ssh() -> subprocess.CompletedProcess[str]:
+            return subprocess.run(
+                ssh_cmd + [install_script],
+                capture_output=True,
+                text=True,
+                timeout=120,
+            )
+        result = await trio.to_thread.run_sync(run_ssh)
+        if result.returncode != 0:
+            typer.echo(f"Error: {result.stderr}", err=True)
+            return False
+        output = result.stdout.strip()
+        if "ALREADY_INSTALLED" in output:
+            typer.echo(f"  Already installed at {lib_info.get('install_path', 'N/A')}")
+        elif "INSTALLING" in output:
+            typer.echo(f"  Installed to {lib_info.get('install_path', 'N/A')}")
+        elif "REPAIRED" in output:
+            typer.echo("  ROCm headers repaired")
+        return success_marker in output
+    try:
+        success = trio.run(_install)
+    except Exception as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if success:
+        typer.echo(f"✓ {library} ready on {name}")
+        # Print usage hint
+        if library == "hipkittens":
+            typer.echo("")
+            typer.echo("Usage in load_inline:")
+            typer.echo('  extra_include_paths=["/opt/hipkittens/include", "/opt/rocm/include/hip"]')
+    else:
+        typer.echo(f"Failed to install {library}", err=True)
+        raise typer.Exit(1)
 @targets_app.command("pods")
 def targets_pods() -> None:
     """List all running RunPod pods.
@@ -4185,8 +4689,8 @@ def billing_usage(
     """Show current billing usage and subscription info.
     Example:
-        wafer billing
-        wafer billing --json
+        wafer config billing
+        wafer config billing --json
     """
     # Only show usage if no subcommand was invoked
     if ctx.invoked_subcommand is not None:
@@ -4214,9 +4718,9 @@ def billing_topup(
     Opens a Stripe checkout page to add credits. Default amount is $25.
     Example:
-        wafer billing topup        # Add $25
-        wafer billing topup 100    # Add $100
-        wafer billing topup --no-browser  # Print URL instead
+        wafer config billing topup        # Add $25
+        wafer config billing topup 100    # Add $100
+        wafer config billing topup --no-browser  # Print URL instead
     """
     import webbrowser
@@ -4262,8 +4766,8 @@ def billing_portal(
     Manage your subscription, update payment method, or view invoices.
     Example:
-        wafer billing portal
-        wafer billing portal --no-browser
+        wafer config billing portal
+        wafer config billing portal --no-browser
     """
     import webbrowser
@@ -4300,8 +4804,8 @@ def ssh_keys_list(
     """List all registered SSH public keys.
     Example:
-        wafer ssh-keys list
-        wafer ssh-keys list --json
+        wafer config ssh-keys list
+        wafer config ssh-keys list --json
     """
     from .ssh_keys import list_ssh_keys
@@ -4327,9 +4831,9 @@ def ssh_keys_add(
     id_ed25519.pub, id_rsa.pub, id_ecdsa.pub.
     Example:
-        wafer ssh-keys add                              # Auto-detect
-        wafer ssh-keys add ~/.ssh/id_rsa.pub            # Specific file
-        wafer ssh-keys add ~/.ssh/id_ed25519.pub --name laptop
+        wafer config ssh-keys add                              # Auto-detect
+        wafer config ssh-keys add ~/.ssh/id_rsa.pub            # Specific file
+        wafer config ssh-keys add ~/.ssh/id_ed25519.pub --name laptop
     """
     from .ssh_keys import add_ssh_key
@@ -4348,10 +4852,10 @@ def ssh_keys_remove(
 ) -> None:
     """Remove an SSH public key.
-    Get the key ID from 'wafer ssh-keys list'.
+    Get the key ID from 'wafer config ssh-keys list'.
     Example:
-        wafer ssh-keys remove abc123-def456-...
+        wafer config ssh-keys remove abc123-def456-...
     """
     from .ssh_keys import remove_ssh_key
@@ -4391,9 +4895,13 @@ def workspaces_list(
 @workspaces_app.command("create")
 def workspaces_create(
     name: str = typer.Argument(..., help="Workspace name"),
-    gpu_type: str = typer.Option("B200", "--gpu", "-g", help="GPU type: MI300X (AMD) or B200 (NVIDIA, default)"),
+    gpu_type: str = typer.Option(
+        "B200", "--gpu", "-g", help="GPU type: MI300X (AMD) or B200 (NVIDIA, default)"
+    ),
     image: str | None = typer.Option(None, "--image", "-i", help="Docker image (optional)"),
-    wait: bool = typer.Option(False, "--wait", "-w", help="Wait for provisioning and show SSH credentials"),
+    wait: bool = typer.Option(
+        False, "--wait", "-w", help="Wait for provisioning and show SSH credentials"
+    ),
     json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
 ) -> None:
     """Create a new workspace.
@@ -4702,19 +5210,25 @@ def workspaces_ssh(
     ssh_host = ws.get("ssh_host")
     ssh_port = ws.get("ssh_port")
     ssh_user = ws.get("ssh_user")
     if not ssh_host or not ssh_port or not ssh_user:
         typer.echo("Error: Workspace not ready. Wait a few seconds and retry.", err=True)
         raise typer.Exit(1)
     # Connect via SSH
-    os.execvp("ssh", [
+    os.execvp(
         "ssh",
-        "-p", str(ssh_port),
-        "-o", "StrictHostKeyChecking=no",
-        "-o", "UserKnownHostsFile=/dev/null",
-        f"{ssh_user}@{ssh_host}",
-    ])
+        [
+            "ssh",
+            "-p",
+            str(ssh_port),
+            "-o",
+            "StrictHostKeyChecking=no",
+            "-o",
+            "UserKnownHostsFile=/dev/null",
+            f"{ssh_user}@{ssh_host}",
+        ],
+    )
 @workspaces_app.command("sync")
@@ -4777,6 +5291,69 @@ def workspaces_sync(
         raise typer.Exit(1) from None
+@workspaces_app.command("pull")
+def workspaces_pull(
+    workspace: str = typer.Argument(..., help="Workspace name or ID"),
+    remote_path: str = typer.Argument(
+        ..., help="Remote path in workspace (relative to /workspace or absolute)"
+    ),
+    local_path: Path = typer.Argument(
+        Path("."), help="Local destination path (default: current directory)"
+    ),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
+    quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
+) -> None:
+    """Pull files from workspace to local machine.
+    Uses rsync over SSH to download files from the workspace's /workspace directory.
+    Examples:
+        wafer workspaces pull dev kernel.py ./           # Pull single file
+        wafer workspaces pull dev kernel.py ./my_kernel.py  # Pull and rename
+        wafer workspaces pull dev /workspace/results ./  # Pull directory
+    """
+    from .global_config import get_preferences
+    from .workspaces import pull_files
+    # Determine verbosity based on mode
+    prefs = get_preferences()
+    if quiet:
+        show_status = False
+    elif verbose:
+        show_status = True
+    else:
+        show_status = prefs.mode == "explicit"
+    if show_status:
+        typer.echo(f"[wafer] Pulling {remote_path} from workspace {workspace}...", err=True)
+    def on_progress(msg: str) -> None:
+        if show_status:
+            typer.echo(f"[wafer] {msg}", err=True)
+    try:
+        file_count = pull_files(
+            workspace, remote_path, local_path.resolve(), on_progress=on_progress
+        )
+        if show_status:
+            typer.echo(f"[wafer] Pulled {file_count} files to {local_path}", err=True)
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+# =============================================================================
+# Live resource commands (list/terminate/reconcile/provision)
+# =============================================================================
+targets_ops_app.command("list")(_targets_list_cmd)
+targets_ops_app.command("terminate")(_targets_terminate_cmd)
+targets_ops_app.command("reconcile")(_targets_reconcile_cmd)
+targets_ops_app.command("provision")(_targets_provision_cmd)
+targets_ops_app.command("pools")(_targets_pools_cmd)
+targets_ops_app.command("probe")(_targets_probe_cmd)
 # =============================================================================
 # Target operations commands (exec/ssh/sync)
 # =============================================================================
@@ -5535,7 +6112,7 @@ def ncu_analyze(
     compute/memory throughput, and optimization recommendations.
     By default, uses local NCU if available, otherwise runs analysis
-    remotely via wafer-api (requires authentication: wafer login).
+    remotely via wafer-api (requires authentication: wafer auth login).
     Use --target for direct SSH mode (like wafer remote-run --direct).
     Use --include-source to fetch SASS assembly with register/instruction data.
@@ -5630,7 +6207,7 @@ def nsys_analyze(
     Returns timeline events, kernel information, memory usage, and diagnostics.
     By default, uses local nsys if available, otherwise runs analysis
-    remotely via wafer-api (requires authentication: wafer login).
+    remotely via wafer-api (requires authentication: wafer auth login).
     Supports multiple execution modes:
     - Local: Uses local nsys CLI (no GPU required for analysis)
@@ -6615,7 +7192,7 @@ def autotuner_results(
         raise typer.Exit(1) from None
-@app.command("capture")
+@app.command("capture", rich_help_panel="Kernel Development")
 def capture_command(  # noqa: PLR0915
     label: str = typer.Argument(
         ..., help="Label for this capture (e.g., 'baseline', 'optimized-v2')"
@@ -7280,6 +7857,203 @@ def isa_targets() -> None:
     typer.echo(output)
+# =============================================================================
+# Trace comparison commands
+# =============================================================================
+@compare_app.command("analyze")
+def compare_analyze(
+    trace1: Path = typer.Argument(..., help="First trace file (AMD or NVIDIA)", exists=True),
+    trace2: Path = typer.Argument(..., help="Second trace file (AMD or NVIDIA)", exists=True),
+    format: str = typer.Option(
+        "text",
+        "--format",
+        "-f",
+        help="Output format: text, text-layers, csv, csv-layers, json",
+    ),
+    output: Path | None = typer.Option(
+        None, "--output", "-o", help="Output file (default: stdout)"
+    ),
+    phase: str = typer.Option(
+        "all",
+        "--phase",
+        help="Filter by phase: all, prefill, decode",
+    ),
+    layers: bool = typer.Option(False, "--layers", help="Show layer-wise performance breakdown"),
+    all: bool = typer.Option(
+        False, "--all", help="Show all items (no truncation for layers, operations, kernels)"
+    ),
+    stack_traces: bool = typer.Option(
+        False, "--stack-traces", help="Show Python stack traces for operations"
+    ),
+    recommendations: bool = typer.Option(
+        False, "--recommendations", help="Generate prioritized recommendations for kernel team"
+    ),
+    json: bool = typer.Option(
+        False, "--json", hidden=True, help="Ignored (for compatibility with cliExecutor)"
+    ),
+) -> None:
+    """Compare GPU traces from AMD and NVIDIA platforms.
+    Analyzes performance differences between traces, identifying which operations
+    are faster/slower on each platform and providing kernel-level details.
+    Examples:
+        # Basic comparison (stdout)
+        wafer compare analyze amd_trace.json nvidia_trace.json
+        # Show layer-wise breakdown
+        wafer compare analyze amd_trace.json nvidia_trace.json --layers
+        wafer compare analyze amd_trace.json nvidia_trace.json --format text-layers
+        # Show all layers without truncation
+        wafer compare analyze amd_trace.json nvidia_trace.json --layers --all
+        # Show Python stack traces
+        wafer compare analyze amd_trace.json nvidia_trace.json --stack-traces
+        # Show all stack traces without truncation
+        wafer compare analyze amd_trace.json nvidia_trace.json --stack-traces --all
+        # Save to file
+        wafer compare analyze amd_trace.json nvidia_trace.json -o report.txt
+        # CSV output (operations) to file
+        wafer compare analyze amd_trace.json nvidia_trace.json --format csv -o operations.csv
+        # CSV output (layers) to file
+        wafer compare analyze amd_trace.json nvidia_trace.json --format csv-layers -o layers.csv
+        # JSON output to file
+        wafer compare analyze amd_trace.json nvidia_trace.json --format json -o report.json
+        # Analyze only prefill phase
+        wafer compare analyze amd_trace.json nvidia_trace.json --phase prefill
+    """
+    from .trace_compare import compare_traces
+    compare_traces(
+        trace1=trace1,
+        trace2=trace2,
+        output=output,
+        output_format=format,
+        phase=phase,
+        show_layers=layers,
+        show_all=all,
+        show_stack_traces=stack_traces,
+        recommendations=recommendations,
+    )
+    _mark_command_success()
+@compare_app.command("fusion")
+def compare_fusion_cmd(
+    trace1: Path = typer.Argument(..., help="First trace file (AMD or NVIDIA)", exists=True),
+    trace2: Path = typer.Argument(..., help="Second trace file (AMD or NVIDIA)", exists=True),
+    format: str = typer.Option(
+        "text",
+        "--format",
+        "-f",
+        help="Output format: text, csv, json",
+    ),
+    output: Path | None = typer.Option(
+        None, "--output", "-o", help="Output file (default: stdout)"
+    ),
+    min_group_size: int = typer.Option(
+        50,
+        "--min-group-size",
+        help="Minimum correlation group size to analyze",
+    ),
+    json: bool = typer.Option(
+        False, "--json", hidden=True, help="Ignored (for compatibility with cliExecutor)"
+    ),
+) -> None:
+    """Analyze kernel fusion differences between AMD and NVIDIA traces.
+    Detects which operations are fused differently on each platform by analyzing
+    how many kernel launches each platform uses for the same logical operations.
+    Examples:
+        # Basic fusion analysis (stdout)
+        wafer compare fusion amd_trace.json nvidia_trace.json
+        # Save to file
+        wafer compare fusion amd_trace.json nvidia_trace.json -o fusion_report.txt
+        # JSON output to file
+        wafer compare fusion amd_trace.json nvidia_trace.json --format json -o fusion.json
+        # CSV output to file
+        wafer compare fusion amd_trace.json nvidia_trace.json --format csv -o fusion.csv
+    """
+    from .trace_compare import compare_align
+    compare_align(
+        trace1=trace1,
+        trace2=trace2,
+        output=output,
+        output_format=format,
+        phase="all",
+    )
+    _mark_command_success()
+@compare_app.command("align")
+def compare_align_cmd(
+    trace1: Path = typer.Argument(..., help="First trace file (AMD or NVIDIA)", exists=True),
+    trace2: Path = typer.Argument(..., help="Second trace file (AMD or NVIDIA)", exists=True),
+    format: str = typer.Option(
+        "json",
+        "--format",
+        "-f",
+        help="Output format: json",
+    ),
+    output: Path | None = typer.Option(
+        None, "--output", "-o", help="Output file (default: stdout)"
+    ),
+    phase: str = typer.Option(
+        "all",
+        "--phase",
+        help="Filter by phase: all, prefill, decode",
+    ),
+    layer: int | None = typer.Option(
+        None,
+        "--layer",
+        help="Focus on specific layer number",
+    ),
+) -> None:
+    """Align kernels at layer level for exact kernel-to-kernel comparison.
+    Provides kernel-to-kernel mapping across AMD and NVIDIA platforms,
+    showing which kernels correspond to each other at each layer position.
+    Examples:
+        # Basic alignment (stdout JSON)
+        wafer compare align amd_trace.json nvidia_trace.json
+        # Save to file
+        wafer compare align amd_trace.json nvidia_trace.json -o alignment.json
+        # Focus on decode phase only
+        wafer compare align amd_trace.json nvidia_trace.json --phase decode
+        # Focus on specific layer
+        wafer compare align amd_trace.json nvidia_trace.json --layer 5
+    """
+    from .trace_compare import compare_align
+    compare_align(
+        trace1=trace1,
+        trace2=trace2,
+        output=output,
+        output_format=format,
+        phase=phase,
+        layer=layer,
+    )
+    _mark_command_success()
 def main() -> None:
     """Entry point for wafer CLI."""
     app()

wafer-cli 0.2.14__py3-none-any.whl → 0.2.30__py3-none-any.whl

wafer-cli 0.2.14py3-none-any.whl → 0.2.30py3-none-any.whl