PyPI - wafer-cli - Versions diffs - 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl - Mend

wafer-cli 0.2.8py3-none-any.whl → 0.2.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

wafer/GUIDE.md +18 -7
wafer/api_client.py +4 -0
wafer/auth.py +85 -0
wafer/cli.py +2339 -404
wafer/corpus.py +158 -32
wafer/evaluate.py +1232 -201
wafer/gpu_run.py +5 -1
wafer/kernel_scope.py +554 -0
wafer/nsys_analyze.py +903 -73
wafer/nsys_profile.py +511 -0
wafer/output.py +241 -0
wafer/problems.py +357 -0
wafer/skills/wafer-guide/SKILL.md +13 -0
wafer/ssh_keys.py +261 -0
wafer/target_lock.py +270 -0
wafer/targets.py +490 -0
wafer/targets_ops.py +718 -0
wafer/wevin_cli.py +129 -18
wafer/workspaces.py +282 -182
{wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/METADATA +1 -1
wafer_cli-0.2.10.dist-info/RECORD +40 -0
wafer_cli-0.2.8.dist-info/RECORD +0 -33
{wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/WHEEL +0 -0
{wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/top_level.txt +0 -0

wafer/cli.py CHANGED Viewed

@@ -30,6 +30,14 @@ import typer
 from .config import WaferConfig, WaferEnvironment
 from .inference import infer_upload_files, resolve_environment
+from .problems import (
+    download_problems,
+    get_problem_path,
+    get_problems_path,
+)
+from .problems import (
+    list_problems as list_problems_fn,
+)
 app = typer.Typer(
     help="GPU development toolkit for LLM coding agents",
@@ -91,11 +99,15 @@ def main_callback(ctx: typer.Context) -> None:
     # Install exception hook to catch SystemExit and mark failures
     original_excepthook = sys.excepthook
-    def custom_excepthook(exc_type, exc_value, exc_traceback):
+    def custom_excepthook(
+        exc_type: type[BaseException],
+        exc_value: BaseException,
+        exc_traceback: object,
+    ) -> None:
         global _command_outcome
         # Mark as failure if SystemExit with non-zero code, or any other exception
         if exc_type is SystemExit:
-            exit_code = exc_value.code if hasattr(exc_value, 'code') else 1
+            exit_code = exc_value.code if hasattr(exc_value, "code") else 1
             if exit_code != 0 and exit_code is not None:
                 _command_outcome = "failure"
         else:
@@ -170,7 +182,12 @@ workspaces_app = typer.Typer(
 Workspaces are on-demand cloud GPU environments. Requires authentication (wafer login).
-  wafer workspaces create dev --gpu H100   # Create workspace
+Available GPUs:
+  MI300X  AMD Instinct MI300X (192GB HBM3, ROCm)
+  B200    NVIDIA Blackwell B200 (180GB HBM3e, CUDA)
+Commands:
+  wafer workspaces create dev --gpu B200   # Create workspace
   wafer workspaces exec dev -- python x.py # Run commands
   wafer workspaces ssh dev                 # Interactive SSH
   wafer workspaces sync dev ./project      # Sync files
@@ -178,6 +195,36 @@ Workspaces are on-demand cloud GPU environments. Requires authentication (wafer
 )
 app.add_typer(workspaces_app, name="workspaces")
+# SSH Key management (BYOK - Bring Your Own Key)
+ssh_keys_app = typer.Typer(
+    help="""Manage SSH public keys for workspace access.
+Register your SSH public keys here. These keys are installed in all workspaces
+you provision, enabling SSH access from any machine with your private key.
+  wafer ssh-keys list              # List registered keys
+  wafer ssh-keys add               # Add key (auto-detects ~/.ssh/id_ed25519.pub)
+  wafer ssh-keys add ~/.ssh/id_rsa.pub --name laptop  # Add specific key
+  wafer ssh-keys remove <key-id>   # Remove a key"""
+)
+app.add_typer(ssh_keys_app, name="ssh-keys")
+# Target operations (exec/ssh/sync on configured targets)
+targets_ops_app = typer.Typer(
+    help="""Execute commands on configured GPU targets.
+Run commands, SSH, or sync files to targets without going through evaluate.
+Useful for exploratory work, debugging, or custom scripts.
+  wafer targets exec my-target -- python test.py    # Run command
+  wafer targets ssh my-target                       # Interactive SSH
+  wafer targets sync my-target ./local_dir          # Sync files
+Supports: RunPod, DigitalOcean (auto-provisions), SSH targets (baremetal/vm).
+Configure targets with: wafer config targets init ..."""
+)
+app.add_typer(targets_ops_app, name="targets")
 # Billing management
 billing_app = typer.Typer(help="Manage billing, credits, and subscription")
 app.add_typer(billing_app, name="billing")
@@ -200,6 +247,13 @@ kernelbench_app = typer.Typer(
 )
 evaluate_app.add_typer(kernelbench_app, name="kernelbench")
+# Nested subcommand for gpumode format
+gpumode_app = typer.Typer(
+    help="Evaluate kernels in GPUMode format (custom_kernel/ref_kernel functions)",
+    invoke_without_command=True,
+)
+evaluate_app.add_typer(gpumode_app, name="gpumode")
 # =============================================================================
 # Dev commands (internal, used by web app proxy)
 # =============================================================================
@@ -238,10 +292,101 @@ nvidia_app.add_typer(tracelens_app, name="tracelens")
 amd_app = typer.Typer(help="AMD GPU profiling and analysis tools")
 app.add_typer(amd_app, name="amd")
-# ISA analysis - under amd
-isa_app = typer.Typer(help="ISA analysis for AMD GPU code objects (.co files)")
+# Unified ISA Analyzer - supports both .co files and Triton artifacts
+isa_app = typer.Typer(help="ISA analysis for AMD GPU kernels (.co, .s, .ll, .ttgir files)")
 amd_app.add_typer(isa_app, name="isa")
+# =============================================================================
+# Roofline analysis (wafer roofline)
+# =============================================================================
+@app.command("roofline")
+def roofline_cmd(
+    gpu: str | None = typer.Option(
+        None, "--gpu", "-g", help="GPU name (e.g., H100, B200, MI300X, A100)"
+    ),
+    bytes_moved: float | None = typer.Option(
+        None, "--bytes", "-b", help="Theoretical minimum bytes moved"
+    ),
+    flops: float | None = typer.Option(None, "--flops", "-f", help="Theoretical minimum FLOPs"),
+    time_ms: float | None = typer.Option(
+        None, "--time-ms", "-t", help="Actual kernel time in milliseconds"
+    ),
+    dtype: str = typer.Option(
+        "fp16", "--dtype", "-d", help="Data type for compute ceiling (fp16, fp32, bf16, fp8, int8)"
+    ),
+    list_gpus: bool = typer.Option(False, "--list-gpus", help="List available GPU specs and exit"),
+) -> None:
+    """Analyze kernel performance against roofline model.
+    The roofline model shows the theoretical speed-of-light (SOL) for your kernel
+    based on whether it's memory-bound or compute-bound.
+    You need to provide:
+    - The GPU you ran on
+    - Theoretical minimum bytes moved (not actual - what the algorithm requires)
+    - Theoretical minimum FLOPs
+    - Actual measured kernel time
+    Example:
+        # Analyze a matmul kernel (4096x4096x4096, FP16)
+        # Theoretical: 2*M*N*K FLOPs = 137.4 TFLOP
+        # Theoretical bytes: (M*K + K*N + M*N) * 2 = 100.7 MB
+        wafer roofline --gpu H100 --bytes 100.7e6 --flops 137.4e12 --time-ms 85
+        # Analyze a memory-bound elementwise add (1B elements FP32)
+        # Reads 2 tensors, writes 1 = 12 GB total
+        # 1B adds = 1 GFLOP
+        wafer roofline --gpu H100 --bytes 12e9 --flops 1e9 --time-ms 4 --dtype fp32
+        # List available GPUs
+        wafer roofline --list-gpus
+    """
+    from wafer_core.roofline import get_gpu_spec, roofline_analysis
+    from wafer_core.roofline import list_gpus as get_all_gpus
+    if list_gpus:
+        typer.echo("Available GPUs:")
+        for name in get_all_gpus():
+            spec = get_gpu_spec(name)
+            typer.echo(
+                f"  {name}: {spec.peak_bandwidth_gbps:.0f} GB/s, {spec.peak_tflops_fp16:.0f} TFLOPS FP16"
+            )
+        return
+    # Validate required args for analysis
+    missing = []
+    if gpu is None:
+        missing.append("--gpu")
+    if bytes_moved is None:
+        missing.append("--bytes")
+    if flops is None:
+        missing.append("--flops")
+    if time_ms is None:
+        missing.append("--time-ms")
+    if missing:
+        typer.echo(f"Error: Missing required options: {', '.join(missing)}", err=True)
+        typer.echo("", err=True)
+        typer.echo("Run 'wafer roofline --help' for usage.", err=True)
+        raise typer.Exit(1)
+    try:
+        result = roofline_analysis(
+            gpu=gpu,
+            dtype=dtype,
+            bytes_moved=bytes_moved,
+            flops=flops,
+            time_ms=time_ms,
+        )
+    except ValueError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    typer.echo(result.format_report())
 # =============================================================================
 # Skill management (wafer skill ...)
 # =============================================================================
@@ -256,21 +401,22 @@ def skill_install(
         "all",
         "--target",
         "-t",
-        help="Target tool: claude, codex, or all",
+        help="Target tool: claude, codex, cursor, or all",
     ),
     force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing skill"),
 ) -> None:
     """Install the wafer-guide skill for AI coding assistants.
     Installs the bundled skill to make wafer commands discoverable by
-    Claude Code and/or OpenAI Codex CLI.
+    Claude Code, OpenAI Codex CLI, and/or Cursor.
     Skills follow the open agent skills specification (agentskills.io).
     Examples:
-        wafer skill install              # Install for both Claude and Codex
+        wafer skill install              # Install for all tools
         wafer skill install -t claude    # Install for Claude Code only
         wafer skill install -t codex     # Install for Codex CLI only
+        wafer skill install -t cursor    # Install for Cursor only
         wafer skill install --force      # Overwrite existing installation
     """
     # Locate bundled skill
@@ -288,9 +434,13 @@ def skill_install(
         ))
     if target in ("all", "codex"):
         targets_to_install.append(("Codex CLI", Path.home() / ".codex" / "skills" / "wafer-guide"))
+    if target in ("all", "cursor"):
+        targets_to_install.append(("Cursor", Path.home() / ".cursor" / "skills" / "wafer-guide"))
     if not targets_to_install:
-        typer.echo(f"Error: Unknown target '{target}'. Use: claude, codex, or all", err=True)
+        typer.echo(
+            f"Error: Unknown target '{target}'. Use: claude, codex, cursor, or all", err=True
+        )
         raise typer.Exit(1)
     for tool_name, dest_path in targets_to_install:
@@ -325,14 +475,15 @@ def skill_uninstall(
         "all",
         "--target",
         "-t",
-        help="Target tool: claude, codex, or all",
+        help="Target tool: claude, codex, cursor, or all",
     ),
 ) -> None:
     """Uninstall the wafer-guide skill.
     Examples:
-        wafer skill uninstall              # Uninstall from both
+        wafer skill uninstall              # Uninstall from all tools
         wafer skill uninstall -t claude    # Uninstall from Claude Code only
+        wafer skill uninstall -t cursor    # Uninstall from Cursor only
     """
     targets_to_uninstall: list[tuple[str, Path]] = []
@@ -346,9 +497,16 @@ def skill_uninstall(
             "Codex CLI",
             Path.home() / ".codex" / "skills" / "wafer-guide",
         ))
+    if target in ("all", "cursor"):
+        targets_to_uninstall.append((
+            "Cursor",
+            Path.home() / ".cursor" / "skills" / "wafer-guide",
+        ))
     if not targets_to_uninstall:
-        typer.echo(f"Error: Unknown target '{target}'. Use: claude, codex, or all", err=True)
+        typer.echo(
+            f"Error: Unknown target '{target}'. Use: claude, codex, cursor, or all", err=True
+        )
         raise typer.Exit(1)
     for tool_name, dest_path in targets_to_uninstall:
@@ -383,6 +541,7 @@ def skill_status() -> None:
     installations = [
         ("Claude Code", Path.home() / ".claude" / "skills" / "wafer-guide"),
         ("Codex CLI", Path.home() / ".codex" / "skills" / "wafer-guide"),
+        ("Cursor", Path.home() / ".cursor" / "skills" / "wafer-guide"),
     ]
     for tool_name, path in installations:
@@ -396,6 +555,122 @@ def skill_status() -> None:
             typer.echo(f"{tool_name}: Not installed")
+# =============================================================================
+# Provider auth management (wafer auth ...)
+# =============================================================================
+provider_auth_app = typer.Typer(help="Manage API keys for cloud GPU providers")
+app.add_typer(provider_auth_app, name="auth")
+@provider_auth_app.command("login")
+def provider_auth_login(
+    provider: str = typer.Argument(
+        ...,
+        help="Provider name: runpod, digitalocean, or modal",
+    ),
+    api_key: str | None = typer.Option(
+        None,
+        "--api-key",
+        "-k",
+        help="API key (if not provided, reads from stdin)",
+    ),
+) -> None:
+    """Save API key for a cloud GPU provider.
+    Stores the key in ~/.wafer/auth.json. Environment variables
+    (e.g., WAFER_RUNPOD_API_KEY) take precedence over stored keys.
+    Examples:
+        wafer auth login runpod --api-key rp_xxx
+        wafer auth login digitalocean --api-key dop_v1_xxx
+        echo $API_KEY | wafer auth login runpod
+    """
+    import sys
+    from wafer_core.auth import PROVIDERS, save_api_key
+    # Validate provider
+    if provider not in PROVIDERS:
+        typer.echo(f"Error: Unknown provider '{provider}'", err=True)
+        typer.echo(f"Valid providers: {', '.join(PROVIDERS.keys())}", err=True)
+        raise typer.Exit(1)
+    # Get API key from option or stdin
+    if api_key is None:
+        if sys.stdin.isatty():
+            typer.echo(f"Enter API key for {PROVIDERS[provider]['display_name']}:")
+            api_key = typer.prompt("API key", hide_input=True)
+        else:
+            api_key = sys.stdin.read().strip()
+    if not api_key:
+        typer.echo("Error: No API key provided", err=True)
+        raise typer.Exit(1)
+    # Save the key
+    save_api_key(provider, api_key)
+    typer.echo(f"API key saved for {PROVIDERS[provider]['display_name']}")
+    typer.echo("Stored in: ~/.wafer/auth.json")
+@provider_auth_app.command("logout")
+def provider_auth_logout(
+    provider: str = typer.Argument(
+        ...,
+        help="Provider name: runpod, digitalocean, or modal",
+    ),
+) -> None:
+    """Remove stored API key for a cloud GPU provider.
+    Examples:
+        wafer auth logout runpod
+        wafer auth logout digitalocean
+    """
+    from wafer_core.auth import PROVIDERS, remove_api_key
+    # Validate provider
+    if provider not in PROVIDERS:
+        typer.echo(f"Error: Unknown provider '{provider}'", err=True)
+        typer.echo(f"Valid providers: {', '.join(PROVIDERS.keys())}", err=True)
+        raise typer.Exit(1)
+    if remove_api_key(provider):
+        typer.echo(f"API key removed for {PROVIDERS[provider]['display_name']}")
+    else:
+        typer.echo(f"No stored API key found for {PROVIDERS[provider]['display_name']}")
+@provider_auth_app.command("status")
+def provider_auth_status() -> None:
+    """Show authentication status for all cloud GPU providers.
+    Displays which providers have API keys configured and where
+    the keys are coming from (environment variable or auth.json).
+    Example:
+        wafer auth status
+    """
+    from wafer_core.auth import get_all_auth_status
+    statuses = get_all_auth_status()
+    typer.echo("Cloud GPU Provider Authentication Status")
+    typer.echo("=" * 45)
+    for status in statuses:
+        if status.is_authenticated:
+            source_str = f"({status.source})" if status.source else ""
+            typer.echo(f"  {status.display_name}: ✓ {status.key_preview} {source_str}")
+        else:
+            typer.echo(f"  {status.display_name}: ✗ Not configured")
+            typer.echo(f"      Run: wafer auth login {status.provider}")
+            typer.echo(f"      Or set: {status.key_url}")
+    typer.echo("")
+    typer.echo("Note: Environment variables take precedence over stored keys.")
 @app.command(hidden=True)
 def run(
     command: str = typer.Argument(..., help="Command to run in Docker container"),
@@ -975,6 +1250,11 @@ def agent(  # noqa: PLR0913
         "--list-sessions",
         help="List recent sessions and exit",
     ),
+    get_session: str | None = typer.Option(
+        None,
+        "--get-session",
+        help="Get session by ID and print messages (use with --json)",
+    ),
     tools: str | None = typer.Option(
         None,
         "--tools",
@@ -1021,47 +1301,7 @@ def agent(  # noqa: PLR0913
         None,
         "--corpus",
         "-c",
-        help="Documentation corpus to use (cuda, cutlass, hip). Must be downloaded first.",
-    ),
-    # Legacy kernel optimization options (hidden, for backwards compat)
-    problem: Path | None = typer.Option(
-        None,
-        "--problem",
-        hidden=True,
-        help="[Legacy] Path to problem YAML config file",
-    ),
-    reference: Path | None = typer.Option(
-        None,
-        "--reference",
-        "--ref",
-        hidden=True,
-        help="[Legacy] Path to reference kernel file",
-    ),
-    description: str | None = typer.Option(
-        None,
-        "--description",
-        "--desc",
-        hidden=True,
-        help="[Legacy] Problem description",
-    ),
-    test: list[str] | None = typer.Option(
-        None,
-        "--test",
-        hidden=True,
-        help="[Legacy] Test case",
-    ),
-    benchmark: list[str] | None = typer.Option(
-        None,
-        "--benchmark",
-        "-b",
-        hidden=True,
-        help="[Legacy] Benchmark case",
-    ),
-    speedup_target: float | None = typer.Option(
-        None,
-        "--speedup",
-        hidden=True,
-        help="[Legacy] Speedup target",
+        help="Documentation corpus to use (cuda, cutlass, hip, amd). Must be downloaded first.",
     ),
 ) -> None:
     """AI assistant for GPU kernel development.
@@ -1148,20 +1388,15 @@ def agent(  # noqa: PLR0913
         prompt=actual_prompt,
         interactive=use_tui,
         single_turn=single_turn,
-        problem=str(problem) if problem else None,
-        reference=str(reference) if reference else None,
-        description=description,
-        tests=list(test) if test else None,
-        benchmarks=list(benchmark) if benchmark else None,
         model=model,
-        max_turns=max_turns,
-        speedup_target=speedup_target,
         resume=resume,
         from_turn=from_turn,
         list_sessions=list_sessions,
+        get_session=get_session,
         tools=tools.split(",") if tools else None,
         allow_spawn=allow_spawn,
         max_tool_fails=max_tool_fails,
+        max_turns=max_turns,
         json_output=json_output,
         template=template,
         template_args=parsed_template_args,
@@ -1171,7 +1406,7 @@ def agent(  # noqa: PLR0913
 # =============================================================================
 # Evaluate command
-# Hidden aliases for backwards compatibility
+# Hidden aliases for agent command
 def _make_agent_alias(name: str, doc: str) -> None:
     """Create a hidden alias that delegates to agent()."""
@@ -1186,6 +1421,7 @@ def _make_agent_alias(name: str, doc: str) -> None:
         resume: str | None = typer.Option(None, "--resume", "-r"),
         from_turn: int | None = typer.Option(None, "--from-turn"),
         list_sessions: bool = typer.Option(False, "--list-sessions"),
+        get_session: str | None = typer.Option(None, "--get-session"),
         tools: str | None = typer.Option(None, "--tools"),
         allow_spawn: bool = typer.Option(False, "--allow-spawn"),
         max_tool_fails: int | None = typer.Option(None, "--max-tool-fails"),
@@ -1195,12 +1431,6 @@ def _make_agent_alias(name: str, doc: str) -> None:
         template: str | None = typer.Option(None, "--template", "-t"),
         template_args: list[str] | None = typer.Option(None, "--args"),
         corpus: str | None = typer.Option(None, "--corpus"),
-        problem: Path | None = typer.Option(None, "--problem", hidden=True),
-        reference: Path | None = typer.Option(None, "--reference", hidden=True),
-        description: str | None = typer.Option(None, "--description", hidden=True),
-        test: list[Path] | None = typer.Option(None, "--test", hidden=True),
-        benchmark: list[Path] | None = typer.Option(None, "--benchmark", hidden=True),
-        speedup_target: float | None = typer.Option(None, "--speedup-target", hidden=True),
     ) -> None:
         agent(
             prompt=prompt,
@@ -1210,6 +1440,7 @@ def _make_agent_alias(name: str, doc: str) -> None:
             resume=resume,
             from_turn=from_turn,
             list_sessions=list_sessions,
+            get_session=get_session,
             tools=tools,
             allow_spawn=allow_spawn,
             max_tool_fails=max_tool_fails,
@@ -1219,12 +1450,6 @@ def _make_agent_alias(name: str, doc: str) -> None:
             template=template,
             template_args=template_args,
             corpus=corpus,
-            problem=problem,
-            reference=reference,
-            description=description,
-            test=test,
-            benchmark=benchmark,
-            speedup_target=speedup_target,
         )
     alias_cmd.__doc__ = doc
@@ -1289,86 +1514,37 @@ def evaluate(  # noqa: PLR0913
             --benchmark --defensive
     Subcommands:
-        make-template  Generate template files for this format
+        gpumode        Use GPUMode format (functional) - RECOMMENDED
         kernelbench    Use KernelBench format (ModelNew class)
+        make-template  Generate template files for this format (deprecated)
     """
     # If a subcommand is being invoked, skip the main evaluation logic
     if ctx.invoked_subcommand is not None:
         return
-    # Validate required args when running evaluation (not subcommands)
-    missing_args = []
-    if implementation is None:
-        missing_args.append("--impl/-i")
-    if reference is None:
-        missing_args.append("--reference")
-    if test_cases is None:
-        missing_args.append("--test-cases")
-    if missing_args:
-        typer.echo("Error: Missing required arguments", err=True)
-        typer.echo(f"  Required: {', '.join(missing_args)}", err=True)
-        typer.echo("", err=True)
-        typer.echo(
-            "Usage: wafer evaluate --impl KERNEL.py --reference REF.py --test-cases TESTS.json",
-            err=True,
-        )
-        typer.echo("", err=True)
-        typer.echo("Run 'wafer evaluate --help' for full options.", err=True)
-        typer.echo("Run 'wafer evaluate make-template DIR' to generate starter files.", err=True)
-        raise typer.Exit(1)
-    from .evaluate import EvaluateArgs, run_evaluate
-    args = EvaluateArgs(
-        implementation=implementation,
-        reference=reference,
-        test_cases=test_cases,
-        target_name=target or "",
-        benchmark=benchmark,
-        profile=profile,
-        defensive=defensive,
-        sync_artifacts=sync_artifacts,
-        gpu_id=gpu_id,
+    # Bare 'wafer evaluate' is no longer supported - must use subcommand
+    typer.echo("Error: 'wafer evaluate' requires a subcommand.", err=True)
+    typer.echo("", err=True)
+    typer.echo("Available subcommands:", err=True)
+    typer.echo(
+        "  gpumode      Evaluate GPUMode format (custom_kernel/ref_kernel functions)", err=True
     )
-    try:
-        # Use trio_asyncio to run async code that uses both trio and asyncio
-        # (AsyncSSHClient uses asyncssh which is asyncio-based, bridged via trio_asyncio)
-        import trio_asyncio
-        result = trio_asyncio.run(run_evaluate, args)
-    except KeyboardInterrupt:
-        typer.echo("\nInterrupted by user", err=True)
-        raise typer.Exit(130) from None
-    except Exception as e:
-        # Unwrap ExceptionGroup (from Trio nurseries) to show actual error
-        if hasattr(e, "exceptions") and e.exceptions:
-            for exc in e.exceptions:
-                typer.echo(f"Error: {type(exc).__name__}: {exc}", err=True)
-        else:
-            typer.echo(f"Error: {e}", err=True)
-        raise typer.Exit(1) from None
-    # Print results
-    if result.success:
-        typer.echo("")
-        typer.echo("=" * 60)
-        status = "PASS" if result.all_correct else "FAIL"
-        typer.echo(f"Result: {status}")
-        score_pct = f"{result.correctness_score:.1%}"
-        typer.echo(f"Correctness: {result.passed_tests}/{result.total_tests} ({score_pct})")
-        if result.geomean_speedup > 0:
-            typer.echo(f"Speedup: {result.geomean_speedup:.2f}x")
-        if result.artifact_path:
-            typer.echo(f"Artifacts: {result.artifact_path}")
-        typer.echo("=" * 60)
-        if not result.all_correct:
-            raise typer.Exit(1)
-    else:
-        typer.echo(f"Error: {result.error_message}", err=True)
-        raise typer.Exit(1)
+    typer.echo("  kernelbench  Evaluate KernelBench format (ModelNew class)", err=True)
+    typer.echo("", err=True)
+    typer.echo("Examples:", err=True)
+    typer.echo(
+        "  wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json",
+        err=True,
+    )
+    typer.echo(
+        "  wafer evaluate kernelbench --impl impl.py --reference ref.py --benchmark", err=True
+    )
+    typer.echo("", err=True)
+    typer.echo(
+        "Run 'wafer evaluate gpumode --help' or 'wafer evaluate kernelbench --help' for options.",
+        err=True,
+    )
+    raise typer.Exit(1)
 TEMPLATE_KERNEL = '''\
@@ -1503,12 +1679,63 @@ def evaluate_make_template(
 # KernelBench format evaluation
 # =============================================================================
-# Path to KernelBench problems (relative to wafer root)
-KERNELBENCH_ROOT = Path(__file__).parent.parent.parent.parent / "research" / "KernelBench"
+def _get_kernelbench_root() -> Path | None:
+    """Get KernelBench problems root, preferring downloaded location."""
+    # First check downloaded location
+    downloaded = get_problems_path("kernelbench")
+    if downloaded is not None:
+        kb_root = downloaded / "KernelBench"
+        if kb_root.exists():
+            return kb_root
+        return downloaded
+    # Fall back to legacy location (for development)
+    legacy = Path(__file__).parent.parent.parent.parent / "research" / "KernelBench" / "KernelBench"
+    if legacy.exists():
+        return legacy
+    return None
+@kernelbench_app.command("download")
+def kernelbench_download(
+    force: bool = typer.Option(False, "--force", "-f", help="Re-download even if exists"),
+) -> None:
+    """Download KernelBench problems from GitHub.
+    Downloads the problem set to ~/.cache/wafer/problems/kernelbench/
+    Examples:
+        wafer evaluate kernelbench download
+        wafer evaluate kernelbench download --force  # Re-download
+    """
+    try:
+        path = download_problems("kernelbench", force=force, verbose=True)
+        typer.echo("")
+        typer.echo(f"Problems available at: {path}")
+        typer.echo("Run 'wafer evaluate kernelbench list-problems' to see available problems.")
+    except Exception as e:
+        typer.echo(f"Error downloading problems: {e}", err=True)
+        raise typer.Exit(1) from None
+@kernelbench_app.command("list-problems")
+def kernelbench_list_problems() -> None:
+    """List available KernelBench problems.
+    Examples:
+        wafer evaluate kernelbench list-problems
+    """
+    try:
+        list_problems_fn("kernelbench", verbose=True)
+    except ValueError as e:
+        typer.echo(str(e), err=True)
+        raise typer.Exit(1) from None
 @kernelbench_app.callback(invoke_without_command=True)
-def kernelbench_evaluate(  # noqa: PLR0913
+def kernelbench_evaluate(  # noqa: PLR0913, PLR0915
     ctx: typer.Context,
     implementation: Path | None = typer.Option(
         None,
@@ -1528,17 +1755,38 @@ def kernelbench_evaluate(  # noqa: PLR0913
         help="GPU target name. See 'wafer config targets list' for available targets.",
         autocompletion=complete_target_name,
     ),
+    pool: str | None = typer.Option(
+        None,
+        "--pool",
+        "-p",
+        help="Target pool name. Acquires first available target from the pool. "
+        "Define pools in ~/.wafer/config.toml under [pools.<name>].",
+    ),
     benchmark: bool = typer.Option(False, "--benchmark", help="Run performance benchmarks"),
     profile: bool = typer.Option(False, "--profile", help="Enable profiling"),
-    inputs: Path | None = typer.Option(None, "--inputs", help="Custom inputs file to override get_inputs()"),
+    inputs: Path | None = typer.Option(
+        None, "--inputs", help="Custom inputs file to override get_inputs()"
+    ),
     seed: int = typer.Option(42, "--seed", help="Random seed for weight initialization"),
     defensive: bool = typer.Option(
         False, "--defensive", help="Enable defensive timing to detect evaluation hacking"
     ),
+    backend: str | None = typer.Option(
+        None,
+        "--backend",
+        help="Kernel backend for static validation (hip, cuda, triton, cute, tilelang, thunderkittens). "
+        "When specified, validates that the implementation uses the correct backend primitives.",
+    ),
     sync_artifacts: bool = typer.Option(
         True, "--sync-artifacts/--no-sync-artifacts", help="Download artifacts"
     ),
     gpu_id: int | None = typer.Option(None, "--gpu-id", help="Override GPU ID"),
+    json_output: bool = typer.Option(
+        False, "--json", help="Output as single JSON object (machine-readable)"
+    ),
+    jsonl_output: bool = typer.Option(
+        False, "--jsonl", help="Output as streaming JSON Lines (one object per event)"
+    ),
 ) -> None:
     """Run kernel evaluation in KernelBench format (ModelNew class).
@@ -1588,48 +1836,106 @@ def kernelbench_evaluate(  # noqa: PLR0913
         )
         raise typer.Exit(1)
+    # Validate --target and --pool are mutually exclusive
+    if target and pool:
+        typer.echo("Error: Cannot specify both --target and --pool", err=True)
+        raise typer.Exit(1)
     from .evaluate import KernelBenchEvaluateArgs, run_evaluate_kernelbench
+    from .output import OutputCollector, format_evaluate_result, get_output_format
+    output_format = get_output_format(json_output, jsonl_output)
+    collector = OutputCollector(format=output_format)
+    # If pool specified, acquire a target from the pool
+    resolved_target = target or ""
+    pool_lock_context = None
+    if pool:
+        from .target_lock import acquire_from_pool
+        from .targets import filter_pool_by_auth, get_pool
+        try:
+            pool_targets = get_pool(pool)
+        except FileNotFoundError as e:
+            collector.set_error("pool", "PoolNotFound", pool=pool, message=str(e))
+            collector.finalize()
+            raise typer.Exit(1) from None
+        # Filter to only targets with valid auth
+        usable_targets, skipped = filter_pool_by_auth(pool_targets)
+        if skipped:
+            collector.emit("pool_auth_skip", targets=skipped)
+        if not usable_targets:
+            collector.set_error("pool", "NoUsableTargets", pool=pool)
+            collector.finalize()
+            raise typer.Exit(1) from None
+        collector.emit("pool_acquire", pool=pool, count=len(usable_targets))
+        pool_lock_context = acquire_from_pool(usable_targets)
+        acquired_target = pool_lock_context.__enter__()
+        if acquired_target is None:
+            # Exit context manager before raising to avoid resource leak
+            pool_lock_context.__exit__(None, None, None)
+            collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=usable_targets)
+            collector.finalize()
+            raise typer.Exit(1)
+        collector.emit("pool_acquired", target=acquired_target)
+        resolved_target = acquired_target
+    collector.target = resolved_target
     args = KernelBenchEvaluateArgs(
         implementation=implementation,
         reference=reference,
-        target_name=target or "",
+        target_name=resolved_target,
         benchmark=benchmark,
         profile=profile,
         inputs=inputs,
         seed=seed,
         defensive=defensive,
+        backend=backend,
         sync_artifacts=sync_artifacts,
         gpu_id=gpu_id,
     )
+    collector.emit("started", target=resolved_target)
     try:
         import trio_asyncio
+        collector.emit("evaluation", status="running")
         result = trio_asyncio.run(run_evaluate_kernelbench, args)
     except KeyboardInterrupt:
-        typer.echo("\nInterrupted by user", err=True)
+        collector.set_error("evaluation", "Interrupted", message="Interrupted by user")
+        collector.finalize()
         raise typer.Exit(130) from None
     except Exception as e:
-        typer.echo(f"Error: {e}", err=True)
+        collector.set_error("evaluation", "Exception", message=str(e))
+        collector.finalize()
         raise typer.Exit(1) from None
+    finally:
+        # Release pool lock if we acquired one
+        if pool_lock_context is not None:
+            pool_lock_context.__exit__(None, None, None)
-    # Print results
+    # Build structured output
+    eval_output = format_evaluate_result(result, target=resolved_target)
+    collector._result = eval_output
+    # Print results based on output format
     if result.success:
-        typer.echo("")
-        typer.echo("=" * 60)
-        status = "PASS" if result.all_correct else "FAIL"
-        typer.echo(f"Result: {status}")
-        score_pct = f"{result.correctness_score:.1%}"
-        typer.echo(f"Correctness: {result.passed_tests}/{result.total_tests} ({score_pct})")
-        if result.geomean_speedup > 0:
-            typer.echo(f"Speedup: {result.geomean_speedup:.2f}x")
-        typer.echo("=" * 60)
+        collector.output_text_result(result)
+        collector.finalize()
         if not result.all_correct:
             raise typer.Exit(1)
     else:
-        typer.echo(f"Error: {result.error_message}", err=True)
+        collector.output_text_error(result.error_message or "Unknown error")
+        collector.finalize()
         raise typer.Exit(1)
@@ -1659,7 +1965,14 @@ def kernelbench_make_template(
         # Overwrite existing
         wafer evaluate kernelbench make-template level1/1 --force
     """
-    # Parse problem ID
+    # Get problems root (downloaded or legacy)
+    kb_root = _get_kernelbench_root()
+    if kb_root is None:
+        typer.echo("Error: KernelBench problems not found.", err=True)
+        typer.echo("Run 'wafer evaluate kernelbench download' to download problems.", err=True)
+        raise typer.Exit(1)
+    # Parse problem ID
     parts = problem.split("/")
     if len(parts) != 2:
         typer.echo(f"Error: Invalid problem ID '{problem}'. Expected format: level1/1", err=True)
@@ -1670,10 +1983,10 @@ def kernelbench_make_template(
         level_str = f"level{level_str}"
     # Find the problem file
-    problem_dir = KERNELBENCH_ROOT / "KernelBench" / level_str
+    problem_dir = kb_root / level_str
     if not problem_dir.exists():
         typer.echo(f"Error: KernelBench level directory not found: {problem_dir}", err=True)
-        typer.echo(f"Make sure KernelBench is at: {KERNELBENCH_ROOT}", err=True)
+        typer.echo("Run 'wafer evaluate kernelbench download' to download problems.", err=True)
         raise typer.Exit(1)
     # Find matching problem file
@@ -1740,6 +2053,306 @@ def kernelbench_make_template(
     typer.echo(f"     wafer evaluate kernelbench --impl my_kernel.py --reference {output}")
+# =============================================================================
+# GPUMode format evaluation
+# =============================================================================
+@gpumode_app.command("download")
+def gpumode_download(
+    force: bool = typer.Option(False, "--force", "-f", help="Re-download even if exists"),
+) -> None:
+    """Download GPUMode reference kernels from GitHub.
+    Downloads the problem set to ~/.cache/wafer/problems/gpumode/
+    Examples:
+        wafer evaluate gpumode download
+        wafer evaluate gpumode download --force  # Re-download
+    """
+    try:
+        path = download_problems("gpumode", force=force, verbose=True)
+        typer.echo("")
+        typer.echo(f"Problems available at: {path}")
+        typer.echo("Run 'wafer evaluate gpumode list-problems' to see available problems.")
+    except Exception as e:
+        typer.echo(f"Error downloading problems: {e}", err=True)
+        raise typer.Exit(1) from None
+@gpumode_app.command("list-problems")
+def gpumode_list_problems() -> None:
+    """List available GPUMode problems.
+    Examples:
+        wafer evaluate gpumode list-problems
+    """
+    try:
+        list_problems_fn("gpumode", verbose=True)
+    except ValueError as e:
+        typer.echo(str(e), err=True)
+        raise typer.Exit(1) from None
+@gpumode_app.command("make-template")
+def gpumode_make_template(
+    problem: str = typer.Option(
+        ...,
+        "--problem",
+        "-p",
+        help="Problem ID (e.g., 'pmpp/vectoradd_py' or 'amd/fp8-mm')",
+    ),
+    output: Path = typer.Option(
+        None, "--output", "-o", help="Output directory (default: ./<problem_name>/)"
+    ),
+    force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing files"),
+) -> None:
+    """Extract a GPUMode problem as template files.
+    Creates a directory with reference.py, task.yml, and other problem files.
+    You then create kernel.py with your custom_kernel implementation.
+    Examples:
+        # Extract pmpp vectoradd problem
+        wafer evaluate gpumode make-template --problem pmpp/vectoradd_py
+        # Extract to specific directory
+        wafer evaluate gpumode make-template --problem pmpp/vectoradd_py --output ./my-kernel/
+    """
+    import shutil
+    # Get problem path
+    problem_path = get_problem_path("gpumode", problem)
+    if problem_path is None:
+        # Check if problems are downloaded
+        if get_problems_path("gpumode") is None:
+            typer.echo("Error: GPUMode problems not downloaded.", err=True)
+            typer.echo("Run 'wafer evaluate gpumode download' first.", err=True)
+        else:
+            typer.echo(f"Error: Problem '{problem}' not found.", err=True)
+            typer.echo(
+                "Run 'wafer evaluate gpumode list-problems' to see available problems.", err=True
+            )
+        raise typer.Exit(1)
+    # Determine output path
+    if output is None:
+        output = Path.cwd() / problem.replace("/", "_")
+    output = output.resolve()
+    # Check if exists
+    if output.exists() and not force:
+        typer.echo(f"Error: {output} already exists. Use --force to overwrite.", err=True)
+        raise typer.Exit(1)
+    # Copy the problem directory
+    if output.exists():
+        shutil.rmtree(output)
+    shutil.copytree(problem_path, output)
+    typer.echo(f"Created {output}/")
+    typer.echo("")
+    typer.echo("Contents:")
+    for f in sorted(output.iterdir()):
+        if not f.name.startswith("."):
+            typer.echo(f"  {f.name}")
+    typer.echo("")
+    typer.echo("Next steps:")
+    typer.echo("  1. Read reference.py to understand the kernel interface")
+    typer.echo("  2. Create kernel.py with your custom_kernel implementation:")
+    typer.echo("")
+    typer.echo("     def custom_kernel(data):")
+    typer.echo("         # Your optimized implementation")
+    typer.echo("         ...")
+    typer.echo("")
+    typer.echo("  3. Run evaluation:")
+    typer.echo(
+        f"     wafer evaluate gpumode --impl {output}/kernel.py --reference {output}/reference.py \\"
+    )
+    typer.echo(f"         --test-cases {output}/test_cases.json --target <target>")
+@gpumode_app.callback(invoke_without_command=True)
+def gpumode_evaluate(  # noqa: PLR0913, PLR0915
+    ctx: typer.Context,
+    implementation: Path | None = typer.Option(
+        None, "--impl", "-i", help="Path to implementation kernel file"
+    ),
+    reference: Path | None = typer.Option(
+        None, "--reference", help="Path to reference kernel file"
+    ),
+    test_cases: Path | None = typer.Option(
+        None, "--test-cases", help="Path to test cases JSON file"
+    ),
+    target: str | None = typer.Option(
+        None,
+        "--target",
+        "-t",
+        help="GPU target name. See 'wafer config targets list' for available targets.",
+        autocompletion=complete_target_name,
+    ),
+    pool: str | None = typer.Option(
+        None,
+        "--pool",
+        "-p",
+        help="Target pool name. Acquires first available target from the pool. "
+        "Define pools in ~/.wafer/config.toml under [pools.<name>].",
+    ),
+    benchmark: bool = typer.Option(False, "--benchmark", help="Run performance benchmarks"),
+    profile: bool = typer.Option(False, "--profile", help="Enable profiling"),
+    defensive: bool = typer.Option(
+        False, "--defensive", help="Enable defensive timing to detect evaluation hacking"
+    ),
+    sync_artifacts: bool = typer.Option(
+        True, "--sync-artifacts/--no-sync-artifacts", help="Download artifacts"
+    ),
+    gpu_id: int | None = typer.Option(None, "--gpu-id", help="Override GPU ID"),
+) -> None:
+    """Run kernel evaluation in GPUMode format (functional).
+    This format expects:
+    - Implementation: Python file with `custom_kernel(inputs)` function
+    - Reference: Python file with `ref_kernel(inputs)` and `generate_input(**kwargs)` functions
+    - Test cases: JSON file with test parameters
+    Examples:
+        # Basic correctness check
+        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json
+        # With benchmarking
+        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json \\
+            --target vultr-b200 --benchmark
+    Subcommands:
+        download       Download GPUMode problems from GitHub
+        list-problems  List available problems
+        make-template  Extract a problem as template files
+    """
+    # If a subcommand is being invoked, skip the main evaluation logic
+    if ctx.invoked_subcommand is not None:
+        return
+    # Validate required args when running evaluation (not subcommands)
+    missing_args = []
+    if implementation is None:
+        missing_args.append("--impl/-i")
+    if reference is None:
+        missing_args.append("--reference")
+    if test_cases is None:
+        missing_args.append("--test-cases")
+    if missing_args:
+        typer.echo("Error: Missing required arguments", err=True)
+        typer.echo(f"  Required: {', '.join(missing_args)}", err=True)
+        typer.echo("", err=True)
+        typer.echo(
+            "Usage: wafer evaluate gpumode --impl KERNEL.py --reference REF.py --test-cases TESTS.json",
+            err=True,
+        )
+        typer.echo("", err=True)
+        typer.echo("Run 'wafer evaluate gpumode --help' for full options.", err=True)
+        typer.echo("Run 'wafer evaluate gpumode download' to download problem sets.", err=True)
+        raise typer.Exit(1)
+    # Validate --target and --pool are mutually exclusive
+    if target and pool:
+        typer.echo("Error: Cannot specify both --target and --pool", err=True)
+        raise typer.Exit(1)
+    from .evaluate import EvaluateArgs, run_evaluate
+    # If pool specified, acquire a target from the pool
+    resolved_target = target or ""
+    pool_lock_context = None
+    if pool:
+        from .target_lock import acquire_from_pool
+        from .targets import filter_pool_by_auth, get_pool
+        try:
+            pool_targets = get_pool(pool)
+        except FileNotFoundError as e:
+            typer.echo(f"Error: {e}", err=True)
+            raise typer.Exit(1) from None
+        # Filter to only targets with valid auth
+        usable_targets, skipped = filter_pool_by_auth(pool_targets)
+        if skipped:
+            typer.echo(f"Skipping targets without auth: {', '.join(skipped)}", err=True)
+        if not usable_targets:
+            typer.echo(f"Error: No usable targets in pool '{pool}'", err=True)
+            typer.echo("  All targets require authentication that is not configured.", err=True)
+            typer.echo("  Run 'wafer auth status' to see which providers need setup.", err=True)
+            raise typer.Exit(1) from None
+        typer.echo(f"Acquiring target from pool '{pool}' ({len(usable_targets)} targets)...")
+        pool_lock_context = acquire_from_pool(usable_targets)
+        acquired_target = pool_lock_context.__enter__()
+        if acquired_target is None:
+            # Exit context manager before raising to avoid resource leak
+            pool_lock_context.__exit__(None, None, None)
+            typer.echo(f"Error: All targets in pool '{pool}' are busy", err=True)
+            typer.echo(f"  Targets: {', '.join(usable_targets)}", err=True)
+            raise typer.Exit(1)
+        typer.echo(f"Acquired target: {acquired_target}")
+        resolved_target = acquired_target
+    args = EvaluateArgs(
+        implementation=implementation,
+        reference=reference,
+        test_cases=test_cases,
+        target_name=resolved_target,
+        benchmark=benchmark,
+        profile=profile,
+        defensive=defensive,
+        sync_artifacts=sync_artifacts,
+        gpu_id=gpu_id,
+    )
+    try:
+        import trio_asyncio
+        result = trio_asyncio.run(run_evaluate, args)
+    except KeyboardInterrupt:
+        typer.echo("\nInterrupted by user", err=True)
+        raise typer.Exit(130) from None
+    except Exception as e:
+        if hasattr(e, "exceptions") and e.exceptions:
+            for exc in e.exceptions:
+                typer.echo(f"Error: {type(exc).__name__}: {exc}", err=True)
+        else:
+            typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    finally:
+        # Release pool lock if we acquired one
+        if pool_lock_context is not None:
+            pool_lock_context.__exit__(None, None, None)
+    # Print results
+    if result.success:
+        typer.echo("")
+        typer.echo("=" * 60)
+        status = "PASS" if result.all_correct else "FAIL"
+        typer.echo(f"Result: {status}")
+        score_pct = f"{result.correctness_score:.1%}"
+        typer.echo(f"Correctness: {result.passed_tests}/{result.total_tests} ({score_pct})")
+        if result.geomean_speedup > 0:
+            typer.echo(f"Speedup: {result.geomean_speedup:.2f}x")
+        if result.artifact_path:
+            typer.echo(f"Artifacts: {result.artifact_path}")
+        typer.echo("=" * 60)
+        if not result.all_correct:
+            raise typer.Exit(1)
+    else:
+        typer.echo(f"Error: {result.error_message}", err=True)
+        raise typer.Exit(1)
 # =============================================================================
 # Push and Remote-Run commands
 # =============================================================================
@@ -1871,7 +2484,7 @@ def _run_direct_mode(
         typer.echo(f"Uploading {upload_dir.name}...")
         try:
             push_result = push_direct(upload_dir, target)
-            workspace_name = push_result.workspace_path
+            workspace_name = push_result.workspace_name
             typer.echo(f"Uploaded {len(push_result.files_uploaded)} files")
         except Exception as e:
             typer.echo(f"Error uploading: {e}", err=True)
@@ -1901,6 +2514,7 @@ def _run_api_mode(  # noqa: PLR0913
     upload_dir: Path | None,
     workspace_id: str | None,
     gpu_id: int | None,
+    gpu_count: int,
     docker_image: str | None,
     docker_entrypoint: str | None,
     pull_image: bool,
@@ -1915,6 +2529,8 @@ def _run_api_mode(  # noqa: PLR0913
         typer.echo(f"Workspace: {workspace_id}")
     if gpu_id is not None:
         typer.echo(f"GPU: {gpu_id}")
+    if gpu_count > 1:
+        typer.echo(f"GPU count: {gpu_count}")
     if docker_image:
         typer.echo(f"Image: {docker_image}")
     if docker_entrypoint:
@@ -1932,6 +2548,7 @@ def _run_api_mode(  # noqa: PLR0913
             upload_dir=upload_dir,
             workspace_id=workspace_id,
             gpu_id=gpu_id,
+            gpu_count=gpu_count,
             docker_image=docker_image,
             docker_entrypoint=docker_entrypoint,
             pull_image=pull_image,
@@ -1955,6 +2572,7 @@ def remote_run(  # noqa: PLR0913
         None, "--workspace-id", "-w", help="Workspace ID (from wafer push)"
     ),
     gpu_id: int | None = typer.Option(None, "--gpu", "-g", help="GPU ID"),
+    gpu_count: int = typer.Option(1, "--gpu-count", "-n", help="Number of GPUs (1-8)"),
     docker_image: str | None = typer.Option(None, "--image", "-i", help="Docker image override"),
     docker_entrypoint: str | None = typer.Option(
         None, "--docker-entrypoint", help="Override Docker entrypoint (e.g., 'bash')"
@@ -2024,6 +2642,7 @@ def remote_run(  # noqa: PLR0913
             upload_dir,
             workspace_id,
             gpu_id,
+            gpu_count,
             docker_image,
             docker_entrypoint,
             pull_image,
@@ -2044,27 +2663,41 @@ def login(
         None, "--token", "-t", help="Access token (skip browser OAuth)"
     ),
     port: int | None = typer.Option(
-        None, "--port", "-p", help="Port for OAuth callback server (default: 8765 for SSH, random for local)"
+        None,
+        "--port",
+        "-p",
+        help="Port for OAuth callback server (local only, ignored for SSH)",
+    ),
+    no_device_code: bool = typer.Option(
+        False,
+        "--no-device-code",
+        help="Force browser OAuth even on SSH (requires port forwarding)",
     ),
 ) -> None:
     """Authenticate CLI with wafer-api via GitHub OAuth.
-    Opens browser for GitHub authentication. Use --token to skip browser.
+    Local: Opens browser for GitHub authentication.
+    SSH: Uses device code flow (no port forwarding needed).
     Uses the API environment from config (see 'wafer config show').
-    SSH Users:
-    - Automatically uses port 8765 (just set up port forwarding once)
-    - On local machine: ssh -L 8765:localhost:8765 user@host
-    - On remote machine: wafer login
-    - Browser opens locally, redirect works through tunnel
+    SSH Users (Easiest):
+    - Just run: wafer login
+    - Visit the URL and enter the code shown
+    - No port forwarding needed!
+    SSH with browser (Advanced):
+    - Use --no-device-code to force browser flow
+    - Requires: ssh -L 8765:localhost:8765 user@host
     Manual token option:
     - Visit auth.wafer.ai, authenticate, copy token from URL
     - Run: wafer login --token <paste-token>
     Examples:
-        wafer login                    # auto-detects SSH, uses appropriate port
-        wafer login --port 9000        # override port
+        wafer login                    # device code on SSH, browser on local
+        wafer login --no-device-code   # force browser (needs port forwarding on SSH)
+        wafer login --port 9000        # custom port for browser flow
         wafer login --token xyz        # manual token (no browser)
         # Change environment:
@@ -2073,7 +2706,7 @@ def login(
     """
     import httpx
-    from .auth import browser_login, save_credentials, verify_token
+    from .auth import browser_login, device_code_login, save_credentials, verify_token
     from .global_config import get_api_url, get_supabase_url, load_global_config
     # Show which environment we're logging into
@@ -2083,21 +2716,31 @@ def login(
     typer.echo(f"Auth: {get_supabase_url()}")
     typer.echo("")
-    # Auto-detect SSH and use fixed port
-    if port is None:
-        is_ssh = bool(os.environ.get("SSH_CONNECTION") or os.environ.get("SSH_CLIENT"))
-        if is_ssh:
-            port = 8765
-            typer.echo("🔒 SSH session detected - using port 8765 for OAuth callback")
-            typer.echo("   Make sure you have port forwarding set up:")
-            typer.echo("   ssh -L 8765:localhost:8765 user@host")
-            typer.echo("")
+    # Auto-detect SSH
+    is_ssh = bool(os.environ.get("SSH_CONNECTION") or os.environ.get("SSH_CLIENT"))
-    # Browser OAuth if no token provided
+    # Choose auth method
     refresh_token = None
     if token is None:
         try:
-            token, refresh_token = browser_login(port=port)
+            if is_ssh and not no_device_code:
+                # Use device code flow for SSH (no port forwarding needed)
+                typer.echo("🔒 SSH session detected - using device code authentication")
+                typer.echo("   (No port forwarding required!)")
+                typer.echo("")
+                token, refresh_token = device_code_login()
+            else:
+                # Use browser OAuth for local or if explicitly requested
+                if is_ssh:
+                    typer.echo("🔒 SSH session detected - using browser authentication")
+                    typer.echo("   Make sure you have port forwarding set up:")
+                    if port is None:
+                        port = 8765
+                        typer.echo(f"   ssh -L {port}:localhost:{port} user@host")
+                    else:
+                        typer.echo(f"   ssh -L {port}:localhost:{port} user@host")
+                    typer.echo("")
+                token, refresh_token = browser_login(port=port)
         except TimeoutError as e:
             typer.echo(f"Error: {e}", err=True)
             raise typer.Exit(1) from None
@@ -2146,9 +2789,8 @@ def login(
 @app.command("logout")
 def logout() -> None:
     """Remove stored credentials."""
-    from .auth import clear_credentials
     from . import analytics
+    from .auth import clear_credentials
     # Track logout event first (while credentials still exist for user identification)
     # Note: track_logout() handles the case where user is not logged in
@@ -2625,6 +3267,7 @@ init_app = typer.Typer(
 Choose based on your GPU access:
+  local        GPU on current machine (no SSH)
   ssh          Your own hardware via SSH
   runpod       RunPod cloud GPUs (needs WAFER_RUNPOD_API_KEY)
   digitalocean DigitalOcean AMD MI300X (needs WAFER_AMD_DIGITALOCEAN_API_KEY)"""
@@ -2632,57 +3275,143 @@ Choose based on your GPU access:
 targets_app.add_typer(init_app, name="init")
-@init_app.command("runpod")
-def init_runpod(
-    name: str = typer.Option("runpod-mi300x", "--name", "-n", help="Target name"),
-    gpu_type: str = typer.Option("MI300X", "--gpu", "-g", help="GPU type (MI300X, H100, A100)"),
-    ssh_key: str = typer.Option("~/.ssh/id_ed25519", "--ssh-key", "-k", help="Path to SSH key"),
-    keep_alive: bool = typer.Option(
-        True, "--keep-alive/--no-keep-alive", help="Keep pod running after eval"
-    ),
+@init_app.command("local")
+def init_local(
+    name: str = typer.Option("local", "--name", "-n", help="Target name"),
+    gpu_ids: str = typer.Option("0", "--gpu-ids", "-g", help="Comma-separated GPU IDs"),
 ) -> None:
-    """Initialize a RunPod target.
+    """Initialize a local target for GPU on current machine.
-    Creates a target config for auto-provisioned RunPod GPUs.
-    Requires WAFER_RUNPOD_API_KEY environment variable.
+    Detects your local GPU and configures a target for direct execution
+    (no SSH). Use this when running wafer on the same machine as the GPU.
     Examples:
-        wafer config targets init runpod
-        wafer config targets init runpod --name my-runpod --gpu H100
+        wafer config targets init local
+        wafer config targets init local --name my-5090 --gpu-ids 0,1
     """
-    import os
     from .targets import save_target
-    # Check for API key
-    api_key = os.environ.get("WAFER_RUNPOD_API_KEY", "")
-    if not api_key:
-        typer.echo("Error: WAFER_RUNPOD_API_KEY environment variable not set.", err=True)
-        typer.echo("", err=True)
-        typer.echo("Get your API key from: https://runpod.io/console/user/settings", err=True)
-        typer.echo("Then run: export WAFER_RUNPOD_API_KEY=your_key_here", err=True)
-        raise typer.Exit(1)
+    # Parse GPU IDs
+    try:
+        parsed_gpu_ids = [int(g.strip()) for g in gpu_ids.split(",")]
+    except ValueError:
+        typer.echo(f"Error: Invalid GPU IDs '{gpu_ids}'. Use comma-separated integers.", err=True)
+        raise typer.Exit(1) from None
-    # GPU type mappings
-    gpu_configs = {
-        "MI300X": {
-            "gpu_type_id": "AMD Instinct MI300X OAM",
-            "image": "runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04",
-            "compute_capability": "9.4",
-        },
-        "H100": {
-            "gpu_type_id": "NVIDIA H100 80GB HBM3",
-            "image": "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04",
-            "compute_capability": "9.0",
-        },
-        "A100": {
-            "gpu_type_id": "NVIDIA A100 80GB PCIe",
-            "image": "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04",
-            "compute_capability": "8.0",
-        },
-    }
+    typer.echo("Detecting local GPU...")
-    if gpu_type not in gpu_configs:
+    try:
+        from wafer_core.gpu_detect import (
+            detect_local_gpu,
+            get_compute_capability,
+            get_torch_requirements,
+        )
+        detected_gpu = detect_local_gpu()
+        if detected_gpu:
+            typer.echo(f"  Found: {detected_gpu.gpu_name}")
+            if detected_gpu.vendor == "nvidia":
+                typer.echo(f"  CUDA: {detected_gpu.driver_version}")
+            else:
+                typer.echo(f"  ROCm: {detected_gpu.driver_version}")
+            typer.echo(f"  GPU count: {detected_gpu.gpu_count}")
+            # Get torch requirements and compute capability
+            torch_reqs = get_torch_requirements(detected_gpu)
+            compute_capability = get_compute_capability(detected_gpu)
+            gpu_type = _extract_gpu_type(detected_gpu.gpu_name)
+            typer.echo(f"  PyTorch: {torch_reqs.packages[0]}")
+        else:
+            typer.echo("  No GPU detected (nvidia-smi/rocm-smi not found)", err=True)
+            raise typer.Exit(1)
+    except ImportError as e:
+        typer.echo(f"Error: Missing dependency: {e}", err=True)
+        raise typer.Exit(1) from None
+    # Build target data
+    target_data = {
+        "name": name,
+        "type": "local",
+        "gpu_ids": parsed_gpu_ids,
+        "gpu_type": gpu_type,
+        "compute_capability": compute_capability,
+        "torch_package": torch_reqs.packages[0],
+        "torch_index_url": torch_reqs.index_url,
+        "vendor": detected_gpu.vendor,
+        "driver_version": detected_gpu.driver_version,
+    }
+    try:
+        target = save_target(target_data)
+        typer.echo(f"✓ Created target: {target.name}")
+        typer.echo("  Type: Local (no SSH)")
+        typer.echo(f"  GPU IDs: {parsed_gpu_ids}")
+        typer.echo(f"  GPU Type: {gpu_type}")
+        typer.echo(f"  Compute: {compute_capability}")
+        typer.echo(f"  Torch: {torch_reqs.packages[0]}")
+        typer.echo("")
+        typer.echo(
+            f"Usage: wafer evaluate --target {name} --impl kernel.py --reference ref.py --test-cases tests.json"
+        )
+    except (ValueError, AssertionError) as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+@init_app.command("runpod")
+def init_runpod(
+    name: str = typer.Option("runpod-mi300x", "--name", "-n", help="Target name"),
+    gpu_type: str = typer.Option("MI300X", "--gpu", "-g", help="GPU type (MI300X, H100, A100)"),
+    ssh_key: str = typer.Option("~/.ssh/id_ed25519", "--ssh-key", "-k", help="Path to SSH key"),
+    keep_alive: bool = typer.Option(
+        True, "--keep-alive/--no-keep-alive", help="Keep pod running after eval"
+    ),
+) -> None:
+    """Initialize a RunPod target.
+    Creates a target config for auto-provisioned RunPod GPUs.
+    Requires WAFER_RUNPOD_API_KEY environment variable.
+    Examples:
+        wafer config targets init runpod
+        wafer config targets init runpod --name my-runpod --gpu H100
+    """
+    import os
+    from .targets import save_target
+    # Check for API key
+    api_key = os.environ.get("WAFER_RUNPOD_API_KEY", "")
+    if not api_key:
+        typer.echo("Error: WAFER_RUNPOD_API_KEY environment variable not set.", err=True)
+        typer.echo("", err=True)
+        typer.echo("Get your API key from: https://runpod.io/console/user/settings", err=True)
+        typer.echo("Then run: export WAFER_RUNPOD_API_KEY=your_key_here", err=True)
+        raise typer.Exit(1)
+    # GPU type mappings
+    gpu_configs = {
+        "MI300X": {
+            "gpu_type_id": "AMD Instinct MI300X OAM",
+            "image": "runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04",
+            "compute_capability": "9.4",
+        },
+        "H100": {
+            "gpu_type_id": "NVIDIA H100 80GB HBM3",
+            "image": "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04",
+            "compute_capability": "9.0",
+        },
+        "A100": {
+            "gpu_type_id": "NVIDIA A100 80GB PCIe",
+            "image": "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04",
+            "compute_capability": "8.0",
+        },
+    }
+    if gpu_type not in gpu_configs:
         typer.echo(
             f"Error: Unknown GPU type '{gpu_type}'. Available: {', '.join(gpu_configs.keys())}",
             err=True,
@@ -2795,23 +3524,29 @@ def init_ssh(
     host: str = typer.Option(..., "--host", "-H", help="SSH host (user@hostname:port)"),
     ssh_key: str = typer.Option("~/.ssh/id_ed25519", "--ssh-key", "-k", help="Path to SSH key"),
     gpu_ids: str = typer.Option("0", "--gpu-ids", "-g", help="Comma-separated GPU IDs"),
-    gpu_type: str = typer.Option(
-        "H100", "--gpu-type", help="GPU type (H100, A100, B200, MI300X, etc.)"
+    gpu_type: str | None = typer.Option(
+        None, "--gpu-type", help="GPU type (auto-detected if not specified)"
     ),
     docker_image: str | None = typer.Option(
         None, "--docker-image", "-d", help="Docker image (optional)"
     ),
     ncu: bool = typer.Option(False, "--ncu/--no-ncu", help="NCU profiling available"),
+    no_detect: bool = typer.Option(False, "--no-detect", help="Skip GPU auto-detection"),
 ) -> None:
     """Initialize an SSH target for your own GPU hardware.
     Creates a target config for direct SSH access to a GPU machine.
-    Use for baremetal servers, VMs, or any machine you have SSH access to.
+    Automatically detects GPU type and selects compatible PyTorch version.
     Examples:
+        # Auto-detect GPU (recommended)
         wafer config targets init ssh --name my-gpu --host user@192.168.1.100:22
+        # Multiple GPUs with NCU profiling
         wafer config targets init ssh --name lab-h100 --host ubuntu@gpu.lab.com:22 --gpu-ids 0,1 --ncu
-        wafer config targets init ssh --name docker-gpu --host user@host:22 --docker-image nvcr.io/nvidia/pytorch:24.01-py3
+        # Skip detection, specify manually
+        wafer config targets init ssh --name my-gpu --host user@host:22 --gpu-type H100 --no-detect
     """
     from .targets import save_target
@@ -2828,17 +3563,86 @@ def init_ssh(
         typer.echo("Example: user@192.168.1.100:22", err=True)
         raise typer.Exit(1)
+    # Auto-detect GPU if not specified
+    detected_gpu = None
+    torch_package = None
+    torch_index_url = None
+    if not no_detect:
+        typer.echo(f"Connecting to {host}...")
+        try:
+            import trio
+            import trio_asyncio
+            from wafer_core.async_ssh import AsyncSSHClient
+            from wafer_core.gpu_detect import (
+                detect_remote_gpu,
+                get_compute_capability,
+                get_torch_requirements,
+            )
+            expanded_key = str(Path(ssh_key).expanduser())
+            async def _detect() -> None:
+                nonlocal detected_gpu, torch_package, torch_index_url
+                # Need trio_asyncio.open_loop() for asyncssh bridge
+                async with trio_asyncio.open_loop():
+                    async with AsyncSSHClient(host, expanded_key) as client:
+                        detected_gpu = await detect_remote_gpu(client)
+            trio.run(_detect)
+            if detected_gpu:
+                typer.echo(f"  Found: {detected_gpu.gpu_name}")
+                if detected_gpu.vendor == "nvidia":
+                    typer.echo(f"  CUDA: {detected_gpu.driver_version}")
+                else:
+                    typer.echo(f"  ROCm: {detected_gpu.driver_version}")
+                # Get torch requirements
+                torch_reqs = get_torch_requirements(detected_gpu)
+                torch_package = torch_reqs.packages[0]  # Just torch, not all packages
+                torch_index_url = torch_reqs.index_url
+                typer.echo(f"  PyTorch: {torch_package}")
+                # Use detected GPU type if not specified
+                if not gpu_type:
+                    # Extract GPU name (e.g., "H100" from "NVIDIA H100 80GB HBM3")
+                    gpu_type = _extract_gpu_type(detected_gpu.gpu_name)
+            else:
+                typer.echo("  No GPU detected (nvidia-smi/rocm-smi not found)")
+                if not gpu_type:
+                    gpu_type = "H100"  # Default fallback
+                    typer.echo(f"  Using default: {gpu_type}")
+        except Exception as e:
+            typer.echo(f"  Detection failed: {e}", err=True)
+            if not gpu_type:
+                gpu_type = "H100"
+                typer.echo(f"  Using default: {gpu_type}")
+    # Fallback if no detection
+    if not gpu_type:
+        gpu_type = "H100"
     # Compute capability mappings
-    compute_caps = {
-        "B200": "10.0",
-        "H100": "9.0",
-        "A100": "8.0",
-        "A10": "8.6",
-        "V100": "7.0",
-        "MI300X": "9.4",
-        "MI250X": "9.0",
-    }
-    compute_capability = compute_caps.get(gpu_type, "8.0")
+    if detected_gpu:
+        from wafer_core.gpu_detect import get_compute_capability
+        compute_capability = get_compute_capability(detected_gpu)
+    else:
+        compute_caps = {
+            "B200": "10.0",
+            "H100": "9.0",
+            "A100": "8.0",
+            "A10": "8.6",
+            "V100": "7.0",
+            "MI300X": "9.4",
+            "MI250X": "9.0",
+            "RTX 5090": "10.0",
+            "RTX 4090": "8.9",
+            "RTX 3090": "8.6",
+        }
+        compute_capability = compute_caps.get(gpu_type, "8.0")
     # Build target data
     target_data = {
@@ -2855,6 +3659,12 @@ def init_ssh(
     if docker_image:
         target_data["docker_image"] = docker_image
+    # Add torch requirements if detected
+    if torch_package:
+        target_data["torch_package"] = torch_package
+    if torch_index_url:
+        target_data["torch_index_url"] = torch_index_url
     try:
         target = save_target(target_data)
         typer.echo(f"✓ Created target: {target.name}")
@@ -2862,9 +3672,12 @@ def init_ssh(
         typer.echo(f"  Host: {host}")
         typer.echo(f"  GPU IDs: {parsed_gpu_ids}")
         typer.echo(f"  GPU Type: {gpu_type}")
+        typer.echo(f"  Compute: {compute_capability}")
         typer.echo(f"  NCU: {'Yes' if ncu else 'No'}")
         if docker_image:
             typer.echo(f"  Docker: {docker_image}")
+        if torch_package:
+            typer.echo(f"  Torch: {torch_package}")
         typer.echo("")
         typer.echo(
             f"Usage: wafer evaluate --target {name} --impl kernel.py --reference ref.py --test-cases tests.json"
@@ -2874,6 +3687,44 @@ def init_ssh(
         raise typer.Exit(1) from None
+def _extract_gpu_type(gpu_name: str) -> str:
+    """Extract GPU type from full GPU name.
+    Examples:
+        "NVIDIA H100 80GB HBM3" -> "H100"
+        "NVIDIA GeForce RTX 4090" -> "RTX 4090"
+        "AMD Instinct MI300X OAM" -> "MI300X"
+    """
+    gpu_name_upper = gpu_name.upper()
+    # Check for known GPU types
+    known_types = [
+        "B200",
+        "B100",
+        "H200",
+        "H100",
+        "A100",
+        "A10",
+        "V100",
+        "RTX 5090",
+        "RTX 5080",
+        "RTX 4090",
+        "RTX 4080",
+        "RTX 3090",
+        "RTX 3080",
+        "MI300X",
+        "MI250X",
+        "MI100",
+    ]
+    for gpu_type in known_types:
+        if gpu_type in gpu_name_upper:
+            return gpu_type
+    # Fallback: return cleaned name
+    return gpu_name.replace("NVIDIA ", "").replace("AMD ", "").strip()
 @targets_app.command("add")
 def targets_add(
     file_path: Path = typer.Argument(..., help="Path to target TOML file"),
@@ -2956,6 +3807,93 @@ def targets_show(
         raise typer.Exit(1) from None
+@targets_app.command("probe")
+def targets_probe(
+    name: str = typer.Argument(..., help="Target name"),
+) -> None:
+    """Probe a target to discover available compilation backends.
+    Connects to the target and checks what's available:
+    - Triton
+    - torch.compile/inductor
+    - HIP/hipcc or CUDA/nvcc
+    - ROCm or CUDA version
+    - Python packages (torch, triton, etc.)
+    Example:
+        wafer config targets probe runpod-mi300x
+    """
+    import trio
+    from .targets import ProbeError, load_target, probe_target_capabilities
+    try:
+        target = load_target(name)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    typer.echo(f"Probing target: {name}...")
+    try:
+        capabilities = trio.run(probe_target_capabilities, target)
+    except ProbeError as e:
+        # ProbeError already has actionable context
+        typer.echo(f"\nError: {e}", err=True)
+        raise typer.Exit(1) from None
+    except Exception as e:
+        # Unexpected errors - include type for debugging
+        typer.echo(f"\nUnexpected error probing target: {type(e).__name__}: {e}", err=True)
+        raise typer.Exit(1) from None
+    # Display results
+    typer.echo(f"\nTarget: {name}")
+    if capabilities.get("gpu_name"):
+        typer.echo(f"  GPU: {capabilities['gpu_name']}")
+    if capabilities.get("compute_capability"):
+        typer.echo(f"  Compute: {capabilities['compute_capability']}")
+    typer.echo("\n  Compilation Backends:")
+    backends = capabilities.get("backends", {})
+    # Triton
+    triton_ver = backends.get("triton")
+    if triton_ver:
+        typer.echo(f"    ✓ Triton: {triton_ver}")
+    else:
+        typer.echo("    ✗ Triton: not installed")
+    # torch.compile
+    if triton_ver and backends.get("torch"):
+        typer.echo("    ✓ torch.compile/inductor: available")
+    else:
+        typer.echo("    ✗ torch.compile/inductor: requires Triton")
+    # HIP/CUDA compiler
+    if backends.get("hipcc"):
+        typer.echo(f"    ✓ HIP/hipcc: {backends['hipcc']}")
+    elif backends.get("nvcc"):
+        typer.echo(f"    ✓ CUDA/nvcc: {backends['nvcc']}")
+    else:
+        typer.echo("    ✗ No GPU compiler found")
+    # ROCm/CUDA version
+    if capabilities.get("rocm_version"):
+        typer.echo(f"    ROCm: {capabilities['rocm_version']}")
+    if capabilities.get("cuda_version"):
+        typer.echo(f"    CUDA: {capabilities['cuda_version']}")
+    typer.echo("\n  Python Environment:")
+    typer.echo(f"    Python: {capabilities.get('python_version', 'unknown')}")
+    packages = capabilities.get("packages", {})
+    if packages.get("torch"):
+        typer.echo(f"    PyTorch: {packages['torch']}")
+    if triton_ver:
+        typer.echo(f"    Triton: {triton_ver}")
 @targets_app.command("remove")
 def targets_remove(
     name: str = typer.Argument(..., help="Target name"),
@@ -3086,6 +4024,92 @@ def targets_pods() -> None:
         typer.echo()
+# ── Pool commands ───────────────────────────────────────────────────────────
+@targets_app.command("pool-list")
+def targets_pool_list() -> None:
+    """List all configured target pools.
+    Example:
+        wafer config targets pool-list
+    """
+    from .targets import get_pool, list_pools
+    pools = list_pools()
+    if not pools:
+        typer.echo("No pools configured")
+        typer.echo("")
+        typer.echo("Define pools in ~/.wafer/config.toml:")
+        typer.echo("  [pools.my-pool]")
+        typer.echo('  targets = ["target-1", "target-2"]')
+        return
+    typer.echo("Configured pools:\n")
+    for pool_name in pools:
+        try:
+            targets = get_pool(pool_name)
+            typer.echo(f"  {pool_name}: {', '.join(targets)}")
+        except Exception as e:
+            typer.echo(f"  {pool_name}: (error: {e})")
+@targets_app.command("pool-create")
+def targets_pool_create(
+    name: str = typer.Argument(..., help="Pool name"),
+    targets: list[str] = typer.Argument(..., help="Target names to include in pool"),
+) -> None:
+    """Create or update a target pool.
+    Example:
+        wafer config targets pool-create mi300x-pool mi300x-1 mi300x-2 mi300x-3
+    """
+    from .targets import save_pool
+    try:
+        save_pool(name, targets)
+        typer.echo(f"Pool '{name}' created with {len(targets)} targets")
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+@targets_app.command("pool-status")
+def targets_pool_status(
+    name: str = typer.Argument(..., help="Pool name"),
+) -> None:
+    """Show status of targets in a pool (locked/available).
+    Example:
+        wafer config targets pool-status mi300x-pool
+    """
+    from .target_lock import get_lock_holder, is_target_locked
+    from .targets import get_pool
+    try:
+        targets = get_pool(name)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    typer.echo(f"Pool '{name}' ({len(targets)} targets):\n")
+    available = 0
+    for target_name in targets:
+        locked = is_target_locked(target_name)
+        if locked:
+            pid = get_lock_holder(target_name)
+            pid_str = f" (pid {pid})" if pid else ""
+            typer.echo(f"  [busy]  {target_name}{pid_str}")
+        else:
+            typer.echo(f"  [free]  {target_name}")
+            available += 1
+    typer.echo("")
+    typer.echo(f"Available: {available}/{len(targets)}")
 # =============================================================================
 # Billing commands
 # =============================================================================
@@ -3119,7 +4143,9 @@ def billing_usage(
 @billing_app.command("topup")
 def billing_topup(
     amount: int = typer.Argument(25, help="Amount in dollars ($10-$500)"),
-    no_browser: bool = typer.Option(False, "--no-browser", help="Print URL instead of opening browser"),
+    no_browser: bool = typer.Option(
+        False, "--no-browser", help="Print URL instead of opening browser"
+    ),
 ) -> None:
     """Add credits to your account.
@@ -3165,7 +4191,9 @@ def billing_topup(
 @billing_app.command("portal")
 def billing_portal(
-    no_browser: bool = typer.Option(False, "--no-browser", help="Print URL instead of opening browser"),
+    no_browser: bool = typer.Option(
+        False, "--no-browser", help="Print URL instead of opening browser"
+    ),
 ) -> None:
     """Open Stripe billing portal.
@@ -3198,6 +4226,81 @@ def billing_portal(
         raise typer.Exit(1) from None
+# =============================================================================
+# SSH Keys commands (BYOK - Bring Your Own Key)
+# =============================================================================
+@ssh_keys_app.command("list")
+def ssh_keys_list(
+    json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
+) -> None:
+    """List all registered SSH public keys.
+    Example:
+        wafer ssh-keys list
+        wafer ssh-keys list --json
+    """
+    from .ssh_keys import list_ssh_keys
+    try:
+        result = list_ssh_keys(json_output=json_output)
+        typer.echo(result)
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from e
+@ssh_keys_app.command("add")
+def ssh_keys_add(
+    pubkey_path: Path | None = typer.Argument(
+        None, help="Path to public key file (auto-detects ~/.ssh/id_ed25519.pub if not specified)"
+    ),
+    name: str | None = typer.Option(None, "--name", "-n", help="Friendly name for the key"),
+    json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
+) -> None:
+    """Add an SSH public key.
+    If no path is specified, auto-detects keys from ~/.ssh/ in preference order:
+    id_ed25519.pub, id_rsa.pub, id_ecdsa.pub.
+    Example:
+        wafer ssh-keys add                              # Auto-detect
+        wafer ssh-keys add ~/.ssh/id_rsa.pub            # Specific file
+        wafer ssh-keys add ~/.ssh/id_ed25519.pub --name laptop
+    """
+    from .ssh_keys import add_ssh_key
+    try:
+        result = add_ssh_key(pubkey_path=pubkey_path, name=name, json_output=json_output)
+        typer.echo(result)
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from e
+@ssh_keys_app.command("remove")
+def ssh_keys_remove(
+    key_id: str = typer.Argument(..., help="UUID of the SSH key to remove"),
+    json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
+) -> None:
+    """Remove an SSH public key.
+    Get the key ID from 'wafer ssh-keys list'.
+    Example:
+        wafer ssh-keys remove abc123-def456-...
+    """
+    from .ssh_keys import remove_ssh_key
+    try:
+        result = remove_ssh_key(key_id=key_id, json_output=json_output)
+        typer.echo(result)
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from e
 # =============================================================================
 # Workspaces commands
 # =============================================================================
@@ -3226,21 +4329,34 @@ def workspaces_list(
 @workspaces_app.command("create")
 def workspaces_create(
     name: str = typer.Argument(..., help="Workspace name"),
-    gpu_type: str = typer.Option("B200", "--gpu", "-g", help="GPU type (default: B200)"),
+    gpu_type: str = typer.Option("B200", "--gpu", "-g", help="GPU type: MI300X (AMD) or B200 (NVIDIA, default)"),
     image: str | None = typer.Option(None, "--image", "-i", help="Docker image (optional)"),
+    wait: bool = typer.Option(False, "--wait", "-w", help="Wait for provisioning and show SSH credentials"),
     json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
 ) -> None:
     """Create a new workspace.
+    Available GPUs:
+        MI300X  AMD Instinct MI300X (192GB HBM3, ROCm)
+        B200    NVIDIA Blackwell B200 (180GB HBM3e, CUDA)
     Example:
-        wafer workspaces create my-kernel
-        wafer workspaces create my-kernel --gpu H100
+        wafer workspaces create my-kernel                # B200 (default)
+        wafer workspaces create my-kernel --gpu MI300X   # AMD MI300X
+        wafer workspaces create my-kernel --gpu B200     # NVIDIA B200
         wafer workspaces create my-kernel --image pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
+        wafer workspaces create my-kernel --wait
     """
     from .workspaces import create_workspace
     try:
-        result = create_workspace(name, gpu_type=gpu_type, image=image, json_output=json_output)
+        result = create_workspace(
+            name,
+            gpu_type=gpu_type,
+            image=image,
+            wait=wait,
+            json_output=json_output,
+        )
         typer.echo(result)
     except RuntimeError as e:
         typer.echo(f"Error: {e}", err=True)
@@ -3250,16 +4366,23 @@ def workspaces_create(
 @workspaces_app.command("delete")
 def workspaces_delete(
     workspace_id: str = typer.Argument(..., help="Workspace ID to delete"),
+    yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation prompt"),
     json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
 ) -> None:
     """Delete a workspace.
     Example:
         wafer workspaces delete ws_abc123
+        wafer workspaces delete ws_abc123 -y
     """
     from .workspaces import delete_workspace
     try:
+        if not yes:
+            confirm = typer.confirm(f"Delete workspace '{workspace_id}'?")
+            if not confirm:
+                typer.echo("Cancelled.")
+                raise typer.Exit(0)
         result = delete_workspace(workspace_id, json_output=json_output)
         typer.echo(result)
     except RuntimeError as e:
@@ -3267,32 +4390,6 @@ def workspaces_delete(
         raise typer.Exit(1) from None
-@workspaces_app.command("attach")
-def workspaces_attach(
-    workspace_id: str = typer.Argument(..., help="Workspace ID to attach to"),
-    json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
-) -> None:
-    """Attach to a workspace (get SSH credentials).
-    This will:
-    1. Start the workspace if needed
-    2. Return SSH connection details
-    3. Save the private key to ~/.wafer/keys/
-    Example:
-        wafer workspaces attach ws_abc123
-        wafer workspaces attach ws_abc123 --json
-    """
-    from .workspaces import attach_workspace
-    try:
-        result = attach_workspace(workspace_id, json_output=json_output)
-        typer.echo(result)
-    except RuntimeError as e:
-        typer.echo(f"Error: {e}", err=True)
-        raise typer.Exit(1) from None
 @workspaces_app.command("show")
 def workspaces_show(
     workspace_id: str = typer.Argument(..., help="Workspace ID to show"),
@@ -3314,12 +4411,19 @@ def workspaces_show(
         raise typer.Exit(1) from None
-@workspaces_app.command("exec", context_settings={"allow_interspersed_args": False})
+@workspaces_app.command(
+    "exec",
+    context_settings={
+        "allow_interspersed_args": False,
+        "ignore_unknown_options": True,
+        "allow_extra_args": True,
+    },
+)
 def workspaces_exec(
+    ctx: typer.Context,
     workspace: str | None = typer.Argument(
         None, help="Workspace name or ID (optional if default set)"
     ),
-    command: list[str] = typer.Argument(..., help="Command to execute on GPU"),
     timeout: int | None = typer.Option(
         None,
         "--timeout",
@@ -3332,17 +4436,30 @@ def workspaces_exec(
         "-s",
         help="Sync local directory to workspace before executing",
     ),
+    gpu: bool = typer.Option(False, "--gpu", help="Force GPU routing (default behavior)"),
+    cpu: bool = typer.Option(False, "--cpu", help="Run in workspace container (no GPU)"),
+    baremetal: bool = typer.Option(
+        False, "--baremetal", help="Force baremetal target (for hardware counters like ncu/nsys)"
+    ),
+    pull_image: bool = typer.Option(False, "--pull-image", help="Pull image on target if missing"),
     verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
     quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
 ) -> None:
-    """Execute a command in workspace with GPU routing.
+    """Execute a command in workspace.
+    By default, auto-detects whether to route to GPU based on the command.
+    Use --gpu, --cpu, or --baremetal to override.
-    Runs the command on the workspace's configured GPU target (Modal, baremetal, etc.)
-    and streams output back. No SSH or zsh plugin required.
+    Routing options:
+      --gpu       Force GPU container (Modal or baremetal with GPU)
+      --cpu       Run in workspace container directly (no GPU)
+      --baremetal Force baremetal target (for ncu, nsys, hardware counters)
     If workspace is not specified, uses the default workspace from config,
     or the only workspace if you have exactly one.
+    IMPORTANT: Options must come before the workspace name.
     Examples:
         wafer workspaces exec dev -- python train.py
         wafer workspaces exec dev -- python -c "import torch; print(torch.cuda.is_available())"
@@ -3353,6 +4470,49 @@ def workspaces_exec(
     from .global_config import get_defaults, get_preferences
     from .workspaces import exec_command, resolve_workspace, sync_files
+    # Enforce option ordering to avoid treating CLI flags as remote commands
+    known_options = {
+        "--timeout",
+        "-t",
+        "--sync",
+        "-s",
+        "--gpu",
+        "--cpu",
+        "--baremetal",
+        "--pull-image",
+        "--verbose",
+        "-v",
+        "--quiet",
+        "-q",
+        "--help",
+        "-h",
+    }
+    for arg in ctx.args:
+        if arg == "--":
+            break
+        if arg in known_options:
+            typer.echo(
+                "Error: options must come before the workspace name. "
+                "Example: wafer workspaces exec --pull-image dev -- python -V",
+                err=True,
+            )
+            raise typer.Exit(1)
+    # Validate mutually exclusive routing flags
+    routing_flags = sum([gpu, cpu, baremetal])
+    if routing_flags > 1:
+        typer.echo("Error: --gpu, --cpu, and --baremetal are mutually exclusive", err=True)
+        raise typer.Exit(1)
+    # Determine routing (None = auto-detect)
+    routing: str | None = None
+    if gpu:
+        routing = "gpu"
+    elif cpu:
+        routing = "cpu"
+    elif baremetal:
+        routing = "baremetal"
     # Resolve workspace (specified, config default, or single workspace)
     try:
         resolved_workspace = resolve_workspace(workspace)
@@ -3377,7 +4537,8 @@ def workspaces_exec(
         show_status = prefs.mode == "explicit"
     if show_status:
-        typer.echo(f"[wafer] Workspace: {resolved_workspace}", err=True)
+        routing_label = routing or "auto"
+        typer.echo(f"[wafer] Workspace: {resolved_workspace} (routing: {routing_label})", err=True)
     # Sync files if requested
     if sync is not None:
@@ -3403,114 +4564,617 @@ def workspaces_exec(
             typer.echo(f"Error: {e}", err=True)
             raise typer.Exit(1) from None
+    # Get command from context args (passthrough after --)
+    import shlex
+    command = list(ctx.args)
+    if command and command[0] == "--":
+        command = command[1:]
+    if not command:
+        typer.echo("Error: No command specified", err=True)
+        raise typer.Exit(1)
     if show_status:
         typer.echo(f"[wafer] Executing (timeout: {effective_timeout}s)...", err=True)
-    # Join command list into shell command string, stripping leading "--" separator
+    # Build command string
+    # Handle two cases:
+    # 1. Single element: user quoted the whole command (e.g., "echo hello world")
+    #    -> use directly, don't re-quote
+    # 2. Multiple elements: user passed separate args (e.g., -- python -c "print(1)")
+    #    -> use shlex.join to properly quote args with spaces
+    if len(command) == 1:
+        command_str = command[0]
+    else:
+        command_str = shlex.join(command)
+    try:
+        exit_code = exec_command(
+            workspace_id=resolved_workspace,
+            command=command_str,
+            timeout_seconds=effective_timeout,
+            routing=routing,
+            pull_image=pull_image,
+        )
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Exit code: {exit_code}", err=True)
+    raise typer.Exit(exit_code)
+@workspaces_app.command("ssh")
+def workspaces_ssh(
+    workspace: str | None = typer.Argument(
+        None, help="Workspace name or ID (optional if default set)"
+    ),
+) -> None:
+    """SSH into a workspace.
+    Uses workspace SSH credentials once the workspace is running.
+    If workspace is not specified, uses the default workspace.
+    Examples:
+        wafer workspaces ssh dev
+        wafer workspaces ssh           # uses default workspace
+    """
+    import os
+    from .workspaces import get_workspace_raw, resolve_workspace
+    # Resolve workspace
+    try:
+        resolved_workspace = resolve_workspace(workspace)
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    typer.echo(f"Connecting to workspace: {resolved_workspace}...", err=True)
+    # Get SSH credentials from workspace
+    try:
+        ws = get_workspace_raw(resolved_workspace)
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    from .workspaces import VALID_STATUSES
+    workspace_status = ws.get("status")
+    assert workspace_status in VALID_STATUSES, (
+        f"Workspace {resolved_workspace} has invalid status '{workspace_status}'. "
+        f"Valid statuses: {VALID_STATUSES}"
+    )
+    if workspace_status != "running":
+        typer.echo(f"Error: Workspace is {workspace_status}. Wait for it to be running.", err=True)
+        raise typer.Exit(1)
+    if not ws.get("ssh_host") or not ws.get("ssh_port") or not ws.get("ssh_user"):
+        typer.echo("Error: SSH credentials not available yet.", err=True)
+        raise typer.Exit(1)
+    # Build SSH args - key_path is None for BYOK model (uses default SSH key)
+    ssh_args = ["ssh"]
+    ssh_args.extend([
+        "-p",
+        str(ws.get("ssh_port")),
+        "-o",
+        "StrictHostKeyChecking=no",
+        "-o",
+        "UserKnownHostsFile=/dev/null",
+        f"{ws.get('ssh_user')}@{ws.get('ssh_host')}",
+    ])
+    # Replace current process with SSH
+    os.execvp("ssh", ssh_args)
+@workspaces_app.command("sync")
+def workspaces_sync(
+    workspace: str | None = typer.Argument(
+        None, help="Workspace name or ID (optional if default set)"
+    ),
+    path: Path = typer.Argument(..., help="Local file or directory to sync"),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
+    quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
+) -> None:
+    """Sync local files to workspace.
+    Uses rsync over SSH to sync files to the workspace's /workspace directory.
+    If workspace is not specified, uses the default workspace.
+    Examples:
+        wafer workspaces sync dev ./my-project
+        wafer workspaces sync ./my-project        # uses default workspace
+        wafer workspaces sync dev .               # sync current directory
+        wafer workspaces sync dev ./script.py     # sync single file
+    """
+    from .global_config import get_preferences
+    from .workspaces import resolve_workspace, sync_files
+    # Determine verbosity based on mode
+    prefs = get_preferences()
+    if quiet:
+        show_status = False
+    elif verbose:
+        show_status = True
+    else:
+        show_status = prefs.mode == "explicit"
+    # Validate path
+    if not path.exists():
+        typer.echo(f"Error: Path not found: {path}", err=True)
+        raise typer.Exit(1)
+    # Resolve workspace
+    try:
+        resolved_workspace = resolve_workspace(workspace)
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Syncing {path} to workspace {resolved_workspace}...", err=True)
+    def on_progress(msg: str) -> None:
+        if show_status:
+            typer.echo(f"[wafer] {msg}", err=True)
+    try:
+        file_count, warning = sync_files(
+            resolved_workspace, path.resolve(), on_progress=on_progress
+        )
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+# =============================================================================
+# Target operations commands (exec/ssh/sync)
+# =============================================================================
+@targets_ops_app.command("exec", context_settings={"allow_interspersed_args": False})
+def targets_exec(
+    target: str = typer.Argument(
+        ...,
+        help="Target name",
+        autocompletion=complete_target_name,
+    ),
+    command: list[str] = typer.Argument(..., help="Command to execute"),
+    timeout: int | None = typer.Option(
+        None,
+        "--timeout",
+        "-t",
+        help="Execution timeout in seconds (default: 300)",
+    ),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
+    quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
+) -> None:
+    """Execute a command on a configured target.
+    Provisions the target if needed (RunPod, DigitalOcean), then runs the command via SSH.
+    For cloud targets, the instance is kept alive after execution - use
+    'wafer config targets cleanup <name>' to terminate.
+    Supported targets: RunPod, DigitalOcean, SSH (baremetal/vm).
+    Not supported: Modal (serverless), Local (no SSH), Workspace (use 'wafer workspaces exec').
+    Examples:
+        wafer targets exec runpod-mi300x -- python -c "import torch; print(torch.cuda.is_available())"
+        wafer targets exec runpod-mi300x -- rocm-smi
+        wafer targets exec my-ssh-server -- nvidia-smi
+        wafer targets exec runpod-mi300x "echo hello && ls -la" --timeout 60
+    """
+    from .global_config import get_preferences
+    from .targets import load_target
+    from .targets_ops import TargetExecError, exec_on_target_sync, get_target_ssh_info
+    # Determine verbosity
+    prefs = get_preferences()
+    if quiet:
+        show_status = False
+    elif verbose:
+        show_status = True
+    else:
+        show_status = prefs.mode == "explicit"
+    # Load target
+    try:
+        target_config = load_target(target)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        typer.echo("List available targets with: wafer config targets list", err=True)
+        raise typer.Exit(1) from None
+    except ValueError as e:
+        typer.echo(f"Error loading target config: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Target: {target} ({type(target_config).__name__})", err=True)
+    # Get SSH info (may provision)
+    if show_status:
+        typer.echo("[wafer] Connecting to target...", err=True)
+    try:
+        ssh_info = trio.run(get_target_ssh_info, target_config)
+    except TargetExecError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Connected: {ssh_info.user}@{ssh_info.host}:{ssh_info.port}", err=True)
+    # Build command string
     if isinstance(command, list):
         import shlex
-        # Remove leading "--" if present (typer passes it through with allow_interspersed_args=False)
+        # Remove leading "--" if present
         if command and command[0] == "--":
             command = command[1:]
-        # Use shlex.join to properly quote args containing spaces/special chars
-        command_str = shlex.join(command)
+        if not command:
+            typer.echo("Error: No command specified", err=True)
+            raise typer.Exit(1)
+        if len(command) == 1:
+            command_str = command[0]
+        else:
+            command_str = shlex.join(command)
     else:
         command_str = command
+    # Default timeout
+    effective_timeout = timeout if timeout is not None else 300
+    if show_status:
+        typer.echo(f"[wafer] Executing (timeout: {effective_timeout}s)...", err=True)
+    # Execute
+    try:
+        exit_code = exec_on_target_sync(ssh_info, command_str, effective_timeout)
+    except TargetExecError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Exit code: {exit_code}", err=True)
+    raise typer.Exit(exit_code)
+@targets_ops_app.command("ssh")
+def targets_ssh(
+    target: str = typer.Argument(
+        ...,
+        help="Target name",
+        autocompletion=complete_target_name,
+    ),
+) -> None:
+    """SSH into a configured target.
+    Provisions the target if needed (RunPod, DigitalOcean), then starts an interactive SSH session.
+    For cloud targets, the instance is kept alive - use 'wafer config targets cleanup <name>' to terminate.
+    Examples:
+        wafer targets ssh runpod-mi300x
+        wafer targets ssh my-baremetal-server
+    """
+    from .targets import load_target
+    from .targets_ops import TargetExecError, get_target_ssh_info
+    # Load target
+    try:
+        target_config = load_target(target)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        typer.echo("List available targets with: wafer config targets list", err=True)
+        raise typer.Exit(1) from None
+    except ValueError as e:
+        typer.echo(f"Error loading target config: {e}", err=True)
+        raise typer.Exit(1) from None
+    typer.echo(f"Connecting to target: {target}...", err=True)
+    # Get SSH info (may provision)
+    try:
+        ssh_info = trio.run(get_target_ssh_info, target_config)
+    except TargetExecError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    # Build SSH command
+    ssh_args = [
+        "ssh",
+        "-i",
+        str(ssh_info.key_path),
+        "-p",
+        str(ssh_info.port),
+        "-o",
+        "StrictHostKeyChecking=no",
+        "-o",
+        "UserKnownHostsFile=/dev/null",
+        f"{ssh_info.user}@{ssh_info.host}",
+    ]
+    # Replace current process with SSH
+    os.execvp("ssh", ssh_args)
+@targets_ops_app.command("sync")
+def targets_sync(
+    target: str = typer.Argument(
+        ...,
+        help="Target name",
+        autocompletion=complete_target_name,
+    ),
+    path: Path = typer.Argument(..., help="Local file or directory to sync"),
+    dest: str | None = typer.Option(
+        None,
+        "--dest",
+        "-d",
+        help="Remote destination path (default: /tmp/<basename>)",
+    ),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
+    quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
+) -> None:
+    """Sync local files to a configured target.
+    Uses rsync over SSH to copy files to the target. Provisions the target if needed.
+    Examples:
+        wafer targets sync runpod-mi300x ./my-project
+        wafer targets sync runpod-mi300x ./script.py --dest /workspace/script.py
+        wafer targets sync my-server ./kernels --dest /tmp/kernels
+    """
+    from .global_config import get_preferences
+    from .targets import load_target
+    from .targets_ops import TargetExecError, get_target_ssh_info, sync_to_target
+    # Determine verbosity
+    prefs = get_preferences()
+    if quiet:
+        show_status = False
+    elif verbose:
+        show_status = True
+    else:
+        show_status = prefs.mode == "explicit"
+    # Validate path
+    if not path.exists():
+        typer.echo(f"Error: Path not found: {path}", err=True)
+        raise typer.Exit(1)
+    # Load target
+    try:
+        target_config = load_target(target)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        typer.echo("List available targets with: wafer config targets list", err=True)
+        raise typer.Exit(1) from None
+    except ValueError as e:
+        typer.echo(f"Error loading target config: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Target: {target} ({type(target_config).__name__})", err=True)
+    # Get SSH info (may provision)
+    if show_status:
+        typer.echo("[wafer] Connecting to target...", err=True)
+    try:
+        ssh_info = trio.run(get_target_ssh_info, target_config)
+    except TargetExecError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Connected: {ssh_info.user}@{ssh_info.host}:{ssh_info.port}", err=True)
+    # Sync
+    def on_progress(msg: str) -> None:
+        if show_status:
+            typer.echo(f"[wafer] {msg}", err=True)
     try:
-        exit_code = exec_command(
-            workspace_id=resolved_workspace,
-            command=command_str,
-            timeout_seconds=effective_timeout,
-        )
-    except RuntimeError as e:
+        file_count = sync_to_target(ssh_info, path.resolve(), dest, on_progress)
+    except TargetExecError as e:
         typer.echo(f"Error: {e}", err=True)
         raise typer.Exit(1) from None
     if show_status:
-        typer.echo(f"[wafer] Exit code: {exit_code}", err=True)
-    raise typer.Exit(exit_code)
+        typer.echo(f"[wafer] Done. Synced {file_count} files.", err=True)
-@workspaces_app.command("ssh")
-def workspaces_ssh(
-    workspace: str | None = typer.Argument(
-        None, help="Workspace name or ID (optional if default set)"
-    ),
+@targets_ops_app.command("scp")
+def targets_scp(
+    source: str = typer.Argument(..., help="Source path (prefix with target: for remote)"),
+    dest: str = typer.Argument(..., help="Destination path (prefix with target: for remote)"),
+    recursive: bool = typer.Option(False, "-r", "--recursive", help="Copy directories recursively"),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
+    quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
 ) -> None:
-    """SSH into a workspace.
+    """Copy files to/from a target using scp-style syntax.
-    Gets SSH credentials via attach, then execs into SSH.
-    If workspace is not specified, uses the default workspace.
+    Use target: prefix to indicate remote paths. Exactly one of source or dest
+    must be remote.
     Examples:
-        wafer workspaces ssh dev
-        wafer workspaces ssh           # uses default workspace
+        wafer targets scp runpod-mi300x:/tmp/trace.json ./trace.json  # download
+        wafer targets scp ./script.py runpod-mi300x:/tmp/script.py    # upload
+        wafer targets scp -r ./kernels runpod-mi300x:/tmp/kernels     # upload dir
+        wafer targets scp -r runpod-mi300x:/tmp/results ./results     # download dir
     """
-    import os
+    from .global_config import get_preferences
+    from .targets import load_target
+    from .targets_ops import TargetExecError, get_target_ssh_info, parse_scp_path, scp_transfer
+    # Determine verbosity
+    prefs = get_preferences()
+    if quiet:
+        show_status = False
+    elif verbose:
+        show_status = True
+    else:
+        show_status = prefs.mode == "explicit"
-    from .workspaces import get_ssh_credentials, resolve_workspace
+    # Parse source and dest
+    source_target, source_path = parse_scp_path(source)
+    dest_target, dest_path = parse_scp_path(dest)
-    # Resolve workspace
+    # Validate: exactly one must be remote
+    if source_target and dest_target:
+        typer.echo("Error: Both paths are remote. Use ssh to transfer between remotes.", err=True)
+        raise typer.Exit(1)
+    if not source_target and not dest_target:
+        typer.echo("Error: Both paths are local. Use regular cp command.", err=True)
+        raise typer.Exit(1)
+    # Determine direction and target
+    is_download = source_target is not None
+    target_name = source_target if is_download else dest_target
+    # Load target
     try:
-        resolved_workspace = resolve_workspace(workspace)
-    except RuntimeError as e:
-        typer.echo(f"Error: {e}", err=True)
+        target_config = load_target(target_name)
+    except FileNotFoundError:
+        typer.echo(f"Error: Target '{target_name}' not found.", err=True)
+        typer.echo("Run 'wafer config targets list' to see available targets.", err=True)
+        raise typer.Exit(1) from None
+    except ValueError as e:
+        typer.echo(f"Error loading target config: {e}", err=True)
         raise typer.Exit(1) from None
-    typer.echo(f"Connecting to workspace: {resolved_workspace}...", err=True)
+    # Validate local path exists (for upload)
+    if not is_download:
+        local_path = Path(source_path)
+        if not local_path.exists():
+            typer.echo(f"Error: Local path '{source_path}' does not exist.", err=True)
+            raise typer.Exit(1)
+        if local_path.is_dir() and not recursive:
+            typer.echo(
+                f"Error: '{source_path}' is a directory. Use -r flag for recursive copy.", err=True
+            )
+            raise typer.Exit(1)
-    # Get SSH credentials (this calls attach)
+    if show_status:
+        typer.echo(f"[wafer] Target: {target_name} ({type(target_config).__name__})", err=True)
+        typer.echo("[wafer] Connecting to target...", err=True)
+    # Get SSH info (may provision)
     try:
-        creds = get_ssh_credentials(resolved_workspace)
-    except RuntimeError as e:
+        ssh_info = trio.run(get_target_ssh_info, target_config)
+    except TargetExecError as e:
         typer.echo(f"Error: {e}", err=True)
         raise typer.Exit(1) from None
-    # Exec into SSH - replaces this process
-    ssh_args = [
-        "ssh",
-        "-i",
-        str(creds.key_path),
-        "-p",
-        str(creds.port),
-        "-o",
-        "StrictHostKeyChecking=no",
-        "-o",
-        "UserKnownHostsFile=/dev/null",
-        f"{creds.user}@{creds.host}",
-    ]
+    if show_status:
+        typer.echo(f"[wafer] Connected: {ssh_info.user}@{ssh_info.host}:{ssh_info.port}", err=True)
+        direction = "Downloading" if is_download else "Uploading"
+        typer.echo(f"[wafer] {direction}...", err=True)
-    # Replace current process with SSH
-    os.execvp("ssh", ssh_args)
+    # Transfer
+    try:
+        if is_download:
+            scp_transfer(ssh_info, source_path, dest_path, is_download=True, recursive=recursive)
+        else:
+            scp_transfer(ssh_info, source_path, dest_path, is_download=False, recursive=recursive)
+    except TargetExecError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    if show_status:
+        typer.echo("[wafer] Done.", err=True)
-@workspaces_app.command("sync")
-def workspaces_sync(
-    workspace: str | None = typer.Argument(
-        None, help="Workspace name or ID (optional if default set)"
+@targets_ops_app.command("ensure")
+def targets_ensure(  # noqa: PLR0915
+    target: str = typer.Argument(
+        None,
+        help="Target name",
+        autocompletion=complete_target_name,
     ),
-    path: Path = typer.Argument(..., help="Local file or directory to sync"),
+    tool: str = typer.Argument(None, help="Tool to ensure is installed"),
+    check_only: bool = typer.Option(False, "--check-only", "-c", help="Only check, don't install"),
+    force: bool = typer.Option(False, "--force", "-f", help="Reinstall even if present"),
+    list_tools: bool = typer.Option(False, "--list", "-l", help="List available tools"),
+    timeout: int = typer.Option(300, "--timeout", "-t", help="Installation timeout in seconds"),
     verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
     quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
 ) -> None:
-    """Sync local files to workspace.
+    """Ensure a tool is installed on a target.
-    Uses rsync over SSH to sync files to the workspace's /workspace directory.
-    If workspace is not specified, uses the default workspace.
+    Checks if a tool exists on the target and installs it if missing.
+    Useful for profiling tools like rocprof-compute that aren't pre-installed.
     Examples:
-        wafer workspaces sync dev ./my-project
-        wafer workspaces sync ./my-project        # uses default workspace
-        wafer workspaces sync dev .               # sync current directory
-        wafer workspaces sync dev ./script.py     # sync single file
+        wafer targets ensure runpod-mi300x rocprof-compute
+        wafer targets ensure runpod-mi300x rocprof-compute --check-only
+        wafer targets ensure runpod-mi300x rocprof-compute --force
+        wafer targets ensure --list
     """
     from .global_config import get_preferences
-    from .workspaces import resolve_workspace, sync_files
+    from .targets import load_target
+    from .targets_ops import (
+        TOOL_REGISTRY,
+        TargetExecError,
+        ensure_tool,
+        get_target_platform,
+        get_target_ssh_info,
+    )
-    # Determine verbosity based on mode
+    # Handle --list flag
+    if list_tools:
+        typer.echo("Available tools:\n")
+        typer.echo("AMD tools:")
+        for name, spec in sorted(TOOL_REGISTRY.items()):
+            if spec.platform == "amd":
+                auto = "auto-install" if spec.install_cmd else "manual"
+                typer.echo(f"  {name:20} ({auto}) - {spec.description}")
+        typer.echo("\nNVIDIA tools:")
+        for name, spec in sorted(TOOL_REGISTRY.items()):
+            if spec.platform == "nvidia":
+                auto = "auto-install" if spec.install_cmd else "manual"
+                typer.echo(f"  {name:20} ({auto}) - {spec.description}")
+        typer.echo("\nCross-platform:")
+        for name, spec in sorted(TOOL_REGISTRY.items()):
+            if spec.platform == "any":
+                auto = "auto-install" if spec.install_cmd else "manual"
+                typer.echo(f"  {name:20} ({auto}) - {spec.description}")
+        return
+    # Require target and tool if not listing
+    if not target:
+        typer.echo("Error: Missing argument 'TARGET'", err=True)
+        typer.echo("Usage: wafer targets ensure TARGET TOOL", err=True)
+        typer.echo("   or: wafer targets ensure --list", err=True)
+        raise typer.Exit(1)
+    if not tool:
+        typer.echo("Error: Missing argument 'TOOL'", err=True)
+        typer.echo("Usage: wafer targets ensure TARGET TOOL", err=True)
+        typer.echo("   or: wafer targets ensure --list", err=True)
+        raise typer.Exit(1)
+    # Check tool exists
+    if tool not in TOOL_REGISTRY:
+        typer.echo(f"Error: Unknown tool '{tool}'", err=True)
+        typer.echo(f"Available tools: {', '.join(sorted(TOOL_REGISTRY.keys()))}", err=True)
+        typer.echo("Run 'wafer targets ensure --list' for details.", err=True)
+        raise typer.Exit(1)
+    spec = TOOL_REGISTRY[tool]
+    # Determine verbosity
     prefs = get_preferences()
     if quiet:
         show_status = False
@@ -3519,33 +5183,72 @@ def workspaces_sync(
     else:
         show_status = prefs.mode == "explicit"
-    # Validate path
-    if not path.exists():
-        typer.echo(f"Error: Path not found: {path}", err=True)
-        raise typer.Exit(1)
-    # Resolve workspace
+    # Load target
     try:
-        resolved_workspace = resolve_workspace(workspace)
-    except RuntimeError as e:
+        target_config = load_target(target)
+    except FileNotFoundError as e:
         typer.echo(f"Error: {e}", err=True)
+        typer.echo("List available targets with: wafer config targets list", err=True)
+        raise typer.Exit(1) from None
+    except ValueError as e:
+        typer.echo(f"Error loading target config: {e}", err=True)
         raise typer.Exit(1) from None
-    if show_status:
-        typer.echo(f"[wafer] Syncing {path} to workspace {resolved_workspace}...", err=True)
+    # Platform validation
+    platform = get_target_platform(target_config)
+    if spec.platform != "any" and spec.platform != platform:
+        typer.echo(
+            f"Error: {tool} is an {spec.platform.upper()} tool but target '{target}' "
+            f"is {platform.upper()}",
+            err=True,
+        )
+        raise typer.Exit(1)
-    def on_progress(msg: str) -> None:
-        if show_status:
-            typer.echo(f"[wafer] {msg}", err=True)
+    if show_status:
+        typer.echo(f"[wafer] Target: {target} ({platform.upper()})", err=True)
+        typer.echo(f"[wafer] Checking for {tool}...", err=True)
+    # Get SSH info (may provision)
     try:
-        file_count, warning = sync_files(
-            resolved_workspace, path.resolve(), on_progress=on_progress
-        )
-    except RuntimeError as e:
+        ssh_info = trio.run(get_target_ssh_info, target_config)
+    except TargetExecError as e:
         typer.echo(f"Error: {e}", err=True)
         raise typer.Exit(1) from None
+    if show_status:
+        typer.echo(f"[wafer] Connected: {ssh_info.user}@{ssh_info.host}:{ssh_info.port}", err=True)
+    # Check-only mode
+    if check_only:
+        from .targets_ops import TargetExecError, exec_on_target_sync
+        try:
+            exit_code = exec_on_target_sync(ssh_info, spec.check_cmd, timeout_seconds=30)
+        except TargetExecError as e:
+            typer.echo(f"Error: {e}", err=True)
+            raise typer.Exit(1) from None
+        if exit_code == 0:
+            typer.echo(f"{tool} is installed")
+        else:
+            typer.echo(f"{tool} is NOT installed", err=True)
+            raise typer.Exit(1)
+        return
+    # Ensure tool is installed
+    result = ensure_tool(ssh_info, tool, force=force, timeout=timeout)
+    if result.error:
+        typer.echo(f"Error: {result.error}", err=True)
+        raise typer.Exit(1)
+    if result.already_installed:
+        typer.echo(f"{tool} is already installed")
+    elif result.installed:
+        if result.verified:
+            typer.echo(f"{tool} installed successfully")
+        else:
+            typer.echo(f"{tool} installed (verification skipped)")
 # =============================================================================
 # Perfetto trace analysis commands
@@ -3830,13 +5533,39 @@ def ncu_analyze(
 # =============================================================================
-# NSYS Analyze command
+# NSYS commands
 # =============================================================================
+@nsys_app.command("check")
+def nsys_check() -> None:
+    """Check if NSYS (Nsight Systems) is installed and show version.
+    NSYS is required for local analysis. If not installed, shows install instructions.
+    Examples:
+        wafer nvidia nsys check
+    """
+    from .nsys_analyze import check_nsys_installation
+    result = check_nsys_installation()
+    if result.installed:
+        typer.echo(f"✓ NSYS installed: {result.path}")
+        if result.version:
+            typer.echo(f"  Version: {result.version}")
+    else:
+        typer.echo("✗ NSYS not installed")
+        if result.install_command:
+            typer.echo(f"  Install with: {result.install_command}")
 @nsys_app.command("analyze")
 def nsys_analyze(
     filepath: Path = typer.Argument(..., help="Path to .nsys-rep profile file"),
+    output_dir: Path | None = typer.Option(
+        None, "--output-dir", "-o", help="Output directory for analysis files"
+    ),
     json_output: bool = typer.Option(
         False, "--json", help="Output raw JSON instead of formatted text"
     ),
@@ -3845,6 +5574,12 @@ def nsys_analyze(
         "--remote/--local",
         help="Force remote (via API) or local analysis. Default: auto-detect (remote if nsys not installed locally)",
     ),
+    target: str | None = typer.Option(
+        None,
+        "--target",
+        "-t",
+        help="Remote target: 'workspace:id' for workspace execution, or target name from ~/.wafer/targets/",
+    ),
 ) -> None:
     """Analyze an NVIDIA Nsight Systems profile (.nsys-rep file).
@@ -3853,10 +5588,20 @@ def nsys_analyze(
     By default, uses local nsys if available, otherwise runs analysis
     remotely via wafer-api (requires authentication: wafer login).
+    Supports multiple execution modes:
+    - Local: Uses local nsys CLI (no GPU required for analysis)
+    - Remote API: Uploads file and runs analysis on Modal
+    - Workspace: Runs analysis on a Wafer workspace via SSH
+    - Target: Runs analysis on a configured target machine via SSH
     Examples:
         wafer nvidia nsys analyze profile.nsys-rep
         wafer nvidia nsys analyze profile.nsys-rep --json
+        wafer nvidia nsys analyze profile.nsys-rep --local
         wafer nvidia nsys analyze profile.nsys-rep --remote
+        wafer nvidia nsys analyze profile.nsys-rep --target workspace:abc123
+        wafer nvidia nsys analyze profile.nsys-rep --target vultr-b200
+        wafer nvidia nsys analyze profile.nsys-rep -o ./results/
     """
     from .nsys_analyze import analyze_nsys_profile
@@ -3868,11 +5613,20 @@ def nsys_analyze(
         typer.echo(f"Error: Expected .nsys-rep file, got: {filepath.suffix}", err=True)
         raise typer.Exit(1)
+    # Warn if both remote flag and target are specified
+    if target and remote is not None:
+        typer.echo(
+            "Warning: --target overrides --remote/--local flag",
+            err=True,
+        )
     try:
         result = analyze_nsys_profile(
             filepath,
             json_output=json_output,
             remote=remote,
+            target=target,
+            output_dir=output_dir,
         )
         typer.echo(result)
     except FileNotFoundError as e:
@@ -3883,6 +5637,150 @@ def nsys_analyze(
         raise typer.Exit(1) from None
+@nsys_app.command("profile", context_settings={"allow_interspersed_args": False})
+def nsys_profile(
+    command: list[str] = typer.Argument(..., help="Command to profile"),
+    output: str = typer.Option(
+        "profile",
+        "--output",
+        "-o",
+        help="Output filename (without .nsys-rep extension)",
+    ),
+    trace: str | None = typer.Option(
+        None,
+        "--trace",
+        "-t",
+        help="Trace APIs to capture (comma-separated: cuda,nvtx,osrt,cudnn,cublas). Default: cuda",
+    ),
+    duration: int | None = typer.Option(
+        None,
+        "--duration",
+        "-d",
+        help="Maximum profiling duration in seconds",
+    ),
+    target: str | None = typer.Option(
+        None,
+        "--target",
+        help="Remote target: 'workspace:id' for workspace execution, or target name from ~/.wafer/targets/",
+    ),
+    analyze: bool = typer.Option(
+        False,
+        "--analyze",
+        "-a",
+        help="Automatically analyze the profile after completion",
+    ),
+    json_output: bool = typer.Option(
+        False,
+        "--json",
+        help="Output analysis as JSON (only with --analyze)",
+    ),
+    verbose: bool = typer.Option(
+        False,
+        "--verbose",
+        "-v",
+        help="Show verbose progress messages",
+    ),
+    extra_args: str | None = typer.Option(
+        None,
+        "--extra",
+        help="Extra arguments to pass to nsys profile",
+    ),
+) -> None:
+    """Profile a command with NVIDIA Nsight Systems.
+    Runs nsys profile on the specified command and generates a .nsys-rep file.
+    Profiling requires an NVIDIA GPU. Use --target to run on a remote GPU server
+    or workspace.
+    Examples:
+        wafer nvidia nsys profile -- python train.py
+        wafer nvidia nsys profile -o gemm_profile -- ./gemm_kernel
+        wafer nvidia nsys profile --trace cuda,nvtx -- python model.py
+        wafer nvidia nsys profile --duration 60 -- ./long_running_app
+        wafer nvidia nsys profile --target workspace:abc123 -- python test.py
+        wafer nvidia nsys profile --target vultr-b200 -- ./benchmark
+        wafer nvidia nsys profile --analyze -- python train.py
+        wafer nvidia nsys profile --analyze --json -- ./kernel > results.json
+    """
+    # Parse command
+    import shlex
+    from .nsys_analyze import _parse_target
+    from .nsys_profile import (
+        NSYSProfileOptions,
+        profile_and_analyze,
+        profile_local,
+        profile_remote_ssh,
+        profile_workspace,
+    )
+    if isinstance(command, list):
+        # Remove leading "--" if present
+        if command and command[0] == "--":
+            command = command[1:]
+        if len(command) == 1:
+            command_str = command[0]
+        else:
+            command_str = shlex.join(command)
+    else:
+        command_str = command
+    if not command_str:
+        typer.echo("Error: No command specified", err=True)
+        raise typer.Exit(1)
+    # Parse trace options
+    trace_list = trace.split(",") if trace else None
+    # Build options
+    options = NSYSProfileOptions(
+        command=command_str,
+        output=output,
+        trace=trace_list,
+        duration=duration,
+        extra_args=extra_args,
+    )
+    if verbose:
+        typer.echo(f"[nsys] Command: {command_str}", err=True)
+        if target:
+            typer.echo(f"[nsys] Target: {target}", err=True)
+    # Execute
+    if analyze:
+        profile_result, analysis_result = profile_and_analyze(
+            options,
+            target=target,
+            json_output=json_output,
+            verbose=verbose,
+        )
+    else:
+        if target:
+            target_type, target_id = _parse_target(target)
+            if target_type == "workspace":
+                profile_result = profile_workspace(target_id, options, verbose=verbose)
+            else:
+                profile_result = profile_remote_ssh(target_id, options, verbose=verbose)
+        else:
+            profile_result = profile_local(options, verbose=verbose)
+        analysis_result = None
+    # Report results
+    if not profile_result.success:
+        typer.echo(f"Error: {profile_result.error}", err=True)
+        if profile_result.stderr:
+            typer.echo(f"stderr: {profile_result.stderr}", err=True)
+        raise typer.Exit(1)
+    if verbose or not analyze:
+        typer.echo(f"Profile created: {profile_result.output_path}")
+    if analysis_result:
+        if not analysis_result.success:
+            typer.echo(f"Analysis error: {analysis_result.error}", err=True)
+            raise typer.Exit(1)
 # =============================================================================
 # ROCprof-Compute commands
 # =============================================================================
@@ -4441,8 +6339,8 @@ def _setup_wafer_core_env() -> None:
     - WAFER_API_URL: If already set, uses that instead of config
     - WAFER_AUTH_TOKEN: If already set, uses that instead of cached token
     """
-    from .global_config import get_api_url
     from .auth import get_valid_token
+    from .global_config import get_api_url
     # Set API URL (get_api_url already respects WAFER_API_URL env var)
     os.environ["WAFER_API_URL"] = get_api_url()
@@ -4746,8 +6644,8 @@ def capture_command(  # noqa: PLR0915
     import os
     import tomllib
-    from .global_config import get_api_url
     from .auth import get_valid_token
+    from .global_config import get_api_url
     # Set environment variables for wafer-core BEFORE importing it
     # wafer-core backend.py reads WAFER_API_URL and WAFER_AUTH_TOKEN from env
@@ -4951,8 +6849,8 @@ def capture_list_command(
     """
     import os
-    from .global_config import get_api_url
     from .auth import get_valid_token
+    from .global_config import get_api_url
     # Set environment variables for wafer-core BEFORE importing it
     os.environ["WAFER_API_URL"] = get_api_url()
@@ -5015,13 +6913,14 @@ def capture_list_command(
 @corpus_app.command("download")
 def corpus_download(
-    name: str = typer.Argument(..., help="Corpus name (cuda, cutlass, hip)"),
+    name: str = typer.Argument(..., help="Corpus name (cuda, cutlass, hip, amd)"),
     force: bool = typer.Option(False, "--force", "-f", help="Re-download even if exists"),
 ) -> None:
     """Download a documentation corpus for agent filesystem access.
     Examples:
         wafer corpus download cuda
+        wafer corpus download amd
         wafer corpus download cutlass --force
     """
     from .corpus import CORPORA, download_corpus
@@ -5236,71 +7135,107 @@ def tracelens_collective(
 # =============================================================================
-# ISA Analysis Commands
+# Unified ISA Analysis Commands (wafer amd isa ...)
 # =============================================================================
 @isa_app.command("analyze")
 def isa_analyze(
-    file: Path = typer.Argument(..., help="Path to .co file to analyze"),
-    json_output: bool = typer.Option(False, "--json", help="Output as JSON"),
+    path: Path = typer.Argument(..., help="Path to file or directory to analyze"),
+    json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
+    csv_output: bool = typer.Option(False, "--csv", help="Output as CSV"),
+    recursive: bool = typer.Option(
+        True, "--recursive/--no-recursive", "-r", help="Scan directories recursively"
+    ),
+    filter_expr: str | None = typer.Option(
+        None, "--filter", "-f", help="Filter results (e.g., 'spills > 0')"
+    ),
+    output_file: Path | None = typer.Option(None, "--output", "-o", help="Write output to file"),
+    kernel_index: int = typer.Option(0, "--kernel", "-k", help="Kernel index if multiple in file"),
 ) -> None:
-    """Analyze AMD GPU code object (.co file).
+    """Analyze AMD GPU ISA files (.co, .s, .ll, .ttgir).
-    Extracts and analyzes ISA, showing register usage, instruction mix,
-    spills, and other performance-relevant metrics.
+    Performs static analysis to extract performance metrics like register
+    pressure, spills, MFMA density, and occupancy limits.
-    The .co file is uploaded to the Wafer API server which has ROCm tools
-    installed for analysis.
+    Supports:
+      - AMD GPU code objects (.co) - Requires API authentication
+      - AMDGCN ISA assembly (.s, .gcn, .asm) - Local parsing
+      - LLVM-IR files (.ll) - Local parsing
+      - TTGIR files (.ttgir, .ttir, .mlir) - Local parsing
     Examples:
-        wafer isa analyze kernel.co
-        wafer isa analyze kernel.co --json
+        wafer amd isa analyze kernel.co              # Code object (needs login)
+        wafer amd isa analyze kernel.s               # ISA assembly
+        wafer amd isa analyze kernel.s --json        # Output as JSON
+        wafer amd isa analyze ~/.triton/cache/ --filter 'spills > 0'
+        wafer amd isa analyze . -r --csv -o metrics.csv
     """
-    from dataclasses import asdict
-    from wafer_core.tools.isa_analysis_tools import analyze_isa, format_isa_summary
     from .auth import get_auth_headers
     from .global_config import get_api_url
+    from .kernel_scope import analyze_command
-    # Validate file exists
-    if not file.exists():
-        typer.echo(f"Error: File not found: {file}", err=True)
-        raise typer.Exit(1)
-    if not file.suffix == ".co":
-        typer.echo(f"Error: Expected .co file, got: {file.suffix}", err=True)
-        raise typer.Exit(1)
-    # Get API URL and auth
+    # Get API credentials for .co files
     api_url = get_api_url()
     auth_headers = get_auth_headers()
-    if not auth_headers:
-        typer.echo("Error: Not logged in. Run 'wafer login' first.", err=True)
-        raise typer.Exit(1)
     try:
-        result = analyze_isa(
-            co_file_path=file,
+        output = analyze_command(
+            path=str(path),
+            json_output=json_output,
+            csv_output=csv_output,
+            recursive=recursive,
+            filter_expr=filter_expr,
+            output_file=str(output_file) if output_file else None,
+            kernel_index=kernel_index,
             api_url=api_url,
             auth_headers=auth_headers,
         )
-        if json_output:
-            typer.echo(json.dumps(asdict(result)))
-        else:
-            typer.echo(format_isa_summary(result))
+        typer.echo(output)
     except FileNotFoundError as e:
         typer.echo(f"Error: {e}", err=True)
         raise typer.Exit(1) from None
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
     except Exception as e:
         typer.echo(f"Error: {e}", err=True)
         raise typer.Exit(1) from None
+@isa_app.command("metrics")
+def isa_metrics() -> None:
+    """List available metrics for ISA analysis.
+    Shows all metrics that can be extracted from AMD GPU ISA files,
+    along with their derivation.
+    Examples:
+        wafer amd isa metrics
+    """
+    from .kernel_scope import metrics_command
+    output = metrics_command()
+    typer.echo(output)
+@isa_app.command("targets")
+def isa_targets() -> None:
+    """List supported GPU targets and their specifications.
+    Shows hardware specs (VGPRs, SGPRs, LDS, etc.) for each supported
+    AMD GPU architecture.
+    Examples:
+        wafer amd isa targets
+    """
+    from .kernel_scope import targets_command
+    output = targets_command()
+    typer.echo(output)
 def main() -> None:
     """Entry point for wafer CLI."""
     app()

wafer-cli 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl

wafer-cli 0.2.8py3-none-any.whl → 0.2.10py3-none-any.whl