PyPI - wafer-cli - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl - Mend

wafer-cli 0.2.7py3-none-any.whl → 0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

wafer/auth.py +85 -0
wafer/cli.py +1196 -160
wafer/evaluate.py +1171 -209
wafer/gpu_run.py +5 -1
wafer/kernel_scope.py +453 -0
wafer/problems.py +357 -0
wafer/target_lock.py +270 -0
wafer/targets.py +490 -0
wafer/wevin_cli.py +2 -0
wafer/workspaces.py +53 -1
{wafer_cli-0.2.7.dist-info → wafer_cli-0.2.9.dist-info}/METADATA +1 -1
{wafer_cli-0.2.7.dist-info → wafer_cli-0.2.9.dist-info}/RECORD +15 -12
{wafer_cli-0.2.7.dist-info → wafer_cli-0.2.9.dist-info}/WHEEL +0 -0
{wafer_cli-0.2.7.dist-info → wafer_cli-0.2.9.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.7.dist-info → wafer_cli-0.2.9.dist-info}/top_level.txt +0 -0

wafer/cli.py CHANGED Viewed

@@ -30,6 +30,14 @@ import typer
 from .config import WaferConfig, WaferEnvironment
 from .inference import infer_upload_files, resolve_environment
+from .problems import (
+    download_problems,
+    get_problem_path,
+    get_problems_path,
+)
+from .problems import (
+    list_problems as list_problems_fn,
+)
 app = typer.Typer(
     help="GPU development toolkit for LLM coding agents",
@@ -91,11 +99,15 @@ def main_callback(ctx: typer.Context) -> None:
     # Install exception hook to catch SystemExit and mark failures
     original_excepthook = sys.excepthook
-    def custom_excepthook(exc_type, exc_value, exc_traceback):
+    def custom_excepthook(
+        exc_type: type[BaseException],
+        exc_value: BaseException,
+        exc_traceback: object,
+    ) -> None:
         global _command_outcome
         # Mark as failure if SystemExit with non-zero code, or any other exception
         if exc_type is SystemExit:
-            exit_code = exc_value.code if hasattr(exc_value, 'code') else 1
+            exit_code = exc_value.code if hasattr(exc_value, "code") else 1
             if exit_code != 0 and exit_code is not None:
                 _command_outcome = "failure"
         else:
@@ -200,6 +212,13 @@ kernelbench_app = typer.Typer(
 )
 evaluate_app.add_typer(kernelbench_app, name="kernelbench")
+# Nested subcommand for gpumode format
+gpumode_app = typer.Typer(
+    help="Evaluate kernels in GPUMode format (custom_kernel/ref_kernel functions)",
+    invoke_without_command=True,
+)
+evaluate_app.add_typer(gpumode_app, name="gpumode")
 # =============================================================================
 # Dev commands (internal, used by web app proxy)
 # =============================================================================
@@ -242,6 +261,10 @@ app.add_typer(amd_app, name="amd")
 isa_app = typer.Typer(help="ISA analysis for AMD GPU code objects (.co files)")
 amd_app.add_typer(isa_app, name="isa")
+# Kernel Scope - static ISA analysis for Triton kernels
+kernel_scope_app = typer.Typer(help="Static ISA analysis for Triton compilation artifacts")
+amd_app.add_typer(kernel_scope_app, name="kernel-scope")
 # =============================================================================
 # Skill management (wafer skill ...)
 # =============================================================================
@@ -396,6 +419,122 @@ def skill_status() -> None:
             typer.echo(f"{tool_name}: Not installed")
+# =============================================================================
+# Provider auth management (wafer auth ...)
+# =============================================================================
+provider_auth_app = typer.Typer(help="Manage API keys for cloud GPU providers")
+app.add_typer(provider_auth_app, name="auth")
+@provider_auth_app.command("login")
+def provider_auth_login(
+    provider: str = typer.Argument(
+        ...,
+        help="Provider name: runpod, digitalocean, or modal",
+    ),
+    api_key: str | None = typer.Option(
+        None,
+        "--api-key",
+        "-k",
+        help="API key (if not provided, reads from stdin)",
+    ),
+) -> None:
+    """Save API key for a cloud GPU provider.
+    Stores the key in ~/.wafer/auth.json. Environment variables
+    (e.g., WAFER_RUNPOD_API_KEY) take precedence over stored keys.
+    Examples:
+        wafer auth login runpod --api-key rp_xxx
+        wafer auth login digitalocean --api-key dop_v1_xxx
+        echo $API_KEY | wafer auth login runpod
+    """
+    import sys
+    from wafer_core.auth import PROVIDERS, save_api_key
+    # Validate provider
+    if provider not in PROVIDERS:
+        typer.echo(f"Error: Unknown provider '{provider}'", err=True)
+        typer.echo(f"Valid providers: {', '.join(PROVIDERS.keys())}", err=True)
+        raise typer.Exit(1)
+    # Get API key from option or stdin
+    if api_key is None:
+        if sys.stdin.isatty():
+            typer.echo(f"Enter API key for {PROVIDERS[provider]['display_name']}:")
+            api_key = typer.prompt("API key", hide_input=True)
+        else:
+            api_key = sys.stdin.read().strip()
+    if not api_key:
+        typer.echo("Error: No API key provided", err=True)
+        raise typer.Exit(1)
+    # Save the key
+    save_api_key(provider, api_key)
+    typer.echo(f"API key saved for {PROVIDERS[provider]['display_name']}")
+    typer.echo("Stored in: ~/.wafer/auth.json")
+@provider_auth_app.command("logout")
+def provider_auth_logout(
+    provider: str = typer.Argument(
+        ...,
+        help="Provider name: runpod, digitalocean, or modal",
+    ),
+) -> None:
+    """Remove stored API key for a cloud GPU provider.
+    Examples:
+        wafer auth logout runpod
+        wafer auth logout digitalocean
+    """
+    from wafer_core.auth import PROVIDERS, remove_api_key
+    # Validate provider
+    if provider not in PROVIDERS:
+        typer.echo(f"Error: Unknown provider '{provider}'", err=True)
+        typer.echo(f"Valid providers: {', '.join(PROVIDERS.keys())}", err=True)
+        raise typer.Exit(1)
+    if remove_api_key(provider):
+        typer.echo(f"API key removed for {PROVIDERS[provider]['display_name']}")
+    else:
+        typer.echo(f"No stored API key found for {PROVIDERS[provider]['display_name']}")
+@provider_auth_app.command("status")
+def provider_auth_status() -> None:
+    """Show authentication status for all cloud GPU providers.
+    Displays which providers have API keys configured and where
+    the keys are coming from (environment variable or auth.json).
+    Example:
+        wafer auth status
+    """
+    from wafer_core.auth import get_all_auth_status
+    statuses = get_all_auth_status()
+    typer.echo("Cloud GPU Provider Authentication Status")
+    typer.echo("=" * 45)
+    for status in statuses:
+        if status.is_authenticated:
+            source_str = f"({status.source})" if status.source else ""
+            typer.echo(f"  {status.display_name}: ✓ {status.key_preview} {source_str}")
+        else:
+            typer.echo(f"  {status.display_name}: ✗ Not configured")
+            typer.echo(f"      Run: wafer auth login {status.provider}")
+            typer.echo(f"      Or set: {status.key_url}")
+    typer.echo("")
+    typer.echo("Note: Environment variables take precedence over stored keys.")
 @app.command(hidden=True)
 def run(
     command: str = typer.Argument(..., help="Command to run in Docker container"),
@@ -1289,86 +1428,37 @@ def evaluate(  # noqa: PLR0913
             --benchmark --defensive
     Subcommands:
-        make-template  Generate template files for this format
+        gpumode        Use GPUMode format (functional) - RECOMMENDED
         kernelbench    Use KernelBench format (ModelNew class)
+        make-template  Generate template files for this format (deprecated)
     """
     # If a subcommand is being invoked, skip the main evaluation logic
     if ctx.invoked_subcommand is not None:
         return
-    # Validate required args when running evaluation (not subcommands)
-    missing_args = []
-    if implementation is None:
-        missing_args.append("--impl/-i")
-    if reference is None:
-        missing_args.append("--reference")
-    if test_cases is None:
-        missing_args.append("--test-cases")
-    if missing_args:
-        typer.echo("Error: Missing required arguments", err=True)
-        typer.echo(f"  Required: {', '.join(missing_args)}", err=True)
-        typer.echo("", err=True)
-        typer.echo(
-            "Usage: wafer evaluate --impl KERNEL.py --reference REF.py --test-cases TESTS.json",
-            err=True,
-        )
-        typer.echo("", err=True)
-        typer.echo("Run 'wafer evaluate --help' for full options.", err=True)
-        typer.echo("Run 'wafer evaluate make-template DIR' to generate starter files.", err=True)
-        raise typer.Exit(1)
-    from .evaluate import EvaluateArgs, run_evaluate
-    args = EvaluateArgs(
-        implementation=implementation,
-        reference=reference,
-        test_cases=test_cases,
-        target_name=target or "",
-        benchmark=benchmark,
-        profile=profile,
-        defensive=defensive,
-        sync_artifacts=sync_artifacts,
-        gpu_id=gpu_id,
+    # Bare 'wafer evaluate' is no longer supported - must use subcommand
+    typer.echo("Error: 'wafer evaluate' requires a subcommand.", err=True)
+    typer.echo("", err=True)
+    typer.echo("Available subcommands:", err=True)
+    typer.echo(
+        "  gpumode      Evaluate GPUMode format (custom_kernel/ref_kernel functions)", err=True
     )
-    try:
-        # Use trio_asyncio to run async code that uses both trio and asyncio
-        # (AsyncSSHClient uses asyncssh which is asyncio-based, bridged via trio_asyncio)
-        import trio_asyncio
-        result = trio_asyncio.run(run_evaluate, args)
-    except KeyboardInterrupt:
-        typer.echo("\nInterrupted by user", err=True)
-        raise typer.Exit(130) from None
-    except Exception as e:
-        # Unwrap ExceptionGroup (from Trio nurseries) to show actual error
-        if hasattr(e, "exceptions") and e.exceptions:
-            for exc in e.exceptions:
-                typer.echo(f"Error: {type(exc).__name__}: {exc}", err=True)
-        else:
-            typer.echo(f"Error: {e}", err=True)
-        raise typer.Exit(1) from None
-    # Print results
-    if result.success:
-        typer.echo("")
-        typer.echo("=" * 60)
-        status = "PASS" if result.all_correct else "FAIL"
-        typer.echo(f"Result: {status}")
-        score_pct = f"{result.correctness_score:.1%}"
-        typer.echo(f"Correctness: {result.passed_tests}/{result.total_tests} ({score_pct})")
-        if result.geomean_speedup > 0:
-            typer.echo(f"Speedup: {result.geomean_speedup:.2f}x")
-        if result.artifact_path:
-            typer.echo(f"Artifacts: {result.artifact_path}")
-        typer.echo("=" * 60)
-        if not result.all_correct:
-            raise typer.Exit(1)
-    else:
-        typer.echo(f"Error: {result.error_message}", err=True)
-        raise typer.Exit(1)
+    typer.echo("  kernelbench  Evaluate KernelBench format (ModelNew class)", err=True)
+    typer.echo("", err=True)
+    typer.echo("Examples:", err=True)
+    typer.echo(
+        "  wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json",
+        err=True,
+    )
+    typer.echo(
+        "  wafer evaluate kernelbench --impl impl.py --reference ref.py --benchmark", err=True
+    )
+    typer.echo("", err=True)
+    typer.echo(
+        "Run 'wafer evaluate gpumode --help' or 'wafer evaluate kernelbench --help' for options.",
+        err=True,
+    )
+    raise typer.Exit(1)
 TEMPLATE_KERNEL = '''\
@@ -1503,8 +1593,59 @@ def evaluate_make_template(
 # KernelBench format evaluation
 # =============================================================================
-# Path to KernelBench problems (relative to wafer root)
-KERNELBENCH_ROOT = Path(__file__).parent.parent.parent.parent / "research" / "KernelBench"
+def _get_kernelbench_root() -> Path | None:
+    """Get KernelBench problems root, preferring downloaded location."""
+    # First check downloaded location
+    downloaded = get_problems_path("kernelbench")
+    if downloaded is not None:
+        kb_root = downloaded / "KernelBench"
+        if kb_root.exists():
+            return kb_root
+        return downloaded
+    # Fall back to legacy location (for development)
+    legacy = Path(__file__).parent.parent.parent.parent / "research" / "KernelBench" / "KernelBench"
+    if legacy.exists():
+        return legacy
+    return None
+@kernelbench_app.command("download")
+def kernelbench_download(
+    force: bool = typer.Option(False, "--force", "-f", help="Re-download even if exists"),
+) -> None:
+    """Download KernelBench problems from GitHub.
+    Downloads the problem set to ~/.cache/wafer/problems/kernelbench/
+    Examples:
+        wafer evaluate kernelbench download
+        wafer evaluate kernelbench download --force  # Re-download
+    """
+    try:
+        path = download_problems("kernelbench", force=force, verbose=True)
+        typer.echo("")
+        typer.echo(f"Problems available at: {path}")
+        typer.echo("Run 'wafer evaluate kernelbench list-problems' to see available problems.")
+    except Exception as e:
+        typer.echo(f"Error downloading problems: {e}", err=True)
+        raise typer.Exit(1) from None
+@kernelbench_app.command("list-problems")
+def kernelbench_list_problems() -> None:
+    """List available KernelBench problems.
+    Examples:
+        wafer evaluate kernelbench list-problems
+    """
+    try:
+        list_problems_fn("kernelbench", verbose=True)
+    except ValueError as e:
+        typer.echo(str(e), err=True)
+        raise typer.Exit(1) from None
 @kernelbench_app.callback(invoke_without_command=True)
@@ -1528,9 +1669,18 @@ def kernelbench_evaluate(  # noqa: PLR0913
         help="GPU target name. See 'wafer config targets list' for available targets.",
         autocompletion=complete_target_name,
     ),
+    pool: str | None = typer.Option(
+        None,
+        "--pool",
+        "-p",
+        help="Target pool name. Acquires first available target from the pool. "
+        "Define pools in ~/.wafer/config.toml under [pools.<name>].",
+    ),
     benchmark: bool = typer.Option(False, "--benchmark", help="Run performance benchmarks"),
     profile: bool = typer.Option(False, "--profile", help="Enable profiling"),
-    inputs: Path | None = typer.Option(None, "--inputs", help="Custom inputs file to override get_inputs()"),
+    inputs: Path | None = typer.Option(
+        None, "--inputs", help="Custom inputs file to override get_inputs()"
+    ),
     seed: int = typer.Option(42, "--seed", help="Random seed for weight initialization"),
     defensive: bool = typer.Option(
         False, "--defensive", help="Enable defensive timing to detect evaluation hacking"
@@ -1588,12 +1738,54 @@ def kernelbench_evaluate(  # noqa: PLR0913
         )
         raise typer.Exit(1)
+    # Validate --target and --pool are mutually exclusive
+    if target and pool:
+        typer.echo("Error: Cannot specify both --target and --pool", err=True)
+        raise typer.Exit(1)
     from .evaluate import KernelBenchEvaluateArgs, run_evaluate_kernelbench
+    # If pool specified, acquire a target from the pool
+    resolved_target = target or ""
+    pool_lock_context = None
+    if pool:
+        from .target_lock import acquire_from_pool
+        from .targets import filter_pool_by_auth, get_pool
+        try:
+            pool_targets = get_pool(pool)
+        except FileNotFoundError as e:
+            typer.echo(f"Error: {e}", err=True)
+            raise typer.Exit(1) from None
+        # Filter to only targets with valid auth
+        usable_targets, skipped = filter_pool_by_auth(pool_targets)
+        if skipped:
+            typer.echo(f"Skipping targets without auth: {', '.join(skipped)}", err=True)
+        if not usable_targets:
+            typer.echo(f"Error: No usable targets in pool '{pool}'", err=True)
+            typer.echo("  All targets require authentication that is not configured.", err=True)
+            typer.echo("  Run 'wafer auth status' to see which providers need setup.", err=True)
+            raise typer.Exit(1) from None
+        typer.echo(f"Acquiring target from pool '{pool}' ({len(usable_targets)} targets)...")
+        pool_lock_context = acquire_from_pool(usable_targets)
+        acquired_target = pool_lock_context.__enter__()
+        if acquired_target is None:
+            typer.echo(f"Error: All targets in pool '{pool}' are busy", err=True)
+            typer.echo(f"  Targets: {', '.join(usable_targets)}", err=True)
+            raise typer.Exit(1)
+        typer.echo(f"Acquired target: {acquired_target}")
+        resolved_target = acquired_target
     args = KernelBenchEvaluateArgs(
         implementation=implementation,
         reference=reference,
-        target_name=target or "",
+        target_name=resolved_target,
         benchmark=benchmark,
         profile=profile,
         inputs=inputs,
@@ -1613,6 +1805,10 @@ def kernelbench_evaluate(  # noqa: PLR0913
     except Exception as e:
         typer.echo(f"Error: {e}", err=True)
         raise typer.Exit(1) from None
+    finally:
+        # Release pool lock if we acquired one
+        if pool_lock_context is not None:
+            pool_lock_context.__exit__(None, None, None)
     # Print results
     if result.success:
@@ -1659,6 +1855,13 @@ def kernelbench_make_template(
         # Overwrite existing
         wafer evaluate kernelbench make-template level1/1 --force
     """
+    # Get problems root (downloaded or legacy)
+    kb_root = _get_kernelbench_root()
+    if kb_root is None:
+        typer.echo("Error: KernelBench problems not found.", err=True)
+        typer.echo("Run 'wafer evaluate kernelbench download' to download problems.", err=True)
+        raise typer.Exit(1)
     # Parse problem ID
     parts = problem.split("/")
     if len(parts) != 2:
@@ -1670,10 +1873,10 @@ def kernelbench_make_template(
         level_str = f"level{level_str}"
     # Find the problem file
-    problem_dir = KERNELBENCH_ROOT / "KernelBench" / level_str
+    problem_dir = kb_root / level_str
     if not problem_dir.exists():
         typer.echo(f"Error: KernelBench level directory not found: {problem_dir}", err=True)
-        typer.echo(f"Make sure KernelBench is at: {KERNELBENCH_ROOT}", err=True)
+        typer.echo("Run 'wafer evaluate kernelbench download' to download problems.", err=True)
         raise typer.Exit(1)
     # Find matching problem file
@@ -1708,37 +1911,335 @@ def kernelbench_make_template(
     output = output.resolve()
-    # Check if exists
-    if output.exists() and not force:
-        typer.echo(f"Error: {output} already exists. Use --force to overwrite.", err=True)
+    # Check if exists
+    if output.exists() and not force:
+        typer.echo(f"Error: {output} already exists. Use --force to overwrite.", err=True)
+        raise typer.Exit(1)
+    # Copy the file
+    content = problem_file.read_text()
+    output.parent.mkdir(parents=True, exist_ok=True)
+    output.write_text(content)
+    typer.echo(f"Created {output}")
+    typer.echo("")
+    typer.echo("Next steps:")
+    typer.echo(f"  1. Read {output} to understand the Model interface")
+    typer.echo("  2. Create an implementation file with your ModelNew class:")
+    typer.echo("")
+    typer.echo("     import torch.nn as nn")
+    typer.echo("")
+    typer.echo("     class ModelNew(nn.Module):")
+    typer.echo("         def __init__(self, ...):")
+    typer.echo("             # Same signature as Model.__init__")
+    typer.echo("             ...")
+    typer.echo("")
+    typer.echo("         def forward(self, ...):")
+    typer.echo("             # Same signature as Model.forward")
+    typer.echo("             # Your optimized implementation here")
+    typer.echo("             ...")
+    typer.echo("")
+    typer.echo("  3. Run evaluation:")
+    typer.echo(f"     wafer evaluate kernelbench --impl my_kernel.py --reference {output}")
+# =============================================================================
+# GPUMode format evaluation
+# =============================================================================
+@gpumode_app.command("download")
+def gpumode_download(
+    force: bool = typer.Option(False, "--force", "-f", help="Re-download even if exists"),
+) -> None:
+    """Download GPUMode reference kernels from GitHub.
+    Downloads the problem set to ~/.cache/wafer/problems/gpumode/
+    Examples:
+        wafer evaluate gpumode download
+        wafer evaluate gpumode download --force  # Re-download
+    """
+    try:
+        path = download_problems("gpumode", force=force, verbose=True)
+        typer.echo("")
+        typer.echo(f"Problems available at: {path}")
+        typer.echo("Run 'wafer evaluate gpumode list-problems' to see available problems.")
+    except Exception as e:
+        typer.echo(f"Error downloading problems: {e}", err=True)
+        raise typer.Exit(1) from None
+@gpumode_app.command("list-problems")
+def gpumode_list_problems() -> None:
+    """List available GPUMode problems.
+    Examples:
+        wafer evaluate gpumode list-problems
+    """
+    try:
+        list_problems_fn("gpumode", verbose=True)
+    except ValueError as e:
+        typer.echo(str(e), err=True)
+        raise typer.Exit(1) from None
+@gpumode_app.command("make-template")
+def gpumode_make_template(
+    problem: str = typer.Option(
+        ...,
+        "--problem",
+        "-p",
+        help="Problem ID (e.g., 'pmpp/vectoradd_py' or 'amd/fp8-mm')",
+    ),
+    output: Path = typer.Option(
+        None, "--output", "-o", help="Output directory (default: ./<problem_name>/)"
+    ),
+    force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing files"),
+) -> None:
+    """Extract a GPUMode problem as template files.
+    Creates a directory with reference.py, task.yml, and other problem files.
+    You then create kernel.py with your custom_kernel implementation.
+    Examples:
+        # Extract pmpp vectoradd problem
+        wafer evaluate gpumode make-template --problem pmpp/vectoradd_py
+        # Extract to specific directory
+        wafer evaluate gpumode make-template --problem pmpp/vectoradd_py --output ./my-kernel/
+    """
+    import shutil
+    # Get problem path
+    problem_path = get_problem_path("gpumode", problem)
+    if problem_path is None:
+        # Check if problems are downloaded
+        if get_problems_path("gpumode") is None:
+            typer.echo("Error: GPUMode problems not downloaded.", err=True)
+            typer.echo("Run 'wafer evaluate gpumode download' first.", err=True)
+        else:
+            typer.echo(f"Error: Problem '{problem}' not found.", err=True)
+            typer.echo(
+                "Run 'wafer evaluate gpumode list-problems' to see available problems.", err=True
+            )
+        raise typer.Exit(1)
+    # Determine output path
+    if output is None:
+        output = Path.cwd() / problem.replace("/", "_")
+    output = output.resolve()
+    # Check if exists
+    if output.exists() and not force:
+        typer.echo(f"Error: {output} already exists. Use --force to overwrite.", err=True)
+        raise typer.Exit(1)
+    # Copy the problem directory
+    if output.exists():
+        shutil.rmtree(output)
+    shutil.copytree(problem_path, output)
+    typer.echo(f"Created {output}/")
+    typer.echo("")
+    typer.echo("Contents:")
+    for f in sorted(output.iterdir()):
+        if not f.name.startswith("."):
+            typer.echo(f"  {f.name}")
+    typer.echo("")
+    typer.echo("Next steps:")
+    typer.echo("  1. Read reference.py to understand the kernel interface")
+    typer.echo("  2. Create kernel.py with your custom_kernel implementation:")
+    typer.echo("")
+    typer.echo("     def custom_kernel(data):")
+    typer.echo("         # Your optimized implementation")
+    typer.echo("         ...")
+    typer.echo("")
+    typer.echo("  3. Run evaluation:")
+    typer.echo(
+        f"     wafer evaluate gpumode --impl {output}/kernel.py --reference {output}/reference.py \\"
+    )
+    typer.echo(f"         --test-cases {output}/test_cases.json --target <target>")
+@gpumode_app.callback(invoke_without_command=True)
+def gpumode_evaluate(  # noqa: PLR0913, PLR0915
+    ctx: typer.Context,
+    implementation: Path | None = typer.Option(
+        None, "--impl", "-i", help="Path to implementation kernel file"
+    ),
+    reference: Path | None = typer.Option(
+        None, "--reference", help="Path to reference kernel file"
+    ),
+    test_cases: Path | None = typer.Option(
+        None, "--test-cases", help="Path to test cases JSON file"
+    ),
+    target: str | None = typer.Option(
+        None,
+        "--target",
+        "-t",
+        help="GPU target name. See 'wafer config targets list' for available targets.",
+        autocompletion=complete_target_name,
+    ),
+    pool: str | None = typer.Option(
+        None,
+        "--pool",
+        "-p",
+        help="Target pool name. Acquires first available target from the pool. "
+        "Define pools in ~/.wafer/config.toml under [pools.<name>].",
+    ),
+    benchmark: bool = typer.Option(False, "--benchmark", help="Run performance benchmarks"),
+    profile: bool = typer.Option(False, "--profile", help="Enable profiling"),
+    defensive: bool = typer.Option(
+        False, "--defensive", help="Enable defensive timing to detect evaluation hacking"
+    ),
+    sync_artifacts: bool = typer.Option(
+        True, "--sync-artifacts/--no-sync-artifacts", help="Download artifacts"
+    ),
+    gpu_id: int | None = typer.Option(None, "--gpu-id", help="Override GPU ID"),
+) -> None:
+    """Run kernel evaluation in GPUMode format (functional).
+    This format expects:
+    - Implementation: Python file with `custom_kernel(inputs)` function
+    - Reference: Python file with `ref_kernel(inputs)` and `generate_input(**kwargs)` functions
+    - Test cases: JSON file with test parameters
+    Examples:
+        # Basic correctness check
+        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json
+        # With benchmarking
+        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json \\
+            --target vultr-b200 --benchmark
+    Subcommands:
+        download       Download GPUMode problems from GitHub
+        list-problems  List available problems
+        make-template  Extract a problem as template files
+    """
+    # If a subcommand is being invoked, skip the main evaluation logic
+    if ctx.invoked_subcommand is not None:
+        return
+    # Validate required args when running evaluation (not subcommands)
+    missing_args = []
+    if implementation is None:
+        missing_args.append("--impl/-i")
+    if reference is None:
+        missing_args.append("--reference")
+    if test_cases is None:
+        missing_args.append("--test-cases")
+    if missing_args:
+        typer.echo("Error: Missing required arguments", err=True)
+        typer.echo(f"  Required: {', '.join(missing_args)}", err=True)
+        typer.echo("", err=True)
+        typer.echo(
+            "Usage: wafer evaluate gpumode --impl KERNEL.py --reference REF.py --test-cases TESTS.json",
+            err=True,
+        )
+        typer.echo("", err=True)
+        typer.echo("Run 'wafer evaluate gpumode --help' for full options.", err=True)
+        typer.echo("Run 'wafer evaluate gpumode download' to download problem sets.", err=True)
+        raise typer.Exit(1)
+    # Validate --target and --pool are mutually exclusive
+    if target and pool:
+        typer.echo("Error: Cannot specify both --target and --pool", err=True)
+        raise typer.Exit(1)
+    from .evaluate import EvaluateArgs, run_evaluate
+    # If pool specified, acquire a target from the pool
+    resolved_target = target or ""
+    pool_lock_context = None
+    if pool:
+        from .target_lock import acquire_from_pool
+        from .targets import filter_pool_by_auth, get_pool
+        try:
+            pool_targets = get_pool(pool)
+        except FileNotFoundError as e:
+            typer.echo(f"Error: {e}", err=True)
+            raise typer.Exit(1) from None
+        # Filter to only targets with valid auth
+        usable_targets, skipped = filter_pool_by_auth(pool_targets)
+        if skipped:
+            typer.echo(f"Skipping targets without auth: {', '.join(skipped)}", err=True)
+        if not usable_targets:
+            typer.echo(f"Error: No usable targets in pool '{pool}'", err=True)
+            typer.echo("  All targets require authentication that is not configured.", err=True)
+            typer.echo("  Run 'wafer auth status' to see which providers need setup.", err=True)
+            raise typer.Exit(1) from None
+        typer.echo(f"Acquiring target from pool '{pool}' ({len(usable_targets)} targets)...")
+        pool_lock_context = acquire_from_pool(usable_targets)
+        acquired_target = pool_lock_context.__enter__()
+        if acquired_target is None:
+            typer.echo(f"Error: All targets in pool '{pool}' are busy", err=True)
+            typer.echo(f"  Targets: {', '.join(usable_targets)}", err=True)
+            raise typer.Exit(1)
+        typer.echo(f"Acquired target: {acquired_target}")
+        resolved_target = acquired_target
+    args = EvaluateArgs(
+        implementation=implementation,
+        reference=reference,
+        test_cases=test_cases,
+        target_name=resolved_target,
+        benchmark=benchmark,
+        profile=profile,
+        defensive=defensive,
+        sync_artifacts=sync_artifacts,
+        gpu_id=gpu_id,
+    )
+    try:
+        import trio_asyncio
+        result = trio_asyncio.run(run_evaluate, args)
+    except KeyboardInterrupt:
+        typer.echo("\nInterrupted by user", err=True)
+        raise typer.Exit(130) from None
+    except Exception as e:
+        if hasattr(e, "exceptions") and e.exceptions:
+            for exc in e.exceptions:
+                typer.echo(f"Error: {type(exc).__name__}: {exc}", err=True)
+        else:
+            typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    finally:
+        # Release pool lock if we acquired one
+        if pool_lock_context is not None:
+            pool_lock_context.__exit__(None, None, None)
+    # Print results
+    if result.success:
+        typer.echo("")
+        typer.echo("=" * 60)
+        status = "PASS" if result.all_correct else "FAIL"
+        typer.echo(f"Result: {status}")
+        score_pct = f"{result.correctness_score:.1%}"
+        typer.echo(f"Correctness: {result.passed_tests}/{result.total_tests} ({score_pct})")
+        if result.geomean_speedup > 0:
+            typer.echo(f"Speedup: {result.geomean_speedup:.2f}x")
+        if result.artifact_path:
+            typer.echo(f"Artifacts: {result.artifact_path}")
+        typer.echo("=" * 60)
+        if not result.all_correct:
+            raise typer.Exit(1)
+    else:
+        typer.echo(f"Error: {result.error_message}", err=True)
         raise typer.Exit(1)
-    # Copy the file
-    content = problem_file.read_text()
-    output.parent.mkdir(parents=True, exist_ok=True)
-    output.write_text(content)
-    typer.echo(f"Created {output}")
-    typer.echo("")
-    typer.echo("Next steps:")
-    typer.echo(f"  1. Read {output} to understand the Model interface")
-    typer.echo("  2. Create an implementation file with your ModelNew class:")
-    typer.echo("")
-    typer.echo("     import torch.nn as nn")
-    typer.echo("")
-    typer.echo("     class ModelNew(nn.Module):")
-    typer.echo("         def __init__(self, ...):")
-    typer.echo("             # Same signature as Model.__init__")
-    typer.echo("             ...")
-    typer.echo("")
-    typer.echo("         def forward(self, ...):")
-    typer.echo("             # Same signature as Model.forward")
-    typer.echo("             # Your optimized implementation here")
-    typer.echo("             ...")
-    typer.echo("")
-    typer.echo("  3. Run evaluation:")
-    typer.echo(f"     wafer evaluate kernelbench --impl my_kernel.py --reference {output}")
 # =============================================================================
 # Push and Remote-Run commands
@@ -1871,7 +2372,7 @@ def _run_direct_mode(
         typer.echo(f"Uploading {upload_dir.name}...")
         try:
             push_result = push_direct(upload_dir, target)
-            workspace_name = push_result.workspace_path
+            workspace_name = push_result.workspace_name
             typer.echo(f"Uploaded {len(push_result.files_uploaded)} files")
         except Exception as e:
             typer.echo(f"Error uploading: {e}", err=True)
@@ -2044,27 +2545,41 @@ def login(
         None, "--token", "-t", help="Access token (skip browser OAuth)"
     ),
     port: int | None = typer.Option(
-        None, "--port", "-p", help="Port for OAuth callback server (default: 8765 for SSH, random for local)"
+        None,
+        "--port",
+        "-p",
+        help="Port for OAuth callback server (local only, ignored for SSH)",
+    ),
+    no_device_code: bool = typer.Option(
+        False,
+        "--no-device-code",
+        help="Force browser OAuth even on SSH (requires port forwarding)",
     ),
 ) -> None:
     """Authenticate CLI with wafer-api via GitHub OAuth.
-    Opens browser for GitHub authentication. Use --token to skip browser.
+    Local: Opens browser for GitHub authentication.
+    SSH: Uses device code flow (no port forwarding needed).
     Uses the API environment from config (see 'wafer config show').
-    SSH Users:
-    - Automatically uses port 8765 (just set up port forwarding once)
-    - On local machine: ssh -L 8765:localhost:8765 user@host
-    - On remote machine: wafer login
-    - Browser opens locally, redirect works through tunnel
+    SSH Users (Easiest):
+    - Just run: wafer login
+    - Visit the URL and enter the code shown
+    - No port forwarding needed!
+    SSH with browser (Advanced):
+    - Use --no-device-code to force browser flow
+    - Requires: ssh -L 8765:localhost:8765 user@host
     Manual token option:
     - Visit auth.wafer.ai, authenticate, copy token from URL
     - Run: wafer login --token <paste-token>
     Examples:
-        wafer login                    # auto-detects SSH, uses appropriate port
-        wafer login --port 9000        # override port
+        wafer login                    # device code on SSH, browser on local
+        wafer login --no-device-code   # force browser (needs port forwarding on SSH)
+        wafer login --port 9000        # custom port for browser flow
         wafer login --token xyz        # manual token (no browser)
         # Change environment:
@@ -2073,7 +2588,7 @@ def login(
     """
     import httpx
-    from .auth import browser_login, save_credentials, verify_token
+    from .auth import browser_login, device_code_login, save_credentials, verify_token
     from .global_config import get_api_url, get_supabase_url, load_global_config
     # Show which environment we're logging into
@@ -2083,21 +2598,31 @@ def login(
     typer.echo(f"Auth: {get_supabase_url()}")
     typer.echo("")
-    # Auto-detect SSH and use fixed port
-    if port is None:
-        is_ssh = bool(os.environ.get("SSH_CONNECTION") or os.environ.get("SSH_CLIENT"))
-        if is_ssh:
-            port = 8765
-            typer.echo("🔒 SSH session detected - using port 8765 for OAuth callback")
-            typer.echo("   Make sure you have port forwarding set up:")
-            typer.echo("   ssh -L 8765:localhost:8765 user@host")
-            typer.echo("")
+    # Auto-detect SSH
+    is_ssh = bool(os.environ.get("SSH_CONNECTION") or os.environ.get("SSH_CLIENT"))
-    # Browser OAuth if no token provided
+    # Choose auth method
     refresh_token = None
     if token is None:
         try:
-            token, refresh_token = browser_login(port=port)
+            if is_ssh and not no_device_code:
+                # Use device code flow for SSH (no port forwarding needed)
+                typer.echo("🔒 SSH session detected - using device code authentication")
+                typer.echo("   (No port forwarding required!)")
+                typer.echo("")
+                token, refresh_token = device_code_login()
+            else:
+                # Use browser OAuth for local or if explicitly requested
+                if is_ssh:
+                    typer.echo("🔒 SSH session detected - using browser authentication")
+                    typer.echo("   Make sure you have port forwarding set up:")
+                    if port is None:
+                        port = 8765
+                        typer.echo(f"   ssh -L {port}:localhost:{port} user@host")
+                    else:
+                        typer.echo(f"   ssh -L {port}:localhost:{port} user@host")
+                    typer.echo("")
+                token, refresh_token = browser_login(port=port)
         except TimeoutError as e:
             typer.echo(f"Error: {e}", err=True)
             raise typer.Exit(1) from None
@@ -2146,9 +2671,8 @@ def login(
 @app.command("logout")
 def logout() -> None:
     """Remove stored credentials."""
-    from .auth import clear_credentials
     from . import analytics
+    from .auth import clear_credentials
     # Track logout event first (while credentials still exist for user identification)
     # Note: track_logout() handles the case where user is not logged in
@@ -2625,6 +3149,7 @@ init_app = typer.Typer(
 Choose based on your GPU access:
+  local        GPU on current machine (no SSH)
   ssh          Your own hardware via SSH
   runpod       RunPod cloud GPUs (needs WAFER_RUNPOD_API_KEY)
   digitalocean DigitalOcean AMD MI300X (needs WAFER_AMD_DIGITALOCEAN_API_KEY)"""
@@ -2632,6 +3157,92 @@ Choose based on your GPU access:
 targets_app.add_typer(init_app, name="init")
+@init_app.command("local")
+def init_local(
+    name: str = typer.Option("local", "--name", "-n", help="Target name"),
+    gpu_ids: str = typer.Option("0", "--gpu-ids", "-g", help="Comma-separated GPU IDs"),
+) -> None:
+    """Initialize a local target for GPU on current machine.
+    Detects your local GPU and configures a target for direct execution
+    (no SSH). Use this when running wafer on the same machine as the GPU.
+    Examples:
+        wafer config targets init local
+        wafer config targets init local --name my-5090 --gpu-ids 0,1
+    """
+    from .targets import save_target
+    # Parse GPU IDs
+    try:
+        parsed_gpu_ids = [int(g.strip()) for g in gpu_ids.split(",")]
+    except ValueError:
+        typer.echo(f"Error: Invalid GPU IDs '{gpu_ids}'. Use comma-separated integers.", err=True)
+        raise typer.Exit(1) from None
+    typer.echo("Detecting local GPU...")
+    try:
+        from wafer_core.gpu_detect import (
+            detect_local_gpu,
+            get_compute_capability,
+            get_torch_requirements,
+        )
+        detected_gpu = detect_local_gpu()
+        if detected_gpu:
+            typer.echo(f"  Found: {detected_gpu.gpu_name}")
+            if detected_gpu.vendor == "nvidia":
+                typer.echo(f"  CUDA: {detected_gpu.driver_version}")
+            else:
+                typer.echo(f"  ROCm: {detected_gpu.driver_version}")
+            typer.echo(f"  GPU count: {detected_gpu.gpu_count}")
+            # Get torch requirements and compute capability
+            torch_reqs = get_torch_requirements(detected_gpu)
+            compute_capability = get_compute_capability(detected_gpu)
+            gpu_type = _extract_gpu_type(detected_gpu.gpu_name)
+            typer.echo(f"  PyTorch: {torch_reqs.packages[0]}")
+        else:
+            typer.echo("  No GPU detected (nvidia-smi/rocm-smi not found)", err=True)
+            raise typer.Exit(1)
+    except ImportError as e:
+        typer.echo(f"Error: Missing dependency: {e}", err=True)
+        raise typer.Exit(1) from None
+    # Build target data
+    target_data = {
+        "name": name,
+        "type": "local",
+        "gpu_ids": parsed_gpu_ids,
+        "gpu_type": gpu_type,
+        "compute_capability": compute_capability,
+        "torch_package": torch_reqs.packages[0],
+        "torch_index_url": torch_reqs.index_url,
+        "vendor": detected_gpu.vendor,
+        "driver_version": detected_gpu.driver_version,
+    }
+    try:
+        target = save_target(target_data)
+        typer.echo(f"✓ Created target: {target.name}")
+        typer.echo("  Type: Local (no SSH)")
+        typer.echo(f"  GPU IDs: {parsed_gpu_ids}")
+        typer.echo(f"  GPU Type: {gpu_type}")
+        typer.echo(f"  Compute: {compute_capability}")
+        typer.echo(f"  Torch: {torch_reqs.packages[0]}")
+        typer.echo("")
+        typer.echo(
+            f"Usage: wafer evaluate --target {name} --impl kernel.py --reference ref.py --test-cases tests.json"
+        )
+    except (ValueError, AssertionError) as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
 @init_app.command("runpod")
 def init_runpod(
     name: str = typer.Option("runpod-mi300x", "--name", "-n", help="Target name"),
@@ -2795,23 +3406,29 @@ def init_ssh(
     host: str = typer.Option(..., "--host", "-H", help="SSH host (user@hostname:port)"),
     ssh_key: str = typer.Option("~/.ssh/id_ed25519", "--ssh-key", "-k", help="Path to SSH key"),
     gpu_ids: str = typer.Option("0", "--gpu-ids", "-g", help="Comma-separated GPU IDs"),
-    gpu_type: str = typer.Option(
-        "H100", "--gpu-type", help="GPU type (H100, A100, B200, MI300X, etc.)"
+    gpu_type: str | None = typer.Option(
+        None, "--gpu-type", help="GPU type (auto-detected if not specified)"
     ),
     docker_image: str | None = typer.Option(
         None, "--docker-image", "-d", help="Docker image (optional)"
     ),
     ncu: bool = typer.Option(False, "--ncu/--no-ncu", help="NCU profiling available"),
+    no_detect: bool = typer.Option(False, "--no-detect", help="Skip GPU auto-detection"),
 ) -> None:
     """Initialize an SSH target for your own GPU hardware.
     Creates a target config for direct SSH access to a GPU machine.
-    Use for baremetal servers, VMs, or any machine you have SSH access to.
+    Automatically detects GPU type and selects compatible PyTorch version.
     Examples:
+        # Auto-detect GPU (recommended)
         wafer config targets init ssh --name my-gpu --host user@192.168.1.100:22
+        # Multiple GPUs with NCU profiling
         wafer config targets init ssh --name lab-h100 --host ubuntu@gpu.lab.com:22 --gpu-ids 0,1 --ncu
-        wafer config targets init ssh --name docker-gpu --host user@host:22 --docker-image nvcr.io/nvidia/pytorch:24.01-py3
+        # Skip detection, specify manually
+        wafer config targets init ssh --name my-gpu --host user@host:22 --gpu-type H100 --no-detect
     """
     from .targets import save_target
@@ -2828,17 +3445,86 @@ def init_ssh(
         typer.echo("Example: user@192.168.1.100:22", err=True)
         raise typer.Exit(1)
+    # Auto-detect GPU if not specified
+    detected_gpu = None
+    torch_package = None
+    torch_index_url = None
+    if not no_detect:
+        typer.echo(f"Connecting to {host}...")
+        try:
+            import trio
+            import trio_asyncio
+            from wafer_core.async_ssh import AsyncSSHClient
+            from wafer_core.gpu_detect import (
+                detect_remote_gpu,
+                get_compute_capability,
+                get_torch_requirements,
+            )
+            expanded_key = str(Path(ssh_key).expanduser())
+            async def _detect() -> None:
+                nonlocal detected_gpu, torch_package, torch_index_url
+                # Need trio_asyncio.open_loop() for asyncssh bridge
+                async with trio_asyncio.open_loop():
+                    async with AsyncSSHClient(host, expanded_key) as client:
+                        detected_gpu = await detect_remote_gpu(client)
+            trio.run(_detect)
+            if detected_gpu:
+                typer.echo(f"  Found: {detected_gpu.gpu_name}")
+                if detected_gpu.vendor == "nvidia":
+                    typer.echo(f"  CUDA: {detected_gpu.driver_version}")
+                else:
+                    typer.echo(f"  ROCm: {detected_gpu.driver_version}")
+                # Get torch requirements
+                torch_reqs = get_torch_requirements(detected_gpu)
+                torch_package = torch_reqs.packages[0]  # Just torch, not all packages
+                torch_index_url = torch_reqs.index_url
+                typer.echo(f"  PyTorch: {torch_package}")
+                # Use detected GPU type if not specified
+                if not gpu_type:
+                    # Extract GPU name (e.g., "H100" from "NVIDIA H100 80GB HBM3")
+                    gpu_type = _extract_gpu_type(detected_gpu.gpu_name)
+            else:
+                typer.echo("  No GPU detected (nvidia-smi/rocm-smi not found)")
+                if not gpu_type:
+                    gpu_type = "H100"  # Default fallback
+                    typer.echo(f"  Using default: {gpu_type}")
+        except Exception as e:
+            typer.echo(f"  Detection failed: {e}", err=True)
+            if not gpu_type:
+                gpu_type = "H100"
+                typer.echo(f"  Using default: {gpu_type}")
+    # Fallback if no detection
+    if not gpu_type:
+        gpu_type = "H100"
     # Compute capability mappings
-    compute_caps = {
-        "B200": "10.0",
-        "H100": "9.0",
-        "A100": "8.0",
-        "A10": "8.6",
-        "V100": "7.0",
-        "MI300X": "9.4",
-        "MI250X": "9.0",
-    }
-    compute_capability = compute_caps.get(gpu_type, "8.0")
+    if detected_gpu:
+        from wafer_core.gpu_detect import get_compute_capability
+        compute_capability = get_compute_capability(detected_gpu)
+    else:
+        compute_caps = {
+            "B200": "10.0",
+            "H100": "9.0",
+            "A100": "8.0",
+            "A10": "8.6",
+            "V100": "7.0",
+            "MI300X": "9.4",
+            "MI250X": "9.0",
+            "RTX 5090": "10.0",
+            "RTX 4090": "8.9",
+            "RTX 3090": "8.6",
+        }
+        compute_capability = compute_caps.get(gpu_type, "8.0")
     # Build target data
     target_data = {
@@ -2855,6 +3541,12 @@ def init_ssh(
     if docker_image:
         target_data["docker_image"] = docker_image
+    # Add torch requirements if detected
+    if torch_package:
+        target_data["torch_package"] = torch_package
+    if torch_index_url:
+        target_data["torch_index_url"] = torch_index_url
     try:
         target = save_target(target_data)
         typer.echo(f"✓ Created target: {target.name}")
@@ -2862,9 +3554,12 @@ def init_ssh(
         typer.echo(f"  Host: {host}")
         typer.echo(f"  GPU IDs: {parsed_gpu_ids}")
         typer.echo(f"  GPU Type: {gpu_type}")
+        typer.echo(f"  Compute: {compute_capability}")
         typer.echo(f"  NCU: {'Yes' if ncu else 'No'}")
         if docker_image:
             typer.echo(f"  Docker: {docker_image}")
+        if torch_package:
+            typer.echo(f"  Torch: {torch_package}")
         typer.echo("")
         typer.echo(
             f"Usage: wafer evaluate --target {name} --impl kernel.py --reference ref.py --test-cases tests.json"
@@ -2874,6 +3569,44 @@ def init_ssh(
         raise typer.Exit(1) from None
+def _extract_gpu_type(gpu_name: str) -> str:
+    """Extract GPU type from full GPU name.
+    Examples:
+        "NVIDIA H100 80GB HBM3" -> "H100"
+        "NVIDIA GeForce RTX 4090" -> "RTX 4090"
+        "AMD Instinct MI300X OAM" -> "MI300X"
+    """
+    gpu_name_upper = gpu_name.upper()
+    # Check for known GPU types
+    known_types = [
+        "B200",
+        "B100",
+        "H200",
+        "H100",
+        "A100",
+        "A10",
+        "V100",
+        "RTX 5090",
+        "RTX 5080",
+        "RTX 4090",
+        "RTX 4080",
+        "RTX 3090",
+        "RTX 3080",
+        "MI300X",
+        "MI250X",
+        "MI100",
+    ]
+    for gpu_type in known_types:
+        if gpu_type in gpu_name_upper:
+            return gpu_type
+    # Fallback: return cleaned name
+    return gpu_name.replace("NVIDIA ", "").replace("AMD ", "").strip()
 @targets_app.command("add")
 def targets_add(
     file_path: Path = typer.Argument(..., help="Path to target TOML file"),
@@ -2956,6 +3689,93 @@ def targets_show(
         raise typer.Exit(1) from None
+@targets_app.command("probe")
+def targets_probe(
+    name: str = typer.Argument(..., help="Target name"),
+) -> None:
+    """Probe a target to discover available compilation backends.
+    Connects to the target and checks what's available:
+    - Triton
+    - torch.compile/inductor
+    - HIP/hipcc or CUDA/nvcc
+    - ROCm or CUDA version
+    - Python packages (torch, triton, etc.)
+    Example:
+        wafer config targets probe runpod-mi300x
+    """
+    import trio
+    from .targets import ProbeError, load_target, probe_target_capabilities
+    try:
+        target = load_target(name)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    typer.echo(f"Probing target: {name}...")
+    try:
+        capabilities = trio.run(probe_target_capabilities, target)
+    except ProbeError as e:
+        # ProbeError already has actionable context
+        typer.echo(f"\nError: {e}", err=True)
+        raise typer.Exit(1) from None
+    except Exception as e:
+        # Unexpected errors - include type for debugging
+        typer.echo(f"\nUnexpected error probing target: {type(e).__name__}: {e}", err=True)
+        raise typer.Exit(1) from None
+    # Display results
+    typer.echo(f"\nTarget: {name}")
+    if capabilities.get("gpu_name"):
+        typer.echo(f"  GPU: {capabilities['gpu_name']}")
+    if capabilities.get("compute_capability"):
+        typer.echo(f"  Compute: {capabilities['compute_capability']}")
+    typer.echo("\n  Compilation Backends:")
+    backends = capabilities.get("backends", {})
+    # Triton
+    triton_ver = backends.get("triton")
+    if triton_ver:
+        typer.echo(f"    ✓ Triton: {triton_ver}")
+    else:
+        typer.echo("    ✗ Triton: not installed")
+    # torch.compile
+    if triton_ver and backends.get("torch"):
+        typer.echo("    ✓ torch.compile/inductor: available")
+    else:
+        typer.echo("    ✗ torch.compile/inductor: requires Triton")
+    # HIP/CUDA compiler
+    if backends.get("hipcc"):
+        typer.echo(f"    ✓ HIP/hipcc: {backends['hipcc']}")
+    elif backends.get("nvcc"):
+        typer.echo(f"    ✓ CUDA/nvcc: {backends['nvcc']}")
+    else:
+        typer.echo("    ✗ No GPU compiler found")
+    # ROCm/CUDA version
+    if capabilities.get("rocm_version"):
+        typer.echo(f"    ROCm: {capabilities['rocm_version']}")
+    if capabilities.get("cuda_version"):
+        typer.echo(f"    CUDA: {capabilities['cuda_version']}")
+    typer.echo("\n  Python Environment:")
+    typer.echo(f"    Python: {capabilities.get('python_version', 'unknown')}")
+    packages = capabilities.get("packages", {})
+    if packages.get("torch"):
+        typer.echo(f"    PyTorch: {packages['torch']}")
+    if triton_ver:
+        typer.echo(f"    Triton: {triton_ver}")
 @targets_app.command("remove")
 def targets_remove(
     name: str = typer.Argument(..., help="Target name"),
@@ -3086,6 +3906,92 @@ def targets_pods() -> None:
         typer.echo()
+# ── Pool commands ───────────────────────────────────────────────────────────
+@targets_app.command("pool-list")
+def targets_pool_list() -> None:
+    """List all configured target pools.
+    Example:
+        wafer config targets pool-list
+    """
+    from .targets import get_pool, list_pools
+    pools = list_pools()
+    if not pools:
+        typer.echo("No pools configured")
+        typer.echo("")
+        typer.echo("Define pools in ~/.wafer/config.toml:")
+        typer.echo("  [pools.my-pool]")
+        typer.echo('  targets = ["target-1", "target-2"]')
+        return
+    typer.echo("Configured pools:\n")
+    for pool_name in pools:
+        try:
+            targets = get_pool(pool_name)
+            typer.echo(f"  {pool_name}: {', '.join(targets)}")
+        except Exception as e:
+            typer.echo(f"  {pool_name}: (error: {e})")
+@targets_app.command("pool-create")
+def targets_pool_create(
+    name: str = typer.Argument(..., help="Pool name"),
+    targets: list[str] = typer.Argument(..., help="Target names to include in pool"),
+) -> None:
+    """Create or update a target pool.
+    Example:
+        wafer config targets pool-create mi300x-pool mi300x-1 mi300x-2 mi300x-3
+    """
+    from .targets import save_pool
+    try:
+        save_pool(name, targets)
+        typer.echo(f"Pool '{name}' created with {len(targets)} targets")
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+@targets_app.command("pool-status")
+def targets_pool_status(
+    name: str = typer.Argument(..., help="Pool name"),
+) -> None:
+    """Show status of targets in a pool (locked/available).
+    Example:
+        wafer config targets pool-status mi300x-pool
+    """
+    from .target_lock import get_lock_holder, is_target_locked
+    from .targets import get_pool
+    try:
+        targets = get_pool(name)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    typer.echo(f"Pool '{name}' ({len(targets)} targets):\n")
+    available = 0
+    for target_name in targets:
+        locked = is_target_locked(target_name)
+        if locked:
+            pid = get_lock_holder(target_name)
+            pid_str = f" (pid {pid})" if pid else ""
+            typer.echo(f"  [busy]  {target_name}{pid_str}")
+        else:
+            typer.echo(f"  [free]  {target_name}")
+            available += 1
+    typer.echo("")
+    typer.echo(f"Available: {available}/{len(targets)}")
 # =============================================================================
 # Billing commands
 # =============================================================================
@@ -3119,7 +4025,9 @@ def billing_usage(
 @billing_app.command("topup")
 def billing_topup(
     amount: int = typer.Argument(25, help="Amount in dollars ($10-$500)"),
-    no_browser: bool = typer.Option(False, "--no-browser", help="Print URL instead of opening browser"),
+    no_browser: bool = typer.Option(
+        False, "--no-browser", help="Print URL instead of opening browser"
+    ),
 ) -> None:
     """Add credits to your account.
@@ -3165,7 +4073,9 @@ def billing_topup(
 @billing_app.command("portal")
 def billing_portal(
-    no_browser: bool = typer.Option(False, "--no-browser", help="Print URL instead of opening browser"),
+    no_browser: bool = typer.Option(
+        False, "--no-browser", help="Print URL instead of opening browser"
+    ),
 ) -> None:
     """Open Stripe billing portal.
@@ -3319,7 +4229,7 @@ def workspaces_exec(
     workspace: str | None = typer.Argument(
         None, help="Workspace name or ID (optional if default set)"
     ),
-    command: list[str] = typer.Argument(..., help="Command to execute on GPU"),
+    command: list[str] = typer.Argument(..., help="Command to execute"),
     timeout: int | None = typer.Option(
         None,
         "--timeout",
@@ -3332,13 +4242,23 @@ def workspaces_exec(
         "-s",
         help="Sync local directory to workspace before executing",
     ),
+    gpu: bool = typer.Option(False, "--gpu", help="Force GPU routing (default behavior)"),
+    cpu: bool = typer.Option(False, "--cpu", help="Run in workspace container (no GPU)"),
+    baremetal: bool = typer.Option(
+        False, "--baremetal", help="Force baremetal target (for hardware counters like ncu/nsys)"
+    ),
     verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
     quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
 ) -> None:
-    """Execute a command in workspace with GPU routing.
+    """Execute a command in workspace.
+    By default, auto-detects whether to route to GPU based on the command.
+    Use --gpu, --cpu, or --baremetal to override.
-    Runs the command on the workspace's configured GPU target (Modal, baremetal, etc.)
-    and streams output back. No SSH or zsh plugin required.
+    Routing options:
+      --gpu       Force GPU container (Modal or baremetal with GPU)
+      --cpu       Run in workspace container directly (no GPU)
+      --baremetal Force baremetal target (for ncu, nsys, hardware counters)
     If workspace is not specified, uses the default workspace from config,
     or the only workspace if you have exactly one.
@@ -3353,6 +4273,21 @@ def workspaces_exec(
     from .global_config import get_defaults, get_preferences
     from .workspaces import exec_command, resolve_workspace, sync_files
+    # Validate mutually exclusive routing flags
+    routing_flags = sum([gpu, cpu, baremetal])
+    if routing_flags > 1:
+        typer.echo("Error: --gpu, --cpu, and --baremetal are mutually exclusive", err=True)
+        raise typer.Exit(1)
+    # Determine routing (None = auto-detect)
+    routing: str | None = None
+    if gpu:
+        routing = "gpu"
+    elif cpu:
+        routing = "cpu"
+    elif baremetal:
+        routing = "baremetal"
     # Resolve workspace (specified, config default, or single workspace)
     try:
         resolved_workspace = resolve_workspace(workspace)
@@ -3377,7 +4312,8 @@ def workspaces_exec(
         show_status = prefs.mode == "explicit"
     if show_status:
-        typer.echo(f"[wafer] Workspace: {resolved_workspace}", err=True)
+        routing_label = routing or "auto"
+        typer.echo(f"[wafer] Workspace: {resolved_workspace} (routing: {routing_label})", err=True)
     # Sync files if requested
     if sync is not None:
@@ -3413,8 +4349,15 @@ def workspaces_exec(
         # Remove leading "--" if present (typer passes it through with allow_interspersed_args=False)
         if command and command[0] == "--":
             command = command[1:]
-        # Use shlex.join to properly quote args containing spaces/special chars
-        command_str = shlex.join(command)
+        # Handle two cases:
+        # 1. Single element: user quoted the whole command (e.g., "echo hello world")
+        #    -> use directly, don't re-quote
+        # 2. Multiple elements: user passed separate args (e.g., -- python -c "print(1)")
+        #    -> use shlex.join to properly quote args with spaces
+        if len(command) == 1:
+            command_str = command[0]
+        else:
+            command_str = shlex.join(command)
     else:
         command_str = command
@@ -3423,6 +4366,7 @@ def workspaces_exec(
             workspace_id=resolved_workspace,
             command=command_str,
             timeout_seconds=effective_timeout,
+            routing=routing,
         )
     except RuntimeError as e:
         typer.echo(f"Error: {e}", err=True)
@@ -4441,8 +5385,8 @@ def _setup_wafer_core_env() -> None:
     - WAFER_API_URL: If already set, uses that instead of config
     - WAFER_AUTH_TOKEN: If already set, uses that instead of cached token
     """
-    from .global_config import get_api_url
     from .auth import get_valid_token
+    from .global_config import get_api_url
     # Set API URL (get_api_url already respects WAFER_API_URL env var)
     os.environ["WAFER_API_URL"] = get_api_url()
@@ -4746,8 +5690,8 @@ def capture_command(  # noqa: PLR0915
     import os
     import tomllib
-    from .global_config import get_api_url
     from .auth import get_valid_token
+    from .global_config import get_api_url
     # Set environment variables for wafer-core BEFORE importing it
     # wafer-core backend.py reads WAFER_API_URL and WAFER_AUTH_TOKEN from env
@@ -4951,8 +5895,8 @@ def capture_list_command(
     """
     import os
-    from .global_config import get_api_url
     from .auth import get_valid_token
+    from .global_config import get_api_url
     # Set environment variables for wafer-core BEFORE importing it
     os.environ["WAFER_API_URL"] = get_api_url()
@@ -5301,6 +6245,98 @@ def isa_analyze(
         raise typer.Exit(1) from None
+# =============================================================================
+# Kernel Scope Commands (wafer amd kernel-scope ...)
+# =============================================================================
+@kernel_scope_app.command("analyze")
+def kernel_scope_analyze(
+    path: Path = typer.Argument(..., help="Path to file or directory to analyze"),
+    json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
+    csv_output: bool = typer.Option(False, "--csv", help="Output as CSV"),
+    recursive: bool = typer.Option(
+        True, "--recursive/--no-recursive", "-r", help="Scan directories recursively"
+    ),
+    filter_expr: str | None = typer.Option(
+        None, "--filter", "-f", help="Filter results (e.g., 'spills > 0')"
+    ),
+    output_file: Path | None = typer.Option(None, "--output", "-o", help="Write output to file"),
+    kernel_index: int = typer.Option(0, "--kernel", "-k", help="Kernel index if multiple in file"),
+) -> None:
+    """Analyze Triton compilation artifacts (ISA, LLVM-IR, TTGIR).
+    Performs static analysis to extract performance metrics like register
+    pressure, spills, MFMA density, and occupancy limits.
+    Supports:
+      - AMDGCN ISA files (.s, .gcn, .asm)
+      - LLVM-IR files (.ll)
+      - TTGIR files (.ttgir, .ttir, .mlir)
+    Examples:
+        wafer amd kernel-scope analyze kernel.s
+        wafer amd kernel-scope analyze kernel.s --json
+        wafer amd kernel-scope analyze ~/.triton/cache/ --filter 'spills > 0'
+        wafer amd kernel-scope analyze . -r --csv -o metrics.csv
+    """
+    from .kernel_scope import analyze_command
+    try:
+        output = analyze_command(
+            path=str(path),
+            json_output=json_output,
+            csv_output=csv_output,
+            recursive=recursive,
+            filter_expr=filter_expr,
+            output_file=str(output_file) if output_file else None,
+            kernel_index=kernel_index,
+        )
+        typer.echo(output)
+    except FileNotFoundError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    except RuntimeError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    except Exception as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+@kernel_scope_app.command("metrics")
+def kernel_scope_metrics() -> None:
+    """List available metrics for kernel scope analysis.
+    Shows all metrics that can be extracted from Triton compilation
+    artifacts, along with their derivation.
+    Examples:
+        wafer amd kernel-scope metrics
+    """
+    from .kernel_scope import metrics_command
+    output = metrics_command()
+    typer.echo(output)
+@kernel_scope_app.command("targets")
+def kernel_scope_targets() -> None:
+    """List supported GPU targets and their specifications.
+    Shows hardware specs (VGPRs, SGPRs, LDS, etc.) for each supported
+    AMD GPU architecture.
+    Examples:
+        wafer amd kernel-scope targets
+    """
+    from .kernel_scope import targets_command
+    output = targets_command()
+    typer.echo(output)
 def main() -> None:
     """Entry point for wafer CLI."""
     app()

wafer-cli 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

wafer-cli 0.2.7py3-none-any.whl → 0.2.9py3-none-any.whl