PyPI - wafer-cli - Versions diffs - 0.2.32__py3-none-any.whl → 0.2.33__py3-none-any.whl - Mend

wafer-cli 0.2.32py3-none-any.whl → 0.2.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

wafer/GUIDE.md +1 -1
wafer/agent_defaults.py +157 -2
wafer/billing.py +6 -6
wafer/cli.py +432 -348
wafer/corpus.py +6 -72
wafer/evaluate.py +143 -81
wafer/global_config.py +0 -13
wafer/kernel_scope.py +1 -1
wafer/ncu_analyze.py +1 -1
wafer/nsys_analyze.py +1 -1
wafer/skills/wafer-guide/SKILL.md +6 -22
wafer/ssh_keys.py +6 -6
wafer/targets_ops.py +2 -29
wafer/templates/aiter_optimize.py +59 -0
wafer/templates/optimize_kernel.py +2 -4
wafer/templates/optimize_kernelbench.py +62 -17
wafer/templates/optimize_vllm.py +156 -0
wafer/trace_compare.py +48 -139
wafer/wevin_cli.py +1 -12
wafer/workspaces.py +8 -8
wafer_cli-0.2.33.dist-info/METADATA +260 -0
{wafer_cli-0.2.32.dist-info → wafer_cli-0.2.33.dist-info}/RECORD +25 -23
wafer_cli-0.2.32.dist-info/METADATA +0 -107
{wafer_cli-0.2.32.dist-info → wafer_cli-0.2.33.dist-info}/WHEEL +0 -0
{wafer_cli-0.2.32.dist-info → wafer_cli-0.2.33.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.32.dist-info → wafer_cli-0.2.33.dist-info}/top_level.txt +0 -0

wafer/cli.py CHANGED Viewed

@@ -8,7 +8,6 @@
 Core commands:
   agent       AI assistant for GPU kernel development
   evaluate    Test kernel correctness and performance
-  baseline    Discover what kernel PyTorch uses for an op
   corpus      Download GPU documentation for local access
   workspaces  Manage cloud GPU environments
@@ -195,16 +194,11 @@ def complete_target_name(incomplete: str) -> list[str]:
 # =============================================================================
 # Core subcommand groups (visible in --help)
-#
-# TODO: Further consolidate top-level commands to reduce --help surface area.
-# Candidates:
-#   - compare → wafer nvidia compare or keep top-level (cross-platform)
-#   - guide/skill/demo → wafer onboard {guide,skill,demo}
 # =============================================================================
 # Config management (includes targets as nested subcommand)
 config_app = typer.Typer(help="Manage CLI configuration and local GPU targets")
-app.add_typer(config_app, name="config", rich_help_panel="Configuration")
+app.add_typer(config_app, name="config")
 # Target management - nested under config
 targets_app = typer.Typer(
@@ -224,7 +218,7 @@ config_app.add_typer(targets_app, name="targets")
 workspaces_app = typer.Typer(
     help="""Manage cloud GPU workspaces for remote development.
-Workspaces are on-demand cloud GPU environments. Requires authentication (wafer auth login).
+Workspaces are on-demand cloud GPU environments. Requires authentication (wafer login).
 Available GPUs:
   MI300X  AMD Instinct MI300X (192GB HBM3, ROCm)
@@ -237,21 +231,21 @@ Commands:
   wafer workspaces sync dev ./project      # Sync files
   wafer workspaces delete dev              # Clean up"""
 )
-app.add_typer(workspaces_app, name="workspaces", rich_help_panel="Infrastructure")
+app.add_typer(workspaces_app, name="workspaces")
-# SSH Key management (BYOK - Bring Your Own Key) - nested under config
+# SSH Key management (BYOK - Bring Your Own Key)
 ssh_keys_app = typer.Typer(
     help="""Manage SSH public keys for workspace access.
 Register your SSH public keys here. These keys are installed in all workspaces
 you provision, enabling SSH access from any machine with your private key.
-  wafer config ssh-keys list              # List registered keys
-  wafer config ssh-keys add               # Add key (auto-detects ~/.ssh/id_ed25519.pub)
-  wafer config ssh-keys add ~/.ssh/id_rsa.pub --name laptop  # Add specific key
-  wafer config ssh-keys remove <key-id>   # Remove a key"""
+  wafer ssh-keys list              # List registered keys
+  wafer ssh-keys add               # Add key (auto-detects ~/.ssh/id_ed25519.pub)
+  wafer ssh-keys add ~/.ssh/id_rsa.pub --name laptop  # Add specific key
+  wafer ssh-keys remove <key-id>   # Remove a key"""
 )
-config_app.add_typer(ssh_keys_app, name="ssh-keys")
+app.add_typer(ssh_keys_app, name="ssh-keys")
 # Target operations (exec/ssh/sync on configured targets)
 targets_ops_app = typer.Typer(
@@ -267,48 +261,22 @@ Useful for exploratory work, debugging, or custom scripts.
 Supports: RunPod, DigitalOcean (auto-provisions), SSH targets (baremetal/vm).
 Configure targets with: wafer config targets init ..."""
 )
-app.add_typer(targets_ops_app, name="targets", rich_help_panel="Infrastructure")
+app.add_typer(targets_ops_app, name="targets")
-# Specs management (new: local TOML configs)
-from wafer.specs_cli import specs_app
-app.add_typer(specs_app, name="specs", rich_help_panel="Configuration")
-# Live resource management (new: API-backed commands on `wafer targets`)
-# These become: wafer targets list, wafer targets terminate, etc.
-from wafer.targets_cli import (
-    targets_list as _targets_list_cmd,
-)
-from wafer.targets_cli import (
-    targets_pools as _targets_pools_cmd,
-)
-from wafer.targets_cli import (
-    targets_probe as _targets_probe_cmd,
-)
-from wafer.targets_cli import (
-    targets_provision as _targets_provision_cmd,
-)
-from wafer.targets_cli import (
-    targets_reconcile as _targets_reconcile_cmd,
-)
-from wafer.targets_cli import (
-    targets_terminate as _targets_terminate_cmd,
-)
-# Billing management - nested under config
+# Billing management
 billing_app = typer.Typer(help="Manage billing, credits, and subscription")
-config_app.add_typer(billing_app, name="billing")
+app.add_typer(billing_app, name="billing")
 # Corpus management
 corpus_app = typer.Typer(help="Download and manage GPU documentation")
-app.add_typer(corpus_app, name="corpus", rich_help_panel="Kernel Development")
+app.add_typer(corpus_app, name="corpus")
 # Evaluate (supports multiple kernel formats)
 evaluate_app = typer.Typer(
     help="Test kernel correctness and performance",
     invoke_without_command=True,
 )
-app.add_typer(evaluate_app, name="evaluate", rich_help_panel="Kernel Development")
+app.add_typer(evaluate_app, name="evaluate")
 # Nested subcommand for kernelbench format
 kernelbench_app = typer.Typer(
@@ -324,11 +292,6 @@ gpumode_app = typer.Typer(
 )
 evaluate_app.add_typer(gpumode_app, name="gpumode")
-# Baseline discovery (what kernel does PyTorch use?)
-from wafer.baseline import baseline_app
-app.add_typer(baseline_app, name="baseline", rich_help_panel="Kernel Development")
 # =============================================================================
 # Dev commands (internal, used by web app proxy)
 # =============================================================================
@@ -342,7 +305,7 @@ app.add_typer(dev_app, name="dev")
 # =============================================================================
 nvidia_app = typer.Typer(help="NVIDIA GPU profiling and analysis tools")
-app.add_typer(nvidia_app, name="nvidia", rich_help_panel="Profiling")
+app.add_typer(nvidia_app, name="nvidia")
 # NCU analysis - under nvidia
 ncu_app = typer.Typer(help="Nsight Compute profile analysis")
@@ -365,7 +328,7 @@ nvidia_app.add_typer(tracelens_app, name="tracelens")
 # =============================================================================
 amd_app = typer.Typer(help="AMD GPU profiling and analysis tools")
-app.add_typer(amd_app, name="amd", rich_help_panel="Profiling")
+app.add_typer(amd_app, name="amd")
 # Unified ISA Analyzer - supports both .co files and Triton artifacts
 isa_app = typer.Typer(help="ISA analysis for AMD GPU kernels (.co, .s, .ll, .ttgir files)")
@@ -376,14 +339,14 @@ amd_app.add_typer(isa_app, name="isa")
 # =============================================================================
 compare_app = typer.Typer(help="Compare GPU traces across platforms (AMD vs NVIDIA)")
-app.add_typer(compare_app, name="compare", rich_help_panel="Profiling")
+app.add_typer(compare_app, name="compare")
 # =============================================================================
 # Roofline analysis (wafer roofline)
 # =============================================================================
-@app.command("roofline", rich_help_panel="Kernel Development")
+@app.command("roofline")
 def roofline_cmd(
     gpu: str | None = typer.Option(
         None, "--gpu", "-g", help="GPU name (e.g., H100, B200, MI300X, A100)"
@@ -474,7 +437,7 @@ def roofline_cmd(
 # =============================================================================
 skill_app = typer.Typer(help="Manage AI coding assistant skills (Claude Code, Codex)")
-app.add_typer(skill_app, name="skill", rich_help_panel="Onboarding")
+app.add_typer(skill_app, name="skill")
 @skill_app.command("install")
@@ -638,19 +601,14 @@ def skill_status() -> None:
 # =============================================================================
-# Authentication (wafer auth ...)
+# Provider auth management (wafer auth ...)
 # =============================================================================
-auth_app = typer.Typer(help="Authenticate with Wafer and cloud GPU providers")
-app.add_typer(auth_app, name="auth", rich_help_panel="Configuration")
-providers_app = typer.Typer(
-    help="Manage API keys for cloud GPU providers (RunPod, DigitalOcean, etc.)"
-)
-auth_app.add_typer(providers_app, name="providers")
+provider_auth_app = typer.Typer(help="Manage API keys for cloud GPU providers")
+app.add_typer(provider_auth_app, name="auth")
-@providers_app.command("login")
+@provider_auth_app.command("login")
 def provider_auth_login(
     provider: str = typer.Argument(
         ...,
@@ -669,10 +627,10 @@ def provider_auth_login(
     (e.g., ANTHROPIC_API_KEY) take precedence over stored keys.
     Examples:
-        wafer auth providers login anthropic --api-key sk-ant-xxx
-        wafer auth providers login runpod --api-key rp_xxx
-        wafer auth providers login openai --api-key sk-xxx
-        echo $API_KEY | wafer auth providers login anthropic
+        wafer auth login anthropic --api-key sk-ant-xxx
+        wafer auth login runpod --api-key rp_xxx
+        wafer auth login openai --api-key sk-xxx
+        echo $API_KEY | wafer auth login anthropic
     """
     import sys
@@ -702,7 +660,7 @@ def provider_auth_login(
     typer.echo("Stored in: ~/.wafer/auth.json")
-@providers_app.command("logout")
+@provider_auth_app.command("logout")
 def provider_auth_logout(
     provider: str = typer.Argument(
         ...,
@@ -712,8 +670,8 @@ def provider_auth_logout(
     """Remove stored API key for a cloud GPU provider.
     Examples:
-        wafer auth providers logout runpod
-        wafer auth providers logout digitalocean
+        wafer auth logout runpod
+        wafer auth logout digitalocean
     """
     from wafer_core.auth import PROVIDERS, remove_api_key
@@ -729,7 +687,7 @@ def provider_auth_logout(
         typer.echo(f"No stored API key found for {PROVIDERS[provider]['display_name']}")
-@providers_app.command("status")
+@provider_auth_app.command("status")
 def provider_auth_status() -> None:
     """Show authentication status for all cloud GPU providers.
@@ -737,7 +695,7 @@ def provider_auth_status() -> None:
     the keys are coming from (environment variable or auth.json).
     Example:
-        wafer auth providers status
+        wafer auth status
     """
     from wafer_core.auth import get_all_auth_status
@@ -752,7 +710,7 @@ def provider_auth_status() -> None:
             typer.echo(f"  {status.display_name}: ✓ {status.key_preview} {source_str}")
         else:
             typer.echo(f"  {status.display_name}: ✗ Not configured")
-            typer.echo(f"      Run: wafer auth providers login {status.provider}")
+            typer.echo(f"      Run: wafer auth login {status.provider}")
             typer.echo(f"      Or set: {status.key_url}")
     typer.echo("")
@@ -1297,7 +1255,7 @@ def config_show_legacy() -> None:
     config_show_new()
-@app.command(rich_help_panel="Kernel Development")
+@app.command()
 def agent(  # noqa: PLR0913
     prompt: str | None = typer.Argument(
         None,
@@ -1539,7 +1497,6 @@ def _make_agent_alias(name: str, doc: str) -> None:
         template_args: list[str] | None = typer.Option(None, "--args"),
         corpus: str | None = typer.Option(None, "--corpus"),
         no_sandbox: bool = typer.Option(False, "--no-sandbox"),
-        no_proxy: bool = typer.Option(False, "--no-proxy", help="Skip wafer proxy, use ANTHROPIC_API_KEY directly"),
     ) -> None:
         agent(
             prompt=prompt,
@@ -1560,7 +1517,6 @@ def _make_agent_alias(name: str, doc: str) -> None:
             template_args=template_args,
             corpus=corpus,
             no_sandbox=no_sandbox,
-            no_proxy=no_proxy,  # Must explicitly pass to avoid Typer default object being truthy
         )
     alias_cmd.__doc__ = doc
@@ -1584,11 +1540,7 @@ def evaluate(  # noqa: PLR0913
         None, "--reference", help="Path to reference kernel file"
     ),
     test_cases: Path | None = typer.Option(
-        None,
-        "--test-cases",
-        help="Path to test cases JSON file. "
-        'Format: [{"name": "small", "n": 1024, "seed": 42}, ...]. '
-        "Run 'wafer evaluate make-template' to generate an example.",
+        None, "--test-cases", help="Path to test cases JSON file"
     ),
     target: str | None = typer.Option(
         None,
@@ -1600,9 +1552,7 @@ def evaluate(  # noqa: PLR0913
     benchmark: bool = typer.Option(False, "--benchmark", help="Run performance benchmarks"),
     profile: bool = typer.Option(False, "--profile", help="Enable profiling"),
     defensive: bool = typer.Option(
-        True,
-        "--defense/--no-defense",
-        help="Run reward hack defense checks after benchmarking. Enabled by default.",
+        False, "--defensive", help="Enable defensive timing to detect evaluation hacking"
     ),
     sync_artifacts: bool = typer.Option(
         True, "--sync-artifacts/--no-sync-artifacts", help="Download artifacts"
@@ -1616,24 +1566,24 @@ def evaluate(  # noqa: PLR0913
     The evaluation checks:
       1. Correctness: Does the kernel produce the same output as the reference?
       2. Performance (--benchmark): How fast is it compared to the reference?
-      3. Defense: Detects reward hacking (runs automatically with benchmark, disable with --no-defense)
+      3. Defense (--defensive): Detects evaluation hacking (stream injection, etc.)
     Examples:
         # Basic correctness check
-        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json
+        wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json
-        # With benchmarking (defense checks run automatically)
-        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json \\
+        # With benchmarking on a specific target
+        wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json \\
             --target vultr-b200 --benchmark
-        # Benchmarking without defense checks
-        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json \\
-            --benchmark --no-defense
+        # Full evaluation with defensive timing (detects cheating)
+        wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json \\
+            --benchmark --defensive
     Subcommands:
         gpumode        Use GPUMode format (functional) - RECOMMENDED
         kernelbench    Use KernelBench format (ModelNew class)
-        make-template  Generate template files for this format
+        make-template  Generate template files for this format (deprecated)
     """
     # If a subcommand is being invoked, skip the main evaluation logic
     if ctx.invoked_subcommand is not None:
@@ -1787,7 +1737,7 @@ def evaluate_make_template(
     typer.echo(f"  2. Edit {output_dir / 'reference.py'} with the ground truth + input generator")
     typer.echo(f"  3. Edit {output_dir / 'test_cases.json'} with your test parameters")
     typer.echo("  4. Run:")
-    typer.echo(f"     wafer evaluate gpumode --impl {output_dir / 'kernel.py'} \\")
+    typer.echo(f"     wafer evaluate --impl {output_dir / 'kernel.py'} \\")
     typer.echo(f"         --reference {output_dir / 'reference.py'} \\")
     typer.echo(f"         --test-cases {output_dir / 'test_cases.json'} --benchmark")
@@ -1851,95 +1801,6 @@ def kernelbench_list_problems() -> None:
         raise typer.Exit(1) from None
-def _resolve_pool_query(pool: str, collector) -> tuple[str, object]:
-    """Resolve a PoolQuery pool to a target spec name + lock context.
-    Queries live providers, matches by pool query, locks one target,
-    returns (spec_name, lock_context) for the evaluator.
-    """
-    import trio
-    from wafer_core.targets.pool import resolve_pool
-    from .target_lock import acquire_from_pool
-    matched_targets = trio.run(resolve_pool, pool)
-    if not matched_targets:
-        collector.set_error("pool", "NoMatchingTargets", pool=pool)
-        collector.finalize()
-        raise typer.Exit(1)
-    # Filter to targets with a spec (evaluator needs spec fields)
-    spec_targets = [t for t in matched_targets if t.spec_name]
-    if not spec_targets:
-        collector.set_error(
-            "pool",
-            "NoSpecTargets",
-            pool=pool,
-            message="Matched targets have no spec binding — evaluator needs spec fields",
-        )
-        collector.finalize()
-        raise typer.Exit(1)
-    # Lock one by resource_id
-    resource_ids = [t.resource_id for t in spec_targets]
-    collector.emit("pool_acquire", pool=pool, count=len(resource_ids))
-    lock_ctx = acquire_from_pool(resource_ids)
-    acquired_id = lock_ctx.__enter__()
-    if acquired_id is None:
-        lock_ctx.__exit__(None, None, None)
-        collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=resource_ids)
-        collector.finalize()
-        raise typer.Exit(1)
-    # Map resource_id back to spec_name
-    acquired_target = next(t for t in spec_targets if t.resource_id == acquired_id)
-    spec_name = acquired_target.spec_name
-    collector.emit("pool_acquired", target=spec_name, resource_id=acquired_id)
-    return spec_name, lock_ctx
-def _resolve_pool_legacy(pool: str, collector) -> tuple[str, object]:
-    """Resolve an old-style pool (static target name list) to a target name + lock context.
-    Old format: [pools.name] targets = ["t1", "t2"]
-    """
-    from .target_lock import acquire_from_pool
-    from .targets import filter_pool_by_auth, get_pool
-    try:
-        pool_targets = get_pool(pool)
-    except FileNotFoundError as e:
-        collector.set_error("pool", "PoolNotFound", pool=pool, message=str(e))
-        collector.finalize()
-        raise typer.Exit(1) from None
-    usable_targets, skipped = filter_pool_by_auth(pool_targets)
-    if skipped:
-        collector.emit("pool_auth_skip", targets=skipped)
-    if not usable_targets:
-        collector.set_error("pool", "NoUsableTargets", pool=pool)
-        collector.finalize()
-        raise typer.Exit(1) from None
-    collector.emit("pool_acquire", pool=pool, count=len(usable_targets))
-    lock_ctx = acquire_from_pool(usable_targets)
-    acquired_target = lock_ctx.__enter__()
-    if acquired_target is None:
-        lock_ctx.__exit__(None, None, None)
-        collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=usable_targets)
-        collector.finalize()
-        raise typer.Exit(1)
-    collector.emit("pool_acquired", target=acquired_target)
-    return acquired_target, lock_ctx
 @kernelbench_app.callback(invoke_without_command=True)
 def kernelbench_evaluate(  # noqa: PLR0913, PLR0915
     ctx: typer.Context,
@@ -1975,9 +1836,7 @@ def kernelbench_evaluate(  # noqa: PLR0913, PLR0915
     ),
     seed: int = typer.Option(42, "--seed", help="Random seed for weight initialization"),
     defensive: bool = typer.Option(
-        True,
-        "--defense/--no-defense",
-        help="Run reward hack defense checks after benchmarking. Enabled by default.",
+        False, "--defensive", help="Enable defensive timing to detect evaluation hacking"
     ),
     backend: str | None = typer.Option(
         None,
@@ -2017,20 +1876,16 @@ def kernelbench_evaluate(  # noqa: PLR0913, PLR0915
     The evaluation checks:
       1. Correctness: Does ModelNew.forward() produce same output as Model.forward()?
       2. Performance (--benchmark): How fast is it compared to the reference?
-      3. Defense: Detects reward hacking (runs automatically with benchmark, disable with --no-defense)
+      3. Defense (--defensive): Detects evaluation hacking
     Examples:
         # Basic correctness check
         wafer evaluate kernelbench --impl my_kernel.py --reference problem.py
-        # With benchmarking (defense checks run automatically)
+        # With benchmarking
         wafer evaluate kernelbench --impl my_kernel.py --reference problem.py \\
             --target vultr-b200 --benchmark
-        # Benchmarking without defense checks
-        wafer evaluate kernelbench --impl my_kernel.py --reference problem.py \\
-            --target vultr-b200 --benchmark --no-defense
     Subcommands:
         make-template  Extract a KernelBench problem as template
     """
@@ -2076,12 +1931,39 @@ def kernelbench_evaluate(  # noqa: PLR0913, PLR0915
     pool_lock_context = None
     if pool:
-        from wafer_core.targets.pool import is_query_pool
+        from .target_lock import acquire_from_pool
+        from .targets import filter_pool_by_auth, get_pool
-        if is_query_pool(pool):
-            resolved_target, pool_lock_context = _resolve_pool_query(pool, collector)
-        else:
-            resolved_target, pool_lock_context = _resolve_pool_legacy(pool, collector)
+        try:
+            pool_targets = get_pool(pool)
+        except FileNotFoundError as e:
+            collector.set_error("pool", "PoolNotFound", pool=pool, message=str(e))
+            collector.finalize()
+            raise typer.Exit(1) from None
+        # Filter to only targets with valid auth
+        usable_targets, skipped = filter_pool_by_auth(pool_targets)
+        if skipped:
+            collector.emit("pool_auth_skip", targets=skipped)
+        if not usable_targets:
+            collector.set_error("pool", "NoUsableTargets", pool=pool)
+            collector.finalize()
+            raise typer.Exit(1) from None
+        collector.emit("pool_acquire", pool=pool, count=len(usable_targets))
+        pool_lock_context = acquire_from_pool(usable_targets)
+        acquired_target = pool_lock_context.__enter__()
+        if acquired_target is None:
+            # Exit context manager before raising to avoid resource leak
+            pool_lock_context.__exit__(None, None, None)
+            collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=usable_targets)
+            collector.finalize()
+            raise typer.Exit(1)
+        collector.emit("pool_acquired", target=acquired_target)
+        resolved_target = acquired_target
     collector.target = resolved_target
@@ -2090,15 +1972,12 @@ def kernelbench_evaluate(  # noqa: PLR0913, PLR0915
     if stages == "all":
         resolved_stages = "compile,correctness,benchmark,defense"
-    # Handle --benchmark and --defense/--no-defense flags
+    # Handle backward compat: --benchmark and --defensive flags add to stages
     stage_set = set(resolved_stages.split(","))
     if benchmark and "benchmark" not in stage_set:
         stage_set.add("benchmark")
-    # Defense runs automatically when benchmarking, unless --no-defense
-    if defensive and "benchmark" in stage_set and "defense" not in stage_set:
+    if defensive and "defense" not in stage_set:
         stage_set.add("defense")
-    if not defensive:
-        stage_set.discard("defense")
     resolved_stages = ",".join(
         sorted(
             stage_set,
@@ -2409,11 +2288,7 @@ def gpumode_evaluate(  # noqa: PLR0913, PLR0915
         None, "--reference", help="Path to reference kernel file"
     ),
     test_cases: Path | None = typer.Option(
-        None,
-        "--test-cases",
-        help="Path to test cases JSON file. "
-        'Format: [{"name": "small", "n": 1024, "seed": 42}, ...]. '
-        "Run 'wafer evaluate make-template' to generate an example.",
+        None, "--test-cases", help="Path to test cases JSON file"
     ),
     target: str | None = typer.Option(
         None,
@@ -2432,9 +2307,7 @@ def gpumode_evaluate(  # noqa: PLR0913, PLR0915
     benchmark: bool = typer.Option(False, "--benchmark", help="Run performance benchmarks"),
     profile: bool = typer.Option(False, "--profile", help="Enable profiling"),
     defensive: bool = typer.Option(
-        True,
-        "--defense/--no-defense",
-        help="Run reward hack defense checks after benchmarking. Enabled by default.",
+        False, "--defensive", help="Enable defensive timing to detect evaluation hacking"
     ),
     sync_artifacts: bool = typer.Option(
         True, "--sync-artifacts/--no-sync-artifacts", help="Download artifacts"
@@ -2483,13 +2356,6 @@ def gpumode_evaluate(  # noqa: PLR0913, PLR0915
             err=True,
         )
         typer.echo("", err=True)
-        if "--test-cases" in missing_args:
-            typer.echo(
-                "Tip: Run 'wafer evaluate make-template' to generate template files "
-                "including test_cases.json.",
-                err=True,
-            )
-            typer.echo("", err=True)
         typer.echo("Run 'wafer evaluate gpumode --help' for full options.", err=True)
         typer.echo("Run 'wafer evaluate gpumode download' to download problem sets.", err=True)
         raise typer.Exit(1)
@@ -2590,12 +2456,313 @@ def gpumode_evaluate(  # noqa: PLR0913, PLR0915
     else:
         typer.echo(f"Error: {result.error_message}", err=True)
         raise typer.Exit(1)
+# =============================================================================
+# Push and Remote-Run commands
+# =============================================================================
+@app.command("push", hidden=True)
+def push(
+    local_path: Path = typer.Argument(..., help="Local directory to upload"),
+    workspace: str | None = typer.Option(None, "--workspace", "-w", help="Workspace name override"),
+    direct: bool = typer.Option(False, "--direct", "-d", help="Use direct SSH instead of API"),
+    target_name: str | None = typer.Option(
+        None,
+        "--target",
+        "-t",
+        help="Target for --direct mode. See 'wafer config targets list'.",
+        autocompletion=complete_target_name,
+    ),
+) -> None:
+    """Push directory to remote GPU.
+    By default, uses wafer-api. Use --direct for direct SSH mode.
+    Examples:
+        wafer push ./my_project
+        wafer push . --workspace my-kernel
+        wafer push ./my_project --direct --target vultr-b200
+    """
+    # Validate path
+    if not local_path.exists():
+        typer.echo(f"Error: Path not found: {local_path}", err=True)
+        raise typer.Exit(1)
+    if not local_path.is_dir():
+        typer.echo(f"Error: Not a directory: {local_path}", err=True)
+        raise typer.Exit(1)
+    # Resolve to absolute path
+    local_path = local_path.resolve()
+    if direct:
+        # Direct SSH mode (requires target)
+        if not target_name:
+            typer.echo("Error: --target required for --direct mode", err=True)
+            raise typer.Exit(1)
+        from wafer_core.utils.kernel_utils.targets.config import ModalTarget
+        from .gpu_run import push_directory as push_direct
+        from .targets import load_target
+        try:
+            target = load_target(target_name)
+        except FileNotFoundError:
+            typer.echo(f"Error: Target not found: {target_name}", err=True)
+            typer.echo("List targets with: wafer config targets list", err=True)
+            raise typer.Exit(1) from None
+        if isinstance(target, ModalTarget):
+            typer.echo(
+                f"Error: Target '{target_name}' is a Modal target. Direct push requires SSH.",
+                err=True,
+            )
+            raise typer.Exit(1) from None
+        typer.echo(f"Connecting to {target.ssh_target}...")
+        try:
+            result = push_direct(local_path, target)
+        except Exception as e:
+            typer.echo(f"Error: {e}", err=True)
+            raise typer.Exit(1) from None
+        typer.echo(f"Uploading {len(result.files_uploaded)} files to {result.workspace_path}")
+        for f in result.files_uploaded:
+            typer.echo(f"  ✓ {f}")
+        typer.echo(f"Pushed to: {result.workspace_path}")
+    else:
+        # API mode (default)
+        from .api_client import push_directory as push_api
+        workspace_name = workspace or local_path.name
+        typer.echo(f"Pushing {local_path.name} to wafer-api...")
+        try:
+            result = push_api(local_path, workspace_name)
+        except Exception as e:
+            typer.echo(f"Error: {e}", err=True)
+            raise typer.Exit(1) from None
+        typer.echo(f"Uploaded {len(result.files_uploaded)} files")
+        for f in result.files_uploaded:
+            typer.echo(f"  ✓ {f}")
+        typer.echo(f"Workspace ID: {result.workspace_id}")
+def _run_direct_mode(
+    cmd_str: str,
+    target_name: str,
+    upload_dir: Path | None,
+    workspace_id: str | None,
+    gpu_id: int | None,
+) -> int:
+    """Run command via direct SSH mode. Returns exit code."""
+    from wafer_core.utils.kernel_utils.targets.config import ModalTarget
+    from .gpu_run import push_directory as push_direct
+    from .gpu_run import run_command as run_direct
+    from .targets import load_target
+    try:
+        target = load_target(target_name)
+    except FileNotFoundError:
+        typer.echo(f"Error: Target not found: {target_name}", err=True)
+        typer.echo("List targets with: wafer config targets list", err=True)
+        raise typer.Exit(1) from None
+    if isinstance(target, ModalTarget):
+        typer.echo(
+            f"Error: Target '{target_name}' is a Modal target. Direct mode requires SSH.", err=True
+        )
+        raise typer.Exit(1) from None
+    if not target.docker_image:
+        typer.echo(f"Error: Target '{target_name}' has no docker_image configured", err=True)
+        raise typer.Exit(1)
+    # If upload_dir provided, push first
+    workspace_name = workspace_id
+    if upload_dir:
+        typer.echo(f"Uploading {upload_dir.name}...")
+        try:
+            push_result = push_direct(upload_dir, target)
+            workspace_name = push_result.workspace_name
+            typer.echo(f"Uploaded {len(push_result.files_uploaded)} files")
+        except Exception as e:
+            typer.echo(f"Error uploading: {e}", err=True)
+            raise typer.Exit(1) from None
+    elif not workspace_name:
+        workspace_name = "tmp"
+    effective_gpu = gpu_id if gpu_id is not None else target.gpu_ids[0]
+    typer.echo(f"Target: {target_name} (docker: {target.docker_image})")
+    typer.echo(f"Workspace: {workspace_name}")
+    typer.echo(f"GPU: {effective_gpu}")
+    typer.echo(f"Command: {cmd_str}")
+    typer.echo("-" * 60)
+    try:
+        return run_direct(cmd_str, workspace_name, target, gpu_id)
+    except KeyboardInterrupt:
+        typer.echo("\nInterrupted by user", err=True)
+        raise typer.Exit(130) from None
+    except Exception as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+def _run_api_mode(  # noqa: PLR0913
+    cmd_str: str,
+    upload_dir: Path | None,
+    workspace_id: str | None,
+    gpu_id: int | None,
+    gpu_count: int,
+    docker_image: str | None,
+    docker_entrypoint: str | None,
+    pull_image: bool,
+    require_hwc: bool,
+) -> int:
+    """Run command via wafer-api. Returns exit code."""
+    from .api_client import run_command_stream
+    if upload_dir:
+        typer.echo(f"Uploading: {upload_dir}")
+    elif workspace_id:
+        typer.echo(f"Workspace: {workspace_id}")
+    if gpu_id is not None:
+        typer.echo(f"GPU: {gpu_id}")
+    if gpu_count > 1:
+        typer.echo(f"GPU count: {gpu_count}")
+    if docker_image:
+        typer.echo(f"Image: {docker_image}")
+    if docker_entrypoint:
+        typer.echo(f"Entrypoint: {docker_entrypoint}")
+    if pull_image:
+        typer.echo("Pull image: yes")
+    typer.echo(f"Command: {cmd_str}")
+    if require_hwc:
+        typer.echo("Hardware counters: required (baremetal)")
+    typer.echo("-" * 60)
+    try:
+        return run_command_stream(
+            command=cmd_str,
+            upload_dir=upload_dir,
+            workspace_id=workspace_id,
+            gpu_id=gpu_id,
+            gpu_count=gpu_count,
+            docker_image=docker_image,
+            docker_entrypoint=docker_entrypoint,
+            pull_image=pull_image,
+            require_hardware_counters=require_hwc,
+        )
+    except KeyboardInterrupt:
+        typer.echo("\nInterrupted by user", err=True)
+        raise typer.Exit(130) from None
+    except Exception as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+@app.command("remote-run", hidden=True)
+def remote_run(  # noqa: PLR0913
+    command: list[str] = typer.Argument(..., help="Command to run"),
+    upload_dir: Path | None = typer.Option(
+        None, "--upload-dir", "-u", help="Directory to upload (stateless mode)"
+    ),
+    workspace_id: str | None = typer.Option(
+        None, "--workspace-id", "-w", help="Workspace ID (from wafer push)"
+    ),
+    gpu_id: int | None = typer.Option(None, "--gpu", "-g", help="GPU ID"),
+    gpu_count: int = typer.Option(1, "--gpu-count", "-n", help="Number of GPUs (1-8)"),
+    docker_image: str | None = typer.Option(None, "--image", "-i", help="Docker image override"),
+    docker_entrypoint: str | None = typer.Option(
+        None, "--docker-entrypoint", help="Override Docker entrypoint (e.g., 'bash')"
+    ),
+    pull_image: bool = typer.Option(
+        False, "--pull-image", help="Pull image if not available on target"
+    ),
+    require_hwc: bool = typer.Option(
+        False, "--require-hwc", help="Require hardware counters (baremetal)"
+    ),
+    direct: bool = typer.Option(False, "--direct", "-d", help="Use direct SSH instead of API"),
+    target_name: str | None = typer.Option(
+        None,
+        "--target",
+        "-t",
+        help="Target for --direct mode. See 'wafer config targets list'.",
+        autocompletion=complete_target_name,
+    ),
+) -> None:
+    """Run command on remote GPU in Docker.
+    Two modes:
+    - High-level (stateless): --upload-dir uploads files and runs command
+    - Low-level: --workspace-id uses existing workspace from 'wafer push'
+    By default, uses wafer-api. Use --direct for direct SSH mode.
+    Examples:
+        # Stateless: upload and run
+        wafer remote-run --upload-dir ./my_project -- python train.py
+        # Run without files
+        wafer remote-run -- nvidia-smi
+        # Low-level: use existing workspace
+        wafer remote-run --workspace-id ws_abc123 -- python train.py
+        # Direct SSH mode
+        wafer remote-run --upload-dir ./my_project --direct --target vultr-b200 -- python train.py
+    """
+    cmd_str = " ".join(command)
+    if not cmd_str.strip():
+        typer.echo("Error: Empty command", err=True)
+        raise typer.Exit(1)
+    if upload_dir and workspace_id:
+        typer.echo("Error: --upload-dir and --workspace-id are mutually exclusive", err=True)
+        raise typer.Exit(1)
+    if upload_dir:
+        if not upload_dir.exists():
+            typer.echo(f"Error: Directory not found: {upload_dir}", err=True)
+            raise typer.Exit(1)
+        if not upload_dir.is_dir():
+            typer.echo(f"Error: Not a directory: {upload_dir}", err=True)
+            raise typer.Exit(1)
+        upload_dir = upload_dir.resolve()
+    if direct:
+        if not target_name:
+            typer.echo("Error: --target required for --direct mode", err=True)
+            raise typer.Exit(1)
+        exit_code = _run_direct_mode(cmd_str, target_name, upload_dir, workspace_id, gpu_id)
+    else:
+        exit_code = _run_api_mode(
+            cmd_str,
+            upload_dir,
+            workspace_id,
+            gpu_id,
+            gpu_count,
+            docker_image,
+            docker_entrypoint,
+            pull_image,
+            require_hwc,
+        )
+    raise typer.Exit(exit_code)
 # =============================================================================
 # Authentication commands
 # =============================================================================
-@auth_app.command("login")
+@app.command("login")
 def login(
     token: str | None = typer.Option(
         None, "--token", "-t", help="Access token (skip browser OAuth)"
@@ -2620,7 +2787,7 @@ def login(
     Uses the API environment from config (see 'wafer config show').
     SSH Users (Easiest):
-    - Just run: wafer auth login
+    - Just run: wafer login
     - Visit the URL and enter the code shown
     - No port forwarding needed!
@@ -2630,17 +2797,17 @@ def login(
     Manual token option:
     - Visit auth.wafer.ai, authenticate, copy token from URL
-    - Run: wafer auth login --token <paste-token>
+    - Run: wafer login --token <paste-token>
     Examples:
-        wafer auth login                    # device code on SSH, browser on local
-        wafer auth login --no-device-code   # force browser (needs port forwarding on SSH)
-        wafer auth login --port 9000        # custom port for browser flow
-        wafer auth login --token xyz        # manual token (no browser)
+        wafer login                    # device code on SSH, browser on local
+        wafer login --no-device-code   # force browser (needs port forwarding on SSH)
+        wafer login --port 9000        # custom port for browser flow
+        wafer login --token xyz        # manual token (no browser)
         # Change environment:
         wafer config set api.environment staging
-        wafer auth login
+        wafer login
     """
     import httpx
@@ -2724,7 +2891,7 @@ def login(
     typer.echo("Token saved to ~/.wafer/credentials.json")
-@auth_app.command("logout")
+@app.command("logout")
 def logout() -> None:
     """Remove stored credentials."""
     from . import analytics
@@ -2741,7 +2908,7 @@ def logout() -> None:
         typer.echo("Not logged in (no credentials found).")
-@auth_app.command("whoami")
+@app.command("whoami")
 def whoami(
     verify: bool = typer.Option(False, "--verify", "-v", help="Verify token with API"),
     refresh: bool = typer.Option(False, "--refresh", "-r", help="Refresh token if expired"),
@@ -2755,7 +2922,7 @@ def whoami(
     creds = load_credentials()
     if creds is None:
-        typer.echo("Not logged in. Run: wafer auth login")
+        typer.echo("Not logged in. Run: wafer login")
         raise typer.Exit(1)
     if verify or refresh:
@@ -2763,7 +2930,7 @@ def whoami(
             # Try to get valid token with auto-refresh
             token = get_valid_token()
             if token is None:
-                typer.echo("Token expired and refresh failed. Run: wafer auth login", err=True)
+                typer.echo("Token expired and refresh failed. Run: wafer login", err=True)
                 raise typer.Exit(1)
             if token != creds.access_token:
                 typer.echo("Token refreshed successfully")
@@ -2776,10 +2943,10 @@ def whoami(
         except Exception as e:
             if creds.refresh_token and not refresh:
                 typer.echo(f"Token expired: {e}", err=True)
-                typer.echo("Try: wafer auth whoami --refresh", err=True)
+                typer.echo("Try: wafer whoami --refresh", err=True)
             else:
                 typer.echo(f"Token invalid or expired: {e}", err=True)
-                typer.echo("Run: wafer auth login", err=True)
+                typer.echo("Run: wafer login", err=True)
             raise typer.Exit(1) from None
     elif creds.email:
         typer.echo(creds.email)
@@ -2787,7 +2954,7 @@ def whoami(
         typer.echo("Logged in (email not available)")
-@app.command("guide", rich_help_panel="Onboarding")
+@app.command("guide")
 def guide() -> None:
     """Show the Wafer CLI usage guide.
@@ -2818,7 +2985,7 @@ demo_app = typer.Typer(
   wafer demo trace  Analyze a sample performance trace
   wafer demo eval   Run kernel evaluation on cloud GPU (requires login)"""
 )
-app.add_typer(demo_app, name="demo", rich_help_panel="Onboarding")
+app.add_typer(demo_app, name="demo")
 DEMO_TRACES_URL = "https://github.com/wafer-ai/wafer/raw/main/apps/wafer-cli/wafer/demo_data"
 DEMO_DIR = Path.home() / ".cache" / "wafer" / "demo"
@@ -3038,7 +3205,7 @@ def demo_eval(
     """Demo: Evaluate a kernel on a cloud GPU.
     Creates a workspace, runs a sample Triton kernel evaluation, and cleans up.
-    Requires authentication (wafer auth login).
+    Requires authentication (wafer login).
     Example:
         wafer demo eval
@@ -3053,7 +3220,7 @@ def demo_eval(
     # Check auth first
     creds = load_credentials()
     if not creds:
-        typer.echo("Error: Not authenticated. Run: wafer auth login")
+        typer.echo("Error: Not authenticated. Run: wafer login")
         raise typer.Exit(1)
     if not yes:
@@ -4411,8 +4578,8 @@ def billing_usage(
     """Show current billing usage and subscription info.
     Example:
-        wafer config billing
-        wafer config billing --json
+        wafer billing
+        wafer billing --json
     """
     # Only show usage if no subcommand was invoked
     if ctx.invoked_subcommand is not None:
@@ -4440,9 +4607,9 @@ def billing_topup(
     Opens a Stripe checkout page to add credits. Default amount is $25.
     Example:
-        wafer config billing topup        # Add $25
-        wafer config billing topup 100    # Add $100
-        wafer config billing topup --no-browser  # Print URL instead
+        wafer billing topup        # Add $25
+        wafer billing topup 100    # Add $100
+        wafer billing topup --no-browser  # Print URL instead
     """
     import webbrowser
@@ -4488,8 +4655,8 @@ def billing_portal(
     Manage your subscription, update payment method, or view invoices.
     Example:
-        wafer config billing portal
-        wafer config billing portal --no-browser
+        wafer billing portal
+        wafer billing portal --no-browser
     """
     import webbrowser
@@ -4526,8 +4693,8 @@ def ssh_keys_list(
     """List all registered SSH public keys.
     Example:
-        wafer config ssh-keys list
-        wafer config ssh-keys list --json
+        wafer ssh-keys list
+        wafer ssh-keys list --json
     """
     from .ssh_keys import list_ssh_keys
@@ -4553,9 +4720,9 @@ def ssh_keys_add(
     id_ed25519.pub, id_rsa.pub, id_ecdsa.pub.
     Example:
-        wafer config ssh-keys add                              # Auto-detect
-        wafer config ssh-keys add ~/.ssh/id_rsa.pub            # Specific file
-        wafer config ssh-keys add ~/.ssh/id_ed25519.pub --name laptop
+        wafer ssh-keys add                              # Auto-detect
+        wafer ssh-keys add ~/.ssh/id_rsa.pub            # Specific file
+        wafer ssh-keys add ~/.ssh/id_ed25519.pub --name laptop
     """
     from .ssh_keys import add_ssh_key
@@ -4574,10 +4741,10 @@ def ssh_keys_remove(
 ) -> None:
     """Remove an SSH public key.
-    Get the key ID from 'wafer config ssh-keys list'.
+    Get the key ID from 'wafer ssh-keys list'.
     Example:
-        wafer config ssh-keys remove abc123-def456-...
+        wafer ssh-keys remove abc123-def456-...
     """
     from .ssh_keys import remove_ssh_key
@@ -5064,18 +5231,6 @@ def workspaces_pull(
         raise typer.Exit(1) from None
-# =============================================================================
-# Live resource commands (list/terminate/reconcile/provision)
-# =============================================================================
-targets_ops_app.command("list")(_targets_list_cmd)
-targets_ops_app.command("terminate")(_targets_terminate_cmd)
-targets_ops_app.command("reconcile")(_targets_reconcile_cmd)
-targets_ops_app.command("provision")(_targets_provision_cmd)
-targets_ops_app.command("pools")(_targets_pools_cmd)
-targets_ops_app.command("probe")(_targets_probe_cmd)
 # =============================================================================
 # Target operations commands (exec/ssh/sync)
 # =============================================================================
@@ -5834,9 +5989,9 @@ def ncu_analyze(
     compute/memory throughput, and optimization recommendations.
     By default, uses local NCU if available, otherwise runs analysis
-    remotely via wafer-api (requires authentication: wafer auth login).
+    remotely via wafer-api (requires authentication: wafer login).
-    Use --target for direct SSH mode.
+    Use --target for direct SSH mode (like wafer remote-run --direct).
     Use --include-source to fetch SASS assembly with register/instruction data.
     Examples:
@@ -5929,7 +6084,7 @@ def nsys_analyze(
     Returns timeline events, kernel information, memory usage, and diagnostics.
     By default, uses local nsys if available, otherwise runs analysis
-    remotely via wafer-api (requires authentication: wafer auth login).
+    remotely via wafer-api (requires authentication: wafer login).
     Supports multiple execution modes:
     - Local: Uses local nsys CLI (no GPU required for analysis)
@@ -6914,7 +7069,7 @@ def autotuner_results(
         raise typer.Exit(1) from None
-@app.command("capture", rich_help_panel="Kernel Development")
+@app.command("capture")
 def capture_command(  # noqa: PLR0915
     label: str = typer.Argument(
         ..., help="Label for this capture (e.g., 'baseline', 'optimized-v2')"
@@ -7594,29 +7749,18 @@ def compare_analyze(
         "-f",
         help="Output format: text, text-layers, csv, csv-layers, json",
     ),
-    output: Path | None = typer.Option(
-        None, "--output", "-o", help="Output file (default: stdout)"
-    ),
+    output: Path | None = typer.Option(None, "--output", "-o", help="Output file (default: stdout)"),
     phase: str = typer.Option(
         "all",
         "--phase",
         help="Filter by phase: all, prefill, decode",
     ),
     layers: bool = typer.Option(False, "--layers", help="Show layer-wise performance breakdown"),
-    all: bool = typer.Option(
-        False, "--all", help="Show all items (no truncation for layers, operations, kernels)"
-    ),
-    stack_traces: bool = typer.Option(
-        False, "--stack-traces", help="Show Python stack traces for operations"
-    ),
-    recommendations: bool = typer.Option(
-        False, "--recommendations", help="Generate prioritized recommendations for kernel team"
-    ),
-    json: bool = typer.Option(
-        False, "--json", hidden=True, help="Ignored (for compatibility with cliExecutor)"
-    ),
+    all: bool = typer.Option(False, "--all", help="Show all items (no truncation for layers, operations, kernels)"),
+    stack_traces: bool = typer.Option(False, "--stack-traces", help="Show Python stack traces for operations"),
+    json: bool = typer.Option(False, "--json", hidden=True, help="Ignored (for compatibility with cliExecutor)"),
 ) -> None:
-    """Compare GPU traces from AMD and NVIDIA platforms.
+    """Compare GPU traces from two platforms platforms.
     Analyzes performance differences between traces, identifying which operations
     are faster/slower on each platform and providing kernel-level details.
@@ -7664,7 +7808,6 @@ def compare_analyze(
         show_layers=layers,
         show_all=all,
         show_stack_traces=stack_traces,
-        recommendations=recommendations,
     )
     _mark_command_success()
@@ -7679,17 +7822,13 @@ def compare_fusion_cmd(
         "-f",
         help="Output format: text, csv, json",
     ),
-    output: Path | None = typer.Option(
-        None, "--output", "-o", help="Output file (default: stdout)"
-    ),
+    output: Path | None = typer.Option(None, "--output", "-o", help="Output file (default: stdout)"),
     min_group_size: int = typer.Option(
         50,
         "--min-group-size",
         help="Minimum correlation group size to analyze",
     ),
-    json: bool = typer.Option(
-        False, "--json", hidden=True, help="Ignored (for compatibility with cliExecutor)"
-    ),
+    json: bool = typer.Option(False, "--json", hidden=True, help="Ignored (for compatibility with cliExecutor)"),
 ) -> None:
     """Analyze kernel fusion differences between AMD and NVIDIA traces.
@@ -7709,69 +7848,14 @@ def compare_fusion_cmd(
         # CSV output to file
         wafer compare fusion amd_trace.json nvidia_trace.json --format csv -o fusion.csv
     """
-    from .trace_compare import compare_align
-    compare_align(
-        trace1=trace1,
-        trace2=trace2,
-        output=output,
-        output_format=format,
-        phase="all",
-    )
-    _mark_command_success()
-@compare_app.command("align")
-def compare_align_cmd(
-    trace1: Path = typer.Argument(..., help="First trace file (AMD or NVIDIA)", exists=True),
-    trace2: Path = typer.Argument(..., help="Second trace file (AMD or NVIDIA)", exists=True),
-    format: str = typer.Option(
-        "json",
-        "--format",
-        "-f",
-        help="Output format: json",
-    ),
-    output: Path | None = typer.Option(
-        None, "--output", "-o", help="Output file (default: stdout)"
-    ),
-    phase: str = typer.Option(
-        "all",
-        "--phase",
-        help="Filter by phase: all, prefill, decode",
-    ),
-    layer: int | None = typer.Option(
-        None,
-        "--layer",
-        help="Focus on specific layer number",
-    ),
-) -> None:
-    """Align kernels at layer level for exact kernel-to-kernel comparison.
-    Provides kernel-to-kernel mapping across AMD and NVIDIA platforms,
-    showing which kernels correspond to each other at each layer position.
-    Examples:
-        # Basic alignment (stdout JSON)
-        wafer compare align amd_trace.json nvidia_trace.json
-        # Save to file
-        wafer compare align amd_trace.json nvidia_trace.json -o alignment.json
-        # Focus on decode phase only
-        wafer compare align amd_trace.json nvidia_trace.json --phase decode
-        # Focus on specific layer
-        wafer compare align amd_trace.json nvidia_trace.json --layer 5
-    """
-    from .trace_compare import compare_align
+    from .trace_compare import compare_fusion
-    compare_align(
+    compare_fusion(
         trace1=trace1,
         trace2=trace2,
         output=output,
-        output_format=format,
-        phase=phase,
-        layer=layer,
+        format_type=format,
+        min_group_size=min_group_size,
     )
     _mark_command_success()

wafer-cli 0.2.32__py3-none-any.whl → 0.2.33__py3-none-any.whl

wafer-cli 0.2.32py3-none-any.whl → 0.2.33py3-none-any.whl