PyPI - wafer-cli - Versions diffs - 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl - Mend

wafer-cli 0.2.8py3-none-any.whl → 0.2.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

wafer/GUIDE.md +18 -7
wafer/api_client.py +4 -0
wafer/auth.py +85 -0
wafer/cli.py +2339 -404
wafer/corpus.py +158 -32
wafer/evaluate.py +1232 -201
wafer/gpu_run.py +5 -1
wafer/kernel_scope.py +554 -0
wafer/nsys_analyze.py +903 -73
wafer/nsys_profile.py +511 -0
wafer/output.py +241 -0
wafer/problems.py +357 -0
wafer/skills/wafer-guide/SKILL.md +13 -0
wafer/ssh_keys.py +261 -0
wafer/target_lock.py +270 -0
wafer/targets.py +490 -0
wafer/targets_ops.py +718 -0
wafer/wevin_cli.py +129 -18
wafer/workspaces.py +282 -182
{wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/METADATA +1 -1
wafer_cli-0.2.10.dist-info/RECORD +40 -0
wafer_cli-0.2.8.dist-info/RECORD +0 -33
{wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/WHEEL +0 -0
{wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.8.dist-info → wafer_cli-0.2.10.dist-info}/top_level.txt +0 -0

wafer/evaluate.py CHANGED Viewed

@@ -14,6 +14,7 @@ logger = logging.getLogger(__name__)
 from wafer_core.utils.kernel_utils.targets.config import (
     BaremetalTarget,
     DigitalOceanTarget,
+    LocalTarget,
     ModalTarget,
     RunPodTarget,
     VMTarget,
@@ -21,6 +22,30 @@ from wafer_core.utils.kernel_utils.targets.config import (
 )
+# Map AMD compute capability to ROCm architecture
+# Used to set PYTORCH_ROCM_ARCH for faster compilation (compile only for target arch)
+AMD_CC_TO_ARCH = {
+    "9.4": "gfx942",  # MI300X
+    "9.0a": "gfx90a",  # MI200 series
+    "9.08": "gfx908",  # MI100
+    "9.06": "gfx906",  # MI50/60
+    "10.30": "gfx1030",  # RDNA2
+    "11.0": "gfx1100",  # RDNA3
+}
+def _get_rocm_arch(compute_capability: str) -> str | None:
+    """Get ROCm architecture string from compute capability.
+    Returns gfx* string for PYTORCH_ROCM_ARCH, or None if not found.
+    """
+    # Already a gfx string
+    if compute_capability.startswith("gfx"):
+        return compute_capability
+    # Map from numeric CC
+    return AMD_CC_TO_ARCH.get(compute_capability)
 def _build_docker_run_command(
     image: str,
     command: str,
@@ -161,6 +186,7 @@ class KernelBenchEvaluateArgs:
     inputs: Path | None = None  # Custom inputs file to override get_inputs()
     seed: int = 42  # Random seed for reproducibility
     defensive: bool = False
+    backend: str | None = None  # Kernel backend for static validation
     sync_artifacts: bool = True
     gpu_id: int | None = None
@@ -396,33 +422,6 @@ async def run_evaluate_docker(
     print(f"Connecting to {target.ssh_target}...")
     async with AsyncSSHClient(target.ssh_target, target.ssh_key) as client:
-        # Upload wafer-core to remote
-        try:
-            wafer_root = _get_wafer_root()
-            wafer_core_path = wafer_root / "packages" / "wafer-core"
-            print(f"Uploading wafer-core from {wafer_core_path}...")
-            # Create workspace and upload
-            workspace_name = wafer_core_path.name
-            remote_workspace = f"{REMOTE_WORKSPACE_BASE}/{workspace_name}"
-            await client.exec(f"mkdir -p {remote_workspace}")
-            wafer_core_workspace = await client.expand_path(remote_workspace)
-            upload_result = await client.upload_files(
-                str(wafer_core_path), wafer_core_workspace, recursive=True
-            )
-            print(f"Uploaded {upload_result.files_copied} files")
-        except Exception as e:
-            return EvaluateResult(
-                success=False,
-                all_correct=False,
-                correctness_score=0.0,
-                geomean_speedup=0.0,
-                passed_tests=0,
-                total_tests=0,
-                error_message=f"Failed to upload wafer-core: {e}",
-            )
         print(f"Using Docker image: {target.docker_image}")
         print(f"Using GPU {gpu_id}...")
@@ -431,10 +430,13 @@ async def run_evaluate_docker(
         ref_code = args.reference.read_text()
         test_cases_data = json.loads(args.test_cases.read_text())
-        # Create a unique run directory
+        # Create workspace for evaluation files
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         run_dir = f"wafer_eval_{timestamp}"
-        run_path = f"{wafer_core_workspace}/{run_dir}"
+        eval_workspace = f"{REMOTE_WORKSPACE_BASE}/eval_{timestamp}"
+        await client.exec(f"mkdir -p {eval_workspace}")
+        eval_workspace_expanded = await client.expand_path(eval_workspace)
+        run_path = f"{eval_workspace_expanded}/{run_dir}"
         print("Uploading evaluation files...")
@@ -521,17 +523,16 @@ async def run_evaluate_docker(
         container_impl_path = f"{container_run_path}/implementation.py"
         container_ref_path = f"{container_run_path}/reference.py"
         container_test_cases_path = f"{container_run_path}/test_cases.json"
-        container_evaluate_script = (
-            f"{CONTAINER_WORKSPACE}/wafer_core/utils/kernel_utils/evaluate.py"
-        )
-        # Build pip install command for torch and other deps (no wafer-core install needed)
+        # Build pip install command for torch and other deps, plus wafer-core
         pip_install_cmd = _build_docker_pip_install_cmd(target)
+        install_cmd = (
+            f"{pip_install_cmd} && uv pip install --system --break-system-packages wafer-core"
+        )
-        # Build evaluate command - use PYTHONPATH instead of installing wafer-core
+        # Build evaluate command using installed wafer-core module
         python_cmd_parts = [
-            f"PYTHONPATH={CONTAINER_WORKSPACE}:$PYTHONPATH",
-            f"python3 {container_evaluate_script}",
+            "python3 -m wafer_core.utils.kernel_utils.evaluate",
             f"--implementation {container_impl_path}",
             f"--reference {container_ref_path}",
             f"--test-cases {container_test_cases_path}",
@@ -547,8 +548,8 @@ async def run_evaluate_docker(
         eval_cmd = " ".join(python_cmd_parts)
-        # Full command: install torch deps, then run evaluate with PYTHONPATH
-        full_cmd = f"{pip_install_cmd} && cd {container_run_path} && {eval_cmd}"
+        # Full command: install deps + wafer-core, then run evaluate
+        full_cmd = f"{install_cmd} && cd {container_run_path} && {eval_cmd}"
         # Build Docker run command
         # Add SYS_ADMIN capability when profiling (needed for NCU GPU performance counters)
@@ -558,7 +559,7 @@ async def run_evaluate_docker(
             working_dir=container_run_path,
             env={"CUDA_VISIBLE_DEVICES": str(gpu_id), "PYTHONUNBUFFERED": "1"},
             gpus="all",
-            volumes={wafer_core_workspace: CONTAINER_WORKSPACE},
+            volumes={eval_workspace_expanded: CONTAINER_WORKSPACE},
             cap_add=["SYS_ADMIN"] if args.profile else None,
         )
@@ -567,7 +568,7 @@ async def run_evaluate_docker(
         # Run Docker command and stream output
         log_lines = []
         async for line in client.exec_stream(docker_cmd):
-            print(line)
+            print(line, flush=True)
             log_lines.append(line)
         # Read results
@@ -665,6 +666,181 @@ async def run_evaluate_docker(
         )
+async def run_evaluate_local(
+    args: EvaluateArgs,
+    target: LocalTarget,
+) -> EvaluateResult:
+    """Run evaluation locally on the current machine.
+    For LocalTarget - no SSH needed, runs directly.
+    Args:
+        args: Evaluate arguments
+        target: Local target config
+    Returns:
+        Evaluation result
+    """
+    import os
+    import subprocess
+    import tempfile
+    from datetime import datetime
+    # Select GPU
+    gpu_id = args.gpu_id if args.gpu_id is not None else target.gpu_ids[0]
+    print(f"Running local evaluation on GPU {gpu_id}...")
+    # Create temp directory for eval files
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    with tempfile.TemporaryDirectory(prefix=f"wafer_eval_{timestamp}_") as run_path:
+        run_path = Path(run_path)
+        # Write implementation
+        impl_path = run_path / "implementation.py"
+        impl_path.write_text(args.implementation.read_text())
+        # Write reference
+        ref_path = run_path / "reference.py"
+        ref_path.write_text(args.reference.read_text())
+        # Write custom inputs if provided
+        inputs_path = None
+        if args.inputs:
+            inputs_path = run_path / "custom_inputs.py"
+            inputs_path.write_text(args.inputs.read_text())
+        # Write eval script
+        eval_script_path = run_path / "kernelbench_eval.py"
+        eval_script_path.write_text(KERNELBENCH_EVAL_SCRIPT)
+        # Write defense module if defensive mode is enabled
+        defense_module_path = None
+        if args.defensive:
+            defense_src = (
+                Path(__file__).parent.parent.parent.parent
+                / "packages"
+                / "wafer-core"
+                / "wafer_core"
+                / "utils"
+                / "kernel_utils"
+                / "defense.py"
+            )
+            if defense_src.exists():
+                defense_module_path = run_path / "defense.py"
+                defense_module_path.write_text(defense_src.read_text())
+            else:
+                print(f"Warning: defense.py not found at {defense_src}")
+        # Output file
+        output_path = run_path / "results.json"
+        # Build eval command
+        cmd_parts = [
+            "python3",
+            str(eval_script_path),
+            "--impl",
+            str(impl_path),
+            "--reference",
+            str(ref_path),
+            "--output",
+            str(output_path),
+            "--seed",
+            str(args.seed),
+        ]
+        if args.benchmark:
+            cmd_parts.append("--benchmark")
+        if args.profile:
+            cmd_parts.append("--profile")
+        if inputs_path:
+            cmd_parts.extend(["--inputs", str(inputs_path)])
+        if args.defensive and defense_module_path:
+            cmd_parts.extend(["--defensive", "--defense-module", str(defense_module_path)])
+        # Set environment for GPU selection
+        env = os.environ.copy()
+        if target.vendor == "nvidia":
+            env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+        else:  # AMD
+            env["HIP_VISIBLE_DEVICES"] = str(gpu_id)
+            env["ROCM_PATH"] = "/opt/rocm"
+        print(f"Running: {' '.join(cmd_parts[:4])} ...")
+        # Run evaluation
+        try:
+            result = subprocess.run(
+                cmd_parts,
+                cwd=str(run_path),
+                env=env,
+                capture_output=True,
+                text=True,
+                timeout=args.timeout or 600,
+            )
+        except subprocess.TimeoutExpired:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message="Evaluation timed out",
+            )
+        if result.returncode != 0:
+            error_msg = result.stderr or result.stdout or "Unknown error"
+            # Truncate long errors
+            if len(error_msg) > 1000:
+                error_msg = error_msg[:500] + "\n...\n" + error_msg[-500:]
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Evaluation failed:\n{error_msg}",
+            )
+        # Parse results
+        if not output_path.exists():
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message="No results.json produced",
+            )
+        try:
+            results = json.loads(output_path.read_text())
+        except json.JSONDecodeError as e:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Failed to parse results: {e}",
+            )
+        # Extract results
+        return EvaluateResult(
+            success=True,
+            all_correct=results.get("all_correct", False),
+            correctness_score=results.get("correctness_score", 0.0),
+            geomean_speedup=results.get("geomean_speedup", 0.0),
+            passed_tests=results.get("passed_tests", 0),
+            total_tests=results.get("total_tests", 0),
+            benchmark_results=results.get("benchmark", {}),
+        )
 async def run_evaluate_ssh(
     args: EvaluateArgs,
     target: BaremetalTarget | VMTarget,
@@ -982,6 +1158,7 @@ def _build_modal_sandbox_script(
     test_cases_b64: str,
     run_benchmarks: bool,
     run_defensive: bool,
+    defense_code_b64: str | None = None,
 ) -> str:
     """Build Python script to create sandbox and run evaluation.
@@ -1062,6 +1239,20 @@ print('Files written')
             print(json.dumps({{"error": f"Failed to write files: {{proc.stderr.read()}}"}}))
             return
+        # Write defense module if defensive mode is enabled
+        # NOTE: Check for actual base64 content, not just truthy string (None becomes "None")
+        if {run_defensive} and "{defense_code_b64}" and "{defense_code_b64}" != "None":
+            proc = sandbox.exec("python", "-c", f"""
+import base64
+with open('/workspace/defense.py', 'w') as f:
+    f.write(base64.b64decode('{defense_code_b64}').decode())
+print('Defense module written')
+""")
+            proc.wait()
+            if proc.returncode != 0:
+                print(json.dumps({{"error": f"Failed to write defense module: {{proc.stderr.read()}}"}}))
+                return
         # Build inline evaluation script
         eval_script = """
 import json
@@ -1089,6 +1280,26 @@ generate_input = load_fn('reference.py', 'generate_input')
 import torch
+# Load defense module if available and defensive mode is enabled
+run_defensive = {run_defensive}
+defense = None
+if run_defensive:
+    try:
+        defense = load_fn('defense.py', 'run_all_defenses')
+        time_with_defenses = load_fn('defense.py', 'time_execution_with_defenses')
+        print('[Defense] Defense module loaded')
+        # Wrap kernels for defense API compatibility
+        # Defense API calls kernel(*args), but functional format expects kernel(inputs_tuple)
+        # These wrappers repack the unpacked args back into a tuple
+        def _wrap_for_defense(kernel):
+            return lambda *args: kernel(args)
+        custom_kernel_for_defense = _wrap_for_defense(custom_kernel)
+        ref_kernel_for_defense = _wrap_for_defense(ref_kernel)
+    except Exception as e:
+        print(f'[Defense] Warning: Could not load defense module: {{e}}')
+        defense = None
 results = []
 all_correct = True
 total_time_ms = 0.0
@@ -1116,36 +1327,63 @@ for tc in test_cases:
         impl_time_ms = 0.0
         ref_time_ms = 0.0
         if {run_benchmarks}:
-            # Warmup
-            for _ in range(3):
-                custom_kernel(inputs)
-            torch.cuda.synchronize()
-            # Measure with defensive timing if requested
-            # Defensive: sync before recording end event to catch stream injection
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
-            start.record()
-            for _ in range(10):
-                custom_kernel(inputs)
-            if {run_defensive}:
-                torch.cuda.synchronize()  # DEFENSE: sync all streams before end
-            end.record()
-            torch.cuda.synchronize()
-            impl_time_ms = start.elapsed_time(end) / 10
-            # Reference timing (same defensive approach)
-            for _ in range(3):
-                ref_kernel(inputs)
-            torch.cuda.synchronize()
-            start.record()
-            for _ in range(10):
-                ref_kernel(inputs)
-            if {run_defensive}:
-                torch.cuda.synchronize()  # DEFENSE: sync all streams before end
-            end.record()
-            torch.cuda.synchronize()
-            ref_time_ms = start.elapsed_time(end) / 10
+            if run_defensive and defense is not None:
+                # Use full defense suite with wrapped kernels
+                # inputs_list unpacks the tuple so defense can infer dtype/device from tensors
+                inputs_list = list(inputs) if hasattr(inputs, '__iter__') and not isinstance(inputs, torch.Tensor) else [inputs]
+                # Run defense checks
+                all_passed, defense_results, _ = defense(custom_kernel_for_defense, *inputs_list)
+                if not all_passed:
+                    failed = [name for name, passed, _ in defense_results if not passed]
+                    raise ValueError(f"Defense checks failed: {{failed}}")
+                # Time with defensive timing (using wrapped kernels)
+                impl_times, _ = time_with_defenses(
+                    custom_kernel_for_defense,
+                    inputs_list,
+                    num_warmup=3,
+                    num_trials=10,
+                    verbose=False,
+                    run_defenses=False,
+                )
+                impl_time_ms = sum(impl_times) / len(impl_times)
+                ref_times, _ = time_with_defenses(
+                    ref_kernel_for_defense,
+                    inputs_list,
+                    num_warmup=3,
+                    num_trials=10,
+                    verbose=False,
+                    run_defenses=False,
+                )
+                ref_time_ms = sum(ref_times) / len(ref_times)
+            else:
+                # Standard timing without full defenses
+                # Warmup
+                for _ in range(3):
+                    custom_kernel(inputs)
+                torch.cuda.synchronize()
+                start = torch.cuda.Event(enable_timing=True)
+                end = torch.cuda.Event(enable_timing=True)
+                start.record()
+                for _ in range(10):
+                    custom_kernel(inputs)
+                end.record()
+                torch.cuda.synchronize()
+                impl_time_ms = start.elapsed_time(end) / 10
+                # Reference timing
+                for _ in range(3):
+                    ref_kernel(inputs)
+                torch.cuda.synchronize()
+                start.record()
+                for _ in range(10):
+                    ref_kernel(inputs)
+                end.record()
+                torch.cuda.synchronize()
+                ref_time_ms = start.elapsed_time(end) / 10
             total_time_ms += impl_time_ms
             ref_total_time_ms += ref_time_ms
@@ -1197,7 +1435,7 @@ print(json.dumps({{
         # Find the last JSON line in output
         for line in reversed(stdout.strip().split("\\n")):
             if line.startswith("{{"):
-                print(line)
+                print(line, flush=True)
                 return
         print(json.dumps({{"error": f"No result JSON in output: {{stdout[:500]}}"}}))
@@ -1238,6 +1476,23 @@ async def run_evaluate_modal(
     ref_code_b64 = base64.b64encode(args.reference.read_bytes()).decode()
     test_cases_b64 = base64.b64encode(args.test_cases.read_bytes()).decode()
+    # Encode defense module if defensive mode is enabled
+    defense_code_b64 = None
+    if args.defensive:
+        defense_path = (
+            Path(__file__).parent.parent.parent.parent
+            / "packages"
+            / "wafer-core"
+            / "wafer_core"
+            / "utils"
+            / "kernel_utils"
+            / "defense.py"
+        )
+        if defense_path.exists():
+            defense_code_b64 = base64.b64encode(defense_path.read_bytes()).decode()
+        else:
+            print(f"Warning: defense.py not found at {defense_path}, falling back to basic defense")
     # Build the script that creates sandbox and runs eval
     script = _build_modal_sandbox_script(
         target=target,
@@ -1246,6 +1501,7 @@ async def run_evaluate_modal(
         test_cases_b64=test_cases_b64,
         run_benchmarks=args.benchmark,
         run_defensive=args.defensive,
+        defense_code_b64=defense_code_b64,
     )
     def _run_subprocess() -> tuple[str, str, int]:
@@ -1343,6 +1599,7 @@ def _build_workspace_eval_script(
     test_cases_json: str,
     run_benchmarks: bool,
     run_defensive: bool = False,
+    defense_code: str | None = None,
 ) -> str:
     """Build inline evaluation script for workspace exec.
@@ -1353,6 +1610,7 @@ def _build_workspace_eval_script(
     impl_b64 = base64.b64encode(impl_code.encode()).decode()
     ref_b64 = base64.b64encode(ref_code.encode()).decode()
     tests_b64 = base64.b64encode(test_cases_json.encode()).decode()
+    defense_b64 = base64.b64encode(defense_code.encode()).decode() if defense_code else ""
     return f'''
 import base64
@@ -1372,6 +1630,15 @@ with open("/tmp/kernel.py", "w") as f:
 with open("/tmp/reference.py", "w") as f:
     f.write(ref_code)
+# Write defense module if available
+run_defensive = {run_defensive}
+defense_b64 = "{defense_b64}"
+# NOTE: Check defense_b64 is not empty and not the string "None" (from None formatting)
+if run_defensive and defense_b64 and defense_b64 != "None":
+    defense_code = base64.b64decode(defense_b64).decode()
+    with open("/tmp/defense.py", "w") as f:
+        f.write(defense_code)
 # Load kernels
 def load_fn(path, name):
     spec = importlib.util.spec_from_file_location("mod", path)
@@ -1385,6 +1652,24 @@ generate_input = load_fn("/tmp/reference.py", "generate_input")
 import torch
+# Load defense module if available
+defense = None
+if run_defensive and defense_b64 and defense_b64 != "None":
+    try:
+        defense = load_fn("/tmp/defense.py", "run_all_defenses")
+        time_with_defenses = load_fn("/tmp/defense.py", "time_execution_with_defenses")
+        print("[Defense] Defense module loaded")
+        # Wrap kernels for defense API compatibility
+        # Defense API calls kernel(*args), but functional format expects kernel(inputs_tuple)
+        def _wrap_for_defense(kernel):
+            return lambda *args: kernel(args)
+        custom_kernel_for_defense = _wrap_for_defense(custom_kernel)
+        ref_kernel_for_defense = _wrap_for_defense(ref_kernel)
+    except Exception as e:
+        print(f"[Defense] Warning: Could not load defense module: {{e}}")
+        defense = None
 results = []
 all_correct = True
 total_time_ms = 0.0
@@ -1412,36 +1697,60 @@ for tc in test_cases:
         impl_time_ms = 0.0
         ref_time_ms = 0.0
         if {run_benchmarks}:
-            # Warmup
-            for _ in range(3):
-                custom_kernel(inputs)
-            torch.cuda.synchronize()
-            # Measure with defensive timing if requested
-            # Defensive: sync before recording end event to catch stream injection
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
-            start.record()
-            for _ in range(10):
-                custom_kernel(inputs)
-            if {run_defensive}:
-                torch.cuda.synchronize()  # DEFENSE: sync all streams before end
-            end.record()
-            torch.cuda.synchronize()
-            impl_time_ms = start.elapsed_time(end) / 10
-            # Reference timing (same defensive approach)
-            for _ in range(3):
-                ref_kernel(inputs)
-            torch.cuda.synchronize()
-            start.record()
-            for _ in range(10):
-                ref_kernel(inputs)
-            if {run_defensive}:
-                torch.cuda.synchronize()  # DEFENSE: sync all streams before end
-            end.record()
-            torch.cuda.synchronize()
-            ref_time_ms = start.elapsed_time(end) / 10
+            if run_defensive and defense is not None:
+                # Use full defense suite with wrapped kernels
+                inputs_list = list(inputs) if hasattr(inputs, '__iter__') and not isinstance(inputs, torch.Tensor) else [inputs]
+                # Run defense checks
+                all_passed, defense_results, _ = defense(custom_kernel_for_defense, *inputs_list)
+                if not all_passed:
+                    failed = [name for name, passed, _ in defense_results if not passed]
+                    raise ValueError(f"Defense checks failed: {{failed}}")
+                # Time with defensive timing (using wrapped kernels)
+                impl_times, _ = time_with_defenses(
+                    custom_kernel_for_defense,
+                    inputs_list,
+                    num_warmup=3,
+                    num_trials=10,
+                    verbose=False,
+                    run_defenses=False,
+                )
+                impl_time_ms = sum(impl_times) / len(impl_times)
+                ref_times, _ = time_with_defenses(
+                    ref_kernel_for_defense,
+                    inputs_list,
+                    num_warmup=3,
+                    num_trials=10,
+                    verbose=False,
+                    run_defenses=False,
+                )
+                ref_time_ms = sum(ref_times) / len(ref_times)
+            else:
+                # Standard timing
+                for _ in range(3):
+                    custom_kernel(inputs)
+                torch.cuda.synchronize()
+                start = torch.cuda.Event(enable_timing=True)
+                end = torch.cuda.Event(enable_timing=True)
+                start.record()
+                for _ in range(10):
+                    custom_kernel(inputs)
+                end.record()
+                torch.cuda.synchronize()
+                impl_time_ms = start.elapsed_time(end) / 10
+                for _ in range(3):
+                    ref_kernel(inputs)
+                torch.cuda.synchronize()
+                start.record()
+                for _ in range(10):
+                    ref_kernel(inputs)
+                end.record()
+                torch.cuda.synchronize()
+                ref_time_ms = start.elapsed_time(end) / 10
             total_time_ms += impl_time_ms
             ref_total_time_ms += ref_time_ms
@@ -1503,6 +1812,23 @@ async def run_evaluate_workspace(
     ref_code = args.reference.read_text()
     test_cases_json = args.test_cases.read_text()
+    # Read defense module if defensive mode is enabled
+    defense_code = None
+    if args.defensive:
+        defense_path = (
+            Path(__file__).parent.parent.parent.parent
+            / "packages"
+            / "wafer-core"
+            / "wafer_core"
+            / "utils"
+            / "kernel_utils"
+            / "defense.py"
+        )
+        if defense_path.exists():
+            defense_code = defense_path.read_text()
+        else:
+            print(f"Warning: defense.py not found at {defense_path}, falling back to basic defense")
     # Build inline eval script
     eval_script = _build_workspace_eval_script(
         impl_code=impl_code,
@@ -1510,6 +1836,7 @@ async def run_evaluate_workspace(
         test_cases_json=test_cases_json,
         run_benchmarks=args.benchmark,
         run_defensive=args.defensive,
+        defense_code=defense_code,
     )
     # Execute via workspace exec
@@ -1855,15 +2182,12 @@ async def run_evaluate_runpod(
                 # Add venv bin to PATH so ninja (from pip) is found by torch.utils.cpp_extension
                 venv_bin = env_state.venv_bin
                 env_vars = f"PATH={venv_bin}:$PATH HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm"
-                pythonpath = f"PYTHONPATH={wafer_core_workspace}"
-                evaluate_script = (
-                    f"{wafer_core_workspace}/wafer_core/utils/kernel_utils/evaluate.py"
-                )
                 # Run from run_path so reference_kernel.py is importable
+                # Use installed wafer-core module
                 eval_cmd = (
                     f"cd {run_path} && "
-                    f"{env_vars} {pythonpath} {python_exe} {evaluate_script} "
+                    f"{env_vars} {python_exe} -m wafer_core.utils.kernel_utils.evaluate "
                     f"--implementation {impl_path} "
                     f"--reference {ref_path} "
                     f"--test-cases {test_cases_path} "
@@ -2219,15 +2543,12 @@ async def run_evaluate_digitalocean(
                     env_vars = (
                         f"PATH={venv_bin}:$PATH HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm"
                     )
-                    pythonpath = f"PYTHONPATH={wafer_core_workspace}"
-                    evaluate_script = (
-                        f"{wafer_core_workspace}/wafer_core/utils/kernel_utils/evaluate.py"
-                    )
                     # Run from run_path so reference_kernel.py is importable
+                    # Use installed wafer-core module
                     eval_cmd = (
                         f"cd {run_path} && "
-                        f"{env_vars} {pythonpath} {python_exe} {evaluate_script} "
+                        f"{env_vars} {python_exe} -m wafer_core.utils.kernel_utils.evaluate "
                         f"--implementation {impl_path} "
                         f"--reference {ref_path} "
                         f"--test-cases {test_cases_path} "
@@ -2407,7 +2728,9 @@ async def run_evaluate(args: EvaluateArgs) -> EvaluateResult:
     print(f"Using target: {target_name}")
     # Dispatch to appropriate executor
-    if isinstance(target, BaremetalTarget | VMTarget):
+    if isinstance(target, LocalTarget):
+        return await run_evaluate_local(args, target)
+    elif isinstance(target, BaremetalTarget | VMTarget):
         return await run_evaluate_ssh(args, target)
     elif isinstance(target, ModalTarget):
         return await run_evaluate_modal(args, target)
@@ -2436,6 +2759,7 @@ async def run_evaluate(args: EvaluateArgs) -> EvaluateResult:
 # Inline evaluation script for KernelBench format
 # This runs inside the Docker container on the remote GPU
 KERNELBENCH_EVAL_SCRIPT = """
+import gc
 import json
 import os
 import sys
@@ -2444,6 +2768,68 @@ import torch
 import torch.nn as nn
 from pathlib import Path
+# Use a unique per-run PyTorch extension cache directory to ensure fresh compilation.
+# This prevents stale cached extensions from being loaded when the pod is reused.
+# Without this, if a kernel is modified but uses the same extension name,
+# PyTorch would load the old cached .so instead of recompiling.
+# We use a UUID-based directory instead of clearing the cache to avoid race conditions
+# with other processes that might be using the cache.
+import uuid
+unique_cache_dir = f"/tmp/torch_extensions_{uuid.uuid4().hex[:8]}"
+os.environ["TORCH_EXTENSIONS_DIR"] = unique_cache_dir
+print(f"[KernelBench] Using unique extension cache: {unique_cache_dir}")
+# Clear any stale GPU memory from previous runs at startup
+# NOTE: empty_cache only frees memory from THIS process's PyTorch allocator.
+# It won't free memory from dead/zombie processes - rocm-smi --showpids can show
+# PIDs that no longer exist but still hold GPU memory. Those require a GPU reset
+# (rocm-smi --gpureset) to fully clear. TODO: detect and warn about orphaned memory.
+if torch.cuda.is_available():
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+def _calculate_timing_stats(times: list[float]) -> dict:
+    '''Calculate median and IQR from timing samples.
+    Returns dict with median, iqr_low (25th percentile), iqr_high (75th percentile),
+    mean, min, max, and std.
+    '''
+    import statistics
+    if not times:
+        return {"median": 0, "iqr_low": 0, "iqr_high": 0, "mean": 0, "min": 0, "max": 0, "std": 0}
+    sorted_times = sorted(times)
+    n = len(sorted_times)
+    # Median
+    median = statistics.median(sorted_times)
+    # Quartiles (25th and 75th percentile)
+    # For small samples, use simple interpolation
+    q1_idx = (n - 1) * 0.25
+    q3_idx = (n - 1) * 0.75
+    q1_low = int(q1_idx)
+    q1_frac = q1_idx - q1_low
+    iqr_low = sorted_times[q1_low] * (1 - q1_frac) + sorted_times[min(q1_low + 1, n - 1)] * q1_frac
+    q3_low = int(q3_idx)
+    q3_frac = q3_idx - q3_low
+    iqr_high = sorted_times[q3_low] * (1 - q3_frac) + sorted_times[min(q3_low + 1, n - 1)] * q3_frac
+    return {
+        "median": median,
+        "iqr_low": iqr_low,
+        "iqr_high": iqr_high,
+        "mean": statistics.mean(sorted_times),
+        "min": min(sorted_times),
+        "max": max(sorted_times),
+        "std": statistics.stdev(sorted_times) if n > 1 else 0,
+    }
 def run_profiling(model, inputs, name, output_dir):
     '''Run torch.profiler and return summary stats.'''
@@ -2674,12 +3060,26 @@ def main():
     parser.add_argument("--inputs", help="Custom inputs file to override get_inputs()/get_init_inputs()")
     parser.add_argument("--benchmark", action="store_true")
     parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--defensive", action="store_true", help="Run full defense checks against reward hacking")
+    parser.add_argument("--defense-module", help="Path to defense.py module")
     parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
     parser.add_argument("--num-correct-trials", type=int, default=3)
     parser.add_argument("--num-perf-trials", type=int, default=10)
     parser.add_argument("--output", required=True)
     args = parser.parse_args()
+    # Load defense module if defensive mode is enabled
+    defense_module = None
+    if args.defensive and args.defense_module:
+        try:
+            import importlib.util
+            defense_spec = importlib.util.spec_from_file_location("defense", args.defense_module)
+            defense_module = importlib.util.module_from_spec(defense_spec)
+            defense_spec.loader.exec_module(defense_module)
+            print("[KernelBench] Defense module loaded")
+        except Exception as e:
+            print(f"[KernelBench] Warning: Could not load defense module: {e}")
     # Create output directory for profiles
     output_dir = Path(args.output).parent
     profile_dir = output_dir / "profiles"
@@ -2813,47 +3213,102 @@ def main():
             inputs = get_inputs()
             inputs = [x.cuda() if isinstance(x, torch.Tensor) else x for x in inputs]
-            # Warmup
-            for _ in range(5):
-                with torch.no_grad():
-                    _ = new_model(*inputs)
-            torch.cuda.synchronize()
-            # Benchmark new model
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
-            times = []
-            for _ in range(args.num_perf_trials):
-                start.record()
-                with torch.no_grad():
-                    _ = new_model(*inputs)
-                end.record()
-                torch.cuda.synchronize()
-                times.append(start.elapsed_time(end))
-            new_time = sum(times) / len(times)
-            results["runtime_ms"] = new_time
-            # Benchmark reference model
-            for _ in range(5):
-                with torch.no_grad():
-                    _ = ref_model(*inputs)
-            torch.cuda.synchronize()
+            if args.defensive and defense_module is not None:
+                # Use full defense suite
+                print("[KernelBench] Running defense checks on implementation...")
+                run_all_defenses = defense_module.run_all_defenses
+                time_with_defenses = defense_module.time_execution_with_defenses
-            times = []
-            for _ in range(args.num_perf_trials):
-                start.record()
-                with torch.no_grad():
-                    _ = ref_model(*inputs)
-                end.record()
+                # Run defense checks on implementation
+                all_passed, defense_results, _ = run_all_defenses(
+                    lambda *x: new_model(*x),
+                    *inputs,
+                )
+                results["defense_results"] = {
+                    name: {"passed": passed, "message": msg}
+                    for name, passed, msg in defense_results
+                }
+                if not all_passed:
+                    failed = [name for name, passed, _ in defense_results if not passed]
+                    results["error"] = f"Defense checks failed: {failed}"
+                    print(f"[KernelBench] Defense checks FAILED: {failed}")
+                    for name, passed, msg in defense_results:
+                        status = "PASS" if passed else "FAIL"
+                        print(f"   [{status}] {name}: {msg}")
+                else:
+                    print("[KernelBench] All defense checks passed")
+                    # Time with defensive timing
+                    impl_times, _ = time_with_defenses(
+                        lambda: new_model(*inputs),
+                        [],
+                        num_warmup=5,
+                        num_trials=args.num_perf_trials,
+                        verbose=False,
+                        run_defenses=False,  # Already ran above
+                    )
+                    # Calculate stats for new model
+                    new_stats = _calculate_timing_stats(impl_times)
+                    results["runtime_ms"] = new_stats["median"]
+                    results["runtime_stats"] = new_stats
+                    # Reference timing
+                    ref_times, _ = time_with_defenses(
+                        lambda: ref_model(*inputs),
+                        [],
+                        num_warmup=5,
+                        num_trials=args.num_perf_trials,
+                        verbose=False,
+                        run_defenses=False,
+                    )
+                    ref_stats = _calculate_timing_stats(ref_times)
+                    results["reference_runtime_ms"] = ref_stats["median"]
+                    results["reference_runtime_stats"] = ref_stats
+                    results["speedup"] = ref_stats["median"] / new_stats["median"] if new_stats["median"] > 0 else 0
+                    print(f"[KernelBench] New: {new_stats['median']:.3f}ms (IQR: {new_stats['iqr_low']:.3f}-{new_stats['iqr_high']:.3f}), Ref: {ref_stats['median']:.3f}ms (IQR: {ref_stats['iqr_low']:.3f}-{ref_stats['iqr_high']:.3f}), Speedup: {results['speedup']:.2f}x")
+            else:
+                # Standard timing without full defenses
+                # Warmup BOTH models before benchmarking either
+                # This ensures consistent GPU state and avoids MIOpen cache effects
+                # that cause variance when warming up models sequentially
+                for _ in range(5):
+                    with torch.no_grad():
+                        _ = new_model(*inputs)
+                        _ = ref_model(*inputs)
                 torch.cuda.synchronize()
-                times.append(start.elapsed_time(end))
-            ref_time = sum(times) / len(times)
-            results["reference_runtime_ms"] = ref_time
-            results["speedup"] = ref_time / new_time if new_time > 0 else 0
-            print(f"[KernelBench] New: {new_time:.3f}ms, Ref: {ref_time:.3f}ms, Speedup: {results['speedup']:.2f}x")
+                # Benchmark new model
+                start = torch.cuda.Event(enable_timing=True)
+                end = torch.cuda.Event(enable_timing=True)
+                new_times = []
+                for _ in range(args.num_perf_trials):
+                    start.record()
+                    with torch.no_grad():
+                        _ = new_model(*inputs)
+                    end.record()
+                    torch.cuda.synchronize()
+                    new_times.append(start.elapsed_time(end))
+                new_stats = _calculate_timing_stats(new_times)
+                results["runtime_ms"] = new_stats["median"]
+                results["runtime_stats"] = new_stats
+                # Benchmark reference model
+                ref_times = []
+                for _ in range(args.num_perf_trials):
+                    start.record()
+                    with torch.no_grad():
+                        _ = ref_model(*inputs)
+                    end.record()
+                    torch.cuda.synchronize()
+                    ref_times.append(start.elapsed_time(end))
+                ref_stats = _calculate_timing_stats(ref_times)
+                results["reference_runtime_ms"] = ref_stats["median"]
+                results["reference_runtime_stats"] = ref_stats
+                results["speedup"] = ref_stats["median"] / new_stats["median"] if new_stats["median"] > 0 else 0
+                print(f"[KernelBench] New: {new_stats['median']:.3f}ms (IQR: {new_stats['iqr_low']:.3f}-{new_stats['iqr_high']:.3f}), Ref: {ref_stats['median']:.3f}ms (IQR: {ref_stats['iqr_low']:.3f}-{ref_stats['iqr_high']:.3f}), Speedup: {results['speedup']:.2f}x")
         # Run profiling if requested and correctness passed
         if args.profile and all_correct:
@@ -2898,6 +3353,16 @@ def main():
         json.dump(results, f, indent=2)
     print(f"[KernelBench] Results written to {args.output}")
+    # Cleanup GPU memory
+    try:
+        del ref_model, new_model
+    except NameError:
+        pass
+    import gc
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
 if __name__ == "__main__":
     main()
 """
@@ -2947,6 +3412,27 @@ def _validate_kernelbench_files(args: KernelBenchEvaluateArgs) -> str | None:
             "  KernelBench format requires: 'class Model', 'get_inputs()', 'get_init_inputs()'"
         )
+    # Static kernel validation if backend specified
+    if args.backend:
+        from wafer_core.utils.kernel_utils.static_checker import validate_kernel_static
+        code = args.implementation.read_text()
+        valid, errors, warnings = validate_kernel_static(code, backend=args.backend)
+        # Print warnings (don't fail)
+        for warning in warnings:
+            logger.warning(f"Static check warning: {warning}")
+        # Fail on errors
+        if not valid:
+            error_list = "\n  - ".join(errors)
+            return (
+                f"Static kernel validation failed for backend '{args.backend}':\n"
+                f"  - {error_list}\n\n"
+                f"The implementation must use {args.backend.upper()} kernel primitives.\n"
+                "See KernelBench documentation for valid kernel patterns."
+            )
     return None
@@ -3059,6 +3545,30 @@ async def run_evaluate_kernelbench_docker(
                 error_message=f"Failed to write eval script: {write_result.stderr}",
             )
+        # Write defense module if defensive mode is enabled
+        defense_module_path = None
+        if args.defensive:
+            defense_path = (
+                Path(__file__).parent.parent.parent.parent
+                / "packages"
+                / "wafer-core"
+                / "wafer_core"
+                / "utils"
+                / "kernel_utils"
+                / "defense.py"
+            )
+            if defense_path.exists():
+                defense_code = defense_path.read_text()
+                defense_module_path = f"{run_path}/defense.py"
+                write_result = await client.exec(
+                    f"cat > '{defense_module_path}' << 'DEFENSE_EOF'\n{defense_code}\nDEFENSE_EOF"
+                )
+                if write_result.exit_code != 0:
+                    print(f"Warning: Failed to write defense module: {write_result.stderr}")
+                    defense_module_path = None
+            else:
+                print(f"Warning: defense.py not found at {defense_path}")
         print("Running KernelBench evaluation in Docker container...")
         # Paths inside container
@@ -3068,6 +3578,7 @@ async def run_evaluate_kernelbench_docker(
         container_inputs_path = f"{container_run_path}/custom_inputs.py" if args.inputs else None
         container_eval_script = f"{container_run_path}/kernelbench_eval.py"
         container_output = f"{container_run_path}/results.json"
+        container_defense_path = f"{container_run_path}/defense.py" if defense_module_path else None
         # Build eval command
         python_cmd_parts = [
@@ -3083,6 +3594,9 @@ async def run_evaluate_kernelbench_docker(
             python_cmd_parts.append("--profile")
         if container_inputs_path:
             python_cmd_parts.append(f"--inputs {container_inputs_path}")
+        if args.defensive and container_defense_path:
+            python_cmd_parts.append("--defensive")
+            python_cmd_parts.append(f"--defense-module {container_defense_path}")
         python_cmd_parts.append(f"--seed {args.seed}")
         eval_cmd = " ".join(python_cmd_parts)
@@ -3106,7 +3620,7 @@ async def run_evaluate_kernelbench_docker(
         # Run and stream output
         log_lines = []
         async for line in client.exec_stream(docker_cmd):
-            print(line)
+            print(line, flush=True)
             log_lines.append(line)
         # Read results
@@ -3298,15 +3812,44 @@ async def run_evaluate_kernelbench_digitalocean(
                         error_message=f"Failed to write eval script: {write_result.stderr}",
                     )
+                # Write defense module if defensive mode is enabled
+                defense_module_path = None
+                if args.defensive:
+                    defense_path = (
+                        Path(__file__).parent.parent.parent.parent
+                        / "packages"
+                        / "wafer-core"
+                        / "wafer_core"
+                        / "utils"
+                        / "kernel_utils"
+                        / "defense.py"
+                    )
+                    if defense_path.exists():
+                        defense_code = defense_path.read_text()
+                        defense_module_path = f"{run_path}/defense.py"
+                        write_result = await client.exec(
+                            f"cat > '{defense_module_path}' << 'DEFENSE_EOF'\n{defense_code}\nDEFENSE_EOF"
+                        )
+                        if write_result.exit_code != 0:
+                            print(f"Warning: Failed to write defense module: {write_result.stderr}")
+                            defense_module_path = None
+                    else:
+                        print(f"Warning: defense.py not found at {defense_path}")
                 print("Running KernelBench evaluation in Docker container (AMD/ROCm)...")
                 # Paths inside container
                 container_run_path = f"{CONTAINER_WORKSPACE}/{run_dir}"
                 container_impl_path = f"{container_run_path}/implementation.py"
                 container_ref_path = f"{container_run_path}/reference.py"
-                container_inputs_path = f"{container_run_path}/custom_inputs.py" if args.inputs else None
+                container_inputs_path = (
+                    f"{container_run_path}/custom_inputs.py" if args.inputs else None
+                )
                 container_eval_script = f"{container_run_path}/kernelbench_eval.py"
                 container_output = f"{container_run_path}/results.json"
+                container_defense_path = (
+                    f"{container_run_path}/defense.py" if defense_module_path else None
+                )
                 # Build eval command
                 python_cmd_parts = [
@@ -3322,6 +3865,9 @@ async def run_evaluate_kernelbench_digitalocean(
                     python_cmd_parts.append("--profile")
                 if container_inputs_path:
                     python_cmd_parts.append(f"--inputs {container_inputs_path}")
+                if args.defensive and container_defense_path:
+                    python_cmd_parts.append("--defensive")
+                    python_cmd_parts.append(f"--defense-module {container_defense_path}")
                 python_cmd_parts.append(f"--seed {args.seed}")
                 eval_cmd = " ".join(python_cmd_parts)
@@ -3330,14 +3876,20 @@ async def run_evaluate_kernelbench_digitalocean(
                 full_cmd = f"cd {container_run_path} && {eval_cmd}"
                 # Build Docker command for AMD
+                # PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
+                rocm_arch = _get_rocm_arch(target.compute_capability)
+                env_dict = {
+                    "HIP_VISIBLE_DEVICES": str(gpu_id),
+                    "PYTHONUNBUFFERED": "1",
+                }
+                if rocm_arch:
+                    env_dict["PYTORCH_ROCM_ARCH"] = rocm_arch
                 docker_cmd = _build_docker_run_command_amd(
                     image=docker_image,
                     command=full_cmd,
                     working_dir=container_run_path,
-                    env={
-                        "HIP_VISIBLE_DEVICES": str(gpu_id),
-                        "PYTHONUNBUFFERED": "1",
-                    },
+                    env=env_dict,
                     volumes={workspace_path: CONTAINER_WORKSPACE},
                 )
@@ -3346,7 +3898,7 @@ async def run_evaluate_kernelbench_digitalocean(
                 # Run and stream output
                 log_lines = []
                 async for line in client.exec_stream(docker_cmd):
-                    print(line)
+                    print(line, flush=True)
                     log_lines.append(line)
                 # Read results
@@ -3407,55 +3959,528 @@ async def run_evaluate_kernelbench_digitalocean(
                 )
-async def run_evaluate_kernelbench(args: KernelBenchEvaluateArgs) -> EvaluateResult:
-    """Run KernelBench format evaluation on configured target.
-    Args:
-        args: KernelBench evaluate arguments
+async def run_evaluate_kernelbench_runpod(
+    args: KernelBenchEvaluateArgs,
+    target: RunPodTarget,
+) -> EvaluateResult:
+    """Run KernelBench format evaluation directly on RunPod AMD GPU.
-    Returns:
-        Evaluation result
+    Runs evaluation script directly on host (no Docker) since RunPod pods
+    already have PyTorch/ROCm installed.
     """
-    from .targets import get_default_target, load_target
+    from datetime import datetime
-    # Validate input files
-    err = _validate_kernelbench_files(args)
-    if err:
-        return EvaluateResult(
-            success=False,
-            all_correct=False,
-            correctness_score=0.0,
-            geomean_speedup=0.0,
-            passed_tests=0,
-            total_tests=0,
-            error_message=err,
-        )
+    from wafer_core.async_ssh import AsyncSSHClient
+    from wafer_core.targets.runpod import RunPodError, runpod_ssh_context
-    # Load target
-    target_name = args.target_name
-    if not target_name:
-        target_name = get_default_target()
-        if not target_name:
-            return EvaluateResult(
-                success=False,
-                all_correct=False,
-                correctness_score=0.0,
-                geomean_speedup=0.0,
-                passed_tests=0,
-                total_tests=0,
-                error_message=(
-                    "No target specified and no default set.\n"
-                    "Set up a target first:\n"
-                    "  wafer config targets init ssh --name my-gpu --host user@host:22\n"
-                    "  wafer config targets init runpod --gpu MI300X\n"
-                    "Then use: --target my-gpu (or set default: wafer config targets default my-gpu)"
-                ),
-            )
+    REMOTE_WORKSPACE_BASE = "/tmp/wafer_eval"
+    # Select GPU
+    gpu_id = args.gpu_id if args.gpu_id is not None else target.gpu_ids[0]
+    print(f"Provisioning RunPod ({target.gpu_type_id})...")
     try:
-        target = load_target(target_name)
-    except FileNotFoundError:
-        return EvaluateResult(
+        async with runpod_ssh_context(target) as ssh_info:
+            ssh_target = f"{ssh_info.user}@{ssh_info.host}:{ssh_info.port}"
+            print(f"Connected to RunPod: {ssh_target}")
+            async with AsyncSSHClient(ssh_target, target.ssh_key) as client:
+                # Create workspace
+                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                run_dir = f"kernelbench_eval_{timestamp}"
+                run_path = f"{REMOTE_WORKSPACE_BASE}/{run_dir}"
+                await client.exec(f"mkdir -p {run_path}")
+                print(f"Created run directory: {run_path}")
+                # Read and upload files
+                impl_code = args.implementation.read_text()
+                ref_code = args.reference.read_text()
+                # Write implementation
+                impl_path = f"{run_path}/implementation.py"
+                write_result = await client.exec(
+                    f"cat > '{impl_path}' << 'IMPL_EOF'\n{impl_code}\nIMPL_EOF"
+                )
+                if write_result.exit_code != 0:
+                    return EvaluateResult(
+                        success=False,
+                        all_correct=False,
+                        correctness_score=0.0,
+                        geomean_speedup=0.0,
+                        passed_tests=0,
+                        total_tests=0,
+                        error_message=f"Failed to write implementation: {write_result.stderr}",
+                    )
+                # Write reference
+                ref_path = f"{run_path}/reference.py"
+                write_result = await client.exec(
+                    f"cat > '{ref_path}' << 'REF_EOF'\n{ref_code}\nREF_EOF"
+                )
+                if write_result.exit_code != 0:
+                    return EvaluateResult(
+                        success=False,
+                        all_correct=False,
+                        correctness_score=0.0,
+                        geomean_speedup=0.0,
+                        passed_tests=0,
+                        total_tests=0,
+                        error_message=f"Failed to write reference: {write_result.stderr}",
+                    )
+                # Write custom inputs if provided
+                inputs_path = None
+                if args.inputs:
+                    inputs_code = args.inputs.read_text()
+                    inputs_path = f"{run_path}/custom_inputs.py"
+                    write_result = await client.exec(
+                        f"cat > '{inputs_path}' << 'INPUTS_EOF'\n{inputs_code}\nINPUTS_EOF"
+                    )
+                    if write_result.exit_code != 0:
+                        return EvaluateResult(
+                            success=False,
+                            all_correct=False,
+                            correctness_score=0.0,
+                            geomean_speedup=0.0,
+                            passed_tests=0,
+                            total_tests=0,
+                            error_message=f"Failed to write custom inputs: {write_result.stderr}",
+                        )
+                # Write eval script
+                eval_script_path = f"{run_path}/kernelbench_eval.py"
+                write_result = await client.exec(
+                    f"cat > '{eval_script_path}' << 'EVAL_EOF'\n{KERNELBENCH_EVAL_SCRIPT}\nEVAL_EOF"
+                )
+                if write_result.exit_code != 0:
+                    return EvaluateResult(
+                        success=False,
+                        all_correct=False,
+                        correctness_score=0.0,
+                        geomean_speedup=0.0,
+                        passed_tests=0,
+                        total_tests=0,
+                        error_message=f"Failed to write eval script: {write_result.stderr}",
+                    )
+                # Write defense module if defensive mode is enabled
+                defense_module_path = None
+                if args.defensive:
+                    defense_path = (
+                        Path(__file__).parent.parent.parent.parent
+                        / "packages"
+                        / "wafer-core"
+                        / "wafer_core"
+                        / "utils"
+                        / "kernel_utils"
+                        / "defense.py"
+                    )
+                    if defense_path.exists():
+                        defense_code = defense_path.read_text()
+                        defense_module_path = f"{run_path}/defense.py"
+                        write_result = await client.exec(
+                            f"cat > '{defense_module_path}' << 'DEFENSE_EOF'\n{defense_code}\nDEFENSE_EOF"
+                        )
+                        if write_result.exit_code != 0:
+                            print(f"Warning: Failed to write defense module: {write_result.stderr}")
+                            defense_module_path = None
+                    else:
+                        print(f"Warning: defense.py not found at {defense_path}")
+                print("Running KernelBench evaluation (AMD/ROCm)...")
+                # Find Python with PyTorch - check common locations on RunPod
+                python_exe = "python3"
+                for candidate in [
+                    "/opt/conda/envs/py_3.10/bin/python3",
+                    "/opt/conda/bin/python3",
+                ]:
+                    check = await client.exec(
+                        f"{candidate} -c 'import torch' 2>/dev/null && echo OK"
+                    )
+                    if "OK" in check.stdout:
+                        python_exe = candidate
+                        print(f"Using Python: {python_exe}")
+                        break
+                # Build eval command - run directly on host
+                output_path = f"{run_path}/results.json"
+                python_cmd_parts = [
+                    f"{python_exe} {eval_script_path}",
+                    f"--impl {impl_path}",
+                    f"--reference {ref_path}",
+                    f"--output {output_path}",
+                ]
+                if args.benchmark:
+                    python_cmd_parts.append("--benchmark")
+                if args.profile:
+                    python_cmd_parts.append("--profile")
+                if inputs_path:
+                    python_cmd_parts.append(f"--inputs {inputs_path}")
+                if args.defensive and defense_module_path:
+                    python_cmd_parts.append("--defensive")
+                    python_cmd_parts.append(f"--defense-module {defense_module_path}")
+                python_cmd_parts.append(f"--seed {args.seed}")
+                eval_cmd = " ".join(python_cmd_parts)
+                # Set environment for AMD GPU and run
+                # PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
+                rocm_arch = _get_rocm_arch(target.compute_capability)
+                arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
+                env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
+                full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
+                # Run and stream output
+                log_lines = []
+                async for line in client.exec_stream(full_cmd):
+                    print(line, flush=True)
+                    log_lines.append(line)
+                # Read results
+                cat_result = await client.exec(f"cat {output_path}")
+                if cat_result.exit_code != 0:
+                    log_tail = "\n".join(log_lines[-50:])
+                    return EvaluateResult(
+                        success=False,
+                        all_correct=False,
+                        correctness_score=0.0,
+                        geomean_speedup=0.0,
+                        passed_tests=0,
+                        total_tests=0,
+                        error_message=f"Evaluation failed. Log tail:\n{log_tail}",
+                    )
+                # Parse results
+                try:
+                    results_data = json.loads(cat_result.stdout)
+                except json.JSONDecodeError as e:
+                    return EvaluateResult(
+                        success=False,
+                        all_correct=False,
+                        correctness_score=0.0,
+                        geomean_speedup=0.0,
+                        passed_tests=0,
+                        total_tests=0,
+                        error_message=f"Failed to parse results: {e}",
+                    )
+                # Convert to EvaluateResult
+                correct = results_data.get("correct", False)
+                speedup = results_data.get("speedup", 0.0) or 0.0
+                error = results_data.get("error")
+                if error:
+                    return EvaluateResult(
+                        success=False,
+                        all_correct=False,
+                        correctness_score=0.0,
+                        geomean_speedup=0.0,
+                        passed_tests=0,
+                        total_tests=1,
+                        error_message=error,
+                    )
+                return EvaluateResult(
+                    success=True,
+                    all_correct=correct,
+                    correctness_score=1.0 if correct else 0.0,
+                    geomean_speedup=speedup,
+                    passed_tests=1 if correct else 0,
+                    total_tests=1,
+                )
+    except RunPodError as e:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message=f"RunPod error: {e}",
+        )
+async def run_evaluate_kernelbench_baremetal_amd(
+    args: KernelBenchEvaluateArgs,
+    target: BaremetalTarget,
+) -> EvaluateResult:
+    """Run KernelBench format evaluation directly on AMD baremetal target.
+    Runs evaluation script directly on host (no Docker) for AMD GPUs
+    that have PyTorch/ROCm installed.
+    """
+    from datetime import datetime
+    from wafer_core.async_ssh import AsyncSSHClient
+    REMOTE_WORKSPACE_BASE = "/tmp/wafer_eval"
+    # Select GPU
+    gpu_id = args.gpu_id if args.gpu_id is not None else target.gpu_ids[0]
+    print(f"Connecting to {target.ssh_target}...")
+    async with AsyncSSHClient(target.ssh_target, target.ssh_key) as client:
+        # Create workspace
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        run_dir = f"kernelbench_eval_{timestamp}"
+        run_path = f"{REMOTE_WORKSPACE_BASE}/{run_dir}"
+        await client.exec(f"mkdir -p {run_path}")
+        print(f"Created run directory: {run_path}")
+        # Read and upload files
+        impl_code = args.implementation.read_text()
+        ref_code = args.reference.read_text()
+        # Write implementation
+        impl_path = f"{run_path}/implementation.py"
+        write_result = await client.exec(
+            f"cat > '{impl_path}' << 'IMPL_EOF'\n{impl_code}\nIMPL_EOF"
+        )
+        if write_result.exit_code != 0:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Failed to write implementation: {write_result.stderr}",
+            )
+        # Write reference
+        ref_path = f"{run_path}/reference.py"
+        write_result = await client.exec(f"cat > '{ref_path}' << 'REF_EOF'\n{ref_code}\nREF_EOF")
+        if write_result.exit_code != 0:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Failed to write reference: {write_result.stderr}",
+            )
+        # Write custom inputs if provided
+        inputs_path = None
+        if args.inputs:
+            inputs_code = args.inputs.read_text()
+            inputs_path = f"{run_path}/custom_inputs.py"
+            write_result = await client.exec(
+                f"cat > '{inputs_path}' << 'INPUTS_EOF'\n{inputs_code}\nINPUTS_EOF"
+            )
+            if write_result.exit_code != 0:
+                return EvaluateResult(
+                    success=False,
+                    all_correct=False,
+                    correctness_score=0.0,
+                    geomean_speedup=0.0,
+                    passed_tests=0,
+                    total_tests=0,
+                    error_message=f"Failed to write custom inputs: {write_result.stderr}",
+                )
+        # Write eval script
+        eval_script_path = f"{run_path}/kernelbench_eval.py"
+        write_result = await client.exec(
+            f"cat > '{eval_script_path}' << 'EVAL_EOF'\n{KERNELBENCH_EVAL_SCRIPT}\nEVAL_EOF"
+        )
+        if write_result.exit_code != 0:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Failed to write eval script: {write_result.stderr}",
+            )
+        # Write defense module if defensive mode is enabled
+        defense_module_path = None
+        if args.defensive:
+            defense_path = (
+                Path(__file__).parent.parent.parent.parent
+                / "packages"
+                / "wafer-core"
+                / "wafer_core"
+                / "utils"
+                / "kernel_utils"
+                / "defense.py"
+            )
+            if defense_path.exists():
+                defense_code = defense_path.read_text()
+                defense_module_path = f"{run_path}/defense.py"
+                write_result = await client.exec(
+                    f"cat > '{defense_module_path}' << 'DEFENSE_EOF'\n{defense_code}\nDEFENSE_EOF"
+                )
+                if write_result.exit_code != 0:
+                    print(f"Warning: Failed to write defense module: {write_result.stderr}")
+                    defense_module_path = None
+            else:
+                print(f"Warning: defense.py not found at {defense_path}")
+        print("Running KernelBench evaluation (AMD/ROCm)...")
+        # Find Python with PyTorch - check common locations
+        python_exe = "python3"
+        for candidate in [
+            "/opt/conda/envs/py_3.10/bin/python3",
+            "/opt/conda/bin/python3",
+        ]:
+            check = await client.exec(f"{candidate} -c 'import torch' 2>/dev/null && echo OK")
+            if "OK" in check.stdout:
+                python_exe = candidate
+                print(f"Using Python: {python_exe}")
+                break
+        # Build eval command - run directly on host
+        output_path = f"{run_path}/results.json"
+        python_cmd_parts = [
+            f"{python_exe} {eval_script_path}",
+            f"--impl {impl_path}",
+            f"--reference {ref_path}",
+            f"--output {output_path}",
+        ]
+        if args.benchmark:
+            python_cmd_parts.append("--benchmark")
+        if args.profile:
+            python_cmd_parts.append("--profile")
+        if inputs_path:
+            python_cmd_parts.append(f"--inputs {inputs_path}")
+        if args.defensive and defense_module_path:
+            python_cmd_parts.append("--defensive")
+            python_cmd_parts.append(f"--defense-module {defense_module_path}")
+        python_cmd_parts.append(f"--seed {args.seed}")
+        eval_cmd = " ".join(python_cmd_parts)
+        # Set environment for AMD GPU and run
+        # PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
+        rocm_arch = _get_rocm_arch(target.compute_capability)
+        arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
+        env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
+        full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
+        # Run and stream output
+        log_lines = []
+        async for line in client.exec_stream(full_cmd):
+            print(line, flush=True)
+            log_lines.append(line)
+        # Read results
+        cat_result = await client.exec(f"cat {output_path}")
+        if cat_result.exit_code != 0:
+            log_tail = "\n".join(log_lines[-50:])
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Evaluation failed. Log tail:\n{log_tail}",
+            )
+        # Parse results
+        try:
+            results_data = json.loads(cat_result.stdout)
+        except json.JSONDecodeError as e:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Failed to parse results: {e}",
+            )
+        # Convert to EvaluateResult
+        correct = results_data.get("correct", False)
+        speedup = results_data.get("speedup", 0.0) or 0.0
+        error = results_data.get("error")
+        if error:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=1,
+                error_message=error,
+            )
+        return EvaluateResult(
+            success=True,
+            all_correct=correct,
+            correctness_score=1.0 if correct else 0.0,
+            geomean_speedup=speedup,
+            passed_tests=1 if correct else 0,
+            total_tests=1,
+        )
+async def run_evaluate_kernelbench(args: KernelBenchEvaluateArgs) -> EvaluateResult:
+    """Run KernelBench format evaluation on configured target.
+    Args:
+        args: KernelBench evaluate arguments
+    Returns:
+        Evaluation result
+    """
+    from .targets import get_default_target, load_target
+    # Validate input files
+    err = _validate_kernelbench_files(args)
+    if err:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message=err,
+        )
+    # Load target
+    target_name = args.target_name
+    if not target_name:
+        target_name = get_default_target()
+        if not target_name:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=(
+                    "No target specified and no default set.\n"
+                    "Set up a target first:\n"
+                    "  wafer config targets init ssh --name my-gpu --host user@host:22\n"
+                    "  wafer config targets init runpod --gpu MI300X\n"
+                    "Then use: --target my-gpu (or set default: wafer config targets default my-gpu)"
+                ),
+            )
+    try:
+        target = load_target(target_name)
+    except FileNotFoundError:
+        return EvaluateResult(
             success=False,
             all_correct=False,
             correctness_score=0.0,
@@ -3471,7 +4496,13 @@ async def run_evaluate_kernelbench(args: KernelBenchEvaluateArgs) -> EvaluateRes
     if isinstance(target, DigitalOceanTarget):
         # DigitalOcean AMD MI300X - uses ROCm Docker with device passthrough
         return await run_evaluate_kernelbench_digitalocean(args, target)
+    elif isinstance(target, RunPodTarget):
+        # RunPod AMD MI300X - uses ROCm Docker with device passthrough
+        return await run_evaluate_kernelbench_runpod(args, target)
     elif isinstance(target, BaremetalTarget | VMTarget):
+        # Check if this is an AMD target (gfx* compute capability) - run directly
+        if target.compute_capability and target.compute_capability.startswith("gfx"):
+            return await run_evaluate_kernelbench_baremetal_amd(args, target)
         # NVIDIA targets - require docker_image to be set
         if not target.docker_image:
             return EvaluateResult(
@@ -3497,6 +4528,6 @@ async def run_evaluate_kernelbench(args: KernelBenchEvaluateArgs) -> EvaluateRes
             total_tests=0,
             error_message=(
                 f"Target type '{type(target).__name__}' not yet supported for KernelBench format. "
-                "Use a DigitalOcean, Baremetal, or VM target."
+                "Use a DigitalOcean, RunPod, Baremetal, or VM target."
             ),
         )

wafer-cli 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl

wafer-cli 0.2.8py3-none-any.whl → 0.2.10py3-none-any.whl