PyPI - wafer-cli - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

wafer-cli 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

wafer/cli.py +862 -104
wafer/evaluate.py +1423 -158
wafer/gpu_run.py +5 -1
wafer/problems.py +357 -0
wafer/target_lock.py +198 -0
wafer/targets.py +158 -0
wafer/wevin_cli.py +22 -2
{wafer_cli-0.2.3.dist-info → wafer_cli-0.2.5.dist-info}/METADATA +1 -1
{wafer_cli-0.2.3.dist-info → wafer_cli-0.2.5.dist-info}/RECORD +12 -10
{wafer_cli-0.2.3.dist-info → wafer_cli-0.2.5.dist-info}/WHEEL +1 -1
{wafer_cli-0.2.3.dist-info → wafer_cli-0.2.5.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.3.dist-info → wafer_cli-0.2.5.dist-info}/top_level.txt +0 -0

wafer/evaluate.py CHANGED Viewed

@@ -14,6 +14,7 @@ logger = logging.getLogger(__name__)
 from wafer_core.utils.kernel_utils.targets.config import (
     BaremetalTarget,
     DigitalOceanTarget,
+    LocalTarget,
     ModalTarget,
     RunPodTarget,
     VMTarget,
@@ -158,6 +159,8 @@ class KernelBenchEvaluateArgs:
     target_name: str
     benchmark: bool = False
     profile: bool = False
+    inputs: Path | None = None  # Custom inputs file to override get_inputs()
+    seed: int = 42  # Random seed for reproducibility
     defensive: bool = False
     sync_artifacts: bool = True
     gpu_id: int | None = None
@@ -394,33 +397,6 @@ async def run_evaluate_docker(
     print(f"Connecting to {target.ssh_target}...")
     async with AsyncSSHClient(target.ssh_target, target.ssh_key) as client:
-        # Upload wafer-core to remote
-        try:
-            wafer_root = _get_wafer_root()
-            wafer_core_path = wafer_root / "packages" / "wafer-core"
-            print(f"Uploading wafer-core from {wafer_core_path}...")
-            # Create workspace and upload
-            workspace_name = wafer_core_path.name
-            remote_workspace = f"{REMOTE_WORKSPACE_BASE}/{workspace_name}"
-            await client.exec(f"mkdir -p {remote_workspace}")
-            wafer_core_workspace = await client.expand_path(remote_workspace)
-            upload_result = await client.upload_files(
-                str(wafer_core_path), wafer_core_workspace, recursive=True
-            )
-            print(f"Uploaded {upload_result.files_copied} files")
-        except Exception as e:
-            return EvaluateResult(
-                success=False,
-                all_correct=False,
-                correctness_score=0.0,
-                geomean_speedup=0.0,
-                passed_tests=0,
-                total_tests=0,
-                error_message=f"Failed to upload wafer-core: {e}",
-            )
         print(f"Using Docker image: {target.docker_image}")
         print(f"Using GPU {gpu_id}...")
@@ -429,10 +405,13 @@ async def run_evaluate_docker(
         ref_code = args.reference.read_text()
         test_cases_data = json.loads(args.test_cases.read_text())
-        # Create a unique run directory
+        # Create workspace for evaluation files
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         run_dir = f"wafer_eval_{timestamp}"
-        run_path = f"{wafer_core_workspace}/{run_dir}"
+        eval_workspace = f"{REMOTE_WORKSPACE_BASE}/eval_{timestamp}"
+        await client.exec(f"mkdir -p {eval_workspace}")
+        eval_workspace_expanded = await client.expand_path(eval_workspace)
+        run_path = f"{eval_workspace_expanded}/{run_dir}"
         print("Uploading evaluation files...")
@@ -519,17 +498,16 @@ async def run_evaluate_docker(
         container_impl_path = f"{container_run_path}/implementation.py"
         container_ref_path = f"{container_run_path}/reference.py"
         container_test_cases_path = f"{container_run_path}/test_cases.json"
-        container_evaluate_script = (
-            f"{CONTAINER_WORKSPACE}/wafer_core/utils/kernel_utils/evaluate.py"
-        )
-        # Build pip install command for torch and other deps (no wafer-core install needed)
+        # Build pip install command for torch and other deps, plus wafer-core
         pip_install_cmd = _build_docker_pip_install_cmd(target)
+        install_cmd = (
+            f"{pip_install_cmd} && uv pip install --system --break-system-packages wafer-core"
+        )
-        # Build evaluate command - use PYTHONPATH instead of installing wafer-core
+        # Build evaluate command using installed wafer-core module
         python_cmd_parts = [
-            f"PYTHONPATH={CONTAINER_WORKSPACE}:$PYTHONPATH",
-            f"python3 {container_evaluate_script}",
+            "python3 -m wafer_core.utils.kernel_utils.evaluate",
             f"--implementation {container_impl_path}",
             f"--reference {container_ref_path}",
             f"--test-cases {container_test_cases_path}",
@@ -545,8 +523,8 @@ async def run_evaluate_docker(
         eval_cmd = " ".join(python_cmd_parts)
-        # Full command: install torch deps, then run evaluate with PYTHONPATH
-        full_cmd = f"{pip_install_cmd} && cd {container_run_path} && {eval_cmd}"
+        # Full command: install deps + wafer-core, then run evaluate
+        full_cmd = f"{install_cmd} && cd {container_run_path} && {eval_cmd}"
         # Build Docker run command
         # Add SYS_ADMIN capability when profiling (needed for NCU GPU performance counters)
@@ -556,7 +534,7 @@ async def run_evaluate_docker(
             working_dir=container_run_path,
             env={"CUDA_VISIBLE_DEVICES": str(gpu_id), "PYTHONUNBUFFERED": "1"},
             gpus="all",
-            volumes={wafer_core_workspace: CONTAINER_WORKSPACE},
+            volumes={eval_workspace_expanded: CONTAINER_WORKSPACE},
             cap_add=["SYS_ADMIN"] if args.profile else None,
         )
@@ -663,6 +641,181 @@ async def run_evaluate_docker(
         )
+async def run_evaluate_local(
+    args: EvaluateArgs,
+    target: LocalTarget,
+) -> EvaluateResult:
+    """Run evaluation locally on the current machine.
+    For LocalTarget - no SSH needed, runs directly.
+    Args:
+        args: Evaluate arguments
+        target: Local target config
+    Returns:
+        Evaluation result
+    """
+    import os
+    import subprocess
+    import tempfile
+    from datetime import datetime
+    # Select GPU
+    gpu_id = args.gpu_id if args.gpu_id is not None else target.gpu_ids[0]
+    print(f"Running local evaluation on GPU {gpu_id}...")
+    # Create temp directory for eval files
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    with tempfile.TemporaryDirectory(prefix=f"wafer_eval_{timestamp}_") as run_path:
+        run_path = Path(run_path)
+        # Write implementation
+        impl_path = run_path / "implementation.py"
+        impl_path.write_text(args.implementation.read_text())
+        # Write reference
+        ref_path = run_path / "reference.py"
+        ref_path.write_text(args.reference.read_text())
+        # Write custom inputs if provided
+        inputs_path = None
+        if args.inputs:
+            inputs_path = run_path / "custom_inputs.py"
+            inputs_path.write_text(args.inputs.read_text())
+        # Write eval script
+        eval_script_path = run_path / "kernelbench_eval.py"
+        eval_script_path.write_text(KERNELBENCH_EVAL_SCRIPT)
+        # Write defense module if defensive mode is enabled
+        defense_module_path = None
+        if args.defensive:
+            defense_src = (
+                Path(__file__).parent.parent.parent.parent
+                / "packages"
+                / "wafer-core"
+                / "wafer_core"
+                / "utils"
+                / "kernel_utils"
+                / "defense.py"
+            )
+            if defense_src.exists():
+                defense_module_path = run_path / "defense.py"
+                defense_module_path.write_text(defense_src.read_text())
+            else:
+                print(f"Warning: defense.py not found at {defense_src}")
+        # Output file
+        output_path = run_path / "results.json"
+        # Build eval command
+        cmd_parts = [
+            "python3",
+            str(eval_script_path),
+            "--impl",
+            str(impl_path),
+            "--reference",
+            str(ref_path),
+            "--output",
+            str(output_path),
+            "--seed",
+            str(args.seed),
+        ]
+        if args.benchmark:
+            cmd_parts.append("--benchmark")
+        if args.profile:
+            cmd_parts.append("--profile")
+        if inputs_path:
+            cmd_parts.extend(["--inputs", str(inputs_path)])
+        if args.defensive and defense_module_path:
+            cmd_parts.extend(["--defensive", "--defense-module", str(defense_module_path)])
+        # Set environment for GPU selection
+        env = os.environ.copy()
+        if target.vendor == "nvidia":
+            env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+        else:  # AMD
+            env["HIP_VISIBLE_DEVICES"] = str(gpu_id)
+            env["ROCM_PATH"] = "/opt/rocm"
+        print(f"Running: {' '.join(cmd_parts[:4])} ...")
+        # Run evaluation
+        try:
+            result = subprocess.run(
+                cmd_parts,
+                cwd=str(run_path),
+                env=env,
+                capture_output=True,
+                text=True,
+                timeout=args.timeout or 600,
+            )
+        except subprocess.TimeoutExpired:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message="Evaluation timed out",
+            )
+        if result.returncode != 0:
+            error_msg = result.stderr or result.stdout or "Unknown error"
+            # Truncate long errors
+            if len(error_msg) > 1000:
+                error_msg = error_msg[:500] + "\n...\n" + error_msg[-500:]
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Evaluation failed:\n{error_msg}",
+            )
+        # Parse results
+        if not output_path.exists():
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message="No results.json produced",
+            )
+        try:
+            results = json.loads(output_path.read_text())
+        except json.JSONDecodeError as e:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Failed to parse results: {e}",
+            )
+        # Extract results
+        return EvaluateResult(
+            success=True,
+            all_correct=results.get("all_correct", False),
+            correctness_score=results.get("correctness_score", 0.0),
+            geomean_speedup=results.get("geomean_speedup", 0.0),
+            passed_tests=results.get("passed_tests", 0),
+            total_tests=results.get("total_tests", 0),
+            benchmark_results=results.get("benchmark", {}),
+        )
 async def run_evaluate_ssh(
     args: EvaluateArgs,
     target: BaremetalTarget | VMTarget,
@@ -980,6 +1133,7 @@ def _build_modal_sandbox_script(
     test_cases_b64: str,
     run_benchmarks: bool,
     run_defensive: bool,
+    defense_code_b64: str | None = None,
 ) -> str:
     """Build Python script to create sandbox and run evaluation.
@@ -1060,6 +1214,20 @@ print('Files written')
             print(json.dumps({{"error": f"Failed to write files: {{proc.stderr.read()}}"}}))
             return
+        # Write defense module if defensive mode is enabled
+        # NOTE: Check for actual base64 content, not just truthy string (None becomes "None")
+        if {run_defensive} and "{defense_code_b64}" and "{defense_code_b64}" != "None":
+            proc = sandbox.exec("python", "-c", f"""
+import base64
+with open('/workspace/defense.py', 'w') as f:
+    f.write(base64.b64decode('{defense_code_b64}').decode())
+print('Defense module written')
+""")
+            proc.wait()
+            if proc.returncode != 0:
+                print(json.dumps({{"error": f"Failed to write defense module: {{proc.stderr.read()}}"}}))
+                return
         # Build inline evaluation script
         eval_script = """
 import json
@@ -1087,6 +1255,26 @@ generate_input = load_fn('reference.py', 'generate_input')
 import torch
+# Load defense module if available and defensive mode is enabled
+run_defensive = {run_defensive}
+defense = None
+if run_defensive:
+    try:
+        defense = load_fn('defense.py', 'run_all_defenses')
+        time_with_defenses = load_fn('defense.py', 'time_execution_with_defenses')
+        print('[Defense] Defense module loaded')
+        # Wrap kernels for defense API compatibility
+        # Defense API calls kernel(*args), but functional format expects kernel(inputs_tuple)
+        # These wrappers repack the unpacked args back into a tuple
+        def _wrap_for_defense(kernel):
+            return lambda *args: kernel(args)
+        custom_kernel_for_defense = _wrap_for_defense(custom_kernel)
+        ref_kernel_for_defense = _wrap_for_defense(ref_kernel)
+    except Exception as e:
+        print(f'[Defense] Warning: Could not load defense module: {{e}}')
+        defense = None
 results = []
 all_correct = True
 total_time_ms = 0.0
@@ -1114,36 +1302,63 @@ for tc in test_cases:
         impl_time_ms = 0.0
         ref_time_ms = 0.0
         if {run_benchmarks}:
-            # Warmup
-            for _ in range(3):
-                custom_kernel(inputs)
-            torch.cuda.synchronize()
-            # Measure with defensive timing if requested
-            # Defensive: sync before recording end event to catch stream injection
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
-            start.record()
-            for _ in range(10):
-                custom_kernel(inputs)
-            if {run_defensive}:
-                torch.cuda.synchronize()  # DEFENSE: sync all streams before end
-            end.record()
-            torch.cuda.synchronize()
-            impl_time_ms = start.elapsed_time(end) / 10
-            # Reference timing (same defensive approach)
-            for _ in range(3):
-                ref_kernel(inputs)
-            torch.cuda.synchronize()
-            start.record()
-            for _ in range(10):
-                ref_kernel(inputs)
-            if {run_defensive}:
-                torch.cuda.synchronize()  # DEFENSE: sync all streams before end
-            end.record()
-            torch.cuda.synchronize()
-            ref_time_ms = start.elapsed_time(end) / 10
+            if run_defensive and defense is not None:
+                # Use full defense suite with wrapped kernels
+                # inputs_list unpacks the tuple so defense can infer dtype/device from tensors
+                inputs_list = list(inputs) if hasattr(inputs, '__iter__') and not isinstance(inputs, torch.Tensor) else [inputs]
+                # Run defense checks
+                all_passed, defense_results, _ = defense(custom_kernel_for_defense, *inputs_list)
+                if not all_passed:
+                    failed = [name for name, passed, _ in defense_results if not passed]
+                    raise ValueError(f"Defense checks failed: {{failed}}")
+                # Time with defensive timing (using wrapped kernels)
+                impl_times, _ = time_with_defenses(
+                    custom_kernel_for_defense,
+                    inputs_list,
+                    num_warmup=3,
+                    num_trials=10,
+                    verbose=False,
+                    run_defenses=False,
+                )
+                impl_time_ms = sum(impl_times) / len(impl_times)
+                ref_times, _ = time_with_defenses(
+                    ref_kernel_for_defense,
+                    inputs_list,
+                    num_warmup=3,
+                    num_trials=10,
+                    verbose=False,
+                    run_defenses=False,
+                )
+                ref_time_ms = sum(ref_times) / len(ref_times)
+            else:
+                # Standard timing without full defenses
+                # Warmup
+                for _ in range(3):
+                    custom_kernel(inputs)
+                torch.cuda.synchronize()
+                start = torch.cuda.Event(enable_timing=True)
+                end = torch.cuda.Event(enable_timing=True)
+                start.record()
+                for _ in range(10):
+                    custom_kernel(inputs)
+                end.record()
+                torch.cuda.synchronize()
+                impl_time_ms = start.elapsed_time(end) / 10
+                # Reference timing
+                for _ in range(3):
+                    ref_kernel(inputs)
+                torch.cuda.synchronize()
+                start.record()
+                for _ in range(10):
+                    ref_kernel(inputs)
+                end.record()
+                torch.cuda.synchronize()
+                ref_time_ms = start.elapsed_time(end) / 10
             total_time_ms += impl_time_ms
             ref_total_time_ms += ref_time_ms
@@ -1236,6 +1451,23 @@ async def run_evaluate_modal(
     ref_code_b64 = base64.b64encode(args.reference.read_bytes()).decode()
     test_cases_b64 = base64.b64encode(args.test_cases.read_bytes()).decode()
+    # Encode defense module if defensive mode is enabled
+    defense_code_b64 = None
+    if args.defensive:
+        defense_path = (
+            Path(__file__).parent.parent.parent.parent
+            / "packages"
+            / "wafer-core"
+            / "wafer_core"
+            / "utils"
+            / "kernel_utils"
+            / "defense.py"
+        )
+        if defense_path.exists():
+            defense_code_b64 = base64.b64encode(defense_path.read_bytes()).decode()
+        else:
+            print(f"Warning: defense.py not found at {defense_path}, falling back to basic defense")
     # Build the script that creates sandbox and runs eval
     script = _build_modal_sandbox_script(
         target=target,
@@ -1244,6 +1476,7 @@ async def run_evaluate_modal(
         test_cases_b64=test_cases_b64,
         run_benchmarks=args.benchmark,
         run_defensive=args.defensive,
+        defense_code_b64=defense_code_b64,
     )
     def _run_subprocess() -> tuple[str, str, int]:
@@ -1341,6 +1574,7 @@ def _build_workspace_eval_script(
     test_cases_json: str,
     run_benchmarks: bool,
     run_defensive: bool = False,
+    defense_code: str | None = None,
 ) -> str:
     """Build inline evaluation script for workspace exec.
@@ -1351,6 +1585,7 @@ def _build_workspace_eval_script(
     impl_b64 = base64.b64encode(impl_code.encode()).decode()
     ref_b64 = base64.b64encode(ref_code.encode()).decode()
     tests_b64 = base64.b64encode(test_cases_json.encode()).decode()
+    defense_b64 = base64.b64encode(defense_code.encode()).decode() if defense_code else ""
     return f'''
 import base64
@@ -1370,6 +1605,15 @@ with open("/tmp/kernel.py", "w") as f:
 with open("/tmp/reference.py", "w") as f:
     f.write(ref_code)
+# Write defense module if available
+run_defensive = {run_defensive}
+defense_b64 = "{defense_b64}"
+# NOTE: Check defense_b64 is not empty and not the string "None" (from None formatting)
+if run_defensive and defense_b64 and defense_b64 != "None":
+    defense_code = base64.b64decode(defense_b64).decode()
+    with open("/tmp/defense.py", "w") as f:
+        f.write(defense_code)
 # Load kernels
 def load_fn(path, name):
     spec = importlib.util.spec_from_file_location("mod", path)
@@ -1383,6 +1627,24 @@ generate_input = load_fn("/tmp/reference.py", "generate_input")
 import torch
+# Load defense module if available
+defense = None
+if run_defensive and defense_b64 and defense_b64 != "None":
+    try:
+        defense = load_fn("/tmp/defense.py", "run_all_defenses")
+        time_with_defenses = load_fn("/tmp/defense.py", "time_execution_with_defenses")
+        print("[Defense] Defense module loaded")
+        # Wrap kernels for defense API compatibility
+        # Defense API calls kernel(*args), but functional format expects kernel(inputs_tuple)
+        def _wrap_for_defense(kernel):
+            return lambda *args: kernel(args)
+        custom_kernel_for_defense = _wrap_for_defense(custom_kernel)
+        ref_kernel_for_defense = _wrap_for_defense(ref_kernel)
+    except Exception as e:
+        print(f"[Defense] Warning: Could not load defense module: {{e}}")
+        defense = None
 results = []
 all_correct = True
 total_time_ms = 0.0
@@ -1410,36 +1672,60 @@ for tc in test_cases:
         impl_time_ms = 0.0
         ref_time_ms = 0.0
         if {run_benchmarks}:
-            # Warmup
-            for _ in range(3):
-                custom_kernel(inputs)
-            torch.cuda.synchronize()
-            # Measure with defensive timing if requested
-            # Defensive: sync before recording end event to catch stream injection
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
-            start.record()
-            for _ in range(10):
-                custom_kernel(inputs)
-            if {run_defensive}:
-                torch.cuda.synchronize()  # DEFENSE: sync all streams before end
-            end.record()
-            torch.cuda.synchronize()
-            impl_time_ms = start.elapsed_time(end) / 10
-            # Reference timing (same defensive approach)
-            for _ in range(3):
-                ref_kernel(inputs)
-            torch.cuda.synchronize()
-            start.record()
-            for _ in range(10):
-                ref_kernel(inputs)
-            if {run_defensive}:
-                torch.cuda.synchronize()  # DEFENSE: sync all streams before end
-            end.record()
-            torch.cuda.synchronize()
-            ref_time_ms = start.elapsed_time(end) / 10
+            if run_defensive and defense is not None:
+                # Use full defense suite with wrapped kernels
+                inputs_list = list(inputs) if hasattr(inputs, '__iter__') and not isinstance(inputs, torch.Tensor) else [inputs]
+                # Run defense checks
+                all_passed, defense_results, _ = defense(custom_kernel_for_defense, *inputs_list)
+                if not all_passed:
+                    failed = [name for name, passed, _ in defense_results if not passed]
+                    raise ValueError(f"Defense checks failed: {{failed}}")
+                # Time with defensive timing (using wrapped kernels)
+                impl_times, _ = time_with_defenses(
+                    custom_kernel_for_defense,
+                    inputs_list,
+                    num_warmup=3,
+                    num_trials=10,
+                    verbose=False,
+                    run_defenses=False,
+                )
+                impl_time_ms = sum(impl_times) / len(impl_times)
+                ref_times, _ = time_with_defenses(
+                    ref_kernel_for_defense,
+                    inputs_list,
+                    num_warmup=3,
+                    num_trials=10,
+                    verbose=False,
+                    run_defenses=False,
+                )
+                ref_time_ms = sum(ref_times) / len(ref_times)
+            else:
+                # Standard timing
+                for _ in range(3):
+                    custom_kernel(inputs)
+                torch.cuda.synchronize()
+                start = torch.cuda.Event(enable_timing=True)
+                end = torch.cuda.Event(enable_timing=True)
+                start.record()
+                for _ in range(10):
+                    custom_kernel(inputs)
+                end.record()
+                torch.cuda.synchronize()
+                impl_time_ms = start.elapsed_time(end) / 10
+                for _ in range(3):
+                    ref_kernel(inputs)
+                torch.cuda.synchronize()
+                start.record()
+                for _ in range(10):
+                    ref_kernel(inputs)
+                end.record()
+                torch.cuda.synchronize()
+                ref_time_ms = start.elapsed_time(end) / 10
             total_time_ms += impl_time_ms
             ref_total_time_ms += ref_time_ms
@@ -1501,6 +1787,23 @@ async def run_evaluate_workspace(
     ref_code = args.reference.read_text()
     test_cases_json = args.test_cases.read_text()
+    # Read defense module if defensive mode is enabled
+    defense_code = None
+    if args.defensive:
+        defense_path = (
+            Path(__file__).parent.parent.parent.parent
+            / "packages"
+            / "wafer-core"
+            / "wafer_core"
+            / "utils"
+            / "kernel_utils"
+            / "defense.py"
+        )
+        if defense_path.exists():
+            defense_code = defense_path.read_text()
+        else:
+            print(f"Warning: defense.py not found at {defense_path}, falling back to basic defense")
     # Build inline eval script
     eval_script = _build_workspace_eval_script(
         impl_code=impl_code,
@@ -1508,6 +1811,7 @@ async def run_evaluate_workspace(
         test_cases_json=test_cases_json,
         run_benchmarks=args.benchmark,
         run_defensive=args.defensive,
+        defense_code=defense_code,
     )
     # Execute via workspace exec
@@ -1853,15 +2157,12 @@ async def run_evaluate_runpod(
                 # Add venv bin to PATH so ninja (from pip) is found by torch.utils.cpp_extension
                 venv_bin = env_state.venv_bin
                 env_vars = f"PATH={venv_bin}:$PATH HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm"
-                pythonpath = f"PYTHONPATH={wafer_core_workspace}"
-                evaluate_script = (
-                    f"{wafer_core_workspace}/wafer_core/utils/kernel_utils/evaluate.py"
-                )
                 # Run from run_path so reference_kernel.py is importable
+                # Use installed wafer-core module
                 eval_cmd = (
                     f"cd {run_path} && "
-                    f"{env_vars} {pythonpath} {python_exe} {evaluate_script} "
+                    f"{env_vars} {python_exe} -m wafer_core.utils.kernel_utils.evaluate "
                     f"--implementation {impl_path} "
                     f"--reference {ref_path} "
                     f"--test-cases {test_cases_path} "
@@ -2217,15 +2518,12 @@ async def run_evaluate_digitalocean(
                     env_vars = (
                         f"PATH={venv_bin}:$PATH HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm"
                     )
-                    pythonpath = f"PYTHONPATH={wafer_core_workspace}"
-                    evaluate_script = (
-                        f"{wafer_core_workspace}/wafer_core/utils/kernel_utils/evaluate.py"
-                    )
                     # Run from run_path so reference_kernel.py is importable
+                    # Use installed wafer-core module
                     eval_cmd = (
                         f"cd {run_path} && "
-                        f"{env_vars} {pythonpath} {python_exe} {evaluate_script} "
+                        f"{env_vars} {python_exe} -m wafer_core.utils.kernel_utils.evaluate "
                         f"--implementation {impl_path} "
                         f"--reference {ref_path} "
                         f"--test-cases {test_cases_path} "
@@ -2405,7 +2703,9 @@ async def run_evaluate(args: EvaluateArgs) -> EvaluateResult:
     print(f"Using target: {target_name}")
     # Dispatch to appropriate executor
-    if isinstance(target, BaremetalTarget | VMTarget):
+    if isinstance(target, LocalTarget):
+        return await run_evaluate_local(args, target)
+    elif isinstance(target, BaremetalTarget | VMTarget):
         return await run_evaluate_ssh(args, target)
     elif isinstance(target, ModalTarget):
         return await run_evaluate_modal(args, target)
@@ -2435,10 +2735,233 @@ async def run_evaluate(args: EvaluateArgs) -> EvaluateResult:
 # This runs inside the Docker container on the remote GPU
 KERNELBENCH_EVAL_SCRIPT = """
 import json
+import os
 import sys
 import time
 import torch
 import torch.nn as nn
+from pathlib import Path
+def run_profiling(model, inputs, name, output_dir):
+    '''Run torch.profiler and return summary stats.'''
+    from torch.profiler import profile, ProfilerActivity
+    # Determine activities based on backend
+    activities = [ProfilerActivity.CPU]
+    if torch.cuda.is_available():
+        activities.append(ProfilerActivity.CUDA)
+    # Warmup
+    for _ in range(3):
+        with torch.no_grad():
+            _ = model(*inputs)
+    torch.cuda.synchronize()
+    # Profile
+    with profile(
+        activities=activities,
+        record_shapes=True,
+        with_stack=False,
+        profile_memory=True,
+    ) as prof:
+        with torch.no_grad():
+            _ = model(*inputs)
+        torch.cuda.synchronize()
+    # Get key averages
+    key_averages = prof.key_averages()
+    # Find the main kernel (longest GPU time)
+    # Use cuda_time_total for compatibility with both CUDA and ROCm
+    def get_gpu_time(e):
+        # Try different attributes for GPU time
+        if hasattr(e, 'cuda_time_total'):
+            return e.cuda_time_total
+        if hasattr(e, 'device_time_total'):
+            return e.device_time_total
+        if hasattr(e, 'self_cuda_time_total'):
+            return e.self_cuda_time_total
+        return 0
+    gpu_events = [e for e in key_averages if get_gpu_time(e) > 0]
+    gpu_events.sort(key=lambda e: get_gpu_time(e), reverse=True)
+    stats = {
+        "name": name,
+        "total_gpu_time_ms": sum(get_gpu_time(e) for e in gpu_events) / 1000,
+        "total_cpu_time_ms": sum(e.cpu_time_total for e in key_averages) / 1000,
+        "num_gpu_kernels": len(gpu_events),
+        "top_kernels": [],
+    }
+    # Top 5 kernels by GPU time
+    for e in gpu_events[:5]:
+        stats["top_kernels"].append({
+            "name": e.key,
+            "gpu_time_ms": get_gpu_time(e) / 1000,
+            "cpu_time_ms": e.cpu_time_total / 1000,
+            "calls": e.count,
+        })
+    # Save trace for visualization
+    trace_path = Path(output_dir) / f"{name}_trace.json"
+    prof.export_chrome_trace(str(trace_path))
+    stats["trace_file"] = str(trace_path)
+    return stats
+def validate_custom_inputs(original_inputs, custom_inputs):
+    '''Validate that custom inputs match the expected signature.
+    Returns (is_valid, error_message).
+    '''
+    if len(original_inputs) != len(custom_inputs):
+        return False, f"get_inputs() must return {len(original_inputs)} tensors, got {len(custom_inputs)}"
+    for i, (orig, cust) in enumerate(zip(original_inputs, custom_inputs)):
+        if not isinstance(cust, torch.Tensor):
+            if not isinstance(orig, torch.Tensor):
+                continue  # Both non-tensor, ok
+            return False, f"Input {i}: expected Tensor, got {type(cust).__name__}"
+        if not isinstance(orig, torch.Tensor):
+            return False, f"Input {i}: expected {type(orig).__name__}, got Tensor"
+        if orig.dtype != cust.dtype:
+            return False, f"Input {i}: dtype mismatch - expected {orig.dtype}, got {cust.dtype}"
+        if orig.dim() != cust.dim():
+            return False, f"Input {i}: dimension mismatch - expected {orig.dim()}D, got {cust.dim()}D"
+    return True, None
+def analyze_diff(ref_output, new_output, rtol=1e-3, atol=1e-3, max_samples=5):
+    '''Analyze differences between reference and implementation outputs.
+    Returns a dict with detailed diff information.
+    '''
+    diff = (ref_output - new_output).abs()
+    threshold = atol + rtol * ref_output.abs()
+    wrong_mask = diff > threshold
+    total_elements = ref_output.numel()
+    wrong_count = wrong_mask.sum().item()
+    # Basic stats
+    max_diff = diff.max().item()
+    max_diff_idx = tuple(torch.unravel_index(diff.argmax(), diff.shape))
+    max_diff_idx = tuple(int(i) for i in max_diff_idx)  # Convert to Python ints
+    # Relative error (avoid div by zero)
+    ref_abs = ref_output.abs()
+    nonzero_mask = ref_abs > 1e-8
+    if nonzero_mask.any():
+        rel_error = diff[nonzero_mask] / ref_abs[nonzero_mask]
+        max_rel_error = rel_error.max().item()
+        mean_rel_error = rel_error.mean().item()
+    else:
+        max_rel_error = float('inf') if max_diff > 0 else 0.0
+        mean_rel_error = max_rel_error
+    # Error histogram (buckets: <1e-6, 1e-6 to 1e-4, 1e-4 to 1e-2, 1e-2 to 1, >1)
+    histogram = {
+        '<1e-6': int((diff < 1e-6).sum().item()),
+        '1e-6 to 1e-4': int(((diff >= 1e-6) & (diff < 1e-4)).sum().item()),
+        '1e-4 to 1e-2': int(((diff >= 1e-4) & (diff < 1e-2)).sum().item()),
+        '1e-2 to 1': int(((diff >= 1e-2) & (diff < 1)).sum().item()),
+        '>1': int((diff >= 1).sum().item()),
+    }
+    result = {
+        'max_diff': max_diff,
+        'max_diff_idx': max_diff_idx,
+        'mean_diff': diff.mean().item(),
+        'max_rel_error': max_rel_error,
+        'mean_rel_error': mean_rel_error,
+        'total_elements': total_elements,
+        'wrong_count': int(wrong_count),
+        'wrong_pct': 100.0 * wrong_count / total_elements,
+        'histogram': histogram,
+        'samples': [],
+    }
+    # Get indices of wrong elements
+    if wrong_count > 0:
+        wrong_indices = torch.nonzero(wrong_mask, as_tuple=False)
+        # Take first N samples
+        num_samples = min(max_samples, len(wrong_indices))
+        for i in range(num_samples):
+            idx = tuple(wrong_indices[i].tolist())
+            ref_val = ref_output[idx].item()
+            new_val = new_output[idx].item()
+            diff_val = diff[idx].item()
+            result['samples'].append({
+                'index': idx,
+                'ref': ref_val,
+                'impl': new_val,
+                'diff': diff_val,
+            })
+        # Try to detect pattern
+        if wrong_count >= total_elements * 0.99:
+            result['pattern'] = 'all_wrong'
+        elif wrong_count < total_elements * 0.01:
+            # Check if failures are at boundaries
+            shape = ref_output.shape
+            boundary_count = 0
+            for idx in wrong_indices[:min(100, len(wrong_indices))]:
+                idx_list = idx.tolist()
+                is_boundary = any(i == 0 or i == s - 1 for i, s in zip(idx_list, shape))
+                if is_boundary:
+                    boundary_count += 1
+            if boundary_count > len(wrong_indices[:100]) * 0.8:
+                result['pattern'] = 'boundary_issue'
+            else:
+                result['pattern'] = 'scattered'
+        else:
+            result['pattern'] = 'partial'
+    return result
+def print_diff_analysis(analysis):
+    '''Print a human-readable diff analysis.'''
+    print(f"[KernelBench] Diff analysis:")
+    # Max diff with location
+    idx_str = ','.join(str(i) for i in analysis['max_diff_idx'])
+    print(f"   Max diff: {analysis['max_diff']:.6f} at index [{idx_str}]")
+    print(f"   Mean diff: {analysis['mean_diff']:.6f}")
+    # Relative errors
+    print(f"   Max relative error: {analysis['max_rel_error']:.2%}, Mean: {analysis['mean_rel_error']:.2%}")
+    # Wrong count
+    print(f"   Wrong elements: {analysis['wrong_count']:,} / {analysis['total_elements']:,} ({analysis['wrong_pct']:.2f}%)")
+    # Histogram
+    hist = analysis['histogram']
+    print(f"   Error distribution: <1e-6: {hist['<1e-6']:,} | 1e-6~1e-4: {hist['1e-6 to 1e-4']:,} | 1e-4~1e-2: {hist['1e-4 to 1e-2']:,} | 1e-2~1: {hist['1e-2 to 1']:,} | >1: {hist['>1']:,}")
+    if 'pattern' in analysis:
+        pattern_desc = {
+            'all_wrong': 'ALL elements wrong - likely algorithmic error or wrong weights',
+            'boundary_issue': 'Mostly BOUNDARY elements wrong - check edge handling',
+            'scattered': 'SCATTERED failures - numerical precision issue?',
+            'partial': 'PARTIAL failures - check specific conditions',
+        }
+        print(f"   Pattern: {pattern_desc.get(analysis['pattern'], analysis['pattern'])}")
+    if analysis['samples']:
+        print(f"   Sample failures:")
+        for s in analysis['samples']:
+            idx_str = ','.join(str(i) for i in s['index'])
+            print(f"      [{idx_str}]: ref={s['ref']:.6f} impl={s['impl']:.6f} (diff={s['diff']:.6f})")
 def main():
     # Parse args
@@ -2446,12 +2969,35 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--impl", required=True)
     parser.add_argument("--reference", required=True)
+    parser.add_argument("--inputs", help="Custom inputs file to override get_inputs()/get_init_inputs()")
     parser.add_argument("--benchmark", action="store_true")
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--defensive", action="store_true", help="Run full defense checks against reward hacking")
+    parser.add_argument("--defense-module", help="Path to defense.py module")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
     parser.add_argument("--num-correct-trials", type=int, default=3)
     parser.add_argument("--num-perf-trials", type=int, default=10)
     parser.add_argument("--output", required=True)
     args = parser.parse_args()
+    # Load defense module if defensive mode is enabled
+    defense_module = None
+    if args.defensive and args.defense_module:
+        try:
+            import importlib.util
+            defense_spec = importlib.util.spec_from_file_location("defense", args.defense_module)
+            defense_module = importlib.util.module_from_spec(defense_spec)
+            defense_spec.loader.exec_module(defense_module)
+            print("[KernelBench] Defense module loaded")
+        except Exception as e:
+            print(f"[KernelBench] Warning: Could not load defense module: {e}")
+    # Create output directory for profiles
+    output_dir = Path(args.output).parent
+    profile_dir = output_dir / "profiles"
+    if args.profile:
+        profile_dir.mkdir(exist_ok=True)
     results = {
         "compiled": False,
         "correct": False,
@@ -2472,6 +3018,33 @@ def main():
         get_inputs = ref_module.get_inputs
         get_init_inputs = ref_module.get_init_inputs
+        # Load custom inputs if provided
+        if args.inputs:
+            inputs_spec = importlib.util.spec_from_file_location("custom_inputs", args.inputs)
+            inputs_module = importlib.util.module_from_spec(inputs_spec)
+            inputs_spec.loader.exec_module(inputs_module)
+            # Validate custom inputs match expected signature
+            original_inputs = get_inputs()
+            custom_get_inputs = inputs_module.get_inputs
+            custom_inputs = custom_get_inputs()
+            is_valid, error_msg = validate_custom_inputs(original_inputs, custom_inputs)
+            if not is_valid:
+                print(f"[KernelBench] Custom inputs validation failed: {error_msg}")
+                results["error"] = f"Custom inputs validation failed: {error_msg}"
+                raise ValueError(error_msg)
+            # Override get_inputs (and optionally get_init_inputs)
+            get_inputs = custom_get_inputs
+            if hasattr(inputs_module, 'get_init_inputs'):
+                get_init_inputs = inputs_module.get_init_inputs
+            # Show what changed
+            orig_shapes = [tuple(t.shape) if hasattr(t, 'shape') else type(t).__name__ for t in original_inputs]
+            cust_shapes = [tuple(t.shape) if hasattr(t, 'shape') else type(t).__name__ for t in custom_inputs]
+            print(f"[KernelBench] Using custom inputs: {orig_shapes} -> {cust_shapes}")
         # Load implementation module
         impl_spec = importlib.util.spec_from_file_location("implementation", args.impl)
         impl_module = importlib.util.module_from_spec(impl_spec)
@@ -2481,12 +3054,19 @@ def main():
         results["compiled"] = True
         print("[KernelBench] Modules loaded successfully")
-        # Instantiate models
+        # Instantiate models with synchronized seeds for reproducible weights
+        # (matches upstream KernelBench behavior in src/eval.py)
+        seed = args.seed
         init_inputs = get_init_inputs()
         with torch.no_grad():
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed(seed)
             ref_model = Model(*init_inputs).cuda().eval()
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed(seed)
             new_model = ModelNew(*init_inputs).cuda().eval()
-        print("[KernelBench] Models instantiated")
+        print(f"[KernelBench] Models instantiated (seed={seed})")
         # Run correctness trials
         all_correct = True
@@ -2502,8 +3082,18 @@ def main():
             if isinstance(ref_output, torch.Tensor):
                 if not torch.allclose(ref_output, new_output, rtol=1e-3, atol=1e-3):
                     all_correct = False
-                    max_diff = (ref_output - new_output).abs().max().item()
-                    results["error"] = f"Correctness failed on trial {trial+1}: max diff = {max_diff}"
+                    analysis = analyze_diff(ref_output, new_output)
+                    results["error"] = f"Correctness failed on trial {trial+1}: max diff = {analysis['max_diff']}"
+                    results["diff_analysis"] = analysis
+                    print_diff_analysis(analysis)
+                    # Save tensors for debugging
+                    debug_dir = output_dir / "debug"
+                    debug_dir.mkdir(exist_ok=True)
+                    torch.save(ref_output.cpu(), debug_dir / "ref_output.pt")
+                    torch.save(new_output.cpu(), debug_dir / "impl_output.pt")
+                    torch.save(inputs[0].cpu() if inputs else None, debug_dir / "input.pt")
+                    print(f"[KernelBench] Debug tensors saved to: {debug_dir}/")
                     break
             else:
                 # Handle tuple/list outputs
@@ -2511,8 +3101,17 @@ def main():
                     if isinstance(r, torch.Tensor):
                         if not torch.allclose(r, n, rtol=1e-3, atol=1e-3):
                             all_correct = False
-                            max_diff = (r - n).abs().max().item()
-                            results["error"] = f"Correctness failed on trial {trial+1}, output {i}: max diff = {max_diff}"
+                            analysis = analyze_diff(r, n)
+                            results["error"] = f"Correctness failed on trial {trial+1}, output {i}: max diff = {analysis['max_diff']}"
+                            results["diff_analysis"] = analysis
+                            print_diff_analysis(analysis)
+                            # Save tensors for debugging
+                            debug_dir = output_dir / "debug"
+                            debug_dir.mkdir(exist_ok=True)
+                            torch.save(r.cpu(), debug_dir / f"ref_output_{i}.pt")
+                            torch.save(n.cpu(), debug_dir / f"impl_output_{i}.pt")
+                            print(f"[KernelBench] Debug tensors saved to: {debug_dir}/")
                             break
                 if not all_correct:
                     break
@@ -2526,47 +3125,132 @@ def main():
             inputs = get_inputs()
             inputs = [x.cuda() if isinstance(x, torch.Tensor) else x for x in inputs]
-            # Warmup
-            for _ in range(5):
-                with torch.no_grad():
-                    _ = new_model(*inputs)
-            torch.cuda.synchronize()
-            # Benchmark new model
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
+            if args.defensive and defense_module is not None:
+                # Use full defense suite
+                print("[KernelBench] Running defense checks on implementation...")
+                run_all_defenses = defense_module.run_all_defenses
+                time_with_defenses = defense_module.time_execution_with_defenses
-            times = []
-            for _ in range(args.num_perf_trials):
-                start.record()
-                with torch.no_grad():
-                    _ = new_model(*inputs)
-                end.record()
+                # Run defense checks on implementation
+                all_passed, defense_results, _ = run_all_defenses(
+                    lambda *x: new_model(*x),
+                    *inputs,
+                )
+                results["defense_results"] = {
+                    name: {"passed": passed, "message": msg}
+                    for name, passed, msg in defense_results
+                }
+                if not all_passed:
+                    failed = [name for name, passed, _ in defense_results if not passed]
+                    results["error"] = f"Defense checks failed: {failed}"
+                    print(f"[KernelBench] Defense checks FAILED: {failed}")
+                    for name, passed, msg in defense_results:
+                        status = "PASS" if passed else "FAIL"
+                        print(f"   [{status}] {name}: {msg}")
+                else:
+                    print("[KernelBench] All defense checks passed")
+                    # Time with defensive timing
+                    impl_times, _ = time_with_defenses(
+                        lambda: new_model(*inputs),
+                        [],
+                        num_warmup=5,
+                        num_trials=args.num_perf_trials,
+                        verbose=False,
+                        run_defenses=False,  # Already ran above
+                    )
+                    new_time = sum(impl_times) / len(impl_times)
+                    results["runtime_ms"] = new_time
+                    # Reference timing
+                    ref_times, _ = time_with_defenses(
+                        lambda: ref_model(*inputs),
+                        [],
+                        num_warmup=5,
+                        num_trials=args.num_perf_trials,
+                        verbose=False,
+                        run_defenses=False,
+                    )
+                    ref_time = sum(ref_times) / len(ref_times)
+                    results["reference_runtime_ms"] = ref_time
+                    results["speedup"] = ref_time / new_time if new_time > 0 else 0
+                    print(f"[KernelBench] New: {new_time:.3f}ms, Ref: {ref_time:.3f}ms, Speedup: {results['speedup']:.2f}x")
+            else:
+                # Standard timing without full defenses
+                # Warmup
+                for _ in range(5):
+                    with torch.no_grad():
+                        _ = new_model(*inputs)
                 torch.cuda.synchronize()
-                times.append(start.elapsed_time(end))
-            new_time = sum(times) / len(times)
-            results["runtime_ms"] = new_time
-            # Benchmark reference model
-            for _ in range(5):
-                with torch.no_grad():
-                    _ = ref_model(*inputs)
-            torch.cuda.synchronize()
-            times = []
-            for _ in range(args.num_perf_trials):
-                start.record()
-                with torch.no_grad():
-                    _ = ref_model(*inputs)
-                end.record()
+                # Benchmark new model
+                start = torch.cuda.Event(enable_timing=True)
+                end = torch.cuda.Event(enable_timing=True)
+                times = []
+                for _ in range(args.num_perf_trials):
+                    start.record()
+                    with torch.no_grad():
+                        _ = new_model(*inputs)
+                    end.record()
+                    torch.cuda.synchronize()
+                    times.append(start.elapsed_time(end))
+                new_time = sum(times) / len(times)
+                results["runtime_ms"] = new_time
+                # Benchmark reference model
+                for _ in range(5):
+                    with torch.no_grad():
+                        _ = ref_model(*inputs)
                 torch.cuda.synchronize()
-                times.append(start.elapsed_time(end))
-            ref_time = sum(times) / len(times)
-            results["reference_runtime_ms"] = ref_time
-            results["speedup"] = ref_time / new_time if new_time > 0 else 0
-            print(f"[KernelBench] New: {new_time:.3f}ms, Ref: {ref_time:.3f}ms, Speedup: {results['speedup']:.2f}x")
+                times = []
+                for _ in range(args.num_perf_trials):
+                    start.record()
+                    with torch.no_grad():
+                        _ = ref_model(*inputs)
+                    end.record()
+                    torch.cuda.synchronize()
+                    times.append(start.elapsed_time(end))
+                ref_time = sum(times) / len(times)
+                results["reference_runtime_ms"] = ref_time
+                results["speedup"] = ref_time / new_time if new_time > 0 else 0
+                print(f"[KernelBench] New: {new_time:.3f}ms, Ref: {ref_time:.3f}ms, Speedup: {results['speedup']:.2f}x")
+        # Run profiling if requested and correctness passed
+        if args.profile and all_correct:
+            print("[KernelBench] Running profiler...")
+            inputs = get_inputs()
+            inputs = [x.cuda() if isinstance(x, torch.Tensor) else x for x in inputs]
+            try:
+                # Profile implementation
+                impl_stats = run_profiling(new_model, inputs, "implementation", str(profile_dir))
+                results["profile_impl"] = impl_stats
+                print(f"[KernelBench] Implementation profile:")
+                print(f"   Total GPU time: {impl_stats['total_gpu_time_ms']:.3f}ms")
+                print(f"   Kernels launched: {impl_stats['num_gpu_kernels']}")
+                if impl_stats['top_kernels']:
+                    print(f"   Top kernel: {impl_stats['top_kernels'][0]['name'][:60]}...")
+                    print(f"              {impl_stats['top_kernels'][0]['gpu_time_ms']:.3f}ms")
+                # Profile reference
+                ref_stats = run_profiling(ref_model, inputs, "reference", str(profile_dir))
+                results["profile_ref"] = ref_stats
+                print(f"[KernelBench] Reference profile:")
+                print(f"   Total GPU time: {ref_stats['total_gpu_time_ms']:.3f}ms")
+                print(f"   Kernels launched: {ref_stats['num_gpu_kernels']}")
+                if ref_stats['top_kernels']:
+                    print(f"   Top kernel: {ref_stats['top_kernels'][0]['name'][:60]}...")
+                    print(f"              {ref_stats['top_kernels'][0]['gpu_time_ms']:.3f}ms")
+                print(f"[KernelBench] Profile traces saved to: {profile_dir}/")
+            except Exception as prof_err:
+                print(f"[KernelBench] Profiling failed: {prof_err}")
+                results["profile_error"] = str(prof_err)
     except Exception as e:
         import traceback
@@ -2705,6 +3389,24 @@ async def run_evaluate_kernelbench_docker(
                 error_message=f"Failed to write reference: {write_result.stderr}",
             )
+        # Write custom inputs if provided
+        if args.inputs:
+            inputs_code = args.inputs.read_text()
+            inputs_file_path = f"{run_path}/custom_inputs.py"
+            write_result = await client.exec(
+                f"cat > '{inputs_file_path}' << 'INPUTS_EOF'\n{inputs_code}\nINPUTS_EOF"
+            )
+            if write_result.exit_code != 0:
+                return EvaluateResult(
+                    success=False,
+                    all_correct=False,
+                    correctness_score=0.0,
+                    geomean_speedup=0.0,
+                    passed_tests=0,
+                    total_tests=0,
+                    error_message=f"Failed to write custom inputs: {write_result.stderr}",
+                )
         # Write eval script
         eval_script_path = f"{run_path}/kernelbench_eval.py"
         write_result = await client.exec(
@@ -2721,14 +3423,40 @@ async def run_evaluate_kernelbench_docker(
                 error_message=f"Failed to write eval script: {write_result.stderr}",
             )
+        # Write defense module if defensive mode is enabled
+        defense_module_path = None
+        if args.defensive:
+            defense_path = (
+                Path(__file__).parent.parent.parent.parent
+                / "packages"
+                / "wafer-core"
+                / "wafer_core"
+                / "utils"
+                / "kernel_utils"
+                / "defense.py"
+            )
+            if defense_path.exists():
+                defense_code = defense_path.read_text()
+                defense_module_path = f"{run_path}/defense.py"
+                write_result = await client.exec(
+                    f"cat > '{defense_module_path}' << 'DEFENSE_EOF'\n{defense_code}\nDEFENSE_EOF"
+                )
+                if write_result.exit_code != 0:
+                    print(f"Warning: Failed to write defense module: {write_result.stderr}")
+                    defense_module_path = None
+            else:
+                print(f"Warning: defense.py not found at {defense_path}")
         print("Running KernelBench evaluation in Docker container...")
         # Paths inside container
         container_run_path = f"{CONTAINER_WORKSPACE}/{run_dir}"
         container_impl_path = f"{container_run_path}/implementation.py"
         container_ref_path = f"{container_run_path}/reference.py"
+        container_inputs_path = f"{container_run_path}/custom_inputs.py" if args.inputs else None
         container_eval_script = f"{container_run_path}/kernelbench_eval.py"
         container_output = f"{container_run_path}/results.json"
+        container_defense_path = f"{container_run_path}/defense.py" if defense_module_path else None
         # Build eval command
         python_cmd_parts = [
@@ -2740,6 +3468,14 @@ async def run_evaluate_kernelbench_docker(
         if args.benchmark:
             python_cmd_parts.append("--benchmark")
+        if args.profile:
+            python_cmd_parts.append("--profile")
+        if container_inputs_path:
+            python_cmd_parts.append(f"--inputs {container_inputs_path}")
+        if args.defensive and container_defense_path:
+            python_cmd_parts.append("--defensive")
+            python_cmd_parts.append(f"--defense-module {container_defense_path}")
+        python_cmd_parts.append(f"--seed {args.seed}")
         eval_cmd = " ".join(python_cmd_parts)
@@ -2920,6 +3656,24 @@ async def run_evaluate_kernelbench_digitalocean(
                         error_message=f"Failed to write reference: {write_result.stderr}",
                     )
+                # Write custom inputs if provided
+                if args.inputs:
+                    inputs_code = args.inputs.read_text()
+                    inputs_file_path = f"{run_path}/custom_inputs.py"
+                    write_result = await client.exec(
+                        f"cat > '{inputs_file_path}' << 'INPUTS_EOF'\n{inputs_code}\nINPUTS_EOF"
+                    )
+                    if write_result.exit_code != 0:
+                        return EvaluateResult(
+                            success=False,
+                            all_correct=False,
+                            correctness_score=0.0,
+                            geomean_speedup=0.0,
+                            passed_tests=0,
+                            total_tests=0,
+                            error_message=f"Failed to write custom inputs: {write_result.stderr}",
+                        )
                 # Write eval script
                 eval_script_path = f"{run_path}/kernelbench_eval.py"
                 write_result = await client.exec(
@@ -2936,14 +3690,44 @@ async def run_evaluate_kernelbench_digitalocean(
                         error_message=f"Failed to write eval script: {write_result.stderr}",
                     )
+                # Write defense module if defensive mode is enabled
+                defense_module_path = None
+                if args.defensive:
+                    defense_path = (
+                        Path(__file__).parent.parent.parent.parent
+                        / "packages"
+                        / "wafer-core"
+                        / "wafer_core"
+                        / "utils"
+                        / "kernel_utils"
+                        / "defense.py"
+                    )
+                    if defense_path.exists():
+                        defense_code = defense_path.read_text()
+                        defense_module_path = f"{run_path}/defense.py"
+                        write_result = await client.exec(
+                            f"cat > '{defense_module_path}' << 'DEFENSE_EOF'\n{defense_code}\nDEFENSE_EOF"
+                        )
+                        if write_result.exit_code != 0:
+                            print(f"Warning: Failed to write defense module: {write_result.stderr}")
+                            defense_module_path = None
+                    else:
+                        print(f"Warning: defense.py not found at {defense_path}")
                 print("Running KernelBench evaluation in Docker container (AMD/ROCm)...")
                 # Paths inside container
                 container_run_path = f"{CONTAINER_WORKSPACE}/{run_dir}"
                 container_impl_path = f"{container_run_path}/implementation.py"
                 container_ref_path = f"{container_run_path}/reference.py"
+                container_inputs_path = (
+                    f"{container_run_path}/custom_inputs.py" if args.inputs else None
+                )
                 container_eval_script = f"{container_run_path}/kernelbench_eval.py"
                 container_output = f"{container_run_path}/results.json"
+                container_defense_path = (
+                    f"{container_run_path}/defense.py" if defense_module_path else None
+                )
                 # Build eval command
                 python_cmd_parts = [
@@ -2955,6 +3739,14 @@ async def run_evaluate_kernelbench_digitalocean(
                 if args.benchmark:
                     python_cmd_parts.append("--benchmark")
+                if args.profile:
+                    python_cmd_parts.append("--profile")
+                if container_inputs_path:
+                    python_cmd_parts.append(f"--inputs {container_inputs_path}")
+                if args.defensive and container_defense_path:
+                    python_cmd_parts.append("--defensive")
+                    python_cmd_parts.append(f"--defense-module {container_defense_path}")
+                python_cmd_parts.append(f"--seed {args.seed}")
                 eval_cmd = " ".join(python_cmd_parts)
@@ -3039,11 +3831,478 @@ async def run_evaluate_kernelbench_digitalocean(
                 )
-async def run_evaluate_kernelbench(args: KernelBenchEvaluateArgs) -> EvaluateResult:
-    """Run KernelBench format evaluation on configured target.
-    Args:
-        args: KernelBench evaluate arguments
+async def run_evaluate_kernelbench_runpod(
+    args: KernelBenchEvaluateArgs,
+    target: RunPodTarget,
+) -> EvaluateResult:
+    """Run KernelBench format evaluation directly on RunPod AMD GPU.
+    Runs evaluation script directly on host (no Docker) since RunPod pods
+    already have PyTorch/ROCm installed.
+    """
+    from datetime import datetime
+    from wafer_core.async_ssh import AsyncSSHClient
+    from wafer_core.targets.runpod import RunPodError, runpod_ssh_context
+    REMOTE_WORKSPACE_BASE = "/tmp/wafer_eval"
+    # Select GPU
+    gpu_id = args.gpu_id if args.gpu_id is not None else target.gpu_ids[0]
+    print(f"Provisioning RunPod ({target.gpu_type_id})...")
+    try:
+        async with runpod_ssh_context(target) as ssh_info:
+            ssh_target = f"{ssh_info.user}@{ssh_info.host}:{ssh_info.port}"
+            print(f"Connected to RunPod: {ssh_target}")
+            async with AsyncSSHClient(ssh_target, target.ssh_key) as client:
+                # Create workspace
+                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                run_dir = f"kernelbench_eval_{timestamp}"
+                run_path = f"{REMOTE_WORKSPACE_BASE}/{run_dir}"
+                await client.exec(f"mkdir -p {run_path}")
+                print(f"Created run directory: {run_path}")
+                # Read and upload files
+                impl_code = args.implementation.read_text()
+                ref_code = args.reference.read_text()
+                # Write implementation
+                impl_path = f"{run_path}/implementation.py"
+                write_result = await client.exec(
+                    f"cat > '{impl_path}' << 'IMPL_EOF'\n{impl_code}\nIMPL_EOF"
+                )
+                if write_result.exit_code != 0:
+                    return EvaluateResult(
+                        success=False,
+                        all_correct=False,
+                        correctness_score=0.0,
+                        geomean_speedup=0.0,
+                        passed_tests=0,
+                        total_tests=0,
+                        error_message=f"Failed to write implementation: {write_result.stderr}",
+                    )
+                # Write reference
+                ref_path = f"{run_path}/reference.py"
+                write_result = await client.exec(
+                    f"cat > '{ref_path}' << 'REF_EOF'\n{ref_code}\nREF_EOF"
+                )
+                if write_result.exit_code != 0:
+                    return EvaluateResult(
+                        success=False,
+                        all_correct=False,
+                        correctness_score=0.0,
+                        geomean_speedup=0.0,
+                        passed_tests=0,
+                        total_tests=0,
+                        error_message=f"Failed to write reference: {write_result.stderr}",
+                    )
+                # Write custom inputs if provided
+                inputs_path = None
+                if args.inputs:
+                    inputs_code = args.inputs.read_text()
+                    inputs_path = f"{run_path}/custom_inputs.py"
+                    write_result = await client.exec(
+                        f"cat > '{inputs_path}' << 'INPUTS_EOF'\n{inputs_code}\nINPUTS_EOF"
+                    )
+                    if write_result.exit_code != 0:
+                        return EvaluateResult(
+                            success=False,
+                            all_correct=False,
+                            correctness_score=0.0,
+                            geomean_speedup=0.0,
+                            passed_tests=0,
+                            total_tests=0,
+                            error_message=f"Failed to write custom inputs: {write_result.stderr}",
+                        )
+                # Write eval script
+                eval_script_path = f"{run_path}/kernelbench_eval.py"
+                write_result = await client.exec(
+                    f"cat > '{eval_script_path}' << 'EVAL_EOF'\n{KERNELBENCH_EVAL_SCRIPT}\nEVAL_EOF"
+                )
+                if write_result.exit_code != 0:
+                    return EvaluateResult(
+                        success=False,
+                        all_correct=False,
+                        correctness_score=0.0,
+                        geomean_speedup=0.0,
+                        passed_tests=0,
+                        total_tests=0,
+                        error_message=f"Failed to write eval script: {write_result.stderr}",
+                    )
+                # Write defense module if defensive mode is enabled
+                defense_module_path = None
+                if args.defensive:
+                    defense_path = (
+                        Path(__file__).parent.parent.parent.parent
+                        / "packages"
+                        / "wafer-core"
+                        / "wafer_core"
+                        / "utils"
+                        / "kernel_utils"
+                        / "defense.py"
+                    )
+                    if defense_path.exists():
+                        defense_code = defense_path.read_text()
+                        defense_module_path = f"{run_path}/defense.py"
+                        write_result = await client.exec(
+                            f"cat > '{defense_module_path}' << 'DEFENSE_EOF'\n{defense_code}\nDEFENSE_EOF"
+                        )
+                        if write_result.exit_code != 0:
+                            print(f"Warning: Failed to write defense module: {write_result.stderr}")
+                            defense_module_path = None
+                    else:
+                        print(f"Warning: defense.py not found at {defense_path}")
+                print("Running KernelBench evaluation (AMD/ROCm)...")
+                # Find Python with PyTorch - check common locations on RunPod
+                python_exe = "python3"
+                for candidate in [
+                    "/opt/conda/envs/py_3.10/bin/python3",
+                    "/opt/conda/bin/python3",
+                ]:
+                    check = await client.exec(
+                        f"{candidate} -c 'import torch' 2>/dev/null && echo OK"
+                    )
+                    if "OK" in check.stdout:
+                        python_exe = candidate
+                        print(f"Using Python: {python_exe}")
+                        break
+                # Build eval command - run directly on host
+                output_path = f"{run_path}/results.json"
+                python_cmd_parts = [
+                    f"{python_exe} {eval_script_path}",
+                    f"--impl {impl_path}",
+                    f"--reference {ref_path}",
+                    f"--output {output_path}",
+                ]
+                if args.benchmark:
+                    python_cmd_parts.append("--benchmark")
+                if args.profile:
+                    python_cmd_parts.append("--profile")
+                if inputs_path:
+                    python_cmd_parts.append(f"--inputs {inputs_path}")
+                if args.defensive and defense_module_path:
+                    python_cmd_parts.append("--defensive")
+                    python_cmd_parts.append(f"--defense-module {defense_module_path}")
+                python_cmd_parts.append(f"--seed {args.seed}")
+                eval_cmd = " ".join(python_cmd_parts)
+                # Set environment for AMD GPU and run
+                env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1"
+                full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
+                # Run and stream output
+                log_lines = []
+                async for line in client.exec_stream(full_cmd):
+                    print(line)
+                    log_lines.append(line)
+                # Read results
+                cat_result = await client.exec(f"cat {output_path}")
+                if cat_result.exit_code != 0:
+                    log_tail = "\n".join(log_lines[-50:])
+                    return EvaluateResult(
+                        success=False,
+                        all_correct=False,
+                        correctness_score=0.0,
+                        geomean_speedup=0.0,
+                        passed_tests=0,
+                        total_tests=0,
+                        error_message=f"Evaluation failed. Log tail:\n{log_tail}",
+                    )
+                # Parse results
+                try:
+                    results_data = json.loads(cat_result.stdout)
+                except json.JSONDecodeError as e:
+                    return EvaluateResult(
+                        success=False,
+                        all_correct=False,
+                        correctness_score=0.0,
+                        geomean_speedup=0.0,
+                        passed_tests=0,
+                        total_tests=0,
+                        error_message=f"Failed to parse results: {e}",
+                    )
+                # Convert to EvaluateResult
+                correct = results_data.get("correct", False)
+                speedup = results_data.get("speedup", 0.0) or 0.0
+                error = results_data.get("error")
+                if error:
+                    return EvaluateResult(
+                        success=False,
+                        all_correct=False,
+                        correctness_score=0.0,
+                        geomean_speedup=0.0,
+                        passed_tests=0,
+                        total_tests=1,
+                        error_message=error,
+                    )
+                return EvaluateResult(
+                    success=True,
+                    all_correct=correct,
+                    correctness_score=1.0 if correct else 0.0,
+                    geomean_speedup=speedup,
+                    passed_tests=1 if correct else 0,
+                    total_tests=1,
+                )
+    except RunPodError as e:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message=f"RunPod error: {e}",
+        )
+async def run_evaluate_kernelbench_baremetal_amd(
+    args: KernelBenchEvaluateArgs,
+    target: BaremetalTarget,
+) -> EvaluateResult:
+    """Run KernelBench format evaluation directly on AMD baremetal target.
+    Runs evaluation script directly on host (no Docker) for AMD GPUs
+    that have PyTorch/ROCm installed.
+    """
+    from datetime import datetime
+    from wafer_core.async_ssh import AsyncSSHClient
+    REMOTE_WORKSPACE_BASE = "/tmp/wafer_eval"
+    # Select GPU
+    gpu_id = args.gpu_id if args.gpu_id is not None else target.gpu_ids[0]
+    print(f"Connecting to {target.ssh_target}...")
+    async with AsyncSSHClient(target.ssh_target, target.ssh_key) as client:
+        # Create workspace
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        run_dir = f"kernelbench_eval_{timestamp}"
+        run_path = f"{REMOTE_WORKSPACE_BASE}/{run_dir}"
+        await client.exec(f"mkdir -p {run_path}")
+        print(f"Created run directory: {run_path}")
+        # Read and upload files
+        impl_code = args.implementation.read_text()
+        ref_code = args.reference.read_text()
+        # Write implementation
+        impl_path = f"{run_path}/implementation.py"
+        write_result = await client.exec(
+            f"cat > '{impl_path}' << 'IMPL_EOF'\n{impl_code}\nIMPL_EOF"
+        )
+        if write_result.exit_code != 0:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Failed to write implementation: {write_result.stderr}",
+            )
+        # Write reference
+        ref_path = f"{run_path}/reference.py"
+        write_result = await client.exec(f"cat > '{ref_path}' << 'REF_EOF'\n{ref_code}\nREF_EOF")
+        if write_result.exit_code != 0:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Failed to write reference: {write_result.stderr}",
+            )
+        # Write custom inputs if provided
+        inputs_path = None
+        if args.inputs:
+            inputs_code = args.inputs.read_text()
+            inputs_path = f"{run_path}/custom_inputs.py"
+            write_result = await client.exec(
+                f"cat > '{inputs_path}' << 'INPUTS_EOF'\n{inputs_code}\nINPUTS_EOF"
+            )
+            if write_result.exit_code != 0:
+                return EvaluateResult(
+                    success=False,
+                    all_correct=False,
+                    correctness_score=0.0,
+                    geomean_speedup=0.0,
+                    passed_tests=0,
+                    total_tests=0,
+                    error_message=f"Failed to write custom inputs: {write_result.stderr}",
+                )
+        # Write eval script
+        eval_script_path = f"{run_path}/kernelbench_eval.py"
+        write_result = await client.exec(
+            f"cat > '{eval_script_path}' << 'EVAL_EOF'\n{KERNELBENCH_EVAL_SCRIPT}\nEVAL_EOF"
+        )
+        if write_result.exit_code != 0:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Failed to write eval script: {write_result.stderr}",
+            )
+        # Write defense module if defensive mode is enabled
+        defense_module_path = None
+        if args.defensive:
+            defense_path = (
+                Path(__file__).parent.parent.parent.parent
+                / "packages"
+                / "wafer-core"
+                / "wafer_core"
+                / "utils"
+                / "kernel_utils"
+                / "defense.py"
+            )
+            if defense_path.exists():
+                defense_code = defense_path.read_text()
+                defense_module_path = f"{run_path}/defense.py"
+                write_result = await client.exec(
+                    f"cat > '{defense_module_path}' << 'DEFENSE_EOF'\n{defense_code}\nDEFENSE_EOF"
+                )
+                if write_result.exit_code != 0:
+                    print(f"Warning: Failed to write defense module: {write_result.stderr}")
+                    defense_module_path = None
+            else:
+                print(f"Warning: defense.py not found at {defense_path}")
+        print("Running KernelBench evaluation (AMD/ROCm)...")
+        # Find Python with PyTorch - check common locations
+        python_exe = "python3"
+        for candidate in [
+            "/opt/conda/envs/py_3.10/bin/python3",
+            "/opt/conda/bin/python3",
+        ]:
+            check = await client.exec(f"{candidate} -c 'import torch' 2>/dev/null && echo OK")
+            if "OK" in check.stdout:
+                python_exe = candidate
+                print(f"Using Python: {python_exe}")
+                break
+        # Build eval command - run directly on host
+        output_path = f"{run_path}/results.json"
+        python_cmd_parts = [
+            f"{python_exe} {eval_script_path}",
+            f"--impl {impl_path}",
+            f"--reference {ref_path}",
+            f"--output {output_path}",
+        ]
+        if args.benchmark:
+            python_cmd_parts.append("--benchmark")
+        if args.profile:
+            python_cmd_parts.append("--profile")
+        if inputs_path:
+            python_cmd_parts.append(f"--inputs {inputs_path}")
+        if args.defensive and defense_module_path:
+            python_cmd_parts.append("--defensive")
+            python_cmd_parts.append(f"--defense-module {defense_module_path}")
+        python_cmd_parts.append(f"--seed {args.seed}")
+        eval_cmd = " ".join(python_cmd_parts)
+        # Set environment for AMD GPU and run
+        env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1"
+        full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
+        # Run and stream output
+        log_lines = []
+        async for line in client.exec_stream(full_cmd):
+            print(line)
+            log_lines.append(line)
+        # Read results
+        cat_result = await client.exec(f"cat {output_path}")
+        if cat_result.exit_code != 0:
+            log_tail = "\n".join(log_lines[-50:])
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Evaluation failed. Log tail:\n{log_tail}",
+            )
+        # Parse results
+        try:
+            results_data = json.loads(cat_result.stdout)
+        except json.JSONDecodeError as e:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=0,
+                error_message=f"Failed to parse results: {e}",
+            )
+        # Convert to EvaluateResult
+        correct = results_data.get("correct", False)
+        speedup = results_data.get("speedup", 0.0) or 0.0
+        error = results_data.get("error")
+        if error:
+            return EvaluateResult(
+                success=False,
+                all_correct=False,
+                correctness_score=0.0,
+                geomean_speedup=0.0,
+                passed_tests=0,
+                total_tests=1,
+                error_message=error,
+            )
+        return EvaluateResult(
+            success=True,
+            all_correct=correct,
+            correctness_score=1.0 if correct else 0.0,
+            geomean_speedup=speedup,
+            passed_tests=1 if correct else 0,
+            total_tests=1,
+        )
+async def run_evaluate_kernelbench(args: KernelBenchEvaluateArgs) -> EvaluateResult:
+    """Run KernelBench format evaluation on configured target.
+    Args:
+        args: KernelBench evaluate arguments
     Returns:
         Evaluation result
@@ -3103,7 +4362,13 @@ async def run_evaluate_kernelbench(args: KernelBenchEvaluateArgs) -> EvaluateRes
     if isinstance(target, DigitalOceanTarget):
         # DigitalOcean AMD MI300X - uses ROCm Docker with device passthrough
         return await run_evaluate_kernelbench_digitalocean(args, target)
+    elif isinstance(target, RunPodTarget):
+        # RunPod AMD MI300X - uses ROCm Docker with device passthrough
+        return await run_evaluate_kernelbench_runpod(args, target)
     elif isinstance(target, BaremetalTarget | VMTarget):
+        # Check if this is an AMD target (gfx* compute capability) - run directly
+        if target.compute_capability and target.compute_capability.startswith("gfx"):
+            return await run_evaluate_kernelbench_baremetal_amd(args, target)
         # NVIDIA targets - require docker_image to be set
         if not target.docker_image:
             return EvaluateResult(
@@ -3129,6 +4394,6 @@ async def run_evaluate_kernelbench(args: KernelBenchEvaluateArgs) -> EvaluateRes
             total_tests=0,
             error_message=(
                 f"Target type '{type(target).__name__}' not yet supported for KernelBench format. "
-                "Use a DigitalOcean, Baremetal, or VM target."
+                "Use a DigitalOcean, RunPod, Baremetal, or VM target."
             ),
         )

wafer-cli 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

wafer-cli 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl