PyPI - wafer-cli - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

wafer-cli 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

wafer/cli.py +479 -18
wafer/evaluate.py +760 -268
wafer/gpu_run.py +5 -1
wafer/problems.py +357 -0
wafer/wevin_cli.py +22 -2
{wafer_cli-0.2.3.dist-info → wafer_cli-0.2.4.dist-info}/METADATA +1 -1
{wafer_cli-0.2.3.dist-info → wafer_cli-0.2.4.dist-info}/RECORD +10 -9
{wafer_cli-0.2.3.dist-info → wafer_cli-0.2.4.dist-info}/WHEEL +1 -1
{wafer_cli-0.2.3.dist-info → wafer_cli-0.2.4.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.3.dist-info → wafer_cli-0.2.4.dist-info}/top_level.txt +0 -0

wafer/evaluate.py CHANGED Viewed

@@ -158,6 +158,8 @@ class KernelBenchEvaluateArgs:
     target_name: str
     benchmark: bool = False
     profile: bool = False
+    inputs: Path | None = None  # Custom inputs file to override get_inputs()
+    seed: int = 42  # Random seed for reproducibility
     defensive: bool = False
     sync_artifacts: bool = True
     gpu_id: int | None = None
@@ -349,18 +351,6 @@ def _build_docker_pip_install_cmd(target: BaremetalTarget | VMTarget) -> str:
     return " && ".join(commands)
-def _get_wafer_root() -> Path:
-    """Get wafer monorepo root directory.
-    Walks up from this file to find the wafer repo root (contains apps/, packages/).
-    """
-    current = Path(__file__).resolve()
-    for parent in [current] + list(current.parents):
-        if (parent / "apps").is_dir() and (parent / "packages").is_dir():
-            return parent
-    raise RuntimeError(f"Could not find wafer root from {__file__}")
 async def run_evaluate_docker(
     args: EvaluateArgs,
     target: BaremetalTarget | VMTarget,
@@ -394,33 +384,6 @@ async def run_evaluate_docker(
     print(f"Connecting to {target.ssh_target}...")
     async with AsyncSSHClient(target.ssh_target, target.ssh_key) as client:
-        # Upload wafer-core to remote
-        try:
-            wafer_root = _get_wafer_root()
-            wafer_core_path = wafer_root / "packages" / "wafer-core"
-            print(f"Uploading wafer-core from {wafer_core_path}...")
-            # Create workspace and upload
-            workspace_name = wafer_core_path.name
-            remote_workspace = f"{REMOTE_WORKSPACE_BASE}/{workspace_name}"
-            await client.exec(f"mkdir -p {remote_workspace}")
-            wafer_core_workspace = await client.expand_path(remote_workspace)
-            upload_result = await client.upload_files(
-                str(wafer_core_path), wafer_core_workspace, recursive=True
-            )
-            print(f"Uploaded {upload_result.files_copied} files")
-        except Exception as e:
-            return EvaluateResult(
-                success=False,
-                all_correct=False,
-                correctness_score=0.0,
-                geomean_speedup=0.0,
-                passed_tests=0,
-                total_tests=0,
-                error_message=f"Failed to upload wafer-core: {e}",
-            )
         print(f"Using Docker image: {target.docker_image}")
         print(f"Using GPU {gpu_id}...")
@@ -429,10 +392,13 @@ async def run_evaluate_docker(
         ref_code = args.reference.read_text()
         test_cases_data = json.loads(args.test_cases.read_text())
-        # Create a unique run directory
+        # Create workspace for evaluation files
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         run_dir = f"wafer_eval_{timestamp}"
-        run_path = f"{wafer_core_workspace}/{run_dir}"
+        eval_workspace = f"{REMOTE_WORKSPACE_BASE}/eval_{timestamp}"
+        await client.exec(f"mkdir -p {eval_workspace}")
+        eval_workspace_expanded = await client.expand_path(eval_workspace)
+        run_path = f"{eval_workspace_expanded}/{run_dir}"
         print("Uploading evaluation files...")
@@ -519,17 +485,14 @@ async def run_evaluate_docker(
         container_impl_path = f"{container_run_path}/implementation.py"
         container_ref_path = f"{container_run_path}/reference.py"
         container_test_cases_path = f"{container_run_path}/test_cases.json"
-        container_evaluate_script = (
-            f"{CONTAINER_WORKSPACE}/wafer_core/utils/kernel_utils/evaluate.py"
-        )
-        # Build pip install command for torch and other deps (no wafer-core install needed)
+        # Build pip install command for torch and other deps, plus wafer-core
         pip_install_cmd = _build_docker_pip_install_cmd(target)
+        install_cmd = f"{pip_install_cmd} && uv pip install --system --break-system-packages wafer-core"
-        # Build evaluate command - use PYTHONPATH instead of installing wafer-core
+        # Build evaluate command using installed wafer-core module
         python_cmd_parts = [
-            f"PYTHONPATH={CONTAINER_WORKSPACE}:$PYTHONPATH",
-            f"python3 {container_evaluate_script}",
+            "python3 -m wafer_core.utils.kernel_utils.evaluate",
             f"--implementation {container_impl_path}",
             f"--reference {container_ref_path}",
             f"--test-cases {container_test_cases_path}",
@@ -545,8 +508,8 @@ async def run_evaluate_docker(
         eval_cmd = " ".join(python_cmd_parts)
-        # Full command: install torch deps, then run evaluate with PYTHONPATH
-        full_cmd = f"{pip_install_cmd} && cd {container_run_path} && {eval_cmd}"
+        # Full command: install deps + wafer-core, then run evaluate
+        full_cmd = f"{install_cmd} && cd {container_run_path} && {eval_cmd}"
         # Build Docker run command
         # Add SYS_ADMIN capability when profiling (needed for NCU GPU performance counters)
@@ -556,7 +519,7 @@ async def run_evaluate_docker(
             working_dir=container_run_path,
             env={"CUDA_VISIBLE_DEVICES": str(gpu_id), "PYTHONUNBUFFERED": "1"},
             gpus="all",
-            volumes={wafer_core_workspace: CONTAINER_WORKSPACE},
+            volumes={eval_workspace_expanded: CONTAINER_WORKSPACE},
             cap_add=["SYS_ADMIN"] if args.profile else None,
         )
@@ -980,6 +943,7 @@ def _build_modal_sandbox_script(
     test_cases_b64: str,
     run_benchmarks: bool,
     run_defensive: bool,
+    defense_code_b64: str | None = None,
 ) -> str:
     """Build Python script to create sandbox and run evaluation.
@@ -1060,6 +1024,19 @@ print('Files written')
             print(json.dumps({{"error": f"Failed to write files: {{proc.stderr.read()}}"}}))
             return
+        # Write defense module if defensive mode is enabled
+        if {run_defensive} and "{defense_code_b64}":
+            proc = sandbox.exec("python", "-c", f"""
+import base64
+with open('/workspace/defense.py', 'w') as f:
+    f.write(base64.b64decode('{defense_code_b64}').decode())
+print('Defense module written')
+""")
+            proc.wait()
+            if proc.returncode != 0:
+                print(json.dumps({{"error": f"Failed to write defense module: {{proc.stderr.read()}}"}}))
+                return
         # Build inline evaluation script
         eval_script = """
 import json
@@ -1087,6 +1064,18 @@ generate_input = load_fn('reference.py', 'generate_input')
 import torch
+# Load defense module if available and defensive mode is enabled
+run_defensive = {run_defensive}
+defense = None
+if run_defensive:
+    try:
+        defense = load_fn('defense.py', 'run_all_defenses')
+        time_with_defenses = load_fn('defense.py', 'time_execution_with_defenses')
+        print('[Defense] Defense module loaded')
+    except Exception as e:
+        print(f'[Defense] Warning: Could not load defense module: {{e}}')
+        defense = None
 results = []
 all_correct = True
 total_time_ms = 0.0
@@ -1114,36 +1103,63 @@ for tc in test_cases:
         impl_time_ms = 0.0
         ref_time_ms = 0.0
         if {run_benchmarks}:
-            # Warmup
-            for _ in range(3):
-                custom_kernel(inputs)
-            torch.cuda.synchronize()
-            # Measure with defensive timing if requested
-            # Defensive: sync before recording end event to catch stream injection
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
-            start.record()
-            for _ in range(10):
-                custom_kernel(inputs)
-            if {run_defensive}:
-                torch.cuda.synchronize()  # DEFENSE: sync all streams before end
-            end.record()
-            torch.cuda.synchronize()
-            impl_time_ms = start.elapsed_time(end) / 10
-            # Reference timing (same defensive approach)
-            for _ in range(3):
-                ref_kernel(inputs)
-            torch.cuda.synchronize()
-            start.record()
-            for _ in range(10):
-                ref_kernel(inputs)
-            if {run_defensive}:
-                torch.cuda.synchronize()  # DEFENSE: sync all streams before end
-            end.record()
-            torch.cuda.synchronize()
-            ref_time_ms = start.elapsed_time(end) / 10
+            if run_defensive and defense is not None:
+                # Use full defense suite
+                # Run defense checks on implementation kernel
+                all_passed, defense_results, _ = defense(
+                    lambda: custom_kernel(inputs),
+                )
+                if not all_passed:
+                    failed = [name for name, passed, _ in defense_results if not passed]
+                    raise ValueError(f"Defense checks failed: {{failed}}")
+                # Time with defensive timing
+                impl_times, _ = time_with_defenses(
+                    lambda: custom_kernel(inputs),
+                    [],
+                    num_warmup=3,
+                    num_trials=10,
+                    verbose=False,
+                    run_defenses=False,  # Already ran defenses above
+                )
+                impl_time_ms = sum(impl_times) / len(impl_times)
+                # Reference timing (no defense checks needed)
+                ref_times, _ = time_with_defenses(
+                    lambda: ref_kernel(inputs),
+                    [],
+                    num_warmup=3,
+                    num_trials=10,
+                    verbose=False,
+                    run_defenses=False,
+                )
+                ref_time_ms = sum(ref_times) / len(ref_times)
+            else:
+                # Standard timing without full defenses
+                # Warmup
+                for _ in range(3):
+                    custom_kernel(inputs)
+                torch.cuda.synchronize()
+                start = torch.cuda.Event(enable_timing=True)
+                end = torch.cuda.Event(enable_timing=True)
+                start.record()
+                for _ in range(10):
+                    custom_kernel(inputs)
+                end.record()
+                torch.cuda.synchronize()
+                impl_time_ms = start.elapsed_time(end) / 10
+                # Reference timing
+                for _ in range(3):
+                    ref_kernel(inputs)
+                torch.cuda.synchronize()
+                start.record()
+                for _ in range(10):
+                    ref_kernel(inputs)
+                end.record()
+                torch.cuda.synchronize()
+                ref_time_ms = start.elapsed_time(end) / 10
             total_time_ms += impl_time_ms
             ref_total_time_ms += ref_time_ms
@@ -1236,6 +1252,23 @@ async def run_evaluate_modal(
     ref_code_b64 = base64.b64encode(args.reference.read_bytes()).decode()
     test_cases_b64 = base64.b64encode(args.test_cases.read_bytes()).decode()
+    # Encode defense module if defensive mode is enabled
+    defense_code_b64 = None
+    if args.defensive:
+        defense_path = (
+            Path(__file__).parent.parent.parent.parent
+            / "packages"
+            / "wafer-core"
+            / "wafer_core"
+            / "utils"
+            / "kernel_utils"
+            / "defense.py"
+        )
+        if defense_path.exists():
+            defense_code_b64 = base64.b64encode(defense_path.read_bytes()).decode()
+        else:
+            print(f"Warning: defense.py not found at {defense_path}, falling back to basic defense")
     # Build the script that creates sandbox and runs eval
     script = _build_modal_sandbox_script(
         target=target,
@@ -1244,6 +1277,7 @@ async def run_evaluate_modal(
         test_cases_b64=test_cases_b64,
         run_benchmarks=args.benchmark,
         run_defensive=args.defensive,
+        defense_code_b64=defense_code_b64,
     )
     def _run_subprocess() -> tuple[str, str, int]:
@@ -1341,6 +1375,7 @@ def _build_workspace_eval_script(
     test_cases_json: str,
     run_benchmarks: bool,
     run_defensive: bool = False,
+    defense_code: str | None = None,
 ) -> str:
     """Build inline evaluation script for workspace exec.
@@ -1351,6 +1386,7 @@ def _build_workspace_eval_script(
     impl_b64 = base64.b64encode(impl_code.encode()).decode()
     ref_b64 = base64.b64encode(ref_code.encode()).decode()
     tests_b64 = base64.b64encode(test_cases_json.encode()).decode()
+    defense_b64 = base64.b64encode(defense_code.encode()).decode() if defense_code else ""
     return f'''
 import base64
@@ -1370,6 +1406,14 @@ with open("/tmp/kernel.py", "w") as f:
 with open("/tmp/reference.py", "w") as f:
     f.write(ref_code)
+# Write defense module if available
+run_defensive = {run_defensive}
+defense_b64 = "{defense_b64}"
+if run_defensive and defense_b64:
+    defense_code = base64.b64decode(defense_b64).decode()
+    with open("/tmp/defense.py", "w") as f:
+        f.write(defense_code)
 # Load kernels
 def load_fn(path, name):
     spec = importlib.util.spec_from_file_location("mod", path)
@@ -1383,6 +1427,17 @@ generate_input = load_fn("/tmp/reference.py", "generate_input")
 import torch
+# Load defense module if available
+defense = None
+if run_defensive and defense_b64:
+    try:
+        defense = load_fn("/tmp/defense.py", "run_all_defenses")
+        time_with_defenses = load_fn("/tmp/defense.py", "time_execution_with_defenses")
+        print("[Defense] Defense module loaded")
+    except Exception as e:
+        print(f"[Defense] Warning: Could not load defense module: {{e}}")
+        defense = None
 results = []
 all_correct = True
 total_time_ms = 0.0
@@ -1410,36 +1465,60 @@ for tc in test_cases:
         impl_time_ms = 0.0
         ref_time_ms = 0.0
         if {run_benchmarks}:
-            # Warmup
-            for _ in range(3):
-                custom_kernel(inputs)
-            torch.cuda.synchronize()
-            # Measure with defensive timing if requested
-            # Defensive: sync before recording end event to catch stream injection
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
-            start.record()
-            for _ in range(10):
-                custom_kernel(inputs)
-            if {run_defensive}:
-                torch.cuda.synchronize()  # DEFENSE: sync all streams before end
-            end.record()
-            torch.cuda.synchronize()
-            impl_time_ms = start.elapsed_time(end) / 10
-            # Reference timing (same defensive approach)
-            for _ in range(3):
-                ref_kernel(inputs)
-            torch.cuda.synchronize()
-            start.record()
-            for _ in range(10):
-                ref_kernel(inputs)
-            if {run_defensive}:
-                torch.cuda.synchronize()  # DEFENSE: sync all streams before end
-            end.record()
-            torch.cuda.synchronize()
-            ref_time_ms = start.elapsed_time(end) / 10
+            if run_defensive and defense is not None:
+                # Use full defense suite
+                all_passed, defense_results, _ = defense(
+                    lambda: custom_kernel(inputs),
+                )
+                if not all_passed:
+                    failed = [name for name, passed, _ in defense_results if not passed]
+                    raise ValueError(f"Defense checks failed: {{failed}}")
+                # Time with defensive timing
+                impl_times, _ = time_with_defenses(
+                    lambda: custom_kernel(inputs),
+                    [],
+                    num_warmup=3,
+                    num_trials=10,
+                    verbose=False,
+                    run_defenses=False,
+                )
+                impl_time_ms = sum(impl_times) / len(impl_times)
+                # Reference timing
+                ref_times, _ = time_with_defenses(
+                    lambda: ref_kernel(inputs),
+                    [],
+                    num_warmup=3,
+                    num_trials=10,
+                    verbose=False,
+                    run_defenses=False,
+                )
+                ref_time_ms = sum(ref_times) / len(ref_times)
+            else:
+                # Standard timing
+                for _ in range(3):
+                    custom_kernel(inputs)
+                torch.cuda.synchronize()
+                start = torch.cuda.Event(enable_timing=True)
+                end = torch.cuda.Event(enable_timing=True)
+                start.record()
+                for _ in range(10):
+                    custom_kernel(inputs)
+                end.record()
+                torch.cuda.synchronize()
+                impl_time_ms = start.elapsed_time(end) / 10
+                for _ in range(3):
+                    ref_kernel(inputs)
+                torch.cuda.synchronize()
+                start.record()
+                for _ in range(10):
+                    ref_kernel(inputs)
+                end.record()
+                torch.cuda.synchronize()
+                ref_time_ms = start.elapsed_time(end) / 10
             total_time_ms += impl_time_ms
             ref_total_time_ms += ref_time_ms
@@ -1501,6 +1580,23 @@ async def run_evaluate_workspace(
     ref_code = args.reference.read_text()
     test_cases_json = args.test_cases.read_text()
+    # Read defense module if defensive mode is enabled
+    defense_code = None
+    if args.defensive:
+        defense_path = (
+            Path(__file__).parent.parent.parent.parent
+            / "packages"
+            / "wafer-core"
+            / "wafer_core"
+            / "utils"
+            / "kernel_utils"
+            / "defense.py"
+        )
+        if defense_path.exists():
+            defense_code = defense_path.read_text()
+        else:
+            print(f"Warning: defense.py not found at {defense_path}, falling back to basic defense")
     # Build inline eval script
     eval_script = _build_workspace_eval_script(
         impl_code=impl_code,
@@ -1508,6 +1604,7 @@ async def run_evaluate_workspace(
         test_cases_json=test_cases_json,
         run_benchmarks=args.benchmark,
         run_defensive=args.defensive,
+        defense_code=defense_code,
     )
     # Execute via workspace exec
@@ -1691,54 +1788,12 @@ async def run_evaluate_runpod(
                         error_message=f"Failed to setup Python environment: {e}",
                     )
-                # Upload wafer-core to remote
-                try:
-                    wafer_root = _get_wafer_root()
-                    wafer_core_path = wafer_root / "packages" / "wafer-core"
-                    print(f"Uploading wafer-core from {wafer_core_path}...")
-                    wafer_core_remote = f"{REMOTE_WORKSPACE}/wafer-core"
-                    await client.exec(f"mkdir -p {wafer_core_remote}")
-                    wafer_core_workspace = await client.expand_path(wafer_core_remote)
-                    upload_result = await client.upload_files(
-                        str(wafer_core_path), wafer_core_workspace, recursive=True
-                    )
-                    # Wide event logging for upload result
-                    upload_event = {
-                        "event": "wafer_core_upload",
-                        "target": target.name,
-                        "target_type": "runpod",
-                        "ssh_host": f"{client.user}@{client.host}:{client.port}",
-                        "local_path": str(wafer_core_path),
-                        "remote_path": wafer_core_workspace,
-                        "success": upload_result.success,
-                        "files_copied": upload_result.files_copied,
-                        "duration_seconds": upload_result.duration_seconds,
-                        "error_message": upload_result.error_message,
-                    }
-                    if upload_result.debug_info:
-                        upload_event["debug_info"] = upload_result.debug_info
-                    logger.info(json.dumps(upload_event))
-                    # Fail fast if upload failed
-                    if not upload_result.success:
-                        print(f"ERROR: Upload failed: {upload_result.error_message}")
-                        if upload_result.debug_info:
-                            print(f"Debug info: {json.dumps(upload_result.debug_info, indent=2)}")
-                        return EvaluateResult(
-                            success=False,
-                            all_correct=False,
-                            correctness_score=0.0,
-                            geomean_speedup=0.0,
-                            passed_tests=0,
-                            total_tests=0,
-                            error_message=f"Failed to upload wafer-core: {upload_result.error_message}",
-                        )
-                    print(f"Uploaded {upload_result.files_copied} files")
-                except Exception as e:
+                # Install wafer-core in remote venv
+                print("Installing wafer-core...")
+                install_result = await client.exec(
+                    f"{env_state.venv_bin}/uv pip install wafer-core"
+                )
+                if install_result.exit_code != 0:
                     return EvaluateResult(
                         success=False,
                         all_correct=False,
@@ -1746,7 +1801,7 @@ async def run_evaluate_runpod(
                         geomean_speedup=0.0,
                         passed_tests=0,
                         total_tests=0,
-                        error_message=f"Failed to upload wafer-core: {e}",
+                        error_message=f"Failed to install wafer-core: {install_result.stderr}",
                     )
                 # Select GPU (RunPod pods typically have GPU 0)
@@ -1853,15 +1908,12 @@ async def run_evaluate_runpod(
                 # Add venv bin to PATH so ninja (from pip) is found by torch.utils.cpp_extension
                 venv_bin = env_state.venv_bin
                 env_vars = f"PATH={venv_bin}:$PATH HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm"
-                pythonpath = f"PYTHONPATH={wafer_core_workspace}"
-                evaluate_script = (
-                    f"{wafer_core_workspace}/wafer_core/utils/kernel_utils/evaluate.py"
-                )
                 # Run from run_path so reference_kernel.py is importable
+                # Use installed wafer-core module
                 eval_cmd = (
                     f"cd {run_path} && "
-                    f"{env_vars} {pythonpath} {python_exe} {evaluate_script} "
+                    f"{env_vars} {python_exe} -m wafer_core.utils.kernel_utils.evaluate "
                     f"--implementation {impl_path} "
                     f"--reference {ref_path} "
                     f"--test-cases {test_cases_path} "
@@ -2046,61 +2098,12 @@ async def run_evaluate_digitalocean(
                             error_message=f"Failed to setup Python environment: {e}",
                         )
-                    # Upload wafer-core to remote
-                    try:
-                        wafer_root = _get_wafer_root()
-                        wafer_core_path = wafer_root / "packages" / "wafer-core"
-                        print(f"Uploading wafer-core from {wafer_core_path}...")
-                        wafer_core_remote = f"{REMOTE_WORKSPACE}/wafer-core"
-                        await client.exec(f"mkdir -p {wafer_core_remote}")
-                        wafer_core_workspace = await client.expand_path(wafer_core_remote)
-                        # Use SFTP instead of rsync to avoid SSH subprocess timeout issues
-                        # (DigitalOcean may rate-limit new SSH connections)
-                        upload_result = await client.upload_files(
-                            str(wafer_core_path),
-                            wafer_core_workspace,
-                            recursive=True,
-                            use_sftp=True,
-                        )
-                        # Wide event logging for upload result
-                        upload_event = {
-                            "event": "wafer_core_upload",
-                            "target": target.name,
-                            "target_type": "digitalocean",
-                            "ssh_host": f"{client.user}@{client.host}:{client.port}",
-                            "local_path": str(wafer_core_path),
-                            "remote_path": wafer_core_workspace,
-                            "success": upload_result.success,
-                            "files_copied": upload_result.files_copied,
-                            "duration_seconds": upload_result.duration_seconds,
-                            "error_message": upload_result.error_message,
-                        }
-                        if upload_result.debug_info:
-                            upload_event["debug_info"] = upload_result.debug_info
-                        logger.info(json.dumps(upload_event))
-                        # Fail fast if upload failed
-                        if not upload_result.success:
-                            print(f"ERROR: Upload failed: {upload_result.error_message}")
-                            if upload_result.debug_info:
-                                print(
-                                    f"Debug info: {json.dumps(upload_result.debug_info, indent=2)}"
-                                )
-                            return EvaluateResult(
-                                success=False,
-                                all_correct=False,
-                                correctness_score=0.0,
-                                geomean_speedup=0.0,
-                                passed_tests=0,
-                                total_tests=0,
-                                error_message=f"Failed to upload wafer-core: {upload_result.error_message}",
-                            )
-                        print(f"Uploaded {upload_result.files_copied} files")
-                    except Exception as e:
+                    # Install wafer-core in remote venv
+                    print("Installing wafer-core...")
+                    install_result = await client.exec(
+                        f"{env_state.venv_bin}/uv pip install wafer-core"
+                    )
+                    if install_result.exit_code != 0:
                         return EvaluateResult(
                             success=False,
                             all_correct=False,
@@ -2108,7 +2111,7 @@ async def run_evaluate_digitalocean(
                             geomean_speedup=0.0,
                             passed_tests=0,
                             total_tests=0,
-                            error_message=f"Failed to upload wafer-core: {e}",
+                            error_message=f"Failed to install wafer-core: {install_result.stderr}",
                         )
                     # Select GPU (DigitalOcean droplets typically have GPU 0)
@@ -2217,15 +2220,12 @@ async def run_evaluate_digitalocean(
                     env_vars = (
                         f"PATH={venv_bin}:$PATH HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm"
                     )
-                    pythonpath = f"PYTHONPATH={wafer_core_workspace}"
-                    evaluate_script = (
-                        f"{wafer_core_workspace}/wafer_core/utils/kernel_utils/evaluate.py"
-                    )
                     # Run from run_path so reference_kernel.py is importable
+                    # Use installed wafer-core module
                     eval_cmd = (
                         f"cd {run_path} && "
-                        f"{env_vars} {pythonpath} {python_exe} {evaluate_script} "
+                        f"{env_vars} {python_exe} -m wafer_core.utils.kernel_utils.evaluate "
                         f"--implementation {impl_path} "
                         f"--reference {ref_path} "
                         f"--test-cases {test_cases_path} "
@@ -2435,10 +2435,233 @@ async def run_evaluate(args: EvaluateArgs) -> EvaluateResult:
 # This runs inside the Docker container on the remote GPU
 KERNELBENCH_EVAL_SCRIPT = """
 import json
+import os
 import sys
 import time
 import torch
 import torch.nn as nn
+from pathlib import Path
+def run_profiling(model, inputs, name, output_dir):
+    '''Run torch.profiler and return summary stats.'''
+    from torch.profiler import profile, ProfilerActivity
+    # Determine activities based on backend
+    activities = [ProfilerActivity.CPU]
+    if torch.cuda.is_available():
+        activities.append(ProfilerActivity.CUDA)
+    # Warmup
+    for _ in range(3):
+        with torch.no_grad():
+            _ = model(*inputs)
+    torch.cuda.synchronize()
+    # Profile
+    with profile(
+        activities=activities,
+        record_shapes=True,
+        with_stack=False,
+        profile_memory=True,
+    ) as prof:
+        with torch.no_grad():
+            _ = model(*inputs)
+        torch.cuda.synchronize()
+    # Get key averages
+    key_averages = prof.key_averages()
+    # Find the main kernel (longest GPU time)
+    # Use cuda_time_total for compatibility with both CUDA and ROCm
+    def get_gpu_time(e):
+        # Try different attributes for GPU time
+        if hasattr(e, 'cuda_time_total'):
+            return e.cuda_time_total
+        if hasattr(e, 'device_time_total'):
+            return e.device_time_total
+        if hasattr(e, 'self_cuda_time_total'):
+            return e.self_cuda_time_total
+        return 0
+    gpu_events = [e for e in key_averages if get_gpu_time(e) > 0]
+    gpu_events.sort(key=lambda e: get_gpu_time(e), reverse=True)
+    stats = {
+        "name": name,
+        "total_gpu_time_ms": sum(get_gpu_time(e) for e in gpu_events) / 1000,
+        "total_cpu_time_ms": sum(e.cpu_time_total for e in key_averages) / 1000,
+        "num_gpu_kernels": len(gpu_events),
+        "top_kernels": [],
+    }
+    # Top 5 kernels by GPU time
+    for e in gpu_events[:5]:
+        stats["top_kernels"].append({
+            "name": e.key,
+            "gpu_time_ms": get_gpu_time(e) / 1000,
+            "cpu_time_ms": e.cpu_time_total / 1000,
+            "calls": e.count,
+        })
+    # Save trace for visualization
+    trace_path = Path(output_dir) / f"{name}_trace.json"
+    prof.export_chrome_trace(str(trace_path))
+    stats["trace_file"] = str(trace_path)
+    return stats
+def validate_custom_inputs(original_inputs, custom_inputs):
+    '''Validate that custom inputs match the expected signature.
+    Returns (is_valid, error_message).
+    '''
+    if len(original_inputs) != len(custom_inputs):
+        return False, f"get_inputs() must return {len(original_inputs)} tensors, got {len(custom_inputs)}"
+    for i, (orig, cust) in enumerate(zip(original_inputs, custom_inputs)):
+        if not isinstance(cust, torch.Tensor):
+            if not isinstance(orig, torch.Tensor):
+                continue  # Both non-tensor, ok
+            return False, f"Input {i}: expected Tensor, got {type(cust).__name__}"
+        if not isinstance(orig, torch.Tensor):
+            return False, f"Input {i}: expected {type(orig).__name__}, got Tensor"
+        if orig.dtype != cust.dtype:
+            return False, f"Input {i}: dtype mismatch - expected {orig.dtype}, got {cust.dtype}"
+        if orig.dim() != cust.dim():
+            return False, f"Input {i}: dimension mismatch - expected {orig.dim()}D, got {cust.dim()}D"
+    return True, None
+def analyze_diff(ref_output, new_output, rtol=1e-3, atol=1e-3, max_samples=5):
+    '''Analyze differences between reference and implementation outputs.
+    Returns a dict with detailed diff information.
+    '''
+    diff = (ref_output - new_output).abs()
+    threshold = atol + rtol * ref_output.abs()
+    wrong_mask = diff > threshold
+    total_elements = ref_output.numel()
+    wrong_count = wrong_mask.sum().item()
+    # Basic stats
+    max_diff = diff.max().item()
+    max_diff_idx = tuple(torch.unravel_index(diff.argmax(), diff.shape))
+    max_diff_idx = tuple(int(i) for i in max_diff_idx)  # Convert to Python ints
+    # Relative error (avoid div by zero)
+    ref_abs = ref_output.abs()
+    nonzero_mask = ref_abs > 1e-8
+    if nonzero_mask.any():
+        rel_error = diff[nonzero_mask] / ref_abs[nonzero_mask]
+        max_rel_error = rel_error.max().item()
+        mean_rel_error = rel_error.mean().item()
+    else:
+        max_rel_error = float('inf') if max_diff > 0 else 0.0
+        mean_rel_error = max_rel_error
+    # Error histogram (buckets: <1e-6, 1e-6 to 1e-4, 1e-4 to 1e-2, 1e-2 to 1, >1)
+    histogram = {
+        '<1e-6': int((diff < 1e-6).sum().item()),
+        '1e-6 to 1e-4': int(((diff >= 1e-6) & (diff < 1e-4)).sum().item()),
+        '1e-4 to 1e-2': int(((diff >= 1e-4) & (diff < 1e-2)).sum().item()),
+        '1e-2 to 1': int(((diff >= 1e-2) & (diff < 1)).sum().item()),
+        '>1': int((diff >= 1).sum().item()),
+    }
+    result = {
+        'max_diff': max_diff,
+        'max_diff_idx': max_diff_idx,
+        'mean_diff': diff.mean().item(),
+        'max_rel_error': max_rel_error,
+        'mean_rel_error': mean_rel_error,
+        'total_elements': total_elements,
+        'wrong_count': int(wrong_count),
+        'wrong_pct': 100.0 * wrong_count / total_elements,
+        'histogram': histogram,
+        'samples': [],
+    }
+    # Get indices of wrong elements
+    if wrong_count > 0:
+        wrong_indices = torch.nonzero(wrong_mask, as_tuple=False)
+        # Take first N samples
+        num_samples = min(max_samples, len(wrong_indices))
+        for i in range(num_samples):
+            idx = tuple(wrong_indices[i].tolist())
+            ref_val = ref_output[idx].item()
+            new_val = new_output[idx].item()
+            diff_val = diff[idx].item()
+            result['samples'].append({
+                'index': idx,
+                'ref': ref_val,
+                'impl': new_val,
+                'diff': diff_val,
+            })
+        # Try to detect pattern
+        if wrong_count >= total_elements * 0.99:
+            result['pattern'] = 'all_wrong'
+        elif wrong_count < total_elements * 0.01:
+            # Check if failures are at boundaries
+            shape = ref_output.shape
+            boundary_count = 0
+            for idx in wrong_indices[:min(100, len(wrong_indices))]:
+                idx_list = idx.tolist()
+                is_boundary = any(i == 0 or i == s - 1 for i, s in zip(idx_list, shape))
+                if is_boundary:
+                    boundary_count += 1
+            if boundary_count > len(wrong_indices[:100]) * 0.8:
+                result['pattern'] = 'boundary_issue'
+            else:
+                result['pattern'] = 'scattered'
+        else:
+            result['pattern'] = 'partial'
+    return result
+def print_diff_analysis(analysis):
+    '''Print a human-readable diff analysis.'''
+    print(f"[KernelBench] Diff analysis:")
+    # Max diff with location
+    idx_str = ','.join(str(i) for i in analysis['max_diff_idx'])
+    print(f"   Max diff: {analysis['max_diff']:.6f} at index [{idx_str}]")
+    print(f"   Mean diff: {analysis['mean_diff']:.6f}")
+    # Relative errors
+    print(f"   Max relative error: {analysis['max_rel_error']:.2%}, Mean: {analysis['mean_rel_error']:.2%}")
+    # Wrong count
+    print(f"   Wrong elements: {analysis['wrong_count']:,} / {analysis['total_elements']:,} ({analysis['wrong_pct']:.2f}%)")
+    # Histogram
+    hist = analysis['histogram']
+    print(f"   Error distribution: <1e-6: {hist['<1e-6']:,} | 1e-6~1e-4: {hist['1e-6 to 1e-4']:,} | 1e-4~1e-2: {hist['1e-4 to 1e-2']:,} | 1e-2~1: {hist['1e-2 to 1']:,} | >1: {hist['>1']:,}")
+    if 'pattern' in analysis:
+        pattern_desc = {
+            'all_wrong': 'ALL elements wrong - likely algorithmic error or wrong weights',
+            'boundary_issue': 'Mostly BOUNDARY elements wrong - check edge handling',
+            'scattered': 'SCATTERED failures - numerical precision issue?',
+            'partial': 'PARTIAL failures - check specific conditions',
+        }
+        print(f"   Pattern: {pattern_desc.get(analysis['pattern'], analysis['pattern'])}")
+    if analysis['samples']:
+        print(f"   Sample failures:")
+        for s in analysis['samples']:
+            idx_str = ','.join(str(i) for i in s['index'])
+            print(f"      [{idx_str}]: ref={s['ref']:.6f} impl={s['impl']:.6f} (diff={s['diff']:.6f})")
 def main():
     # Parse args
@@ -2446,12 +2669,35 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--impl", required=True)
     parser.add_argument("--reference", required=True)
+    parser.add_argument("--inputs", help="Custom inputs file to override get_inputs()/get_init_inputs()")
     parser.add_argument("--benchmark", action="store_true")
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--defensive", action="store_true", help="Run full defense checks against reward hacking")
+    parser.add_argument("--defense-module", help="Path to defense.py module")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
     parser.add_argument("--num-correct-trials", type=int, default=3)
     parser.add_argument("--num-perf-trials", type=int, default=10)
     parser.add_argument("--output", required=True)
     args = parser.parse_args()
+    # Load defense module if defensive mode is enabled
+    defense_module = None
+    if args.defensive and args.defense_module:
+        try:
+            import importlib.util
+            defense_spec = importlib.util.spec_from_file_location("defense", args.defense_module)
+            defense_module = importlib.util.module_from_spec(defense_spec)
+            defense_spec.loader.exec_module(defense_module)
+            print("[KernelBench] Defense module loaded")
+        except Exception as e:
+            print(f"[KernelBench] Warning: Could not load defense module: {e}")
+    # Create output directory for profiles
+    output_dir = Path(args.output).parent
+    profile_dir = output_dir / "profiles"
+    if args.profile:
+        profile_dir.mkdir(exist_ok=True)
     results = {
         "compiled": False,
         "correct": False,
@@ -2472,6 +2718,33 @@ def main():
         get_inputs = ref_module.get_inputs
         get_init_inputs = ref_module.get_init_inputs
+        # Load custom inputs if provided
+        if args.inputs:
+            inputs_spec = importlib.util.spec_from_file_location("custom_inputs", args.inputs)
+            inputs_module = importlib.util.module_from_spec(inputs_spec)
+            inputs_spec.loader.exec_module(inputs_module)
+            # Validate custom inputs match expected signature
+            original_inputs = get_inputs()
+            custom_get_inputs = inputs_module.get_inputs
+            custom_inputs = custom_get_inputs()
+            is_valid, error_msg = validate_custom_inputs(original_inputs, custom_inputs)
+            if not is_valid:
+                print(f"[KernelBench] Custom inputs validation failed: {error_msg}")
+                results["error"] = f"Custom inputs validation failed: {error_msg}"
+                raise ValueError(error_msg)
+            # Override get_inputs (and optionally get_init_inputs)
+            get_inputs = custom_get_inputs
+            if hasattr(inputs_module, 'get_init_inputs'):
+                get_init_inputs = inputs_module.get_init_inputs
+            # Show what changed
+            orig_shapes = [tuple(t.shape) if hasattr(t, 'shape') else type(t).__name__ for t in original_inputs]
+            cust_shapes = [tuple(t.shape) if hasattr(t, 'shape') else type(t).__name__ for t in custom_inputs]
+            print(f"[KernelBench] Using custom inputs: {orig_shapes} -> {cust_shapes}")
         # Load implementation module
         impl_spec = importlib.util.spec_from_file_location("implementation", args.impl)
         impl_module = importlib.util.module_from_spec(impl_spec)
@@ -2481,12 +2754,19 @@ def main():
         results["compiled"] = True
         print("[KernelBench] Modules loaded successfully")
-        # Instantiate models
+        # Instantiate models with synchronized seeds for reproducible weights
+        # (matches upstream KernelBench behavior in src/eval.py)
+        seed = args.seed
         init_inputs = get_init_inputs()
         with torch.no_grad():
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed(seed)
             ref_model = Model(*init_inputs).cuda().eval()
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed(seed)
             new_model = ModelNew(*init_inputs).cuda().eval()
-        print("[KernelBench] Models instantiated")
+        print(f"[KernelBench] Models instantiated (seed={seed})")
         # Run correctness trials
         all_correct = True
@@ -2502,8 +2782,18 @@ def main():
             if isinstance(ref_output, torch.Tensor):
                 if not torch.allclose(ref_output, new_output, rtol=1e-3, atol=1e-3):
                     all_correct = False
-                    max_diff = (ref_output - new_output).abs().max().item()
-                    results["error"] = f"Correctness failed on trial {trial+1}: max diff = {max_diff}"
+                    analysis = analyze_diff(ref_output, new_output)
+                    results["error"] = f"Correctness failed on trial {trial+1}: max diff = {analysis['max_diff']}"
+                    results["diff_analysis"] = analysis
+                    print_diff_analysis(analysis)
+                    # Save tensors for debugging
+                    debug_dir = output_dir / "debug"
+                    debug_dir.mkdir(exist_ok=True)
+                    torch.save(ref_output.cpu(), debug_dir / "ref_output.pt")
+                    torch.save(new_output.cpu(), debug_dir / "impl_output.pt")
+                    torch.save(inputs[0].cpu() if inputs else None, debug_dir / "input.pt")
+                    print(f"[KernelBench] Debug tensors saved to: {debug_dir}/")
                     break
             else:
                 # Handle tuple/list outputs
@@ -2511,8 +2801,17 @@ def main():
                     if isinstance(r, torch.Tensor):
                         if not torch.allclose(r, n, rtol=1e-3, atol=1e-3):
                             all_correct = False
-                            max_diff = (r - n).abs().max().item()
-                            results["error"] = f"Correctness failed on trial {trial+1}, output {i}: max diff = {max_diff}"
+                            analysis = analyze_diff(r, n)
+                            results["error"] = f"Correctness failed on trial {trial+1}, output {i}: max diff = {analysis['max_diff']}"
+                            results["diff_analysis"] = analysis
+                            print_diff_analysis(analysis)
+                            # Save tensors for debugging
+                            debug_dir = output_dir / "debug"
+                            debug_dir.mkdir(exist_ok=True)
+                            torch.save(r.cpu(), debug_dir / f"ref_output_{i}.pt")
+                            torch.save(n.cpu(), debug_dir / f"impl_output_{i}.pt")
+                            print(f"[KernelBench] Debug tensors saved to: {debug_dir}/")
                             break
                 if not all_correct:
                     break
@@ -2526,47 +2825,132 @@ def main():
             inputs = get_inputs()
             inputs = [x.cuda() if isinstance(x, torch.Tensor) else x for x in inputs]
-            # Warmup
-            for _ in range(5):
-                with torch.no_grad():
-                    _ = new_model(*inputs)
-            torch.cuda.synchronize()
+            if args.defensive and defense_module is not None:
+                # Use full defense suite
+                print("[KernelBench] Running defense checks on implementation...")
+                run_all_defenses = defense_module.run_all_defenses
+                time_with_defenses = defense_module.time_execution_with_defenses
-            # Benchmark new model
-            start = torch.cuda.Event(enable_timing=True)
-            end = torch.cuda.Event(enable_timing=True)
-            times = []
-            for _ in range(args.num_perf_trials):
-                start.record()
-                with torch.no_grad():
-                    _ = new_model(*inputs)
-                end.record()
+                # Run defense checks on implementation
+                all_passed, defense_results, _ = run_all_defenses(
+                    lambda *x: new_model(*x),
+                    *inputs,
+                )
+                results["defense_results"] = {
+                    name: {"passed": passed, "message": msg}
+                    for name, passed, msg in defense_results
+                }
+                if not all_passed:
+                    failed = [name for name, passed, _ in defense_results if not passed]
+                    results["error"] = f"Defense checks failed: {failed}"
+                    print(f"[KernelBench] Defense checks FAILED: {failed}")
+                    for name, passed, msg in defense_results:
+                        status = "PASS" if passed else "FAIL"
+                        print(f"   [{status}] {name}: {msg}")
+                else:
+                    print("[KernelBench] All defense checks passed")
+                    # Time with defensive timing
+                    impl_times, _ = time_with_defenses(
+                        lambda: new_model(*inputs),
+                        [],
+                        num_warmup=5,
+                        num_trials=args.num_perf_trials,
+                        verbose=False,
+                        run_defenses=False,  # Already ran above
+                    )
+                    new_time = sum(impl_times) / len(impl_times)
+                    results["runtime_ms"] = new_time
+                    # Reference timing
+                    ref_times, _ = time_with_defenses(
+                        lambda: ref_model(*inputs),
+                        [],
+                        num_warmup=5,
+                        num_trials=args.num_perf_trials,
+                        verbose=False,
+                        run_defenses=False,
+                    )
+                    ref_time = sum(ref_times) / len(ref_times)
+                    results["reference_runtime_ms"] = ref_time
+                    results["speedup"] = ref_time / new_time if new_time > 0 else 0
+                    print(f"[KernelBench] New: {new_time:.3f}ms, Ref: {ref_time:.3f}ms, Speedup: {results['speedup']:.2f}x")
+            else:
+                # Standard timing without full defenses
+                # Warmup
+                for _ in range(5):
+                    with torch.no_grad():
+                        _ = new_model(*inputs)
                 torch.cuda.synchronize()
-                times.append(start.elapsed_time(end))
-            new_time = sum(times) / len(times)
-            results["runtime_ms"] = new_time
-            # Benchmark reference model
-            for _ in range(5):
-                with torch.no_grad():
-                    _ = ref_model(*inputs)
-            torch.cuda.synchronize()
-            times = []
-            for _ in range(args.num_perf_trials):
-                start.record()
-                with torch.no_grad():
-                    _ = ref_model(*inputs)
-                end.record()
+                # Benchmark new model
+                start = torch.cuda.Event(enable_timing=True)
+                end = torch.cuda.Event(enable_timing=True)
+                times = []
+                for _ in range(args.num_perf_trials):
+                    start.record()
+                    with torch.no_grad():
+                        _ = new_model(*inputs)
+                    end.record()
+                    torch.cuda.synchronize()
+                    times.append(start.elapsed_time(end))
+                new_time = sum(times) / len(times)
+                results["runtime_ms"] = new_time
+                # Benchmark reference model
+                for _ in range(5):
+                    with torch.no_grad():
+                        _ = ref_model(*inputs)
                 torch.cuda.synchronize()
-                times.append(start.elapsed_time(end))
-            ref_time = sum(times) / len(times)
-            results["reference_runtime_ms"] = ref_time
-            results["speedup"] = ref_time / new_time if new_time > 0 else 0
-            print(f"[KernelBench] New: {new_time:.3f}ms, Ref: {ref_time:.3f}ms, Speedup: {results['speedup']:.2f}x")
+                times = []
+                for _ in range(args.num_perf_trials):
+                    start.record()
+                    with torch.no_grad():
+                        _ = ref_model(*inputs)
+                    end.record()
+                    torch.cuda.synchronize()
+                    times.append(start.elapsed_time(end))
+                ref_time = sum(times) / len(times)
+                results["reference_runtime_ms"] = ref_time
+                results["speedup"] = ref_time / new_time if new_time > 0 else 0
+                print(f"[KernelBench] New: {new_time:.3f}ms, Ref: {ref_time:.3f}ms, Speedup: {results['speedup']:.2f}x")
+        # Run profiling if requested and correctness passed
+        if args.profile and all_correct:
+            print("[KernelBench] Running profiler...")
+            inputs = get_inputs()
+            inputs = [x.cuda() if isinstance(x, torch.Tensor) else x for x in inputs]
+            try:
+                # Profile implementation
+                impl_stats = run_profiling(new_model, inputs, "implementation", str(profile_dir))
+                results["profile_impl"] = impl_stats
+                print(f"[KernelBench] Implementation profile:")
+                print(f"   Total GPU time: {impl_stats['total_gpu_time_ms']:.3f}ms")
+                print(f"   Kernels launched: {impl_stats['num_gpu_kernels']}")
+                if impl_stats['top_kernels']:
+                    print(f"   Top kernel: {impl_stats['top_kernels'][0]['name'][:60]}...")
+                    print(f"              {impl_stats['top_kernels'][0]['gpu_time_ms']:.3f}ms")
+                # Profile reference
+                ref_stats = run_profiling(ref_model, inputs, "reference", str(profile_dir))
+                results["profile_ref"] = ref_stats
+                print(f"[KernelBench] Reference profile:")
+                print(f"   Total GPU time: {ref_stats['total_gpu_time_ms']:.3f}ms")
+                print(f"   Kernels launched: {ref_stats['num_gpu_kernels']}")
+                if ref_stats['top_kernels']:
+                    print(f"   Top kernel: {ref_stats['top_kernels'][0]['name'][:60]}...")
+                    print(f"              {ref_stats['top_kernels'][0]['gpu_time_ms']:.3f}ms")
+                print(f"[KernelBench] Profile traces saved to: {profile_dir}/")
+            except Exception as prof_err:
+                print(f"[KernelBench] Profiling failed: {prof_err}")
+                results["profile_error"] = str(prof_err)
     except Exception as e:
         import traceback
@@ -2705,6 +3089,24 @@ async def run_evaluate_kernelbench_docker(
                 error_message=f"Failed to write reference: {write_result.stderr}",
             )
+        # Write custom inputs if provided
+        if args.inputs:
+            inputs_code = args.inputs.read_text()
+            inputs_file_path = f"{run_path}/custom_inputs.py"
+            write_result = await client.exec(
+                f"cat > '{inputs_file_path}' << 'INPUTS_EOF'\n{inputs_code}\nINPUTS_EOF"
+            )
+            if write_result.exit_code != 0:
+                return EvaluateResult(
+                    success=False,
+                    all_correct=False,
+                    correctness_score=0.0,
+                    geomean_speedup=0.0,
+                    passed_tests=0,
+                    total_tests=0,
+                    error_message=f"Failed to write custom inputs: {write_result.stderr}",
+                )
         # Write eval script
         eval_script_path = f"{run_path}/kernelbench_eval.py"
         write_result = await client.exec(
@@ -2721,14 +3123,40 @@ async def run_evaluate_kernelbench_docker(
                 error_message=f"Failed to write eval script: {write_result.stderr}",
             )
+        # Write defense module if defensive mode is enabled
+        defense_module_path = None
+        if args.defensive:
+            defense_path = (
+                Path(__file__).parent.parent.parent.parent
+                / "packages"
+                / "wafer-core"
+                / "wafer_core"
+                / "utils"
+                / "kernel_utils"
+                / "defense.py"
+            )
+            if defense_path.exists():
+                defense_code = defense_path.read_text()
+                defense_module_path = f"{run_path}/defense.py"
+                write_result = await client.exec(
+                    f"cat > '{defense_module_path}' << 'DEFENSE_EOF'\n{defense_code}\nDEFENSE_EOF"
+                )
+                if write_result.exit_code != 0:
+                    print(f"Warning: Failed to write defense module: {write_result.stderr}")
+                    defense_module_path = None
+            else:
+                print(f"Warning: defense.py not found at {defense_path}")
         print("Running KernelBench evaluation in Docker container...")
         # Paths inside container
         container_run_path = f"{CONTAINER_WORKSPACE}/{run_dir}"
         container_impl_path = f"{container_run_path}/implementation.py"
         container_ref_path = f"{container_run_path}/reference.py"
+        container_inputs_path = f"{container_run_path}/custom_inputs.py" if args.inputs else None
         container_eval_script = f"{container_run_path}/kernelbench_eval.py"
         container_output = f"{container_run_path}/results.json"
+        container_defense_path = f"{container_run_path}/defense.py" if defense_module_path else None
         # Build eval command
         python_cmd_parts = [
@@ -2740,6 +3168,14 @@ async def run_evaluate_kernelbench_docker(
         if args.benchmark:
             python_cmd_parts.append("--benchmark")
+        if args.profile:
+            python_cmd_parts.append("--profile")
+        if container_inputs_path:
+            python_cmd_parts.append(f"--inputs {container_inputs_path}")
+        if args.defensive and container_defense_path:
+            python_cmd_parts.append("--defensive")
+            python_cmd_parts.append(f"--defense-module {container_defense_path}")
+        python_cmd_parts.append(f"--seed {args.seed}")
         eval_cmd = " ".join(python_cmd_parts)
@@ -2920,6 +3356,24 @@ async def run_evaluate_kernelbench_digitalocean(
                         error_message=f"Failed to write reference: {write_result.stderr}",
                     )
+                # Write custom inputs if provided
+                if args.inputs:
+                    inputs_code = args.inputs.read_text()
+                    inputs_file_path = f"{run_path}/custom_inputs.py"
+                    write_result = await client.exec(
+                        f"cat > '{inputs_file_path}' << 'INPUTS_EOF'\n{inputs_code}\nINPUTS_EOF"
+                    )
+                    if write_result.exit_code != 0:
+                        return EvaluateResult(
+                            success=False,
+                            all_correct=False,
+                            correctness_score=0.0,
+                            geomean_speedup=0.0,
+                            passed_tests=0,
+                            total_tests=0,
+                            error_message=f"Failed to write custom inputs: {write_result.stderr}",
+                        )
                 # Write eval script
                 eval_script_path = f"{run_path}/kernelbench_eval.py"
                 write_result = await client.exec(
@@ -2936,14 +3390,44 @@ async def run_evaluate_kernelbench_digitalocean(
                         error_message=f"Failed to write eval script: {write_result.stderr}",
                     )
+                # Write defense module if defensive mode is enabled
+                defense_module_path = None
+                if args.defensive:
+                    defense_path = (
+                        Path(__file__).parent.parent.parent.parent
+                        / "packages"
+                        / "wafer-core"
+                        / "wafer_core"
+                        / "utils"
+                        / "kernel_utils"
+                        / "defense.py"
+                    )
+                    if defense_path.exists():
+                        defense_code = defense_path.read_text()
+                        defense_module_path = f"{run_path}/defense.py"
+                        write_result = await client.exec(
+                            f"cat > '{defense_module_path}' << 'DEFENSE_EOF'\n{defense_code}\nDEFENSE_EOF"
+                        )
+                        if write_result.exit_code != 0:
+                            print(f"Warning: Failed to write defense module: {write_result.stderr}")
+                            defense_module_path = None
+                    else:
+                        print(f"Warning: defense.py not found at {defense_path}")
                 print("Running KernelBench evaluation in Docker container (AMD/ROCm)...")
                 # Paths inside container
                 container_run_path = f"{CONTAINER_WORKSPACE}/{run_dir}"
                 container_impl_path = f"{container_run_path}/implementation.py"
                 container_ref_path = f"{container_run_path}/reference.py"
+                container_inputs_path = (
+                    f"{container_run_path}/custom_inputs.py" if args.inputs else None
+                )
                 container_eval_script = f"{container_run_path}/kernelbench_eval.py"
                 container_output = f"{container_run_path}/results.json"
+                container_defense_path = (
+                    f"{container_run_path}/defense.py" if defense_module_path else None
+                )
                 # Build eval command
                 python_cmd_parts = [
@@ -2955,6 +3439,14 @@ async def run_evaluate_kernelbench_digitalocean(
                 if args.benchmark:
                     python_cmd_parts.append("--benchmark")
+                if args.profile:
+                    python_cmd_parts.append("--profile")
+                if container_inputs_path:
+                    python_cmd_parts.append(f"--inputs {container_inputs_path}")
+                if args.defensive and container_defense_path:
+                    python_cmd_parts.append("--defensive")
+                    python_cmd_parts.append(f"--defense-module {container_defense_path}")
+                python_cmd_parts.append(f"--seed {args.seed}")
                 eval_cmd = " ".join(python_cmd_parts)

wafer-cli 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

wafer-cli 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl