PyPI - wafer-cli - Versions diffs - 0.2.14__py3-none-any.whl → 0.2.30__py3-none-any.whl - Mend

wafer-cli 0.2.14py3-none-any.whl → 0.2.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

wafer/GUIDE.md +1 -1
wafer/agent_defaults.py +42 -0
wafer/auth.py +7 -0
wafer/billing.py +6 -6
wafer/cli.py +905 -131
wafer/cli_instructions.py +143 -0
wafer/corpus.py +313 -15
wafer/evaluate.py +480 -146
wafer/global_config.py +13 -0
wafer/kernel_scope.py +1 -1
wafer/ncu_analyze.py +1 -1
wafer/nsys_analyze.py +1 -1
wafer/skills/wafer-guide/SKILL.md +22 -6
wafer/specs_cli.py +157 -0
wafer/ssh_keys.py +6 -6
wafer/targets_cli.py +472 -0
wafer/targets_ops.py +29 -2
wafer/templates/ask_docs.py +1 -1
wafer/templates/optimize_kernel.py +3 -1
wafer/templates/optimize_kernelbench.py +17 -62
wafer/templates/trace_analyze.py +1 -1
wafer/tests/test_eval_cli_parity.py +199 -0
wafer/trace_compare.py +274 -0
wafer/wevin_cli.py +125 -26
wafer/workspaces.py +163 -16
wafer_cli-0.2.30.dist-info/METADATA +107 -0
wafer_cli-0.2.30.dist-info/RECORD +47 -0
wafer_cli-0.2.14.dist-info/METADATA +0 -16
wafer_cli-0.2.14.dist-info/RECORD +0 -41
{wafer_cli-0.2.14.dist-info → wafer_cli-0.2.30.dist-info}/WHEEL +0 -0
{wafer_cli-0.2.14.dist-info → wafer_cli-0.2.30.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.14.dist-info → wafer_cli-0.2.30.dist-info}/top_level.txt +0 -0

wafer/evaluate.py CHANGED Viewed

@@ -354,7 +354,8 @@ def _build_docker_pip_install_cmd(target: BaremetalTarget | VMTarget) -> str:
     )
     # Install uv (fast, reliable) - use pip3 for compatibility
-    commands.append("pip3 install uv")
+    # Use --break-system-packages for Python 3.12+ with PEP 668 externally managed environments
+    commands.append("pip3 install --break-system-packages uv")
     # Install torch with custom index if specified (like Modal's two-phase install)
     # Use --system --break-system-packages to install to container's Python
@@ -378,18 +379,6 @@ def _build_docker_pip_install_cmd(target: BaremetalTarget | VMTarget) -> str:
     return " && ".join(commands)
-def _get_wafer_root() -> Path:
-    """Get wafer monorepo root directory.
-    Walks up from this file to find the wafer repo root (contains apps/, packages/).
-    """
-    current = Path(__file__).resolve()
-    for parent in [current] + list(current.parents):
-        if (parent / "apps").is_dir() and (parent / "packages").is_dir():
-            return parent
-    raise RuntimeError(f"Could not find wafer root from {__file__}")
 async def run_evaluate_docker(
     args: EvaluateArgs,
     target: BaremetalTarget | VMTarget,
@@ -1167,11 +1156,16 @@ def _build_modal_sandbox_script(
     """
     gpu_type = target.gpu_type
-    # Determine PyTorch index based on GPU type
+    # Determine PyTorch index and CUDA arch based on GPU type
     if gpu_type in ("B200", "GB200"):
-        torch_index = "https://download.pytorch.org/whl/nightly/cu128"
+        torch_index = "https://download.pytorch.org/whl/cu130"
+        cuda_arch_list = "10.0"  # Blackwell (sm_100)
+    elif gpu_type == "H100":
+        torch_index = "https://download.pytorch.org/whl/cu130"
+        cuda_arch_list = "9.0"  # Hopper (sm_90)
     else:
         torch_index = "https://download.pytorch.org/whl/cu124"
+        cuda_arch_list = "8.0"  # Default to Ampere (sm_80)
     return f'''
 import asyncio
@@ -1189,7 +1183,7 @@ async def run_eval():
             "nvidia/cuda:12.9.0-devel-ubuntu22.04",
             add_python="3.12",
         )
-        .apt_install("git", "build-essential", "cmake")
+        .apt_install("git", "build-essential", "cmake", "ripgrep")
         .pip_install(
             "torch",
             index_url="{torch_index}",
@@ -1202,6 +1196,12 @@ async def run_eval():
         )
         .env({{
             "CUDA_HOME": "/usr/local/cuda",
+            # C++ compiler needs explicit include path for cuda_runtime.h
+            "CPLUS_INCLUDE_PATH": "/usr/local/cuda/include",
+            # Linker needs lib path
+            "LIBRARY_PATH": "/usr/local/cuda/lib64",
+            # Force PyTorch to compile for correct GPU architecture
+            "TORCH_CUDA_ARCH_LIST": "{cuda_arch_list}",
         }})
     )
@@ -2021,54 +2021,13 @@ async def run_evaluate_runpod(
                         error_message=f"Failed to setup Python environment: {e}",
                     )
-                # Upload wafer-core to remote
-                try:
-                    wafer_root = _get_wafer_root()
-                    wafer_core_path = wafer_root / "packages" / "wafer-core"
-                    print(f"Uploading wafer-core from {wafer_core_path}...")
-                    wafer_core_remote = f"{REMOTE_WORKSPACE}/wafer-core"
-                    await client.exec(f"mkdir -p {wafer_core_remote}")
-                    wafer_core_workspace = await client.expand_path(wafer_core_remote)
-                    upload_result = await client.upload_files(
-                        str(wafer_core_path), wafer_core_workspace, recursive=True
-                    )
-                    # Wide event logging for upload result
-                    upload_event = {
-                        "event": "wafer_core_upload",
-                        "target": target.name,
-                        "target_type": "runpod",
-                        "ssh_host": f"{client.user}@{client.host}:{client.port}",
-                        "local_path": str(wafer_core_path),
-                        "remote_path": wafer_core_workspace,
-                        "success": upload_result.success,
-                        "files_copied": upload_result.files_copied,
-                        "duration_seconds": upload_result.duration_seconds,
-                        "error_message": upload_result.error_message,
-                    }
-                    if upload_result.debug_info:
-                        upload_event["debug_info"] = upload_result.debug_info
-                    logger.info(json.dumps(upload_event))
-                    # Fail fast if upload failed
-                    if not upload_result.success:
-                        print(f"ERROR: Upload failed: {upload_result.error_message}")
-                        if upload_result.debug_info:
-                            print(f"Debug info: {json.dumps(upload_result.debug_info, indent=2)}")
-                        return EvaluateResult(
-                            success=False,
-                            all_correct=False,
-                            correctness_score=0.0,
-                            geomean_speedup=0.0,
-                            passed_tests=0,
-                            total_tests=0,
-                            error_message=f"Failed to upload wafer-core: {upload_result.error_message}",
-                        )
-                    print(f"Uploaded {upload_result.files_copied} files")
-                except Exception as e:
+                # Install wafer-core in remote venv
+                print("Installing wafer-core...")
+                install_result = await client.exec(
+                    f'export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH" && '
+                    f"uv pip install --python {python_exe} wafer-core"
+                )
+                if install_result.exit_code != 0:
                     return EvaluateResult(
                         success=False,
                         all_correct=False,
@@ -2076,7 +2035,7 @@ async def run_evaluate_runpod(
                         geomean_speedup=0.0,
                         passed_tests=0,
                         total_tests=0,
-                        error_message=f"Failed to upload wafer-core: {e}",
+                        error_message=f"Failed to install wafer-core: {install_result.stderr}",
                     )
                 # Select GPU (RunPod pods typically have GPU 0)
@@ -2217,11 +2176,33 @@ async def run_evaluate_runpod(
                         error_message=f"Evaluation timed out after {target.eval_timeout}s",
                     )
-                # Parse output
+                # Show output to user
                 stdout = result.stdout
                 stderr = result.stderr
+                if stdout:
+                    print(stdout)
                 if result.exit_code != 0:
+                    error_parts = [f"Evaluation failed (exit code {result.exit_code}):"]
+                    if stdout:
+                        error_parts.append(f"stdout: {stdout}")
+                    if stderr:
+                        error_parts.append(f"stderr: {stderr}")
+                    return EvaluateResult(
+                        success=False,
+                        all_correct=False,
+                        correctness_score=0.0,
+                        geomean_speedup=0.0,
+                        passed_tests=0,
+                        total_tests=0,
+                        error_message="\n".join(error_parts),
+                    )
+                # Read results from results.json file written by evaluate module
+                results_path = f"{run_path}/results.json"
+                cat_result = await client.exec(f"cat {results_path}")
+                if cat_result.exit_code != 0:
                     return EvaluateResult(
                         success=False,
                         all_correct=False,
@@ -2229,20 +2210,12 @@ async def run_evaluate_runpod(
                         geomean_speedup=0.0,
                         passed_tests=0,
                         total_tests=0,
-                        error_message=f"Evaluation failed:\nstdout: {stdout}\nstderr: {stderr}",
+                        error_message=f"Failed to read results: {cat_result.stderr}",
                     )
-                # Find JSON result in output
-                result_json = None
-                for line in reversed(stdout.strip().split("\n")):
-                    if line.startswith("{"):
-                        try:
-                            result_json = json.loads(line)
-                            break
-                        except json.JSONDecodeError:
-                            continue
-                if result_json is None:
+                try:
+                    results_data = json.loads(cat_result.stdout)
+                except json.JSONDecodeError as e:
                     return EvaluateResult(
                         success=False,
                         all_correct=False,
@@ -2250,10 +2223,12 @@ async def run_evaluate_runpod(
                         geomean_speedup=0.0,
                         passed_tests=0,
                         total_tests=0,
-                        error_message=f"No JSON result in output:\n{stdout}",
+                        error_message=f"Invalid JSON in results: {e}",
                     )
-                if "error" in result_json:
+                # Extract backend results (same format as DigitalOcean/SSH path)
+                backends = results_data.get("backends", [])
+                if not backends:
                     return EvaluateResult(
                         success=False,
                         all_correct=False,
@@ -2261,18 +2236,20 @@ async def run_evaluate_runpod(
                         geomean_speedup=0.0,
                         passed_tests=0,
                         total_tests=0,
-                        error_message=result_json["error"],
+                        error_message="No backend results found",
                     )
-                passed = result_json.get("passed", 0)
-                total = result_json.get("total", 0)
+                backend = backends[0]
+                correctness_tests = backend.get("correctness_tests", [])
+                passed = sum(1 for t in correctness_tests if t.get("is_correct", False))
+                total = len(correctness_tests)
                 correctness = passed / total if total > 0 else 0.0
                 return EvaluateResult(
                     success=True,
-                    all_correct=result_json.get("all_correct", False),
+                    all_correct=backend.get("all_correct", False),
                     correctness_score=correctness,
-                    geomean_speedup=result_json.get("speedup", 0.0),
+                    geomean_speedup=backend.get("geomean_speedup", 0.0),
                     passed_tests=passed,
                     total_tests=total,
                 )
@@ -2373,61 +2350,13 @@ async def run_evaluate_digitalocean(
                             error_message=f"Failed to setup Python environment: {e}",
                         )
-                    # Upload wafer-core to remote
-                    try:
-                        wafer_root = _get_wafer_root()
-                        wafer_core_path = wafer_root / "packages" / "wafer-core"
-                        print(f"Uploading wafer-core from {wafer_core_path}...")
-                        wafer_core_remote = f"{REMOTE_WORKSPACE}/wafer-core"
-                        await client.exec(f"mkdir -p {wafer_core_remote}")
-                        wafer_core_workspace = await client.expand_path(wafer_core_remote)
-                        # Use SFTP instead of rsync to avoid SSH subprocess timeout issues
-                        # (DigitalOcean may rate-limit new SSH connections)
-                        upload_result = await client.upload_files(
-                            str(wafer_core_path),
-                            wafer_core_workspace,
-                            recursive=True,
-                            use_sftp=True,
-                        )
-                        # Wide event logging for upload result
-                        upload_event = {
-                            "event": "wafer_core_upload",
-                            "target": target.name,
-                            "target_type": "digitalocean",
-                            "ssh_host": f"{client.user}@{client.host}:{client.port}",
-                            "local_path": str(wafer_core_path),
-                            "remote_path": wafer_core_workspace,
-                            "success": upload_result.success,
-                            "files_copied": upload_result.files_copied,
-                            "duration_seconds": upload_result.duration_seconds,
-                            "error_message": upload_result.error_message,
-                        }
-                        if upload_result.debug_info:
-                            upload_event["debug_info"] = upload_result.debug_info
-                        logger.info(json.dumps(upload_event))
-                        # Fail fast if upload failed
-                        if not upload_result.success:
-                            print(f"ERROR: Upload failed: {upload_result.error_message}")
-                            if upload_result.debug_info:
-                                print(
-                                    f"Debug info: {json.dumps(upload_result.debug_info, indent=2)}"
-                                )
-                            return EvaluateResult(
-                                success=False,
-                                all_correct=False,
-                                correctness_score=0.0,
-                                geomean_speedup=0.0,
-                                passed_tests=0,
-                                total_tests=0,
-                                error_message=f"Failed to upload wafer-core: {upload_result.error_message}",
-                            )
-                        print(f"Uploaded {upload_result.files_copied} files")
-                    except Exception as e:
+                    # Install wafer-core in remote venv
+                    print("Installing wafer-core...")
+                    install_result = await client.exec(
+                        f'export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH" && '
+                        f"uv pip install --python {python_exe} wafer-core"
+                    )
+                    if install_result.exit_code != 0:
                         return EvaluateResult(
                             success=False,
                             all_correct=False,
@@ -2435,7 +2364,7 @@ async def run_evaluate_digitalocean(
                             geomean_speedup=0.0,
                             passed_tests=0,
                             total_tests=0,
-                            error_message=f"Failed to upload wafer-core: {e}",
+                            error_message=f"Failed to install wafer-core: {install_result.stderr}",
                         )
                     # Select GPU (DigitalOcean droplets typically have GPU 0)
@@ -3452,6 +3381,368 @@ def _validate_kernelbench_files(args: KernelBenchEvaluateArgs) -> str | None:
     return None
+def _build_modal_kernelbench_script(
+    target: ModalTarget,
+    impl_code_b64: str,
+    ref_code_b64: str,
+    eval_script_b64: str,
+    run_benchmarks: bool,
+    run_defensive: bool,
+    defense_code_b64: str | None,
+    seed: int,
+    inputs_code_b64: str | None = None,
+) -> str:
+    """Build Python script to create Modal sandbox and run KernelBench evaluation.
+    This runs in a subprocess to isolate Modal's asyncio from trio.
+    """
+    gpu_type = target.gpu_type
+    # Determine PyTorch index and CUDA arch based on GPU type
+    if gpu_type in ("B200", "GB200"):
+        torch_index = "https://download.pytorch.org/whl/cu130"
+        cuda_arch_list = "10.0"  # Blackwell (sm_100)
+    elif gpu_type == "H100":
+        # H100 uses CUDA 13.0 (matches modal_app.py)
+        torch_index = "https://download.pytorch.org/whl/cu130"
+        cuda_arch_list = "9.0"  # Hopper (sm_90)
+    else:
+        torch_index = "https://download.pytorch.org/whl/cu124"
+        cuda_arch_list = "8.0"  # Default to Ampere (sm_80)
+    # Install CUTLASS headers (for cute/tensor.hpp and cutlass/util/*.h) from GitHub
+    # The nvidia-cutlass-dsl pip package doesn't include the C++ headers needed for nvcc
+    # IMPORTANT: symlink to /usr/local/cuda/include because nvcc searches there by default
+    cutlass_install = """
+        .run_commands([
+            # Clone CUTLASS headers from GitHub (shallow clone, full include tree)
+            # Use simple shallow clone - sparse-checkout can be buggy in some environments
+            "git clone --depth 1 https://github.com/NVIDIA/cutlass.git /opt/cutlass",
+            # Verify the util headers exist (for debugging)
+            "ls -la /opt/cutlass/include/cutlass/util/ | head -5",
+            # Symlink headers to CUDA include path (nvcc searches here by default)
+            "ln -sf /opt/cutlass/include/cute /usr/local/cuda/include/cute",
+            "ln -sf /opt/cutlass/include/cutlass /usr/local/cuda/include/cutlass",
+        ])
+        .pip_install(
+            "nvidia-cutlass-dsl",
+            index_url="https://pypi.nvidia.com",
+            extra_index_url="https://pypi.org/simple",
+        )
+    """
+    inputs_write = ""
+    if inputs_code_b64:
+        inputs_write = f'''
+        # Write custom inputs
+        proc = sandbox.exec("python", "-c", f"""
+import base64
+with open('/workspace/custom_inputs.py', 'w') as f:
+    f.write(base64.b64decode('{inputs_code_b64}').decode())
+print('Custom inputs written')
+""")
+        proc.wait()
+'''
+    defense_write = ""
+    if run_defensive and defense_code_b64:
+        defense_write = f'''
+        # Write defense module
+        proc = sandbox.exec("python", "-c", f"""
+import base64
+with open('/workspace/defense.py', 'w') as f:
+    f.write(base64.b64decode('{defense_code_b64}').decode())
+print('Defense module written')
+""")
+        proc.wait()
+'''
+    # Build eval command
+    eval_cmd_parts = [
+        "python /workspace/kernelbench_eval.py",
+        "--impl /workspace/implementation.py",
+        "--reference /workspace/reference.py",
+        "--output /workspace/results.json",
+        f"--seed {seed}",
+    ]
+    if run_benchmarks:
+        eval_cmd_parts.append("--benchmark")
+    if run_defensive and defense_code_b64:
+        eval_cmd_parts.append("--defensive")
+        eval_cmd_parts.append("--defense-module /workspace/defense.py")
+    if inputs_code_b64:
+        eval_cmd_parts.append("--inputs /workspace/custom_inputs.py")
+    eval_cmd = " ".join(eval_cmd_parts)
+    return f'''
+import asyncio
+import base64
+import json
+import sys
+import modal
+async def run_eval():
+    app = modal.App.lookup("wafer-evaluate", create_if_missing=True)
+    # Build image with PyTorch, CUTLASS DSL and dependencies
+    image = (
+        modal.Image.from_registry(
+            "nvidia/cuda:12.9.0-devel-ubuntu22.04",
+            add_python="3.12",
+        )
+        .apt_install("git", "build-essential", "cmake", "ninja-build", "ripgrep")
+        .pip_install(
+            "torch",
+            index_url="{torch_index}",
+            extra_index_url="https://pypi.org/simple",
+        )
+        .pip_install(
+            "numpy",
+            "triton",
+            "ninja",
+        )
+        {cutlass_install}
+        .env({{
+            "CUDA_HOME": "/usr/local/cuda",
+            # C++ compiler needs explicit include path for cuda_runtime.h
+            "CPLUS_INCLUDE_PATH": "/usr/local/cuda/include",
+            # Linker needs lib path
+            "LIBRARY_PATH": "/usr/local/cuda/lib64",
+            # Force PyTorch to compile for correct GPU architecture
+            "TORCH_CUDA_ARCH_LIST": "{cuda_arch_list}",
+        }})
+    )
+    # Create sandbox
+    sandbox = modal.Sandbox.create(
+        app=app,
+        image=image,
+        gpu="{gpu_type}",
+        timeout={target.timeout_seconds},
+    )
+    try:
+        # Create workspace directory
+        sandbox.exec("mkdir", "-p", "/workspace").wait()
+        # Write files to sandbox
+        proc = sandbox.exec("python", "-c", f"""
+import base64
+with open('/workspace/implementation.py', 'w') as f:
+    f.write(base64.b64decode('{impl_code_b64}').decode())
+with open('/workspace/reference.py', 'w') as f:
+    f.write(base64.b64decode('{ref_code_b64}').decode())
+with open('/workspace/kernelbench_eval.py', 'w') as f:
+    f.write(base64.b64decode('{eval_script_b64}').decode())
+print('Files written')
+""")
+        proc.wait()
+        if proc.returncode != 0:
+            print(json.dumps({{"success": False, "error": f"Failed to write files: {{proc.stderr.read()}}"}}))
+            return
+{inputs_write}
+{defense_write}
+        # Run evaluation
+        print(f"Running KernelBench evaluation on {{'{gpu_type}'}}...")
+        proc = sandbox.exec("bash", "-c", "{eval_cmd}")
+        # Stream output
+        for line in proc.stdout:
+            print(line, end="")
+        for line in proc.stderr:
+            print(line, end="", file=sys.stderr)
+        proc.wait()
+        if proc.returncode != 0:
+            print(json.dumps({{"success": False, "error": f"Evaluation failed with exit code {{proc.returncode}}"}}))
+            return
+        # Read results
+        result_proc = sandbox.exec("cat", "/workspace/results.json")
+        result_data = result_proc.stdout.read()
+        result_proc.wait()
+        if result_data:
+            results = json.loads(result_data)
+            print("EVAL_RESULT_JSON:" + json.dumps(results))
+        else:
+            print(json.dumps({{"success": False, "error": "No results.json found"}}))
+    finally:
+        sandbox.terminate()
+asyncio.run(run_eval())
+'''
+async def run_evaluate_kernelbench_modal(
+    args: KernelBenchEvaluateArgs,
+    target: ModalTarget,
+) -> EvaluateResult:
+    """Run KernelBench format evaluation on Modal sandbox.
+    Creates a Modal sandbox, uploads files, runs KernelBench eval, and parses results.
+    Uses subprocess to isolate Modal's asyncio from trio.
+    """
+    import base64
+    import subprocess
+    import sys
+    import trio
+    print(f"Creating Modal sandbox ({target.gpu_type}) for KernelBench evaluation...")
+    # Encode files as base64
+    impl_code_b64 = base64.b64encode(args.implementation.read_bytes()).decode()
+    ref_code_b64 = base64.b64encode(args.reference.read_bytes()).decode()
+    eval_script_b64 = base64.b64encode(KERNELBENCH_EVAL_SCRIPT.encode()).decode()
+    # Encode custom inputs if provided
+    inputs_code_b64 = None
+    if args.inputs:
+        inputs_code_b64 = base64.b64encode(args.inputs.read_bytes()).decode()
+    # Encode defense module if defensive mode is enabled
+    defense_code_b64 = None
+    if args.defensive:
+        defense_path = (
+            Path(__file__).parent.parent.parent.parent
+            / "packages"
+            / "wafer-core"
+            / "wafer_core"
+            / "utils"
+            / "kernel_utils"
+            / "defense.py"
+        )
+        if defense_path.exists():
+            defense_code_b64 = base64.b64encode(defense_path.read_bytes()).decode()
+        else:
+            print(f"Warning: defense.py not found at {defense_path}, falling back to basic defense")
+    # Build the script
+    script = _build_modal_kernelbench_script(
+        target=target,
+        impl_code_b64=impl_code_b64,
+        ref_code_b64=ref_code_b64,
+        eval_script_b64=eval_script_b64,
+        run_benchmarks=args.benchmark,
+        run_defensive=args.defensive,
+        defense_code_b64=defense_code_b64,
+        seed=args.seed,
+        inputs_code_b64=inputs_code_b64,
+    )
+    def _run_subprocess() -> tuple[str, str, int]:
+        result = subprocess.run(
+            [sys.executable, "-c", script],
+            capture_output=True,
+            text=True,
+            timeout=target.timeout_seconds + 120,  # Extra buffer for sandbox creation + image build
+        )
+        return result.stdout, result.stderr, result.returncode
+    try:
+        stdout, stderr, returncode = await trio.to_thread.run_sync(_run_subprocess)
+    except subprocess.TimeoutExpired:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message=f"Modal KernelBench evaluation timed out after {target.timeout_seconds}s",
+        )
+    except Exception as e:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message=f"Failed to run Modal sandbox: {e}",
+        )
+    # Print output for debugging
+    if stdout:
+        for line in stdout.split("\n"):
+            if not line.startswith("EVAL_RESULT_JSON:"):
+                print(line)
+    if stderr:
+        print(stderr, file=sys.stderr)
+    if returncode != 0:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message=f"Modal sandbox failed (exit {returncode}): {stderr or stdout}",
+        )
+    # Parse results from stdout
+    result_json = None
+    for line in stdout.split("\n"):
+        if line.startswith("EVAL_RESULT_JSON:"):
+            result_json = line[len("EVAL_RESULT_JSON:") :]
+            break
+    if not result_json:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message="No results found in Modal output",
+        )
+    try:
+        results = json.loads(result_json)
+    except json.JSONDecodeError as e:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message=f"Failed to parse results JSON: {e}",
+        )
+    # Check for error in results
+    if "error" in results and results.get("success") is False:
+        return EvaluateResult(
+            success=False,
+            all_correct=False,
+            correctness_score=0.0,
+            geomean_speedup=0.0,
+            passed_tests=0,
+            total_tests=0,
+            error_message=results.get("error", "Unknown error"),
+        )
+    # Extract metrics from results
+    return EvaluateResult(
+        success=True,
+        all_correct=results.get("all_correct", False),
+        correctness_score=float(results.get("correctness_score", 0.0)),
+        geomean_speedup=float(results.get("geomean_speedup", 0.0)),
+        passed_tests=int(results.get("passed_tests", 0)),
+        total_tests=int(results.get("total_tests", 0)),
+        error_message=results.get("error"),
+        test_results=results.get("test_results", []),
+        compilation_time_s=results.get("compilation_time_s"),
+        profiling_stats=results.get("profiling_stats"),
+    )
 async def run_evaluate_kernelbench_docker(
     args: KernelBenchEvaluateArgs,
     target: BaremetalTarget | VMTarget,
@@ -4112,6 +4403,7 @@ async def run_evaluate_kernelbench_runpod(
                 # Find Python with PyTorch - check common locations on RunPod
                 python_exe = "python3"
                 for candidate in [
+                    "/opt/venv/bin/python3",
                     "/opt/conda/envs/py_3.10/bin/python3",
                     "/opt/conda/bin/python3",
                 ]:
@@ -4245,6 +4537,22 @@ async def run_evaluate_kernelbench_runpod(
         )
+async def run_evaluate_kernelbench_baremetal_direct(
+    args: KernelBenchEvaluateArgs,
+    target: BaremetalTarget,
+) -> EvaluateResult:
+    """Run KernelBench format evaluation directly on NVIDIA target (no Docker).
+    For targets that already have PyTorch/CUDA installed (e.g., workspace containers).
+    Uses CUDA_VISIBLE_DEVICES for GPU selection.
+    """
+    # Reuse the AMD function but with CUDA env vars
+    # The logic is identical, just the GPU env var is different
+    return await _run_evaluate_kernelbench_baremetal_direct_impl(
+        args, target, gpu_env_var="CUDA_VISIBLE_DEVICES"
+    )
 async def run_evaluate_kernelbench_baremetal_amd(
     args: KernelBenchEvaluateArgs,
     target: BaremetalTarget,
@@ -4254,6 +4562,20 @@ async def run_evaluate_kernelbench_baremetal_amd(
     Runs evaluation script directly on host (no Docker) for AMD GPUs
     that have PyTorch/ROCm installed.
     """
+    return await _run_evaluate_kernelbench_baremetal_direct_impl(
+        args, target, gpu_env_var="HIP_VISIBLE_DEVICES"
+    )
+async def _run_evaluate_kernelbench_baremetal_direct_impl(
+    args: KernelBenchEvaluateArgs,
+    target: BaremetalTarget,
+    gpu_env_var: str = "HIP_VISIBLE_DEVICES",
+) -> EvaluateResult:
+    """Internal implementation for direct baremetal evaluation.
+    Runs evaluation script directly on host (no Docker).
+    """
     from datetime import datetime
     from wafer_core.async_ssh import AsyncSSHClient
@@ -4404,11 +4726,17 @@ async def run_evaluate_kernelbench_baremetal_amd(
         eval_cmd = " ".join(python_cmd_parts)
-        # Set environment for AMD GPU and run
-        # PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
-        rocm_arch = _get_rocm_arch(target.compute_capability)
-        arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
-        env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
+        # Set environment for GPU and run
+        if gpu_env_var == "HIP_VISIBLE_DEVICES":
+            # AMD: PYTORCH_ROCM_ARCH for faster compile
+            rocm_arch = _get_rocm_arch(target.compute_capability)
+            arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
+            env_vars = (
+                f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
+            )
+        else:
+            # NVIDIA: just set CUDA_VISIBLE_DEVICES
+            env_vars = f"CUDA_VISIBLE_DEVICES={gpu_id} PYTHONUNBUFFERED=1"
         full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
         # Handle prepare-only mode
@@ -4559,10 +4887,16 @@ async def run_evaluate_kernelbench(args: KernelBenchEvaluateArgs) -> EvaluateRes
     elif isinstance(target, RunPodTarget):
         # RunPod AMD MI300X - uses ROCm Docker with device passthrough
         return await run_evaluate_kernelbench_runpod(args, target)
+    elif isinstance(target, ModalTarget):
+        # Modal serverless - runs in Modal sandbox
+        return await run_evaluate_kernelbench_modal(args, target)
     elif isinstance(target, BaremetalTarget | VMTarget):
         # Check if this is an AMD target (gfx* compute capability) - run directly
         if target.compute_capability and target.compute_capability.startswith("gfx"):
             return await run_evaluate_kernelbench_baremetal_amd(args, target)
+        # Check for direct execution flag (workspace containers that already have everything)
+        if getattr(target, "direct", False):
+            return await run_evaluate_kernelbench_baremetal_direct(args, target)
         # NVIDIA targets - require docker_image to be set
         if not target.docker_image:
             return EvaluateResult(

wafer-cli 0.2.14__py3-none-any.whl → 0.2.30__py3-none-any.whl

wafer-cli 0.2.14py3-none-any.whl → 0.2.30py3-none-any.whl