wafer-cli 0.2.14__py3-none-any.whl → 0.2.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wafer/evaluate.py CHANGED
@@ -354,7 +354,8 @@ def _build_docker_pip_install_cmd(target: BaremetalTarget | VMTarget) -> str:
354
354
  )
355
355
 
356
356
  # Install uv (fast, reliable) - use pip3 for compatibility
357
- commands.append("pip3 install uv")
357
+ # Use --break-system-packages for Python 3.12+ with PEP 668 externally managed environments
358
+ commands.append("pip3 install --break-system-packages uv")
358
359
 
359
360
  # Install torch with custom index if specified (like Modal's two-phase install)
360
361
  # Use --system --break-system-packages to install to container's Python
@@ -378,18 +379,6 @@ def _build_docker_pip_install_cmd(target: BaremetalTarget | VMTarget) -> str:
378
379
  return " && ".join(commands)
379
380
 
380
381
 
381
- def _get_wafer_root() -> Path:
382
- """Get wafer monorepo root directory.
383
-
384
- Walks up from this file to find the wafer repo root (contains apps/, packages/).
385
- """
386
- current = Path(__file__).resolve()
387
- for parent in [current] + list(current.parents):
388
- if (parent / "apps").is_dir() and (parent / "packages").is_dir():
389
- return parent
390
- raise RuntimeError(f"Could not find wafer root from {__file__}")
391
-
392
-
393
382
  async def run_evaluate_docker(
394
383
  args: EvaluateArgs,
395
384
  target: BaremetalTarget | VMTarget,
@@ -1167,11 +1156,16 @@ def _build_modal_sandbox_script(
1167
1156
  """
1168
1157
  gpu_type = target.gpu_type
1169
1158
 
1170
- # Determine PyTorch index based on GPU type
1159
+ # Determine PyTorch index and CUDA arch based on GPU type
1171
1160
  if gpu_type in ("B200", "GB200"):
1172
- torch_index = "https://download.pytorch.org/whl/nightly/cu128"
1161
+ torch_index = "https://download.pytorch.org/whl/cu130"
1162
+ cuda_arch_list = "10.0" # Blackwell (sm_100)
1163
+ elif gpu_type == "H100":
1164
+ torch_index = "https://download.pytorch.org/whl/cu130"
1165
+ cuda_arch_list = "9.0" # Hopper (sm_90)
1173
1166
  else:
1174
1167
  torch_index = "https://download.pytorch.org/whl/cu124"
1168
+ cuda_arch_list = "8.0" # Default to Ampere (sm_80)
1175
1169
 
1176
1170
  return f'''
1177
1171
  import asyncio
@@ -1189,7 +1183,7 @@ async def run_eval():
1189
1183
  "nvidia/cuda:12.9.0-devel-ubuntu22.04",
1190
1184
  add_python="3.12",
1191
1185
  )
1192
- .apt_install("git", "build-essential", "cmake")
1186
+ .apt_install("git", "build-essential", "cmake", "ripgrep")
1193
1187
  .pip_install(
1194
1188
  "torch",
1195
1189
  index_url="{torch_index}",
@@ -1202,6 +1196,12 @@ async def run_eval():
1202
1196
  )
1203
1197
  .env({{
1204
1198
  "CUDA_HOME": "/usr/local/cuda",
1199
+ # C++ compiler needs explicit include path for cuda_runtime.h
1200
+ "CPLUS_INCLUDE_PATH": "/usr/local/cuda/include",
1201
+ # Linker needs lib path
1202
+ "LIBRARY_PATH": "/usr/local/cuda/lib64",
1203
+ # Force PyTorch to compile for correct GPU architecture
1204
+ "TORCH_CUDA_ARCH_LIST": "{cuda_arch_list}",
1205
1205
  }})
1206
1206
  )
1207
1207
 
@@ -2021,54 +2021,13 @@ async def run_evaluate_runpod(
2021
2021
  error_message=f"Failed to setup Python environment: {e}",
2022
2022
  )
2023
2023
 
2024
- # Upload wafer-core to remote
2025
- try:
2026
- wafer_root = _get_wafer_root()
2027
- wafer_core_path = wafer_root / "packages" / "wafer-core"
2028
- print(f"Uploading wafer-core from {wafer_core_path}...")
2029
-
2030
- wafer_core_remote = f"{REMOTE_WORKSPACE}/wafer-core"
2031
- await client.exec(f"mkdir -p {wafer_core_remote}")
2032
- wafer_core_workspace = await client.expand_path(wafer_core_remote)
2033
-
2034
- upload_result = await client.upload_files(
2035
- str(wafer_core_path), wafer_core_workspace, recursive=True
2036
- )
2037
-
2038
- # Wide event logging for upload result
2039
- upload_event = {
2040
- "event": "wafer_core_upload",
2041
- "target": target.name,
2042
- "target_type": "runpod",
2043
- "ssh_host": f"{client.user}@{client.host}:{client.port}",
2044
- "local_path": str(wafer_core_path),
2045
- "remote_path": wafer_core_workspace,
2046
- "success": upload_result.success,
2047
- "files_copied": upload_result.files_copied,
2048
- "duration_seconds": upload_result.duration_seconds,
2049
- "error_message": upload_result.error_message,
2050
- }
2051
- if upload_result.debug_info:
2052
- upload_event["debug_info"] = upload_result.debug_info
2053
- logger.info(json.dumps(upload_event))
2054
-
2055
- # Fail fast if upload failed
2056
- if not upload_result.success:
2057
- print(f"ERROR: Upload failed: {upload_result.error_message}")
2058
- if upload_result.debug_info:
2059
- print(f"Debug info: {json.dumps(upload_result.debug_info, indent=2)}")
2060
- return EvaluateResult(
2061
- success=False,
2062
- all_correct=False,
2063
- correctness_score=0.0,
2064
- geomean_speedup=0.0,
2065
- passed_tests=0,
2066
- total_tests=0,
2067
- error_message=f"Failed to upload wafer-core: {upload_result.error_message}",
2068
- )
2069
-
2070
- print(f"Uploaded {upload_result.files_copied} files")
2071
- except Exception as e:
2024
+ # Install wafer-core in remote venv
2025
+ print("Installing wafer-core...")
2026
+ install_result = await client.exec(
2027
+ f'export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH" && '
2028
+ f"uv pip install --python {python_exe} wafer-core"
2029
+ )
2030
+ if install_result.exit_code != 0:
2072
2031
  return EvaluateResult(
2073
2032
  success=False,
2074
2033
  all_correct=False,
@@ -2076,7 +2035,7 @@ async def run_evaluate_runpod(
2076
2035
  geomean_speedup=0.0,
2077
2036
  passed_tests=0,
2078
2037
  total_tests=0,
2079
- error_message=f"Failed to upload wafer-core: {e}",
2038
+ error_message=f"Failed to install wafer-core: {install_result.stderr}",
2080
2039
  )
2081
2040
 
2082
2041
  # Select GPU (RunPod pods typically have GPU 0)
@@ -2217,11 +2176,33 @@ async def run_evaluate_runpod(
2217
2176
  error_message=f"Evaluation timed out after {target.eval_timeout}s",
2218
2177
  )
2219
2178
 
2220
- # Parse output
2179
+ # Show output to user
2221
2180
  stdout = result.stdout
2222
2181
  stderr = result.stderr
2182
+ if stdout:
2183
+ print(stdout)
2223
2184
 
2224
2185
  if result.exit_code != 0:
2186
+ error_parts = [f"Evaluation failed (exit code {result.exit_code}):"]
2187
+ if stdout:
2188
+ error_parts.append(f"stdout: {stdout}")
2189
+ if stderr:
2190
+ error_parts.append(f"stderr: {stderr}")
2191
+ return EvaluateResult(
2192
+ success=False,
2193
+ all_correct=False,
2194
+ correctness_score=0.0,
2195
+ geomean_speedup=0.0,
2196
+ passed_tests=0,
2197
+ total_tests=0,
2198
+ error_message="\n".join(error_parts),
2199
+ )
2200
+
2201
+ # Read results from results.json file written by evaluate module
2202
+ results_path = f"{run_path}/results.json"
2203
+ cat_result = await client.exec(f"cat {results_path}")
2204
+
2205
+ if cat_result.exit_code != 0:
2225
2206
  return EvaluateResult(
2226
2207
  success=False,
2227
2208
  all_correct=False,
@@ -2229,20 +2210,12 @@ async def run_evaluate_runpod(
2229
2210
  geomean_speedup=0.0,
2230
2211
  passed_tests=0,
2231
2212
  total_tests=0,
2232
- error_message=f"Evaluation failed:\nstdout: {stdout}\nstderr: {stderr}",
2213
+ error_message=f"Failed to read results: {cat_result.stderr}",
2233
2214
  )
2234
2215
 
2235
- # Find JSON result in output
2236
- result_json = None
2237
- for line in reversed(stdout.strip().split("\n")):
2238
- if line.startswith("{"):
2239
- try:
2240
- result_json = json.loads(line)
2241
- break
2242
- except json.JSONDecodeError:
2243
- continue
2244
-
2245
- if result_json is None:
2216
+ try:
2217
+ results_data = json.loads(cat_result.stdout)
2218
+ except json.JSONDecodeError as e:
2246
2219
  return EvaluateResult(
2247
2220
  success=False,
2248
2221
  all_correct=False,
@@ -2250,10 +2223,12 @@ async def run_evaluate_runpod(
2250
2223
  geomean_speedup=0.0,
2251
2224
  passed_tests=0,
2252
2225
  total_tests=0,
2253
- error_message=f"No JSON result in output:\n{stdout}",
2226
+ error_message=f"Invalid JSON in results: {e}",
2254
2227
  )
2255
2228
 
2256
- if "error" in result_json:
2229
+ # Extract backend results (same format as DigitalOcean/SSH path)
2230
+ backends = results_data.get("backends", [])
2231
+ if not backends:
2257
2232
  return EvaluateResult(
2258
2233
  success=False,
2259
2234
  all_correct=False,
@@ -2261,18 +2236,20 @@ async def run_evaluate_runpod(
2261
2236
  geomean_speedup=0.0,
2262
2237
  passed_tests=0,
2263
2238
  total_tests=0,
2264
- error_message=result_json["error"],
2239
+ error_message="No backend results found",
2265
2240
  )
2266
2241
 
2267
- passed = result_json.get("passed", 0)
2268
- total = result_json.get("total", 0)
2242
+ backend = backends[0]
2243
+ correctness_tests = backend.get("correctness_tests", [])
2244
+ passed = sum(1 for t in correctness_tests if t.get("is_correct", False))
2245
+ total = len(correctness_tests)
2269
2246
  correctness = passed / total if total > 0 else 0.0
2270
2247
 
2271
2248
  return EvaluateResult(
2272
2249
  success=True,
2273
- all_correct=result_json.get("all_correct", False),
2250
+ all_correct=backend.get("all_correct", False),
2274
2251
  correctness_score=correctness,
2275
- geomean_speedup=result_json.get("speedup", 0.0),
2252
+ geomean_speedup=backend.get("geomean_speedup", 0.0),
2276
2253
  passed_tests=passed,
2277
2254
  total_tests=total,
2278
2255
  )
@@ -2373,61 +2350,13 @@ async def run_evaluate_digitalocean(
2373
2350
  error_message=f"Failed to setup Python environment: {e}",
2374
2351
  )
2375
2352
 
2376
- # Upload wafer-core to remote
2377
- try:
2378
- wafer_root = _get_wafer_root()
2379
- wafer_core_path = wafer_root / "packages" / "wafer-core"
2380
- print(f"Uploading wafer-core from {wafer_core_path}...")
2381
-
2382
- wafer_core_remote = f"{REMOTE_WORKSPACE}/wafer-core"
2383
- await client.exec(f"mkdir -p {wafer_core_remote}")
2384
- wafer_core_workspace = await client.expand_path(wafer_core_remote)
2385
-
2386
- # Use SFTP instead of rsync to avoid SSH subprocess timeout issues
2387
- # (DigitalOcean may rate-limit new SSH connections)
2388
- upload_result = await client.upload_files(
2389
- str(wafer_core_path),
2390
- wafer_core_workspace,
2391
- recursive=True,
2392
- use_sftp=True,
2393
- )
2394
-
2395
- # Wide event logging for upload result
2396
- upload_event = {
2397
- "event": "wafer_core_upload",
2398
- "target": target.name,
2399
- "target_type": "digitalocean",
2400
- "ssh_host": f"{client.user}@{client.host}:{client.port}",
2401
- "local_path": str(wafer_core_path),
2402
- "remote_path": wafer_core_workspace,
2403
- "success": upload_result.success,
2404
- "files_copied": upload_result.files_copied,
2405
- "duration_seconds": upload_result.duration_seconds,
2406
- "error_message": upload_result.error_message,
2407
- }
2408
- if upload_result.debug_info:
2409
- upload_event["debug_info"] = upload_result.debug_info
2410
- logger.info(json.dumps(upload_event))
2411
-
2412
- # Fail fast if upload failed
2413
- if not upload_result.success:
2414
- print(f"ERROR: Upload failed: {upload_result.error_message}")
2415
- if upload_result.debug_info:
2416
- print(
2417
- f"Debug info: {json.dumps(upload_result.debug_info, indent=2)}"
2418
- )
2419
- return EvaluateResult(
2420
- success=False,
2421
- all_correct=False,
2422
- correctness_score=0.0,
2423
- geomean_speedup=0.0,
2424
- passed_tests=0,
2425
- total_tests=0,
2426
- error_message=f"Failed to upload wafer-core: {upload_result.error_message}",
2427
- )
2428
-
2429
- print(f"Uploaded {upload_result.files_copied} files")
2430
- except Exception as e:
2353
+ # Install wafer-core in remote venv
2354
+ print("Installing wafer-core...")
2355
+ install_result = await client.exec(
2356
+ f'export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH" && '
2357
+ f"uv pip install --python {python_exe} wafer-core"
2358
+ )
2359
+ if install_result.exit_code != 0:
2431
2360
  return EvaluateResult(
2432
2361
  success=False,
2433
2362
  all_correct=False,
@@ -2435,7 +2364,7 @@ async def run_evaluate_digitalocean(
2435
2364
  geomean_speedup=0.0,
2436
2365
  passed_tests=0,
2437
2366
  total_tests=0,
2438
- error_message=f"Failed to upload wafer-core: {e}",
2367
+ error_message=f"Failed to install wafer-core: {install_result.stderr}",
2439
2368
  )
2440
2369
 
2441
2370
  # Select GPU (DigitalOcean droplets typically have GPU 0)
@@ -3452,6 +3381,368 @@ def _validate_kernelbench_files(args: KernelBenchEvaluateArgs) -> str | None:
3452
3381
  return None
3453
3382
 
3454
3383
 
3384
+ def _build_modal_kernelbench_script(
3385
+ target: ModalTarget,
3386
+ impl_code_b64: str,
3387
+ ref_code_b64: str,
3388
+ eval_script_b64: str,
3389
+ run_benchmarks: bool,
3390
+ run_defensive: bool,
3391
+ defense_code_b64: str | None,
3392
+ seed: int,
3393
+ inputs_code_b64: str | None = None,
3394
+ ) -> str:
3395
+ """Build Python script to create Modal sandbox and run KernelBench evaluation.
3396
+
3397
+ This runs in a subprocess to isolate Modal's asyncio from trio.
3398
+ """
3399
+ gpu_type = target.gpu_type
3400
+
3401
+ # Determine PyTorch index and CUDA arch based on GPU type
3402
+ if gpu_type in ("B200", "GB200"):
3403
+ torch_index = "https://download.pytorch.org/whl/cu130"
3404
+ cuda_arch_list = "10.0" # Blackwell (sm_100)
3405
+ elif gpu_type == "H100":
3406
+ # H100 uses CUDA 13.0 (matches modal_app.py)
3407
+ torch_index = "https://download.pytorch.org/whl/cu130"
3408
+ cuda_arch_list = "9.0" # Hopper (sm_90)
3409
+ else:
3410
+ torch_index = "https://download.pytorch.org/whl/cu124"
3411
+ cuda_arch_list = "8.0" # Default to Ampere (sm_80)
3412
+
3413
+ # Install CUTLASS headers (for cute/tensor.hpp and cutlass/util/*.h) from GitHub
3414
+ # The nvidia-cutlass-dsl pip package doesn't include the C++ headers needed for nvcc
3415
+ # IMPORTANT: symlink to /usr/local/cuda/include because nvcc searches there by default
3416
+ cutlass_install = """
3417
+ .run_commands([
3418
+ # Clone CUTLASS headers from GitHub (shallow clone, full include tree)
3419
+ # Use simple shallow clone - sparse-checkout can be buggy in some environments
3420
+ "git clone --depth 1 https://github.com/NVIDIA/cutlass.git /opt/cutlass",
3421
+ # Verify the util headers exist (for debugging)
3422
+ "ls -la /opt/cutlass/include/cutlass/util/ | head -5",
3423
+ # Symlink headers to CUDA include path (nvcc searches here by default)
3424
+ "ln -sf /opt/cutlass/include/cute /usr/local/cuda/include/cute",
3425
+ "ln -sf /opt/cutlass/include/cutlass /usr/local/cuda/include/cutlass",
3426
+ ])
3427
+ .pip_install(
3428
+ "nvidia-cutlass-dsl",
3429
+ index_url="https://pypi.nvidia.com",
3430
+ extra_index_url="https://pypi.org/simple",
3431
+ )
3432
+ """
3433
+
3434
+ inputs_write = ""
3435
+ if inputs_code_b64:
3436
+ inputs_write = f'''
3437
+ # Write custom inputs
3438
+ proc = sandbox.exec("python", "-c", f"""
3439
+ import base64
3440
+ with open('/workspace/custom_inputs.py', 'w') as f:
3441
+ f.write(base64.b64decode('{inputs_code_b64}').decode())
3442
+ print('Custom inputs written')
3443
+ """)
3444
+ proc.wait()
3445
+ '''
3446
+
3447
+ defense_write = ""
3448
+ if run_defensive and defense_code_b64:
3449
+ defense_write = f'''
3450
+ # Write defense module
3451
+ proc = sandbox.exec("python", "-c", f"""
3452
+ import base64
3453
+ with open('/workspace/defense.py', 'w') as f:
3454
+ f.write(base64.b64decode('{defense_code_b64}').decode())
3455
+ print('Defense module written')
3456
+ """)
3457
+ proc.wait()
3458
+ '''
3459
+
3460
+ # Build eval command
3461
+ eval_cmd_parts = [
3462
+ "python /workspace/kernelbench_eval.py",
3463
+ "--impl /workspace/implementation.py",
3464
+ "--reference /workspace/reference.py",
3465
+ "--output /workspace/results.json",
3466
+ f"--seed {seed}",
3467
+ ]
3468
+ if run_benchmarks:
3469
+ eval_cmd_parts.append("--benchmark")
3470
+ if run_defensive and defense_code_b64:
3471
+ eval_cmd_parts.append("--defensive")
3472
+ eval_cmd_parts.append("--defense-module /workspace/defense.py")
3473
+ if inputs_code_b64:
3474
+ eval_cmd_parts.append("--inputs /workspace/custom_inputs.py")
3475
+
3476
+ eval_cmd = " ".join(eval_cmd_parts)
3477
+
3478
+ return f'''
3479
+ import asyncio
3480
+ import base64
3481
+ import json
3482
+ import sys
3483
+ import modal
3484
+
3485
+ async def run_eval():
3486
+ app = modal.App.lookup("wafer-evaluate", create_if_missing=True)
3487
+
3488
+ # Build image with PyTorch, CUTLASS DSL and dependencies
3489
+ image = (
3490
+ modal.Image.from_registry(
3491
+ "nvidia/cuda:12.9.0-devel-ubuntu22.04",
3492
+ add_python="3.12",
3493
+ )
3494
+ .apt_install("git", "build-essential", "cmake", "ninja-build", "ripgrep")
3495
+ .pip_install(
3496
+ "torch",
3497
+ index_url="{torch_index}",
3498
+ extra_index_url="https://pypi.org/simple",
3499
+ )
3500
+ .pip_install(
3501
+ "numpy",
3502
+ "triton",
3503
+ "ninja",
3504
+ )
3505
+ {cutlass_install}
3506
+ .env({{
3507
+ "CUDA_HOME": "/usr/local/cuda",
3508
+ # C++ compiler needs explicit include path for cuda_runtime.h
3509
+ "CPLUS_INCLUDE_PATH": "/usr/local/cuda/include",
3510
+ # Linker needs lib path
3511
+ "LIBRARY_PATH": "/usr/local/cuda/lib64",
3512
+ # Force PyTorch to compile for correct GPU architecture
3513
+ "TORCH_CUDA_ARCH_LIST": "{cuda_arch_list}",
3514
+ }})
3515
+ )
3516
+
3517
+ # Create sandbox
3518
+ sandbox = modal.Sandbox.create(
3519
+ app=app,
3520
+ image=image,
3521
+ gpu="{gpu_type}",
3522
+ timeout={target.timeout_seconds},
3523
+ )
3524
+
3525
+ try:
3526
+ # Create workspace directory
3527
+ sandbox.exec("mkdir", "-p", "/workspace").wait()
3528
+
3529
+ # Write files to sandbox
3530
+ proc = sandbox.exec("python", "-c", f"""
3531
+ import base64
3532
+ with open('/workspace/implementation.py', 'w') as f:
3533
+ f.write(base64.b64decode('{impl_code_b64}').decode())
3534
+ with open('/workspace/reference.py', 'w') as f:
3535
+ f.write(base64.b64decode('{ref_code_b64}').decode())
3536
+ with open('/workspace/kernelbench_eval.py', 'w') as f:
3537
+ f.write(base64.b64decode('{eval_script_b64}').decode())
3538
+ print('Files written')
3539
+ """)
3540
+ proc.wait()
3541
+ if proc.returncode != 0:
3542
+ print(json.dumps({{"success": False, "error": f"Failed to write files: {{proc.stderr.read()}}"}}))
3543
+ return
3544
+ {inputs_write}
3545
+ {defense_write}
3546
+ # Run evaluation
3547
+ print(f"Running KernelBench evaluation on {{'{gpu_type}'}}...")
3548
+ proc = sandbox.exec("bash", "-c", "{eval_cmd}")
3549
+
3550
+ # Stream output
3551
+ for line in proc.stdout:
3552
+ print(line, end="")
3553
+ for line in proc.stderr:
3554
+ print(line, end="", file=sys.stderr)
3555
+
3556
+ proc.wait()
3557
+
3558
+ if proc.returncode != 0:
3559
+ print(json.dumps({{"success": False, "error": f"Evaluation failed with exit code {{proc.returncode}}"}}))
3560
+ return
3561
+
3562
+ # Read results
3563
+ result_proc = sandbox.exec("cat", "/workspace/results.json")
3564
+ result_data = result_proc.stdout.read()
3565
+ result_proc.wait()
3566
+
3567
+ if result_data:
3568
+ results = json.loads(result_data)
3569
+ print("EVAL_RESULT_JSON:" + json.dumps(results))
3570
+ else:
3571
+ print(json.dumps({{"success": False, "error": "No results.json found"}}))
3572
+
3573
+ finally:
3574
+ sandbox.terminate()
3575
+
3576
+ asyncio.run(run_eval())
3577
+ '''
3578
+
3579
+
3580
+ async def run_evaluate_kernelbench_modal(
3581
+ args: KernelBenchEvaluateArgs,
3582
+ target: ModalTarget,
3583
+ ) -> EvaluateResult:
3584
+ """Run KernelBench format evaluation on Modal sandbox.
3585
+
3586
+ Creates a Modal sandbox, uploads files, runs KernelBench eval, and parses results.
3587
+ Uses subprocess to isolate Modal's asyncio from trio.
3588
+ """
3589
+ import base64
3590
+ import subprocess
3591
+ import sys
3592
+
3593
+ import trio
3594
+
3595
+ print(f"Creating Modal sandbox ({target.gpu_type}) for KernelBench evaluation...")
3596
+
3597
+ # Encode files as base64
3598
+ impl_code_b64 = base64.b64encode(args.implementation.read_bytes()).decode()
3599
+ ref_code_b64 = base64.b64encode(args.reference.read_bytes()).decode()
3600
+ eval_script_b64 = base64.b64encode(KERNELBENCH_EVAL_SCRIPT.encode()).decode()
3601
+
3602
+ # Encode custom inputs if provided
3603
+ inputs_code_b64 = None
3604
+ if args.inputs:
3605
+ inputs_code_b64 = base64.b64encode(args.inputs.read_bytes()).decode()
3606
+
3607
+ # Encode defense module if defensive mode is enabled
3608
+ defense_code_b64 = None
3609
+ if args.defensive:
3610
+ defense_path = (
3611
+ Path(__file__).parent.parent.parent.parent
3612
+ / "packages"
3613
+ / "wafer-core"
3614
+ / "wafer_core"
3615
+ / "utils"
3616
+ / "kernel_utils"
3617
+ / "defense.py"
3618
+ )
3619
+ if defense_path.exists():
3620
+ defense_code_b64 = base64.b64encode(defense_path.read_bytes()).decode()
3621
+ else:
3622
+ print(f"Warning: defense.py not found at {defense_path}, falling back to basic defense")
3623
+
3624
+ # Build the script
3625
+ script = _build_modal_kernelbench_script(
3626
+ target=target,
3627
+ impl_code_b64=impl_code_b64,
3628
+ ref_code_b64=ref_code_b64,
3629
+ eval_script_b64=eval_script_b64,
3630
+ run_benchmarks=args.benchmark,
3631
+ run_defensive=args.defensive,
3632
+ defense_code_b64=defense_code_b64,
3633
+ seed=args.seed,
3634
+ inputs_code_b64=inputs_code_b64,
3635
+ )
3636
+
3637
+ def _run_subprocess() -> tuple[str, str, int]:
3638
+ result = subprocess.run(
3639
+ [sys.executable, "-c", script],
3640
+ capture_output=True,
3641
+ text=True,
3642
+ timeout=target.timeout_seconds + 120, # Extra buffer for sandbox creation + image build
3643
+ )
3644
+ return result.stdout, result.stderr, result.returncode
3645
+
3646
+ try:
3647
+ stdout, stderr, returncode = await trio.to_thread.run_sync(_run_subprocess)
3648
+ except subprocess.TimeoutExpired:
3649
+ return EvaluateResult(
3650
+ success=False,
3651
+ all_correct=False,
3652
+ correctness_score=0.0,
3653
+ geomean_speedup=0.0,
3654
+ passed_tests=0,
3655
+ total_tests=0,
3656
+ error_message=f"Modal KernelBench evaluation timed out after {target.timeout_seconds}s",
3657
+ )
3658
+ except Exception as e:
3659
+ return EvaluateResult(
3660
+ success=False,
3661
+ all_correct=False,
3662
+ correctness_score=0.0,
3663
+ geomean_speedup=0.0,
3664
+ passed_tests=0,
3665
+ total_tests=0,
3666
+ error_message=f"Failed to run Modal sandbox: {e}",
3667
+ )
3668
+
3669
+ # Print output for debugging
3670
+ if stdout:
3671
+ for line in stdout.split("\n"):
3672
+ if not line.startswith("EVAL_RESULT_JSON:"):
3673
+ print(line)
3674
+ if stderr:
3675
+ print(stderr, file=sys.stderr)
3676
+
3677
+ if returncode != 0:
3678
+ return EvaluateResult(
3679
+ success=False,
3680
+ all_correct=False,
3681
+ correctness_score=0.0,
3682
+ geomean_speedup=0.0,
3683
+ passed_tests=0,
3684
+ total_tests=0,
3685
+ error_message=f"Modal sandbox failed (exit {returncode}): {stderr or stdout}",
3686
+ )
3687
+
3688
+ # Parse results from stdout
3689
+ result_json = None
3690
+ for line in stdout.split("\n"):
3691
+ if line.startswith("EVAL_RESULT_JSON:"):
3692
+ result_json = line[len("EVAL_RESULT_JSON:") :]
3693
+ break
3694
+
3695
+ if not result_json:
3696
+ return EvaluateResult(
3697
+ success=False,
3698
+ all_correct=False,
3699
+ correctness_score=0.0,
3700
+ geomean_speedup=0.0,
3701
+ passed_tests=0,
3702
+ total_tests=0,
3703
+ error_message="No results found in Modal output",
3704
+ )
3705
+
3706
+ try:
3707
+ results = json.loads(result_json)
3708
+ except json.JSONDecodeError as e:
3709
+ return EvaluateResult(
3710
+ success=False,
3711
+ all_correct=False,
3712
+ correctness_score=0.0,
3713
+ geomean_speedup=0.0,
3714
+ passed_tests=0,
3715
+ total_tests=0,
3716
+ error_message=f"Failed to parse results JSON: {e}",
3717
+ )
3718
+
3719
+ # Check for error in results
3720
+ if "error" in results and results.get("success") is False:
3721
+ return EvaluateResult(
3722
+ success=False,
3723
+ all_correct=False,
3724
+ correctness_score=0.0,
3725
+ geomean_speedup=0.0,
3726
+ passed_tests=0,
3727
+ total_tests=0,
3728
+ error_message=results.get("error", "Unknown error"),
3729
+ )
3730
+
3731
+ # Extract metrics from results
3732
+ return EvaluateResult(
3733
+ success=True,
3734
+ all_correct=results.get("all_correct", False),
3735
+ correctness_score=float(results.get("correctness_score", 0.0)),
3736
+ geomean_speedup=float(results.get("geomean_speedup", 0.0)),
3737
+ passed_tests=int(results.get("passed_tests", 0)),
3738
+ total_tests=int(results.get("total_tests", 0)),
3739
+ error_message=results.get("error"),
3740
+ test_results=results.get("test_results", []),
3741
+ compilation_time_s=results.get("compilation_time_s"),
3742
+ profiling_stats=results.get("profiling_stats"),
3743
+ )
3744
+
3745
+
3455
3746
  async def run_evaluate_kernelbench_docker(
3456
3747
  args: KernelBenchEvaluateArgs,
3457
3748
  target: BaremetalTarget | VMTarget,
@@ -4112,6 +4403,7 @@ async def run_evaluate_kernelbench_runpod(
4112
4403
  # Find Python with PyTorch - check common locations on RunPod
4113
4404
  python_exe = "python3"
4114
4405
  for candidate in [
4406
+ "/opt/venv/bin/python3",
4115
4407
  "/opt/conda/envs/py_3.10/bin/python3",
4116
4408
  "/opt/conda/bin/python3",
4117
4409
  ]:
@@ -4245,6 +4537,22 @@ async def run_evaluate_kernelbench_runpod(
4245
4537
  )
4246
4538
 
4247
4539
 
4540
+ async def run_evaluate_kernelbench_baremetal_direct(
4541
+ args: KernelBenchEvaluateArgs,
4542
+ target: BaremetalTarget,
4543
+ ) -> EvaluateResult:
4544
+ """Run KernelBench format evaluation directly on NVIDIA target (no Docker).
4545
+
4546
+ For targets that already have PyTorch/CUDA installed (e.g., workspace containers).
4547
+ Uses CUDA_VISIBLE_DEVICES for GPU selection.
4548
+ """
4549
+ # Reuse the AMD function but with CUDA env vars
4550
+ # The logic is identical, just the GPU env var is different
4551
+ return await _run_evaluate_kernelbench_baremetal_direct_impl(
4552
+ args, target, gpu_env_var="CUDA_VISIBLE_DEVICES"
4553
+ )
4554
+
4555
+
4248
4556
  async def run_evaluate_kernelbench_baremetal_amd(
4249
4557
  args: KernelBenchEvaluateArgs,
4250
4558
  target: BaremetalTarget,
@@ -4254,6 +4562,20 @@ async def run_evaluate_kernelbench_baremetal_amd(
4254
4562
  Runs evaluation script directly on host (no Docker) for AMD GPUs
4255
4563
  that have PyTorch/ROCm installed.
4256
4564
  """
4565
+ return await _run_evaluate_kernelbench_baremetal_direct_impl(
4566
+ args, target, gpu_env_var="HIP_VISIBLE_DEVICES"
4567
+ )
4568
+
4569
+
4570
+ async def _run_evaluate_kernelbench_baremetal_direct_impl(
4571
+ args: KernelBenchEvaluateArgs,
4572
+ target: BaremetalTarget,
4573
+ gpu_env_var: str = "HIP_VISIBLE_DEVICES",
4574
+ ) -> EvaluateResult:
4575
+ """Internal implementation for direct baremetal evaluation.
4576
+
4577
+ Runs evaluation script directly on host (no Docker).
4578
+ """
4257
4579
  from datetime import datetime
4258
4580
 
4259
4581
  from wafer_core.async_ssh import AsyncSSHClient
@@ -4404,11 +4726,17 @@ async def run_evaluate_kernelbench_baremetal_amd(
4404
4726
 
4405
4727
  eval_cmd = " ".join(python_cmd_parts)
4406
4728
 
4407
- # Set environment for AMD GPU and run
4408
- # PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
4409
- rocm_arch = _get_rocm_arch(target.compute_capability)
4410
- arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
4411
- env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
4729
+ # Set environment for GPU and run
4730
+ if gpu_env_var == "HIP_VISIBLE_DEVICES":
4731
+ # AMD: PYTORCH_ROCM_ARCH for faster compile
4732
+ rocm_arch = _get_rocm_arch(target.compute_capability)
4733
+ arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
4734
+ env_vars = (
4735
+ f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
4736
+ )
4737
+ else:
4738
+ # NVIDIA: just set CUDA_VISIBLE_DEVICES
4739
+ env_vars = f"CUDA_VISIBLE_DEVICES={gpu_id} PYTHONUNBUFFERED=1"
4412
4740
  full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
4413
4741
 
4414
4742
  # Handle prepare-only mode
@@ -4559,10 +4887,16 @@ async def run_evaluate_kernelbench(args: KernelBenchEvaluateArgs) -> EvaluateRes
4559
4887
  elif isinstance(target, RunPodTarget):
4560
4888
  # RunPod AMD MI300X - uses ROCm Docker with device passthrough
4561
4889
  return await run_evaluate_kernelbench_runpod(args, target)
4890
+ elif isinstance(target, ModalTarget):
4891
+ # Modal serverless - runs in Modal sandbox
4892
+ return await run_evaluate_kernelbench_modal(args, target)
4562
4893
  elif isinstance(target, BaremetalTarget | VMTarget):
4563
4894
  # Check if this is an AMD target (gfx* compute capability) - run directly
4564
4895
  if target.compute_capability and target.compute_capability.startswith("gfx"):
4565
4896
  return await run_evaluate_kernelbench_baremetal_amd(args, target)
4897
+ # Check for direct execution flag (workspace containers that already have everything)
4898
+ if getattr(target, "direct", False):
4899
+ return await run_evaluate_kernelbench_baremetal_direct(args, target)
4566
4900
  # NVIDIA targets - require docker_image to be set
4567
4901
  if not target.docker_image:
4568
4902
  return EvaluateResult(