wafer-cli 0.2.14__py3-none-any.whl → 0.2.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/GUIDE.md +1 -1
- wafer/agent_defaults.py +42 -0
- wafer/auth.py +7 -0
- wafer/billing.py +6 -6
- wafer/cli.py +905 -131
- wafer/cli_instructions.py +143 -0
- wafer/corpus.py +313 -15
- wafer/evaluate.py +480 -146
- wafer/global_config.py +13 -0
- wafer/kernel_scope.py +1 -1
- wafer/ncu_analyze.py +1 -1
- wafer/nsys_analyze.py +1 -1
- wafer/skills/wafer-guide/SKILL.md +22 -6
- wafer/specs_cli.py +157 -0
- wafer/ssh_keys.py +6 -6
- wafer/targets_cli.py +472 -0
- wafer/targets_ops.py +29 -2
- wafer/templates/ask_docs.py +1 -1
- wafer/templates/optimize_kernel.py +3 -1
- wafer/templates/optimize_kernelbench.py +17 -62
- wafer/templates/trace_analyze.py +1 -1
- wafer/tests/test_eval_cli_parity.py +199 -0
- wafer/trace_compare.py +274 -0
- wafer/wevin_cli.py +125 -26
- wafer/workspaces.py +163 -16
- wafer_cli-0.2.30.dist-info/METADATA +107 -0
- wafer_cli-0.2.30.dist-info/RECORD +47 -0
- wafer_cli-0.2.14.dist-info/METADATA +0 -16
- wafer_cli-0.2.14.dist-info/RECORD +0 -41
- {wafer_cli-0.2.14.dist-info → wafer_cli-0.2.30.dist-info}/WHEEL +0 -0
- {wafer_cli-0.2.14.dist-info → wafer_cli-0.2.30.dist-info}/entry_points.txt +0 -0
- {wafer_cli-0.2.14.dist-info → wafer_cli-0.2.30.dist-info}/top_level.txt +0 -0
wafer/evaluate.py
CHANGED
|
@@ -354,7 +354,8 @@ def _build_docker_pip_install_cmd(target: BaremetalTarget | VMTarget) -> str:
|
|
|
354
354
|
)
|
|
355
355
|
|
|
356
356
|
# Install uv (fast, reliable) - use pip3 for compatibility
|
|
357
|
-
|
|
357
|
+
# Use --break-system-packages for Python 3.12+ with PEP 668 externally managed environments
|
|
358
|
+
commands.append("pip3 install --break-system-packages uv")
|
|
358
359
|
|
|
359
360
|
# Install torch with custom index if specified (like Modal's two-phase install)
|
|
360
361
|
# Use --system --break-system-packages to install to container's Python
|
|
@@ -378,18 +379,6 @@ def _build_docker_pip_install_cmd(target: BaremetalTarget | VMTarget) -> str:
|
|
|
378
379
|
return " && ".join(commands)
|
|
379
380
|
|
|
380
381
|
|
|
381
|
-
def _get_wafer_root() -> Path:
|
|
382
|
-
"""Get wafer monorepo root directory.
|
|
383
|
-
|
|
384
|
-
Walks up from this file to find the wafer repo root (contains apps/, packages/).
|
|
385
|
-
"""
|
|
386
|
-
current = Path(__file__).resolve()
|
|
387
|
-
for parent in [current] + list(current.parents):
|
|
388
|
-
if (parent / "apps").is_dir() and (parent / "packages").is_dir():
|
|
389
|
-
return parent
|
|
390
|
-
raise RuntimeError(f"Could not find wafer root from {__file__}")
|
|
391
|
-
|
|
392
|
-
|
|
393
382
|
async def run_evaluate_docker(
|
|
394
383
|
args: EvaluateArgs,
|
|
395
384
|
target: BaremetalTarget | VMTarget,
|
|
@@ -1167,11 +1156,16 @@ def _build_modal_sandbox_script(
|
|
|
1167
1156
|
"""
|
|
1168
1157
|
gpu_type = target.gpu_type
|
|
1169
1158
|
|
|
1170
|
-
# Determine PyTorch index based on GPU type
|
|
1159
|
+
# Determine PyTorch index and CUDA arch based on GPU type
|
|
1171
1160
|
if gpu_type in ("B200", "GB200"):
|
|
1172
|
-
torch_index = "https://download.pytorch.org/whl/
|
|
1161
|
+
torch_index = "https://download.pytorch.org/whl/cu130"
|
|
1162
|
+
cuda_arch_list = "10.0" # Blackwell (sm_100)
|
|
1163
|
+
elif gpu_type == "H100":
|
|
1164
|
+
torch_index = "https://download.pytorch.org/whl/cu130"
|
|
1165
|
+
cuda_arch_list = "9.0" # Hopper (sm_90)
|
|
1173
1166
|
else:
|
|
1174
1167
|
torch_index = "https://download.pytorch.org/whl/cu124"
|
|
1168
|
+
cuda_arch_list = "8.0" # Default to Ampere (sm_80)
|
|
1175
1169
|
|
|
1176
1170
|
return f'''
|
|
1177
1171
|
import asyncio
|
|
@@ -1189,7 +1183,7 @@ async def run_eval():
|
|
|
1189
1183
|
"nvidia/cuda:12.9.0-devel-ubuntu22.04",
|
|
1190
1184
|
add_python="3.12",
|
|
1191
1185
|
)
|
|
1192
|
-
.apt_install("git", "build-essential", "cmake")
|
|
1186
|
+
.apt_install("git", "build-essential", "cmake", "ripgrep")
|
|
1193
1187
|
.pip_install(
|
|
1194
1188
|
"torch",
|
|
1195
1189
|
index_url="{torch_index}",
|
|
@@ -1202,6 +1196,12 @@ async def run_eval():
|
|
|
1202
1196
|
)
|
|
1203
1197
|
.env({{
|
|
1204
1198
|
"CUDA_HOME": "/usr/local/cuda",
|
|
1199
|
+
# C++ compiler needs explicit include path for cuda_runtime.h
|
|
1200
|
+
"CPLUS_INCLUDE_PATH": "/usr/local/cuda/include",
|
|
1201
|
+
# Linker needs lib path
|
|
1202
|
+
"LIBRARY_PATH": "/usr/local/cuda/lib64",
|
|
1203
|
+
# Force PyTorch to compile for correct GPU architecture
|
|
1204
|
+
"TORCH_CUDA_ARCH_LIST": "{cuda_arch_list}",
|
|
1205
1205
|
}})
|
|
1206
1206
|
)
|
|
1207
1207
|
|
|
@@ -2021,54 +2021,13 @@ async def run_evaluate_runpod(
|
|
|
2021
2021
|
error_message=f"Failed to setup Python environment: {e}",
|
|
2022
2022
|
)
|
|
2023
2023
|
|
|
2024
|
-
#
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
|
|
2028
|
-
|
|
2029
|
-
|
|
2030
|
-
|
|
2031
|
-
await client.exec(f"mkdir -p {wafer_core_remote}")
|
|
2032
|
-
wafer_core_workspace = await client.expand_path(wafer_core_remote)
|
|
2033
|
-
|
|
2034
|
-
upload_result = await client.upload_files(
|
|
2035
|
-
str(wafer_core_path), wafer_core_workspace, recursive=True
|
|
2036
|
-
)
|
|
2037
|
-
|
|
2038
|
-
# Wide event logging for upload result
|
|
2039
|
-
upload_event = {
|
|
2040
|
-
"event": "wafer_core_upload",
|
|
2041
|
-
"target": target.name,
|
|
2042
|
-
"target_type": "runpod",
|
|
2043
|
-
"ssh_host": f"{client.user}@{client.host}:{client.port}",
|
|
2044
|
-
"local_path": str(wafer_core_path),
|
|
2045
|
-
"remote_path": wafer_core_workspace,
|
|
2046
|
-
"success": upload_result.success,
|
|
2047
|
-
"files_copied": upload_result.files_copied,
|
|
2048
|
-
"duration_seconds": upload_result.duration_seconds,
|
|
2049
|
-
"error_message": upload_result.error_message,
|
|
2050
|
-
}
|
|
2051
|
-
if upload_result.debug_info:
|
|
2052
|
-
upload_event["debug_info"] = upload_result.debug_info
|
|
2053
|
-
logger.info(json.dumps(upload_event))
|
|
2054
|
-
|
|
2055
|
-
# Fail fast if upload failed
|
|
2056
|
-
if not upload_result.success:
|
|
2057
|
-
print(f"ERROR: Upload failed: {upload_result.error_message}")
|
|
2058
|
-
if upload_result.debug_info:
|
|
2059
|
-
print(f"Debug info: {json.dumps(upload_result.debug_info, indent=2)}")
|
|
2060
|
-
return EvaluateResult(
|
|
2061
|
-
success=False,
|
|
2062
|
-
all_correct=False,
|
|
2063
|
-
correctness_score=0.0,
|
|
2064
|
-
geomean_speedup=0.0,
|
|
2065
|
-
passed_tests=0,
|
|
2066
|
-
total_tests=0,
|
|
2067
|
-
error_message=f"Failed to upload wafer-core: {upload_result.error_message}",
|
|
2068
|
-
)
|
|
2069
|
-
|
|
2070
|
-
print(f"Uploaded {upload_result.files_copied} files")
|
|
2071
|
-
except Exception as e:
|
|
2024
|
+
# Install wafer-core in remote venv
|
|
2025
|
+
print("Installing wafer-core...")
|
|
2026
|
+
install_result = await client.exec(
|
|
2027
|
+
f'export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH" && '
|
|
2028
|
+
f"uv pip install --python {python_exe} wafer-core"
|
|
2029
|
+
)
|
|
2030
|
+
if install_result.exit_code != 0:
|
|
2072
2031
|
return EvaluateResult(
|
|
2073
2032
|
success=False,
|
|
2074
2033
|
all_correct=False,
|
|
@@ -2076,7 +2035,7 @@ async def run_evaluate_runpod(
|
|
|
2076
2035
|
geomean_speedup=0.0,
|
|
2077
2036
|
passed_tests=0,
|
|
2078
2037
|
total_tests=0,
|
|
2079
|
-
error_message=f"Failed to
|
|
2038
|
+
error_message=f"Failed to install wafer-core: {install_result.stderr}",
|
|
2080
2039
|
)
|
|
2081
2040
|
|
|
2082
2041
|
# Select GPU (RunPod pods typically have GPU 0)
|
|
@@ -2217,11 +2176,33 @@ async def run_evaluate_runpod(
|
|
|
2217
2176
|
error_message=f"Evaluation timed out after {target.eval_timeout}s",
|
|
2218
2177
|
)
|
|
2219
2178
|
|
|
2220
|
-
#
|
|
2179
|
+
# Show output to user
|
|
2221
2180
|
stdout = result.stdout
|
|
2222
2181
|
stderr = result.stderr
|
|
2182
|
+
if stdout:
|
|
2183
|
+
print(stdout)
|
|
2223
2184
|
|
|
2224
2185
|
if result.exit_code != 0:
|
|
2186
|
+
error_parts = [f"Evaluation failed (exit code {result.exit_code}):"]
|
|
2187
|
+
if stdout:
|
|
2188
|
+
error_parts.append(f"stdout: {stdout}")
|
|
2189
|
+
if stderr:
|
|
2190
|
+
error_parts.append(f"stderr: {stderr}")
|
|
2191
|
+
return EvaluateResult(
|
|
2192
|
+
success=False,
|
|
2193
|
+
all_correct=False,
|
|
2194
|
+
correctness_score=0.0,
|
|
2195
|
+
geomean_speedup=0.0,
|
|
2196
|
+
passed_tests=0,
|
|
2197
|
+
total_tests=0,
|
|
2198
|
+
error_message="\n".join(error_parts),
|
|
2199
|
+
)
|
|
2200
|
+
|
|
2201
|
+
# Read results from results.json file written by evaluate module
|
|
2202
|
+
results_path = f"{run_path}/results.json"
|
|
2203
|
+
cat_result = await client.exec(f"cat {results_path}")
|
|
2204
|
+
|
|
2205
|
+
if cat_result.exit_code != 0:
|
|
2225
2206
|
return EvaluateResult(
|
|
2226
2207
|
success=False,
|
|
2227
2208
|
all_correct=False,
|
|
@@ -2229,20 +2210,12 @@ async def run_evaluate_runpod(
|
|
|
2229
2210
|
geomean_speedup=0.0,
|
|
2230
2211
|
passed_tests=0,
|
|
2231
2212
|
total_tests=0,
|
|
2232
|
-
error_message=f"
|
|
2213
|
+
error_message=f"Failed to read results: {cat_result.stderr}",
|
|
2233
2214
|
)
|
|
2234
2215
|
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
if line.startswith("{"):
|
|
2239
|
-
try:
|
|
2240
|
-
result_json = json.loads(line)
|
|
2241
|
-
break
|
|
2242
|
-
except json.JSONDecodeError:
|
|
2243
|
-
continue
|
|
2244
|
-
|
|
2245
|
-
if result_json is None:
|
|
2216
|
+
try:
|
|
2217
|
+
results_data = json.loads(cat_result.stdout)
|
|
2218
|
+
except json.JSONDecodeError as e:
|
|
2246
2219
|
return EvaluateResult(
|
|
2247
2220
|
success=False,
|
|
2248
2221
|
all_correct=False,
|
|
@@ -2250,10 +2223,12 @@ async def run_evaluate_runpod(
|
|
|
2250
2223
|
geomean_speedup=0.0,
|
|
2251
2224
|
passed_tests=0,
|
|
2252
2225
|
total_tests=0,
|
|
2253
|
-
error_message=f"
|
|
2226
|
+
error_message=f"Invalid JSON in results: {e}",
|
|
2254
2227
|
)
|
|
2255
2228
|
|
|
2256
|
-
|
|
2229
|
+
# Extract backend results (same format as DigitalOcean/SSH path)
|
|
2230
|
+
backends = results_data.get("backends", [])
|
|
2231
|
+
if not backends:
|
|
2257
2232
|
return EvaluateResult(
|
|
2258
2233
|
success=False,
|
|
2259
2234
|
all_correct=False,
|
|
@@ -2261,18 +2236,20 @@ async def run_evaluate_runpod(
|
|
|
2261
2236
|
geomean_speedup=0.0,
|
|
2262
2237
|
passed_tests=0,
|
|
2263
2238
|
total_tests=0,
|
|
2264
|
-
error_message=
|
|
2239
|
+
error_message="No backend results found",
|
|
2265
2240
|
)
|
|
2266
2241
|
|
|
2267
|
-
|
|
2268
|
-
|
|
2242
|
+
backend = backends[0]
|
|
2243
|
+
correctness_tests = backend.get("correctness_tests", [])
|
|
2244
|
+
passed = sum(1 for t in correctness_tests if t.get("is_correct", False))
|
|
2245
|
+
total = len(correctness_tests)
|
|
2269
2246
|
correctness = passed / total if total > 0 else 0.0
|
|
2270
2247
|
|
|
2271
2248
|
return EvaluateResult(
|
|
2272
2249
|
success=True,
|
|
2273
|
-
all_correct=
|
|
2250
|
+
all_correct=backend.get("all_correct", False),
|
|
2274
2251
|
correctness_score=correctness,
|
|
2275
|
-
geomean_speedup=
|
|
2252
|
+
geomean_speedup=backend.get("geomean_speedup", 0.0),
|
|
2276
2253
|
passed_tests=passed,
|
|
2277
2254
|
total_tests=total,
|
|
2278
2255
|
)
|
|
@@ -2373,61 +2350,13 @@ async def run_evaluate_digitalocean(
|
|
|
2373
2350
|
error_message=f"Failed to setup Python environment: {e}",
|
|
2374
2351
|
)
|
|
2375
2352
|
|
|
2376
|
-
#
|
|
2377
|
-
|
|
2378
|
-
|
|
2379
|
-
|
|
2380
|
-
|
|
2381
|
-
|
|
2382
|
-
|
|
2383
|
-
await client.exec(f"mkdir -p {wafer_core_remote}")
|
|
2384
|
-
wafer_core_workspace = await client.expand_path(wafer_core_remote)
|
|
2385
|
-
|
|
2386
|
-
# Use SFTP instead of rsync to avoid SSH subprocess timeout issues
|
|
2387
|
-
# (DigitalOcean may rate-limit new SSH connections)
|
|
2388
|
-
upload_result = await client.upload_files(
|
|
2389
|
-
str(wafer_core_path),
|
|
2390
|
-
wafer_core_workspace,
|
|
2391
|
-
recursive=True,
|
|
2392
|
-
use_sftp=True,
|
|
2393
|
-
)
|
|
2394
|
-
|
|
2395
|
-
# Wide event logging for upload result
|
|
2396
|
-
upload_event = {
|
|
2397
|
-
"event": "wafer_core_upload",
|
|
2398
|
-
"target": target.name,
|
|
2399
|
-
"target_type": "digitalocean",
|
|
2400
|
-
"ssh_host": f"{client.user}@{client.host}:{client.port}",
|
|
2401
|
-
"local_path": str(wafer_core_path),
|
|
2402
|
-
"remote_path": wafer_core_workspace,
|
|
2403
|
-
"success": upload_result.success,
|
|
2404
|
-
"files_copied": upload_result.files_copied,
|
|
2405
|
-
"duration_seconds": upload_result.duration_seconds,
|
|
2406
|
-
"error_message": upload_result.error_message,
|
|
2407
|
-
}
|
|
2408
|
-
if upload_result.debug_info:
|
|
2409
|
-
upload_event["debug_info"] = upload_result.debug_info
|
|
2410
|
-
logger.info(json.dumps(upload_event))
|
|
2411
|
-
|
|
2412
|
-
# Fail fast if upload failed
|
|
2413
|
-
if not upload_result.success:
|
|
2414
|
-
print(f"ERROR: Upload failed: {upload_result.error_message}")
|
|
2415
|
-
if upload_result.debug_info:
|
|
2416
|
-
print(
|
|
2417
|
-
f"Debug info: {json.dumps(upload_result.debug_info, indent=2)}"
|
|
2418
|
-
)
|
|
2419
|
-
return EvaluateResult(
|
|
2420
|
-
success=False,
|
|
2421
|
-
all_correct=False,
|
|
2422
|
-
correctness_score=0.0,
|
|
2423
|
-
geomean_speedup=0.0,
|
|
2424
|
-
passed_tests=0,
|
|
2425
|
-
total_tests=0,
|
|
2426
|
-
error_message=f"Failed to upload wafer-core: {upload_result.error_message}",
|
|
2427
|
-
)
|
|
2428
|
-
|
|
2429
|
-
print(f"Uploaded {upload_result.files_copied} files")
|
|
2430
|
-
except Exception as e:
|
|
2353
|
+
# Install wafer-core in remote venv
|
|
2354
|
+
print("Installing wafer-core...")
|
|
2355
|
+
install_result = await client.exec(
|
|
2356
|
+
f'export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH" && '
|
|
2357
|
+
f"uv pip install --python {python_exe} wafer-core"
|
|
2358
|
+
)
|
|
2359
|
+
if install_result.exit_code != 0:
|
|
2431
2360
|
return EvaluateResult(
|
|
2432
2361
|
success=False,
|
|
2433
2362
|
all_correct=False,
|
|
@@ -2435,7 +2364,7 @@ async def run_evaluate_digitalocean(
|
|
|
2435
2364
|
geomean_speedup=0.0,
|
|
2436
2365
|
passed_tests=0,
|
|
2437
2366
|
total_tests=0,
|
|
2438
|
-
error_message=f"Failed to
|
|
2367
|
+
error_message=f"Failed to install wafer-core: {install_result.stderr}",
|
|
2439
2368
|
)
|
|
2440
2369
|
|
|
2441
2370
|
# Select GPU (DigitalOcean droplets typically have GPU 0)
|
|
@@ -3452,6 +3381,368 @@ def _validate_kernelbench_files(args: KernelBenchEvaluateArgs) -> str | None:
|
|
|
3452
3381
|
return None
|
|
3453
3382
|
|
|
3454
3383
|
|
|
3384
|
+
def _build_modal_kernelbench_script(
|
|
3385
|
+
target: ModalTarget,
|
|
3386
|
+
impl_code_b64: str,
|
|
3387
|
+
ref_code_b64: str,
|
|
3388
|
+
eval_script_b64: str,
|
|
3389
|
+
run_benchmarks: bool,
|
|
3390
|
+
run_defensive: bool,
|
|
3391
|
+
defense_code_b64: str | None,
|
|
3392
|
+
seed: int,
|
|
3393
|
+
inputs_code_b64: str | None = None,
|
|
3394
|
+
) -> str:
|
|
3395
|
+
"""Build Python script to create Modal sandbox and run KernelBench evaluation.
|
|
3396
|
+
|
|
3397
|
+
This runs in a subprocess to isolate Modal's asyncio from trio.
|
|
3398
|
+
"""
|
|
3399
|
+
gpu_type = target.gpu_type
|
|
3400
|
+
|
|
3401
|
+
# Determine PyTorch index and CUDA arch based on GPU type
|
|
3402
|
+
if gpu_type in ("B200", "GB200"):
|
|
3403
|
+
torch_index = "https://download.pytorch.org/whl/cu130"
|
|
3404
|
+
cuda_arch_list = "10.0" # Blackwell (sm_100)
|
|
3405
|
+
elif gpu_type == "H100":
|
|
3406
|
+
# H100 uses CUDA 13.0 (matches modal_app.py)
|
|
3407
|
+
torch_index = "https://download.pytorch.org/whl/cu130"
|
|
3408
|
+
cuda_arch_list = "9.0" # Hopper (sm_90)
|
|
3409
|
+
else:
|
|
3410
|
+
torch_index = "https://download.pytorch.org/whl/cu124"
|
|
3411
|
+
cuda_arch_list = "8.0" # Default to Ampere (sm_80)
|
|
3412
|
+
|
|
3413
|
+
# Install CUTLASS headers (for cute/tensor.hpp and cutlass/util/*.h) from GitHub
|
|
3414
|
+
# The nvidia-cutlass-dsl pip package doesn't include the C++ headers needed for nvcc
|
|
3415
|
+
# IMPORTANT: symlink to /usr/local/cuda/include because nvcc searches there by default
|
|
3416
|
+
cutlass_install = """
|
|
3417
|
+
.run_commands([
|
|
3418
|
+
# Clone CUTLASS headers from GitHub (shallow clone, full include tree)
|
|
3419
|
+
# Use simple shallow clone - sparse-checkout can be buggy in some environments
|
|
3420
|
+
"git clone --depth 1 https://github.com/NVIDIA/cutlass.git /opt/cutlass",
|
|
3421
|
+
# Verify the util headers exist (for debugging)
|
|
3422
|
+
"ls -la /opt/cutlass/include/cutlass/util/ | head -5",
|
|
3423
|
+
# Symlink headers to CUDA include path (nvcc searches here by default)
|
|
3424
|
+
"ln -sf /opt/cutlass/include/cute /usr/local/cuda/include/cute",
|
|
3425
|
+
"ln -sf /opt/cutlass/include/cutlass /usr/local/cuda/include/cutlass",
|
|
3426
|
+
])
|
|
3427
|
+
.pip_install(
|
|
3428
|
+
"nvidia-cutlass-dsl",
|
|
3429
|
+
index_url="https://pypi.nvidia.com",
|
|
3430
|
+
extra_index_url="https://pypi.org/simple",
|
|
3431
|
+
)
|
|
3432
|
+
"""
|
|
3433
|
+
|
|
3434
|
+
inputs_write = ""
|
|
3435
|
+
if inputs_code_b64:
|
|
3436
|
+
inputs_write = f'''
|
|
3437
|
+
# Write custom inputs
|
|
3438
|
+
proc = sandbox.exec("python", "-c", f"""
|
|
3439
|
+
import base64
|
|
3440
|
+
with open('/workspace/custom_inputs.py', 'w') as f:
|
|
3441
|
+
f.write(base64.b64decode('{inputs_code_b64}').decode())
|
|
3442
|
+
print('Custom inputs written')
|
|
3443
|
+
""")
|
|
3444
|
+
proc.wait()
|
|
3445
|
+
'''
|
|
3446
|
+
|
|
3447
|
+
defense_write = ""
|
|
3448
|
+
if run_defensive and defense_code_b64:
|
|
3449
|
+
defense_write = f'''
|
|
3450
|
+
# Write defense module
|
|
3451
|
+
proc = sandbox.exec("python", "-c", f"""
|
|
3452
|
+
import base64
|
|
3453
|
+
with open('/workspace/defense.py', 'w') as f:
|
|
3454
|
+
f.write(base64.b64decode('{defense_code_b64}').decode())
|
|
3455
|
+
print('Defense module written')
|
|
3456
|
+
""")
|
|
3457
|
+
proc.wait()
|
|
3458
|
+
'''
|
|
3459
|
+
|
|
3460
|
+
# Build eval command
|
|
3461
|
+
eval_cmd_parts = [
|
|
3462
|
+
"python /workspace/kernelbench_eval.py",
|
|
3463
|
+
"--impl /workspace/implementation.py",
|
|
3464
|
+
"--reference /workspace/reference.py",
|
|
3465
|
+
"--output /workspace/results.json",
|
|
3466
|
+
f"--seed {seed}",
|
|
3467
|
+
]
|
|
3468
|
+
if run_benchmarks:
|
|
3469
|
+
eval_cmd_parts.append("--benchmark")
|
|
3470
|
+
if run_defensive and defense_code_b64:
|
|
3471
|
+
eval_cmd_parts.append("--defensive")
|
|
3472
|
+
eval_cmd_parts.append("--defense-module /workspace/defense.py")
|
|
3473
|
+
if inputs_code_b64:
|
|
3474
|
+
eval_cmd_parts.append("--inputs /workspace/custom_inputs.py")
|
|
3475
|
+
|
|
3476
|
+
eval_cmd = " ".join(eval_cmd_parts)
|
|
3477
|
+
|
|
3478
|
+
return f'''
|
|
3479
|
+
import asyncio
|
|
3480
|
+
import base64
|
|
3481
|
+
import json
|
|
3482
|
+
import sys
|
|
3483
|
+
import modal
|
|
3484
|
+
|
|
3485
|
+
async def run_eval():
|
|
3486
|
+
app = modal.App.lookup("wafer-evaluate", create_if_missing=True)
|
|
3487
|
+
|
|
3488
|
+
# Build image with PyTorch, CUTLASS DSL and dependencies
|
|
3489
|
+
image = (
|
|
3490
|
+
modal.Image.from_registry(
|
|
3491
|
+
"nvidia/cuda:12.9.0-devel-ubuntu22.04",
|
|
3492
|
+
add_python="3.12",
|
|
3493
|
+
)
|
|
3494
|
+
.apt_install("git", "build-essential", "cmake", "ninja-build", "ripgrep")
|
|
3495
|
+
.pip_install(
|
|
3496
|
+
"torch",
|
|
3497
|
+
index_url="{torch_index}",
|
|
3498
|
+
extra_index_url="https://pypi.org/simple",
|
|
3499
|
+
)
|
|
3500
|
+
.pip_install(
|
|
3501
|
+
"numpy",
|
|
3502
|
+
"triton",
|
|
3503
|
+
"ninja",
|
|
3504
|
+
)
|
|
3505
|
+
{cutlass_install}
|
|
3506
|
+
.env({{
|
|
3507
|
+
"CUDA_HOME": "/usr/local/cuda",
|
|
3508
|
+
# C++ compiler needs explicit include path for cuda_runtime.h
|
|
3509
|
+
"CPLUS_INCLUDE_PATH": "/usr/local/cuda/include",
|
|
3510
|
+
# Linker needs lib path
|
|
3511
|
+
"LIBRARY_PATH": "/usr/local/cuda/lib64",
|
|
3512
|
+
# Force PyTorch to compile for correct GPU architecture
|
|
3513
|
+
"TORCH_CUDA_ARCH_LIST": "{cuda_arch_list}",
|
|
3514
|
+
}})
|
|
3515
|
+
)
|
|
3516
|
+
|
|
3517
|
+
# Create sandbox
|
|
3518
|
+
sandbox = modal.Sandbox.create(
|
|
3519
|
+
app=app,
|
|
3520
|
+
image=image,
|
|
3521
|
+
gpu="{gpu_type}",
|
|
3522
|
+
timeout={target.timeout_seconds},
|
|
3523
|
+
)
|
|
3524
|
+
|
|
3525
|
+
try:
|
|
3526
|
+
# Create workspace directory
|
|
3527
|
+
sandbox.exec("mkdir", "-p", "/workspace").wait()
|
|
3528
|
+
|
|
3529
|
+
# Write files to sandbox
|
|
3530
|
+
proc = sandbox.exec("python", "-c", f"""
|
|
3531
|
+
import base64
|
|
3532
|
+
with open('/workspace/implementation.py', 'w') as f:
|
|
3533
|
+
f.write(base64.b64decode('{impl_code_b64}').decode())
|
|
3534
|
+
with open('/workspace/reference.py', 'w') as f:
|
|
3535
|
+
f.write(base64.b64decode('{ref_code_b64}').decode())
|
|
3536
|
+
with open('/workspace/kernelbench_eval.py', 'w') as f:
|
|
3537
|
+
f.write(base64.b64decode('{eval_script_b64}').decode())
|
|
3538
|
+
print('Files written')
|
|
3539
|
+
""")
|
|
3540
|
+
proc.wait()
|
|
3541
|
+
if proc.returncode != 0:
|
|
3542
|
+
print(json.dumps({{"success": False, "error": f"Failed to write files: {{proc.stderr.read()}}"}}))
|
|
3543
|
+
return
|
|
3544
|
+
{inputs_write}
|
|
3545
|
+
{defense_write}
|
|
3546
|
+
# Run evaluation
|
|
3547
|
+
print(f"Running KernelBench evaluation on {{'{gpu_type}'}}...")
|
|
3548
|
+
proc = sandbox.exec("bash", "-c", "{eval_cmd}")
|
|
3549
|
+
|
|
3550
|
+
# Stream output
|
|
3551
|
+
for line in proc.stdout:
|
|
3552
|
+
print(line, end="")
|
|
3553
|
+
for line in proc.stderr:
|
|
3554
|
+
print(line, end="", file=sys.stderr)
|
|
3555
|
+
|
|
3556
|
+
proc.wait()
|
|
3557
|
+
|
|
3558
|
+
if proc.returncode != 0:
|
|
3559
|
+
print(json.dumps({{"success": False, "error": f"Evaluation failed with exit code {{proc.returncode}}"}}))
|
|
3560
|
+
return
|
|
3561
|
+
|
|
3562
|
+
# Read results
|
|
3563
|
+
result_proc = sandbox.exec("cat", "/workspace/results.json")
|
|
3564
|
+
result_data = result_proc.stdout.read()
|
|
3565
|
+
result_proc.wait()
|
|
3566
|
+
|
|
3567
|
+
if result_data:
|
|
3568
|
+
results = json.loads(result_data)
|
|
3569
|
+
print("EVAL_RESULT_JSON:" + json.dumps(results))
|
|
3570
|
+
else:
|
|
3571
|
+
print(json.dumps({{"success": False, "error": "No results.json found"}}))
|
|
3572
|
+
|
|
3573
|
+
finally:
|
|
3574
|
+
sandbox.terminate()
|
|
3575
|
+
|
|
3576
|
+
asyncio.run(run_eval())
|
|
3577
|
+
'''
|
|
3578
|
+
|
|
3579
|
+
|
|
3580
|
+
async def run_evaluate_kernelbench_modal(
|
|
3581
|
+
args: KernelBenchEvaluateArgs,
|
|
3582
|
+
target: ModalTarget,
|
|
3583
|
+
) -> EvaluateResult:
|
|
3584
|
+
"""Run KernelBench format evaluation on Modal sandbox.
|
|
3585
|
+
|
|
3586
|
+
Creates a Modal sandbox, uploads files, runs KernelBench eval, and parses results.
|
|
3587
|
+
Uses subprocess to isolate Modal's asyncio from trio.
|
|
3588
|
+
"""
|
|
3589
|
+
import base64
|
|
3590
|
+
import subprocess
|
|
3591
|
+
import sys
|
|
3592
|
+
|
|
3593
|
+
import trio
|
|
3594
|
+
|
|
3595
|
+
print(f"Creating Modal sandbox ({target.gpu_type}) for KernelBench evaluation...")
|
|
3596
|
+
|
|
3597
|
+
# Encode files as base64
|
|
3598
|
+
impl_code_b64 = base64.b64encode(args.implementation.read_bytes()).decode()
|
|
3599
|
+
ref_code_b64 = base64.b64encode(args.reference.read_bytes()).decode()
|
|
3600
|
+
eval_script_b64 = base64.b64encode(KERNELBENCH_EVAL_SCRIPT.encode()).decode()
|
|
3601
|
+
|
|
3602
|
+
# Encode custom inputs if provided
|
|
3603
|
+
inputs_code_b64 = None
|
|
3604
|
+
if args.inputs:
|
|
3605
|
+
inputs_code_b64 = base64.b64encode(args.inputs.read_bytes()).decode()
|
|
3606
|
+
|
|
3607
|
+
# Encode defense module if defensive mode is enabled
|
|
3608
|
+
defense_code_b64 = None
|
|
3609
|
+
if args.defensive:
|
|
3610
|
+
defense_path = (
|
|
3611
|
+
Path(__file__).parent.parent.parent.parent
|
|
3612
|
+
/ "packages"
|
|
3613
|
+
/ "wafer-core"
|
|
3614
|
+
/ "wafer_core"
|
|
3615
|
+
/ "utils"
|
|
3616
|
+
/ "kernel_utils"
|
|
3617
|
+
/ "defense.py"
|
|
3618
|
+
)
|
|
3619
|
+
if defense_path.exists():
|
|
3620
|
+
defense_code_b64 = base64.b64encode(defense_path.read_bytes()).decode()
|
|
3621
|
+
else:
|
|
3622
|
+
print(f"Warning: defense.py not found at {defense_path}, falling back to basic defense")
|
|
3623
|
+
|
|
3624
|
+
# Build the script
|
|
3625
|
+
script = _build_modal_kernelbench_script(
|
|
3626
|
+
target=target,
|
|
3627
|
+
impl_code_b64=impl_code_b64,
|
|
3628
|
+
ref_code_b64=ref_code_b64,
|
|
3629
|
+
eval_script_b64=eval_script_b64,
|
|
3630
|
+
run_benchmarks=args.benchmark,
|
|
3631
|
+
run_defensive=args.defensive,
|
|
3632
|
+
defense_code_b64=defense_code_b64,
|
|
3633
|
+
seed=args.seed,
|
|
3634
|
+
inputs_code_b64=inputs_code_b64,
|
|
3635
|
+
)
|
|
3636
|
+
|
|
3637
|
+
def _run_subprocess() -> tuple[str, str, int]:
|
|
3638
|
+
result = subprocess.run(
|
|
3639
|
+
[sys.executable, "-c", script],
|
|
3640
|
+
capture_output=True,
|
|
3641
|
+
text=True,
|
|
3642
|
+
timeout=target.timeout_seconds + 120, # Extra buffer for sandbox creation + image build
|
|
3643
|
+
)
|
|
3644
|
+
return result.stdout, result.stderr, result.returncode
|
|
3645
|
+
|
|
3646
|
+
try:
|
|
3647
|
+
stdout, stderr, returncode = await trio.to_thread.run_sync(_run_subprocess)
|
|
3648
|
+
except subprocess.TimeoutExpired:
|
|
3649
|
+
return EvaluateResult(
|
|
3650
|
+
success=False,
|
|
3651
|
+
all_correct=False,
|
|
3652
|
+
correctness_score=0.0,
|
|
3653
|
+
geomean_speedup=0.0,
|
|
3654
|
+
passed_tests=0,
|
|
3655
|
+
total_tests=0,
|
|
3656
|
+
error_message=f"Modal KernelBench evaluation timed out after {target.timeout_seconds}s",
|
|
3657
|
+
)
|
|
3658
|
+
except Exception as e:
|
|
3659
|
+
return EvaluateResult(
|
|
3660
|
+
success=False,
|
|
3661
|
+
all_correct=False,
|
|
3662
|
+
correctness_score=0.0,
|
|
3663
|
+
geomean_speedup=0.0,
|
|
3664
|
+
passed_tests=0,
|
|
3665
|
+
total_tests=0,
|
|
3666
|
+
error_message=f"Failed to run Modal sandbox: {e}",
|
|
3667
|
+
)
|
|
3668
|
+
|
|
3669
|
+
# Print output for debugging
|
|
3670
|
+
if stdout:
|
|
3671
|
+
for line in stdout.split("\n"):
|
|
3672
|
+
if not line.startswith("EVAL_RESULT_JSON:"):
|
|
3673
|
+
print(line)
|
|
3674
|
+
if stderr:
|
|
3675
|
+
print(stderr, file=sys.stderr)
|
|
3676
|
+
|
|
3677
|
+
if returncode != 0:
|
|
3678
|
+
return EvaluateResult(
|
|
3679
|
+
success=False,
|
|
3680
|
+
all_correct=False,
|
|
3681
|
+
correctness_score=0.0,
|
|
3682
|
+
geomean_speedup=0.0,
|
|
3683
|
+
passed_tests=0,
|
|
3684
|
+
total_tests=0,
|
|
3685
|
+
error_message=f"Modal sandbox failed (exit {returncode}): {stderr or stdout}",
|
|
3686
|
+
)
|
|
3687
|
+
|
|
3688
|
+
# Parse results from stdout
|
|
3689
|
+
result_json = None
|
|
3690
|
+
for line in stdout.split("\n"):
|
|
3691
|
+
if line.startswith("EVAL_RESULT_JSON:"):
|
|
3692
|
+
result_json = line[len("EVAL_RESULT_JSON:") :]
|
|
3693
|
+
break
|
|
3694
|
+
|
|
3695
|
+
if not result_json:
|
|
3696
|
+
return EvaluateResult(
|
|
3697
|
+
success=False,
|
|
3698
|
+
all_correct=False,
|
|
3699
|
+
correctness_score=0.0,
|
|
3700
|
+
geomean_speedup=0.0,
|
|
3701
|
+
passed_tests=0,
|
|
3702
|
+
total_tests=0,
|
|
3703
|
+
error_message="No results found in Modal output",
|
|
3704
|
+
)
|
|
3705
|
+
|
|
3706
|
+
try:
|
|
3707
|
+
results = json.loads(result_json)
|
|
3708
|
+
except json.JSONDecodeError as e:
|
|
3709
|
+
return EvaluateResult(
|
|
3710
|
+
success=False,
|
|
3711
|
+
all_correct=False,
|
|
3712
|
+
correctness_score=0.0,
|
|
3713
|
+
geomean_speedup=0.0,
|
|
3714
|
+
passed_tests=0,
|
|
3715
|
+
total_tests=0,
|
|
3716
|
+
error_message=f"Failed to parse results JSON: {e}",
|
|
3717
|
+
)
|
|
3718
|
+
|
|
3719
|
+
# Check for error in results
|
|
3720
|
+
if "error" in results and results.get("success") is False:
|
|
3721
|
+
return EvaluateResult(
|
|
3722
|
+
success=False,
|
|
3723
|
+
all_correct=False,
|
|
3724
|
+
correctness_score=0.0,
|
|
3725
|
+
geomean_speedup=0.0,
|
|
3726
|
+
passed_tests=0,
|
|
3727
|
+
total_tests=0,
|
|
3728
|
+
error_message=results.get("error", "Unknown error"),
|
|
3729
|
+
)
|
|
3730
|
+
|
|
3731
|
+
# Extract metrics from results
|
|
3732
|
+
return EvaluateResult(
|
|
3733
|
+
success=True,
|
|
3734
|
+
all_correct=results.get("all_correct", False),
|
|
3735
|
+
correctness_score=float(results.get("correctness_score", 0.0)),
|
|
3736
|
+
geomean_speedup=float(results.get("geomean_speedup", 0.0)),
|
|
3737
|
+
passed_tests=int(results.get("passed_tests", 0)),
|
|
3738
|
+
total_tests=int(results.get("total_tests", 0)),
|
|
3739
|
+
error_message=results.get("error"),
|
|
3740
|
+
test_results=results.get("test_results", []),
|
|
3741
|
+
compilation_time_s=results.get("compilation_time_s"),
|
|
3742
|
+
profiling_stats=results.get("profiling_stats"),
|
|
3743
|
+
)
|
|
3744
|
+
|
|
3745
|
+
|
|
3455
3746
|
async def run_evaluate_kernelbench_docker(
|
|
3456
3747
|
args: KernelBenchEvaluateArgs,
|
|
3457
3748
|
target: BaremetalTarget | VMTarget,
|
|
@@ -4112,6 +4403,7 @@ async def run_evaluate_kernelbench_runpod(
|
|
|
4112
4403
|
# Find Python with PyTorch - check common locations on RunPod
|
|
4113
4404
|
python_exe = "python3"
|
|
4114
4405
|
for candidate in [
|
|
4406
|
+
"/opt/venv/bin/python3",
|
|
4115
4407
|
"/opt/conda/envs/py_3.10/bin/python3",
|
|
4116
4408
|
"/opt/conda/bin/python3",
|
|
4117
4409
|
]:
|
|
@@ -4245,6 +4537,22 @@ async def run_evaluate_kernelbench_runpod(
|
|
|
4245
4537
|
)
|
|
4246
4538
|
|
|
4247
4539
|
|
|
4540
|
+
async def run_evaluate_kernelbench_baremetal_direct(
|
|
4541
|
+
args: KernelBenchEvaluateArgs,
|
|
4542
|
+
target: BaremetalTarget,
|
|
4543
|
+
) -> EvaluateResult:
|
|
4544
|
+
"""Run KernelBench format evaluation directly on NVIDIA target (no Docker).
|
|
4545
|
+
|
|
4546
|
+
For targets that already have PyTorch/CUDA installed (e.g., workspace containers).
|
|
4547
|
+
Uses CUDA_VISIBLE_DEVICES for GPU selection.
|
|
4548
|
+
"""
|
|
4549
|
+
# Reuse the AMD function but with CUDA env vars
|
|
4550
|
+
# The logic is identical, just the GPU env var is different
|
|
4551
|
+
return await _run_evaluate_kernelbench_baremetal_direct_impl(
|
|
4552
|
+
args, target, gpu_env_var="CUDA_VISIBLE_DEVICES"
|
|
4553
|
+
)
|
|
4554
|
+
|
|
4555
|
+
|
|
4248
4556
|
async def run_evaluate_kernelbench_baremetal_amd(
|
|
4249
4557
|
args: KernelBenchEvaluateArgs,
|
|
4250
4558
|
target: BaremetalTarget,
|
|
@@ -4254,6 +4562,20 @@ async def run_evaluate_kernelbench_baremetal_amd(
|
|
|
4254
4562
|
Runs evaluation script directly on host (no Docker) for AMD GPUs
|
|
4255
4563
|
that have PyTorch/ROCm installed.
|
|
4256
4564
|
"""
|
|
4565
|
+
return await _run_evaluate_kernelbench_baremetal_direct_impl(
|
|
4566
|
+
args, target, gpu_env_var="HIP_VISIBLE_DEVICES"
|
|
4567
|
+
)
|
|
4568
|
+
|
|
4569
|
+
|
|
4570
|
+
async def _run_evaluate_kernelbench_baremetal_direct_impl(
|
|
4571
|
+
args: KernelBenchEvaluateArgs,
|
|
4572
|
+
target: BaremetalTarget,
|
|
4573
|
+
gpu_env_var: str = "HIP_VISIBLE_DEVICES",
|
|
4574
|
+
) -> EvaluateResult:
|
|
4575
|
+
"""Internal implementation for direct baremetal evaluation.
|
|
4576
|
+
|
|
4577
|
+
Runs evaluation script directly on host (no Docker).
|
|
4578
|
+
"""
|
|
4257
4579
|
from datetime import datetime
|
|
4258
4580
|
|
|
4259
4581
|
from wafer_core.async_ssh import AsyncSSHClient
|
|
@@ -4404,11 +4726,17 @@ async def run_evaluate_kernelbench_baremetal_amd(
|
|
|
4404
4726
|
|
|
4405
4727
|
eval_cmd = " ".join(python_cmd_parts)
|
|
4406
4728
|
|
|
4407
|
-
# Set environment for
|
|
4408
|
-
|
|
4409
|
-
|
|
4410
|
-
|
|
4411
|
-
|
|
4729
|
+
# Set environment for GPU and run
|
|
4730
|
+
if gpu_env_var == "HIP_VISIBLE_DEVICES":
|
|
4731
|
+
# AMD: PYTORCH_ROCM_ARCH for faster compile
|
|
4732
|
+
rocm_arch = _get_rocm_arch(target.compute_capability)
|
|
4733
|
+
arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
|
|
4734
|
+
env_vars = (
|
|
4735
|
+
f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
|
|
4736
|
+
)
|
|
4737
|
+
else:
|
|
4738
|
+
# NVIDIA: just set CUDA_VISIBLE_DEVICES
|
|
4739
|
+
env_vars = f"CUDA_VISIBLE_DEVICES={gpu_id} PYTHONUNBUFFERED=1"
|
|
4412
4740
|
full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
|
|
4413
4741
|
|
|
4414
4742
|
# Handle prepare-only mode
|
|
@@ -4559,10 +4887,16 @@ async def run_evaluate_kernelbench(args: KernelBenchEvaluateArgs) -> EvaluateRes
|
|
|
4559
4887
|
elif isinstance(target, RunPodTarget):
|
|
4560
4888
|
# RunPod AMD MI300X - uses ROCm Docker with device passthrough
|
|
4561
4889
|
return await run_evaluate_kernelbench_runpod(args, target)
|
|
4890
|
+
elif isinstance(target, ModalTarget):
|
|
4891
|
+
# Modal serverless - runs in Modal sandbox
|
|
4892
|
+
return await run_evaluate_kernelbench_modal(args, target)
|
|
4562
4893
|
elif isinstance(target, BaremetalTarget | VMTarget):
|
|
4563
4894
|
# Check if this is an AMD target (gfx* compute capability) - run directly
|
|
4564
4895
|
if target.compute_capability and target.compute_capability.startswith("gfx"):
|
|
4565
4896
|
return await run_evaluate_kernelbench_baremetal_amd(args, target)
|
|
4897
|
+
# Check for direct execution flag (workspace containers that already have everything)
|
|
4898
|
+
if getattr(target, "direct", False):
|
|
4899
|
+
return await run_evaluate_kernelbench_baremetal_direct(args, target)
|
|
4566
4900
|
# NVIDIA targets - require docker_image to be set
|
|
4567
4901
|
if not target.docker_image:
|
|
4568
4902
|
return EvaluateResult(
|