PyPI - wafer-core - Versions diffs - 0.1.39__py3-none-any.whl → 0.1.40__py3-none-any.whl - Mend

wafer-core 0.1.39py3-none-any.whl → 0.1.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

wafer_core/tools/compile/benchmark.py ADDED Viewed

@@ -0,0 +1,636 @@
+"""Benchmark script for CUDA compilation performance.
+This script measures compilation time for kernels of different sizes
+to track performance improvements from optimizations.
+Usage:
+    python -m wafer_core.tools.compile.benchmark
+    # Or with specific test:
+    python -m wafer_core.tools.compile.benchmark --kernel simple
+    python -m wafer_core.tools.compile.benchmark --kernel medium
+    python -m wafer_core.tools.compile.benchmark --kernel complex
+"""
+import argparse
+import statistics
+import time
+from typing import NamedTuple
+# ============================================================================
+# Test Kernels
+# ============================================================================
+SIMPLE_KERNEL = """\
+// Simple vector addition kernel (~20 lines)
+__global__ void vector_add(float* a, float* b, float* c, int n) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        c[idx] = a[idx] + b[idx];
+    }
+}
+"""
+MEDIUM_KERNEL = """\
+// Medium complexity kernel with shared memory (~100 lines)
+#include <cuda_runtime.h>
+#define TILE_SIZE 16
+__global__ void tiled_matmul(
+    const float* __restrict__ A,
+    const float* __restrict__ B,
+    float* __restrict__ C,
+    int M, int N, int K
+) {
+    __shared__ float As[TILE_SIZE][TILE_SIZE];
+    __shared__ float Bs[TILE_SIZE][TILE_SIZE];
+    int bx = blockIdx.x, by = blockIdx.y;
+    int tx = threadIdx.x, ty = threadIdx.y;
+    int row = by * TILE_SIZE + ty;
+    int col = bx * TILE_SIZE + tx;
+    float sum = 0.0f;
+    for (int t = 0; t < (K + TILE_SIZE - 1) / TILE_SIZE; t++) {
+        // Load tile from A
+        if (row < M && t * TILE_SIZE + tx < K) {
+            As[ty][tx] = A[row * K + t * TILE_SIZE + tx];
+        } else {
+            As[ty][tx] = 0.0f;
+        }
+        // Load tile from B
+        if (t * TILE_SIZE + ty < K && col < N) {
+            Bs[ty][tx] = B[(t * TILE_SIZE + ty) * N + col];
+        } else {
+            Bs[ty][tx] = 0.0f;
+        }
+        __syncthreads();
+        // Compute partial dot product
+        #pragma unroll
+        for (int k = 0; k < TILE_SIZE; k++) {
+            sum = fmaf(As[ty][k], Bs[k][tx], sum);
+        }
+        __syncthreads();
+    }
+    if (row < M && col < N) {
+        C[row * N + col] = sum;
+    }
+}
+// Reduction kernel
+__global__ void reduce_sum(const float* input, float* output, int n) {
+    extern __shared__ float sdata[];
+    unsigned int tid = threadIdx.x;
+    unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
+    float mySum = (i < n) ? input[i] : 0.0f;
+    if (i + blockDim.x < n) {
+        mySum += input[i + blockDim.x];
+    }
+    sdata[tid] = mySum;
+    __syncthreads();
+    for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
+        if (tid < s) {
+            sdata[tid] = mySum = mySum + sdata[tid + s];
+        }
+        __syncthreads();
+    }
+    if (tid < 32) {
+        volatile float* smem = sdata;
+        smem[tid] = mySum = mySum + smem[tid + 32];
+        smem[tid] = mySum = mySum + smem[tid + 16];
+        smem[tid] = mySum = mySum + smem[tid + 8];
+        smem[tid] = mySum = mySum + smem[tid + 4];
+        smem[tid] = mySum = mySum + smem[tid + 2];
+        smem[tid] = mySum = mySum + smem[tid + 1];
+    }
+    if (tid == 0) {
+        output[blockIdx.x] = sdata[0];
+    }
+}
+"""
+COMPLEX_KERNEL = """\
+// Complex kernel with multiple features (~500 lines)
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+namespace cg = cooperative_groups;
+// Constants
+constexpr int BLOCK_SIZE = 256;
+constexpr int TILE_SIZE = 16;
+// ============================================================================
+// Kernel 1: Vector operations with shared memory and reduction
+// ============================================================================
+template <typename T, int BlockSize>
+__global__ void reduceSum(const T* __restrict__ input, T* __restrict__ output, int N) {
+    __shared__ T sdata[BlockSize];
+    unsigned int tid = threadIdx.x;
+    unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
+    T mySum = (i < N) ? input[i] : T(0);
+    if (i + blockDim.x < N) {
+        mySum += input[i + blockDim.x];
+    }
+    sdata[tid] = mySum;
+    __syncthreads();
+    #pragma unroll
+    for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
+        if (tid < s) {
+            sdata[tid] = mySum = mySum + sdata[tid + s];
+        }
+        __syncthreads();
+    }
+    if (tid < 32) {
+        volatile T* smem = sdata;
+        if (BlockSize >= 64) mySum += smem[tid + 32];
+        smem[tid] = mySum;
+        if (BlockSize >= 32) mySum += smem[tid + 16];
+        smem[tid] = mySum;
+        if (BlockSize >= 16) mySum += smem[tid + 8];
+        smem[tid] = mySum;
+        if (BlockSize >= 8) mySum += smem[tid + 4];
+        smem[tid] = mySum;
+        if (BlockSize >= 4) mySum += smem[tid + 2];
+        smem[tid] = mySum;
+        if (BlockSize >= 2) mySum += smem[tid + 1];
+        smem[tid] = mySum;
+    }
+    if (tid == 0) {
+        output[blockIdx.x] = sdata[0];
+    }
+}
+// ============================================================================
+// Kernel 2: Matrix transpose with shared memory
+// ============================================================================
+__global__ void matrixTranspose(const float* __restrict__ input,
+                                float* __restrict__ output,
+                                int width, int height) {
+    __shared__ float tile[TILE_SIZE][TILE_SIZE + 1];
+    int xIndex = blockIdx.x * TILE_SIZE + threadIdx.x;
+    int yIndex = blockIdx.y * TILE_SIZE + threadIdx.y;
+    if (xIndex < width && yIndex < height) {
+        tile[threadIdx.y][threadIdx.x] = input[yIndex * width + xIndex];
+    }
+    __syncthreads();
+    xIndex = blockIdx.y * TILE_SIZE + threadIdx.x;
+    yIndex = blockIdx.x * TILE_SIZE + threadIdx.y;
+    if (xIndex < height && yIndex < width) {
+        output[yIndex * height + xIndex] = tile[threadIdx.x][threadIdx.y];
+    }
+}
+// ============================================================================
+// Kernel 3: Softmax with cooperative groups
+// ============================================================================
+__global__ void softmaxKernel(const float* __restrict__ input,
+                              float* __restrict__ output,
+                              int N, int stride) {
+    cg::thread_block block = cg::this_thread_block();
+    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
+    extern __shared__ float shared[];
+    int row = blockIdx.x;
+    const float* rowInput = input + row * stride;
+    float* rowOutput = output + row * stride;
+    float maxVal = -INFINITY;
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+        maxVal = fmaxf(maxVal, rowInput[i]);
+    }
+    for (int offset = warp.size() / 2; offset > 0; offset /= 2) {
+        maxVal = fmaxf(maxVal, warp.shfl_down(maxVal, offset));
+    }
+    if (warp.thread_rank() == 0) {
+        shared[threadIdx.x / 32] = maxVal;
+    }
+    block.sync();
+    if (threadIdx.x < blockDim.x / 32) {
+        maxVal = shared[threadIdx.x];
+    } else {
+        maxVal = -INFINITY;
+    }
+    for (int offset = 16; offset > 0; offset /= 2) {
+        maxVal = fmaxf(maxVal, __shfl_down_sync(0xffffffff, maxVal, offset));
+    }
+    maxVal = __shfl_sync(0xffffffff, maxVal, 0);
+    float sum = 0.0f;
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+        float val = expf(rowInput[i] - maxVal);
+        rowOutput[i] = val;
+        sum += val;
+    }
+    for (int offset = warp.size() / 2; offset > 0; offset /= 2) {
+        sum += warp.shfl_down(sum, offset);
+    }
+    if (warp.thread_rank() == 0) {
+        shared[threadIdx.x / 32] = sum;
+    }
+    block.sync();
+    if (threadIdx.x < blockDim.x / 32) {
+        sum = shared[threadIdx.x];
+    } else {
+        sum = 0.0f;
+    }
+    for (int offset = 16; offset > 0; offset /= 2) {
+        sum += __shfl_down_sync(0xffffffff, sum, offset);
+    }
+    sum = __shfl_sync(0xffffffff, sum, 0);
+    float invSum = 1.0f / sum;
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+        rowOutput[i] *= invSum;
+    }
+}
+// ============================================================================
+// Kernel 4: Fused multiply-add with vectorized loads
+// ============================================================================
+__global__ void fusedMulAddVec4(const float4* __restrict__ A,
+                                 const float4* __restrict__ B,
+                                 float4* __restrict__ C,
+                                 float alpha, float beta, int N) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < N) {
+        float4 a = A[idx];
+        float4 b = B[idx];
+        float4 c;
+        c.x = fmaf(alpha, a.x, beta * b.x);
+        c.y = fmaf(alpha, a.y, beta * b.y);
+        c.z = fmaf(alpha, a.z, beta * b.z);
+        c.w = fmaf(alpha, a.w, beta * b.w);
+        C[idx] = c;
+    }
+}
+// ============================================================================
+// Kernel 5: Simple GEMM with shared memory tiling
+// ============================================================================
+__global__ void matmulTiled(const float* __restrict__ A,
+                            const float* __restrict__ B,
+                            float* __restrict__ C,
+                            int M, int N, int K) {
+    __shared__ float As[TILE_SIZE][TILE_SIZE];
+    __shared__ float Bs[TILE_SIZE][TILE_SIZE];
+    int row = blockIdx.y * TILE_SIZE + threadIdx.y;
+    int col = blockIdx.x * TILE_SIZE + threadIdx.x;
+    float sum = 0.0f;
+    for (int t = 0; t < (K + TILE_SIZE - 1) / TILE_SIZE; t++) {
+        int tiledCol = t * TILE_SIZE + threadIdx.x;
+        int tiledRow = t * TILE_SIZE + threadIdx.y;
+        As[threadIdx.y][threadIdx.x] = (row < M && tiledCol < K) ?
+                                        A[row * K + tiledCol] : 0.0f;
+        Bs[threadIdx.y][threadIdx.x] = (tiledRow < K && col < N) ?
+                                        B[tiledRow * N + col] : 0.0f;
+        __syncthreads();
+        #pragma unroll
+        for (int k = 0; k < TILE_SIZE; k++) {
+            sum = fmaf(As[threadIdx.y][k], Bs[k][threadIdx.x], sum);
+        }
+        __syncthreads();
+    }
+    if (row < M && col < N) {
+        C[row * N + col] = sum;
+    }
+}
+// ============================================================================
+// Device helper functions
+// ============================================================================
+__device__ __forceinline__ float warpReduceSum(float val) {
+    for (int offset = 16; offset > 0; offset /= 2) {
+        val += __shfl_down_sync(0xffffffff, val, offset);
+    }
+    return val;
+}
+__device__ __forceinline__ float blockReduceSum(float val) {
+    __shared__ float shared[32];
+    int lane = threadIdx.x % 32;
+    int wid = threadIdx.x / 32;
+    val = warpReduceSum(val);
+    if (lane == 0) shared[wid] = val;
+    __syncthreads();
+    val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0.0f;
+    if (wid == 0) val = warpReduceSum(val);
+    return val;
+}
+// ============================================================================
+// Kernel 6: Layer normalization
+// ============================================================================
+__global__ void layerNorm(const float* __restrict__ input,
+                          const float* __restrict__ gamma,
+                          const float* __restrict__ beta,
+                          float* __restrict__ output,
+                          int N, float eps) {
+    int row = blockIdx.x;
+    const float* rowInput = input + row * N;
+    float* rowOutput = output + row * N;
+    float sum = 0.0f;
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+        sum += rowInput[i];
+    }
+    sum = blockReduceSum(sum);
+    __shared__ float s_mean, s_var;
+    if (threadIdx.x == 0) {
+        s_mean = sum / N;
+    }
+    __syncthreads();
+    float var = 0.0f;
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+        float diff = rowInput[i] - s_mean;
+        var += diff * diff;
+    }
+    var = blockReduceSum(var);
+    if (threadIdx.x == 0) {
+        s_var = rsqrtf(var / N + eps);
+    }
+    __syncthreads();
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+        float normalized = (rowInput[i] - s_mean) * s_var;
+        rowOutput[i] = fmaf(normalized, gamma[i], beta[i]);
+    }
+}
+"""
+# ============================================================================
+# Benchmark Results
+# ============================================================================
+class BenchmarkResult(NamedTuple):
+    """Result of a single benchmark run."""
+    kernel_name: str
+    kernel_lines: int
+    compile_time_ms: int
+    success: bool
+    ptx_lines: int | None
+    sass_lines: int | None
+    error: str | None
+def count_lines(code: str) -> int:
+    """Count non-empty lines in code."""
+    return len([line for line in code.split('\n') if line.strip()])
+def run_benchmark(
+    kernel_name: str,
+    kernel_code: str,
+    arch: str = "sm_90a",
+    output_formats: list[str] | None = None,
+    num_runs: int = 3,
+) -> list[BenchmarkResult]:
+    """Run benchmark for a kernel.
+    Args:
+        kernel_name: Name of the kernel for reporting
+        kernel_code: CUDA source code
+        arch: Target architecture
+        output_formats: Output formats to request (default: ["ptx", "sass"])
+        num_runs: Number of benchmark runs
+    Returns:
+        List of BenchmarkResult for each run
+    """
+    import modal
+    if output_formats is None:
+        output_formats = ["ptx", "sass"]
+    # Get the deployed function
+    compile_fn = modal.Function.from_name("cuda-compile", "compile_cuda")
+    kernel_lines = count_lines(kernel_code)
+    results: list[BenchmarkResult] = []
+    for run in range(num_runs):
+        print(f"  Run {run + 1}/{num_runs}...", end=" ", flush=True)
+        start_time = time.time()
+        try:
+            result = compile_fn.remote({
+                "files": {"kernel.cu": kernel_code},
+                "arch": arch,
+                "flags": ["-O3", "-lineinfo"],
+                "output": output_formats,
+            })
+            elapsed_ms = int((time.time() - start_time) * 1000)
+            if result["success"]:
+                ptx_lines = count_lines(result["ptx"]) if result.get("ptx") else None
+                sass_lines = count_lines(result["sass"]) if result.get("sass") else None
+                results.append(BenchmarkResult(
+                    kernel_name=kernel_name,
+                    kernel_lines=kernel_lines,
+                    compile_time_ms=elapsed_ms,
+                    success=True,
+                    ptx_lines=ptx_lines,
+                    sass_lines=sass_lines,
+                    error=None,
+                ))
+                print(f"{elapsed_ms}ms")
+            else:
+                results.append(BenchmarkResult(
+                    kernel_name=kernel_name,
+                    kernel_lines=kernel_lines,
+                    compile_time_ms=elapsed_ms,
+                    success=False,
+                    ptx_lines=None,
+                    sass_lines=None,
+                    error=result.get("stderr", "Unknown error"),
+                ))
+                print(f"FAILED ({elapsed_ms}ms)")
+        except Exception as e:
+            elapsed_ms = int((time.time() - start_time) * 1000)
+            results.append(BenchmarkResult(
+                kernel_name=kernel_name,
+                kernel_lines=kernel_lines,
+                compile_time_ms=elapsed_ms,
+                success=False,
+                ptx_lines=None,
+                sass_lines=None,
+                error=str(e),
+            ))
+            print(f"ERROR: {e}")
+    return results
+def print_summary(results: list[BenchmarkResult]) -> None:
+    """Print benchmark summary."""
+    successful = [r for r in results if r.success]
+    if not successful:
+        print("\n  No successful runs!")
+        if results:
+            print(f"  Error: {results[0].error}")
+        return
+    times = [r.compile_time_ms for r in successful]
+    mean_time = statistics.mean(times)
+    if len(times) > 1:
+        stdev = statistics.stdev(times)
+        min_time = min(times)
+        max_time = max(times)
+        print(f"\n  Results: {mean_time:.0f}ms avg (min: {min_time}ms, max: {max_time}ms, stdev: {stdev:.0f}ms)")
+    else:
+        print(f"\n  Results: {mean_time:.0f}ms")
+    # Show output sizes
+    if successful[0].ptx_lines:
+        print(f"  PTX output: {successful[0].ptx_lines} lines")
+    if successful[0].sass_lines:
+        print(f"  SASS output: {successful[0].sass_lines} lines")
+def run_all_benchmarks(num_runs: int = 3) -> dict[str, list[BenchmarkResult]]:
+    """Run benchmarks for all kernel sizes."""
+    print("=" * 60)
+    print("CUDA Compilation Benchmark")
+    print("=" * 60)
+    kernels = [
+        ("simple", SIMPLE_KERNEL),
+        ("medium", MEDIUM_KERNEL),
+        ("complex", COMPLEX_KERNEL),
+    ]
+    all_results: dict[str, list[BenchmarkResult]] = {}
+    for name, code in kernels:
+        lines = count_lines(code)
+        print(f"\n{name.upper()} KERNEL ({lines} lines)")
+        print("-" * 40)
+        results = run_benchmark(name, code, num_runs=num_runs)
+        all_results[name] = results
+        print_summary(results)
+    # Print final summary
+    print("\n" + "=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    for name in ["simple", "medium", "complex"]:
+        results = all_results.get(name, [])
+        successful = [r for r in results if r.success]
+        if successful:
+            avg_time = statistics.mean([r.compile_time_ms for r in successful])
+            print(f"  {name:10s}: {avg_time:6.0f}ms ({results[0].kernel_lines} lines)")
+        else:
+            print(f"  {name:10s}: FAILED")
+    return all_results
+def main() -> None:
+    """Main entry point."""
+    parser = argparse.ArgumentParser(description="Benchmark CUDA compilation")
+    parser.add_argument(
+        "--kernel",
+        choices=["simple", "medium", "complex", "all"],
+        default="all",
+        help="Which kernel to benchmark",
+    )
+    parser.add_argument(
+        "--runs",
+        type=int,
+        default=3,
+        help="Number of benchmark runs per kernel",
+    )
+    parser.add_argument(
+        "--arch",
+        default="sm_90a",
+        help="Target GPU architecture",
+    )
+    args = parser.parse_args()
+    if args.kernel == "all":
+        run_all_benchmarks(num_runs=args.runs)
+    else:
+        kernel_map = {
+            "simple": SIMPLE_KERNEL,
+            "medium": MEDIUM_KERNEL,
+            "complex": COMPLEX_KERNEL,
+        }
+        code = kernel_map[args.kernel]
+        lines = count_lines(code)
+        print(f"\n{args.kernel.upper()} KERNEL ({lines} lines)")
+        print("-" * 40)
+        results = run_benchmark(
+            args.kernel,
+            code,
+            arch=args.arch,
+            num_runs=args.runs,
+        )
+        print_summary(results)
+if __name__ == "__main__":
+    main()

wafer_core/tools/compile/compiler.py CHANGED Viewed

@@ -58,8 +58,8 @@ async def compile_cuda_remote(
 ) -> CompileResponse:
     """Compile CUDA code using Modal (remote execution).
-    This function spawns a subprocess to call Modal, avoiding event loop
-    conflicts between the caller's event loop and Modal's asyncio.
+    This function calls the deployed Modal function directly using asyncio.to_thread
+    to avoid blocking the event loop.
     Args:
         request: The compile request
@@ -70,92 +70,74 @@ async def compile_cuda_remote(
         CompileResponse with PTX/SASS or error
     """
     import asyncio
-    import json
     import os
-    import tempfile
     import time
-    from pathlib import Path
+    from contextlib import contextmanager
-    start_time = time.time()
+    @contextmanager
+    def temporary_env_vars(env_updates: dict[str, str]):
+        """Context manager to temporarily set environment variables.
+        Saves original values, sets new values, yields, then restores originals.
+        This ensures we don't leak credentials between concurrent requests.
+        """
+        original_values: dict[str, str | None] = {}
+        for key, value in env_updates.items():
+            original_values[key] = os.environ.get(key)
+            os.environ[key] = value
-    # Write request to temp file
+        try:
+            yield
+        finally:
+            for key, original in original_values.items():
+                if original is None:
+                    os.environ.pop(key, None)
+                else:
+                    os.environ[key] = original
+    start_time = time.time()
     request_dict = request_to_dict(request)
-    with tempfile.NamedTemporaryFile(
-        mode="w", suffix=".json", delete=False
-    ) as request_file:
-        json.dump(request_dict, request_file)
-        request_path = request_file.name
+    # Build env updates for credentials (only if provided)
+    env_updates: dict[str, str] = {}
+    if modal_token_id:
+        env_updates["MODAL_TOKEN_ID"] = modal_token_id
+    if modal_token_secret:
+        env_updates["MODAL_TOKEN_SECRET"] = modal_token_secret
+    def call_modal() -> dict:
+        """Call Modal function synchronously (runs in thread pool)."""
+        import modal
+        # Look up the deployed function
+        compile_fn = modal.Function.from_name("cuda-compile", "compile_cuda")
+        # Call the function remotely
+        return compile_fn.remote(request_dict)
     try:
-        # Create a Python script that calls Modal using Function.lookup
-        # This calls the deployed function without needing to rebuild the image
-        script = f'''
-import json
-import modal
-# Load request
-with open("{request_path}") as f:
-    request = json.load(f)
-# Look up the deployed function
-compile_fn = modal.Function.from_name("cuda-compile", "compile_cuda")
-# Call the function remotely
-result = compile_fn.remote(request)
-# Output result as JSON
-print(json.dumps(result))
-'''
-        # Run in subprocess to avoid event loop conflicts
-        env = os.environ.copy()
-        if modal_token_id:
-            env["MODAL_TOKEN_ID"] = modal_token_id
-        if modal_token_secret:
-            env["MODAL_TOKEN_SECRET"] = modal_token_secret
-        # Use the same Python interpreter that's running this code
-        import sys
-        python_executable = sys.executable
-        # Use asyncio.create_subprocess_exec for async subprocess execution
-        proc = await asyncio.create_subprocess_exec(
-            python_executable, "-c", script,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.PIPE,
-            env=env,
-        )
-        stdout_bytes, stderr_bytes = await proc.communicate()
+        # Run Modal call in thread pool with temporary credentials
+        # The context manager ensures env vars are restored after the call
+        def call_modal_with_env() -> dict:
+            with temporary_env_vars(env_updates):
+                return call_modal()
-        if proc.returncode != 0:
-            stderr = stderr_bytes.decode() if stderr_bytes else "Unknown error"
-            # Check for common Modal auth errors
-            if "MODAL_TOKEN" in stderr or "AuthError" in stderr or "not authenticated" in stderr.lower():
-                return CompileResponse.error(
-                    "Modal not configured. Set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET environment variables, "
-                    "or run 'modal token new' to authenticate.",
-                    compilation_time_ms=int((time.time() - start_time) * 1000),
-                )
-            return CompileResponse.error(
-                f"Compilation failed: {stderr}",
-                compilation_time_ms=int((time.time() - start_time) * 1000),
-            )
+        result = await asyncio.to_thread(call_modal_with_env)
+        return response_from_dict(result)
-        # Parse result
-        stdout = stdout_bytes.decode() if stdout_bytes else "{}"
-        try:
-            response_dict = json.loads(stdout)
-            return response_from_dict(response_dict)
-        except json.JSONDecodeError as e:
+    except Exception as e:
+        error_str = str(e)
+        # Check for common Modal auth errors
+        if "MODAL_TOKEN" in error_str or "AuthError" in error_str or "not authenticated" in error_str.lower():
             return CompileResponse.error(
-                f"Failed to parse Modal response: {e}\nOutput: {stdout[:500]}",
+                "Modal not configured. Set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET environment variables, "
+                "or run 'modal token new' to authenticate.",
                 compilation_time_ms=int((time.time() - start_time) * 1000),
             )
-    finally:
-        # Clean up temp file
-        Path(request_path).unlink(missing_ok=True)
+        return CompileResponse.error(
+            f"Compilation failed: {error_str}",
+            compilation_time_ms=int((time.time() - start_time) * 1000),
+        )
 def compile_cuda_local(request: CompileRequest) -> CompileResponse:
@@ -197,7 +179,12 @@ def compile_cuda_local(request: CompileRequest) -> CompileResponse:
             # Write all files to temp directory
             for filename, content in request.files.items():
-                file_path = tmp_path / filename
+                file_path = (tmp_path / filename).resolve()
+                if not file_path.is_relative_to(tmp_path):
+                    return CompileResponse.error(
+                        f"Invalid filename: {filename}",
+                        compilation_time_ms=int((time.time() - start_time) * 1000),
+                    )
                 file_path.parent.mkdir(parents=True, exist_ok=True)
                 file_path.write_text(content)

wafer_core/tools/compile/modal_compile.py CHANGED Viewed

@@ -79,7 +79,10 @@ app = modal.App(name="cuda-compile", image=compile_image)
     cpu=4,
     memory=8192,  # 8GB RAM
     timeout=120,  # 2 minute timeout
+    # Keep one container warm to avoid cold starts (~5-10s savings)
+    min_containers=1,
 )
+@modal.concurrent(max_inputs=4)  # Allow concurrent compilations for better throughput
 def compile_cuda(request: dict) -> dict:
     """Compile CUDA code and return PTX/SASS.
@@ -105,6 +108,7 @@ def compile_cuda(request: dict) -> dict:
     import subprocess
     import tempfile
     import time
+    from concurrent.futures import ThreadPoolExecutor, as_completed
     from pathlib import Path
     start_time = time.time()
@@ -138,13 +142,92 @@ def compile_cuda(request: dict) -> dict:
     main_cu_file = cu_files[0]
+    # Build environment for nvcc
+    nvcc_env = {
+        **os.environ,
+        "CUDA_HOME": "/usr/local/cuda",
+        "PATH": f"/usr/local/cuda/bin:{os.environ.get('PATH', '')}",
+    }
+    def compile_ptx(tmpdir: str, base_cmd: list[str], main_cu_path: Path) -> tuple[str | None, str | None]:
+        """Compile to PTX. Returns (ptx_content, error_message)."""
+        ptx_output = Path(tmpdir) / "output.ptx"
+        ptx_cmd = base_cmd + [
+            "--ptx",
+            "-o",
+            str(ptx_output),
+            str(main_cu_path),
+        ]
+        ptx_result = subprocess.run(
+            ptx_cmd,
+            capture_output=True,
+            text=True,
+            timeout=60,
+            cwd=tmpdir,
+            env=nvcc_env,
+        )
+        if ptx_result.returncode != 0:
+            return None, ptx_result.stderr or ptx_result.stdout
+        if ptx_output.exists():
+            return ptx_output.read_text(), None
+        return None, "PTX output file not created"
+    def compile_sass(tmpdir: str, base_cmd: list[str], main_cu_path: Path) -> tuple[str | None, str | None]:
+        """Compile to SASS (via cubin). Returns (sass_content, error_message)."""
+        cubin_output = Path(tmpdir) / "output.cubin"
+        cubin_cmd = base_cmd + [
+            "--cubin",
+            "-o",
+            str(cubin_output),
+            str(main_cu_path),
+        ]
+        cubin_result = subprocess.run(
+            cubin_cmd,
+            capture_output=True,
+            text=True,
+            timeout=60,
+            cwd=tmpdir,
+            env=nvcc_env,
+        )
+        if cubin_result.returncode != 0:
+            return None, cubin_result.stderr or cubin_result.stdout
+        if not cubin_output.exists():
+            return None, "cubin output file not created"
+        # Disassemble cubin to SASS
+        sass_result = subprocess.run(
+            ["cuobjdump", "--dump-sass", str(cubin_output)],
+            capture_output=True,
+            text=True,
+            timeout=30,
+            cwd=tmpdir,
+        )
+        if sass_result.returncode == 0:
+            return sass_result.stdout, None
+        return None, f"SASS disassembly failed: {sass_result.stderr}"
     try:
         with tempfile.TemporaryDirectory() as tmpdir:
             tmp_path = Path(tmpdir)
             # Write all files to temp directory, preserving subdirectory structure
             for filename, content in files.items():
-                file_path = tmp_path / filename
+                file_path = (tmp_path / filename).resolve()
+                if not file_path.is_relative_to(tmp_path):
+                    return {
+                        "success": False,
+                        "ptx": None,
+                        "sass": None,
+                        "stderr": f"Invalid filename: {filename}",
+                        "compilation_time_ms": int((time.time() - start_time) * 1000),
+                    }
                 file_path.parent.mkdir(parents=True, exist_ok=True)
                 file_path.write_text(content)
@@ -152,151 +235,78 @@ def compile_cuda(request: dict) -> dict:
             main_cu_path = tmp_path / main_cu_file
             include_dir = main_cu_path.parent
-            results: dict[str, str | None] = {"ptx": None, "sass": None}
             # Build base nvcc command with common flags
             base_cmd = [
                 "nvcc",
                 "-arch",
                 arch,
-                # Include the temp directory for user headers
                 f"-I{include_dir}",
-                # Include PyTorch headers
                 "-I/usr/local/lib/python3.12/site-packages/torch/include",
                 "-I/usr/local/lib/python3.12/site-packages/torch/include/torch/csrc/api/include",
-                # Include CUTLASS headers
                 "-I/usr/local/cutlass/include",
-                # Standard CUDA headers are already in the default path
             ]
-            # Add user-specified flags
             base_cmd.extend(flags)
-            # Generate PTX if requested
-            if OutputFormat.PTX.value in output_formats:
-                ptx_output = tmp_path / "output.ptx"
-                ptx_cmd = base_cmd + [
-                    "--ptx",  # Generate PTX
-                    "-o",
-                    str(ptx_output),
-                    str(main_cu_path),
-                ]
-                ptx_result = subprocess.run(
-                    ptx_cmd,
-                    capture_output=True,
-                    text=True,
-                    timeout=60,
-                    cwd=tmpdir,
-                    env={
-                        **os.environ,
-                        "CUDA_HOME": "/usr/local/cuda",
-                        "PATH": f"/usr/local/cuda/bin:{os.environ.get('PATH', '')}",
-                    },
-                )
-                if ptx_result.returncode != 0:
-                    return {
-                        "success": False,
-                        "ptx": None,
-                        "sass": None,
-                        "stderr": ptx_result.stderr or ptx_result.stdout,
-                        "compilation_time_ms": int((time.time() - start_time) * 1000),
-                    }
+            # Determine what to compile
+            want_ptx = OutputFormat.PTX.value in output_formats
+            want_sass = OutputFormat.SASS.value in output_formats
-                if ptx_output.exists():
-                    results["ptx"] = ptx_output.read_text()
-            # Generate SASS if requested
-            if OutputFormat.SASS.value in output_formats:
-                # First compile to cubin, then disassemble to SASS
-                cubin_output = tmp_path / "output.cubin"
-                cubin_cmd = base_cmd + [
-                    "--cubin",  # Generate cubin (binary)
-                    "-o",
-                    str(cubin_output),
-                    str(main_cu_path),
-                ]
-                cubin_result = subprocess.run(
-                    cubin_cmd,
-                    capture_output=True,
-                    text=True,
-                    timeout=60,
-                    cwd=tmpdir,
-                    env={
-                        **os.environ,
-                        "CUDA_HOME": "/usr/local/cuda",
-                        "PATH": f"/usr/local/cuda/bin:{os.environ.get('PATH', '')}",
-                    },
-                )
-                if cubin_result.returncode != 0:
-                    # If we already have PTX, that's a partial success
-                    if results["ptx"]:
-                        return {
-                            "success": True,
-                            "ptx": results["ptx"],
-                            "sass": None,
-                            "stderr": f"SASS generation failed: {cubin_result.stderr}",
-                            "compilation_time_ms": int(
-                                (time.time() - start_time) * 1000
-                            ),
-                        }
-                    return {
-                        "success": False,
-                        "ptx": None,
-                        "sass": None,
-                        "stderr": cubin_result.stderr or cubin_result.stdout,
-                        "compilation_time_ms": int((time.time() - start_time) * 1000),
+            results: dict[str, str | None] = {"ptx": None, "sass": None}
+            errors: list[str] = []
+            # Run compilations in parallel if both are requested
+            if want_ptx and want_sass:
+                with ThreadPoolExecutor(max_workers=2) as executor:
+                    futures = {
+                        executor.submit(compile_ptx, tmpdir, base_cmd, main_cu_path): "ptx",
+                        executor.submit(compile_sass, tmpdir, base_cmd, main_cu_path): "sass",
                     }
-                # Disassemble cubin to SASS using cuobjdump
-                if cubin_output.exists():
-                    sass_cmd = [
-                        "cuobjdump",
-                        "--dump-sass",
-                        str(cubin_output),
-                    ]
-                    sass_result = subprocess.run(
-                        sass_cmd,
-                        capture_output=True,
-                        text=True,
-                        timeout=30,
-                        cwd=tmpdir,
-                    )
-                    if sass_result.returncode == 0:
-                        results["sass"] = sass_result.stdout
-                    else:
-                        # SASS generation failed but we might have PTX
-                        if results["ptx"]:
-                            return {
-                                "success": True,
-                                "ptx": results["ptx"],
-                                "sass": None,
-                                "stderr": f"SASS disassembly failed: {sass_result.stderr}",
-                                "compilation_time_ms": int(
-                                    (time.time() - start_time) * 1000
-                                ),
-                            }
-            # Check if we got any output
+                    for future in as_completed(futures):
+                        output_type = futures[future]
+                        try:
+                            content, error = future.result()
+                            if content:
+                                results[output_type] = content
+                            if error:
+                                errors.append(f"{output_type.upper()}: {error}")
+                        except Exception as e:
+                            errors.append(f"{output_type.upper()} compilation error: {e}")
+            elif want_ptx:
+                content, error = compile_ptx(tmpdir, base_cmd, main_cu_path)
+                if content:
+                    results["ptx"] = content
+                if error:
+                    errors.append(error)
+            elif want_sass:
+                content, error = compile_sass(tmpdir, base_cmd, main_cu_path)
+                if content:
+                    results["sass"] = content
+                if error:
+                    errors.append(error)
+            # Check results
             if not results["ptx"] and not results["sass"]:
                 return {
                     "success": False,
                     "ptx": None,
                     "sass": None,
-                    "stderr": "No output generated",
+                    "stderr": "\n".join(errors) if errors else "No output generated",
                     "compilation_time_ms": int((time.time() - start_time) * 1000),
                 }
+            # Partial success if we got at least one output
+            stderr = ""
+            if errors and (results["ptx"] or results["sass"]):
+                stderr = "\n".join(errors)
             return {
                 "success": True,
                 "ptx": results["ptx"],
                 "sass": results["sass"],
-                "stderr": "",
+                "stderr": stderr,
                 "compilation_time_ms": int((time.time() - start_time) * 1000),
             }

{wafer_core-0.1.39.dist-info → wafer_core-0.1.40.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: wafer-core
-Version: 0.1.39
+Version: 0.1.40
 Summary: Core utilities and environments for Wafer GPU kernel optimization
 Requires-Python: >=3.10
 Requires-Dist: aiohttp>=3.9.0

{wafer_core-0.1.39.dist-info → wafer_core-0.1.40.dist-info}/RECORD RENAMED Viewed

@@ -645,8 +645,9 @@ wafer_core/tools/capture_tool/dtypes.py,sha256=1Vm5obOCYc-Njuwkp7uqh_W4lqtYurT3b
 wafer_core/tools/capture_tool/executor.py,sha256=n1DVfbsP60yJAazx9C9Kwed9LB7AcKXJcoDnhno7ydU,1495
 wafer_core/tools/capture_tool/metrics.py,sha256=BFZNmdE-kh3LneYdWXTNZmlLuo-DCrP5aEBHxEQYJDU,10890
 wafer_core/tools/compile/__init__.py,sha256=8VyaMDDPxg4DcT-rwMf9lcNhAanWnmsqijUJYsuzJNg,615
-wafer_core/tools/compile/compiler.py,sha256=rGPvfqLTg-7y3hyFEihF6lxiEOfbIsRwfvOZSaVJ2_A,10192
-wafer_core/tools/compile/modal_compile.py,sha256=zYrkAtGYkDiM6tJfH_hD-mJ0LqCW5HCSsf_6fADJIbI,13310
+wafer_core/tools/compile/benchmark.py,sha256=6_nfhl24vTWt59EwGievbyMHZK2l4wfslP77BHWsoQ4,19408
+wafer_core/tools/compile/compiler.py,sha256=Y7iwfQkSBc4fmKXpv97ce1grw5L4tJ_VqWFFyYolRAg,10054
+wafer_core/tools/compile/modal_compile.py,sha256=lYMxdrvEQctA1Om6yESetjUAsSyv0W0evNVb8WOY2Ps,13384
 wafer_core/tools/compile/types.py,sha256=8Hjh6Mz2a7s2JjtKYQq-l3X41gmywnbKk3tc1wvbMLM,3277
 wafer_core/tools/compile/tests/__init__.py,sha256=gSuBMN-7VayQ9HgyNuUXRumenwk7jtq86ZxdCgFjeYE,41
 wafer_core/tools/compile/tests/test_compiler.py,sha256=kQ-YTLY8ETnS83nQ8xVSygKY532epxqRTsGx311SG7w,20795
@@ -722,6 +723,6 @@ wafer_core/utils/modal_execution/modal_app.py,sha256=VfS2cX8gHtnlPXemmMcEwDPeQdh
 wafer_core/utils/modal_execution/modal_config.py,sha256=7cGX9TGqilQ3qxI3OFGXV5orjtyRU-PEDOJ4vP2oxno,4421
 wafer_core/utils/modal_execution/modal_execution.py,sha256=gChjnV6jqA3A7IRP3DfvV5cSfm_MN0X4f7JZufXgdZE,24594
 wafer_core/utils/modal_execution/test_modal.py,sha256=_jqou_hrLs1Daf1590Pnb0a_lXMMa2rczAPpW9HpoNQ,8153
-wafer_core-0.1.39.dist-info/METADATA,sha256=OcMn8TZzsUvPT2JBa0xYK_sAT_og1PAZd-DpDcLG1XA,1477
-wafer_core-0.1.39.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-wafer_core-0.1.39.dist-info/RECORD,,
+wafer_core-0.1.40.dist-info/METADATA,sha256=yCfawhvfbqAmwkjDxe7GaIRD8LB6L37DR6-XlGGzevs,1477
+wafer_core-0.1.40.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+wafer_core-0.1.40.dist-info/RECORD,,

{wafer_core-0.1.39.dist-info → wafer_core-0.1.40.dist-info}/WHEEL RENAMED Viewed

File without changes

wafer-core 0.1.39__py3-none-any.whl → 0.1.40__py3-none-any.whl

wafer-core 0.1.39py3-none-any.whl → 0.1.40py3-none-any.whl