PyPI - wafer-core - Versions diffs - 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl - Mend

wafer-core 0.1.38py3-none-any.whl → 0.1.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

wafer_core/lib/trace_compare/fusion_analyzer.py +2 -0
wafer_core/rollouts/_logging/__init__.py +5 -1
wafer_core/rollouts/_logging/logging_config.py +95 -3
wafer_core/rollouts/_logging/sample_handler.py +66 -0
wafer_core/rollouts/_pytui/__init__.py +114 -0
wafer_core/rollouts/_pytui/app.py +809 -0
wafer_core/rollouts/_pytui/console.py +291 -0
wafer_core/rollouts/_pytui/renderer.py +210 -0
wafer_core/rollouts/_pytui/spinner.py +73 -0
wafer_core/rollouts/_pytui/terminal.py +489 -0
wafer_core/rollouts/_pytui/text.py +470 -0
wafer_core/rollouts/_pytui/theme.py +241 -0
wafer_core/rollouts/evaluation.py +142 -177
wafer_core/rollouts/progress_app.py +395 -0
wafer_core/rollouts/tui/DESIGN.md +251 -115
wafer_core/rollouts/tui/monitor.py +64 -20
wafer_core/tools/compile/__init__.py +30 -0
wafer_core/tools/compile/benchmark.py +636 -0
wafer_core/tools/compile/compiler.py +301 -0
wafer_core/tools/compile/modal_compile.py +369 -0
wafer_core/tools/compile/tests/__init__.py +1 -0
wafer_core/tools/compile/tests/test_compiler.py +675 -0
wafer_core/tools/compile/tests/test_data/utils.cuh +10 -0
wafer_core/tools/compile/tests/test_data/vector_add.cu +7 -0
wafer_core/tools/compile/tests/test_data/with_header.cu +9 -0
wafer_core/tools/compile/tests/test_modal_integration.py +326 -0
wafer_core/tools/compile/types.py +117 -0
{wafer_core-0.1.38.dist-info → wafer_core-0.1.40.dist-info}/METADATA +1 -1
{wafer_core-0.1.38.dist-info → wafer_core-0.1.40.dist-info}/RECORD +30 -12
wafer_core/rollouts/events.py +0 -240
wafer_core/rollouts/progress_display.py +0 -476
wafer_core/utils/event_streaming.py +0 -63
{wafer_core-0.1.38.dist-info → wafer_core-0.1.40.dist-info}/WHEEL +0 -0

wafer_core/tools/compile/benchmark.py ADDED Viewed

@@ -0,0 +1,636 @@
+"""Benchmark script for CUDA compilation performance.
+This script measures compilation time for kernels of different sizes
+to track performance improvements from optimizations.
+Usage:
+    python -m wafer_core.tools.compile.benchmark
+    # Or with specific test:
+    python -m wafer_core.tools.compile.benchmark --kernel simple
+    python -m wafer_core.tools.compile.benchmark --kernel medium
+    python -m wafer_core.tools.compile.benchmark --kernel complex
+"""
+import argparse
+import statistics
+import time
+from typing import NamedTuple
+# ============================================================================
+# Test Kernels
+# ============================================================================
+SIMPLE_KERNEL = """\
+// Simple vector addition kernel (~20 lines)
+__global__ void vector_add(float* a, float* b, float* c, int n) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        c[idx] = a[idx] + b[idx];
+    }
+}
+"""
+MEDIUM_KERNEL = """\
+// Medium complexity kernel with shared memory (~100 lines)
+#include <cuda_runtime.h>
+#define TILE_SIZE 16
+__global__ void tiled_matmul(
+    const float* __restrict__ A,
+    const float* __restrict__ B,
+    float* __restrict__ C,
+    int M, int N, int K
+) {
+    __shared__ float As[TILE_SIZE][TILE_SIZE];
+    __shared__ float Bs[TILE_SIZE][TILE_SIZE];
+    int bx = blockIdx.x, by = blockIdx.y;
+    int tx = threadIdx.x, ty = threadIdx.y;
+    int row = by * TILE_SIZE + ty;
+    int col = bx * TILE_SIZE + tx;
+    float sum = 0.0f;
+    for (int t = 0; t < (K + TILE_SIZE - 1) / TILE_SIZE; t++) {
+        // Load tile from A
+        if (row < M && t * TILE_SIZE + tx < K) {
+            As[ty][tx] = A[row * K + t * TILE_SIZE + tx];
+        } else {
+            As[ty][tx] = 0.0f;
+        }
+        // Load tile from B
+        if (t * TILE_SIZE + ty < K && col < N) {
+            Bs[ty][tx] = B[(t * TILE_SIZE + ty) * N + col];
+        } else {
+            Bs[ty][tx] = 0.0f;
+        }
+        __syncthreads();
+        // Compute partial dot product
+        #pragma unroll
+        for (int k = 0; k < TILE_SIZE; k++) {
+            sum = fmaf(As[ty][k], Bs[k][tx], sum);
+        }
+        __syncthreads();
+    }
+    if (row < M && col < N) {
+        C[row * N + col] = sum;
+    }
+}
+// Reduction kernel
+__global__ void reduce_sum(const float* input, float* output, int n) {
+    extern __shared__ float sdata[];
+    unsigned int tid = threadIdx.x;
+    unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
+    float mySum = (i < n) ? input[i] : 0.0f;
+    if (i + blockDim.x < n) {
+        mySum += input[i + blockDim.x];
+    }
+    sdata[tid] = mySum;
+    __syncthreads();
+    for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
+        if (tid < s) {
+            sdata[tid] = mySum = mySum + sdata[tid + s];
+        }
+        __syncthreads();
+    }
+    if (tid < 32) {
+        volatile float* smem = sdata;
+        smem[tid] = mySum = mySum + smem[tid + 32];
+        smem[tid] = mySum = mySum + smem[tid + 16];
+        smem[tid] = mySum = mySum + smem[tid + 8];
+        smem[tid] = mySum = mySum + smem[tid + 4];
+        smem[tid] = mySum = mySum + smem[tid + 2];
+        smem[tid] = mySum = mySum + smem[tid + 1];
+    }
+    if (tid == 0) {
+        output[blockIdx.x] = sdata[0];
+    }
+}
+"""
+COMPLEX_KERNEL = """\
+// Complex kernel with multiple features (~500 lines)
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+namespace cg = cooperative_groups;
+// Constants
+constexpr int BLOCK_SIZE = 256;
+constexpr int TILE_SIZE = 16;
+// ============================================================================
+// Kernel 1: Vector operations with shared memory and reduction
+// ============================================================================
+template <typename T, int BlockSize>
+__global__ void reduceSum(const T* __restrict__ input, T* __restrict__ output, int N) {
+    __shared__ T sdata[BlockSize];
+    unsigned int tid = threadIdx.x;
+    unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
+    T mySum = (i < N) ? input[i] : T(0);
+    if (i + blockDim.x < N) {
+        mySum += input[i + blockDim.x];
+    }
+    sdata[tid] = mySum;
+    __syncthreads();
+    #pragma unroll
+    for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
+        if (tid < s) {
+            sdata[tid] = mySum = mySum + sdata[tid + s];
+        }
+        __syncthreads();
+    }
+    if (tid < 32) {
+        volatile T* smem = sdata;
+        if (BlockSize >= 64) mySum += smem[tid + 32];
+        smem[tid] = mySum;
+        if (BlockSize >= 32) mySum += smem[tid + 16];
+        smem[tid] = mySum;
+        if (BlockSize >= 16) mySum += smem[tid + 8];
+        smem[tid] = mySum;
+        if (BlockSize >= 8) mySum += smem[tid + 4];
+        smem[tid] = mySum;
+        if (BlockSize >= 4) mySum += smem[tid + 2];
+        smem[tid] = mySum;
+        if (BlockSize >= 2) mySum += smem[tid + 1];
+        smem[tid] = mySum;
+    }
+    if (tid == 0) {
+        output[blockIdx.x] = sdata[0];
+    }
+}
+// ============================================================================
+// Kernel 2: Matrix transpose with shared memory
+// ============================================================================
+__global__ void matrixTranspose(const float* __restrict__ input,
+                                float* __restrict__ output,
+                                int width, int height) {
+    __shared__ float tile[TILE_SIZE][TILE_SIZE + 1];
+    int xIndex = blockIdx.x * TILE_SIZE + threadIdx.x;
+    int yIndex = blockIdx.y * TILE_SIZE + threadIdx.y;
+    if (xIndex < width && yIndex < height) {
+        tile[threadIdx.y][threadIdx.x] = input[yIndex * width + xIndex];
+    }
+    __syncthreads();
+    xIndex = blockIdx.y * TILE_SIZE + threadIdx.x;
+    yIndex = blockIdx.x * TILE_SIZE + threadIdx.y;
+    if (xIndex < height && yIndex < width) {
+        output[yIndex * height + xIndex] = tile[threadIdx.x][threadIdx.y];
+    }
+}
+// ============================================================================
+// Kernel 3: Softmax with cooperative groups
+// ============================================================================
+__global__ void softmaxKernel(const float* __restrict__ input,
+                              float* __restrict__ output,
+                              int N, int stride) {
+    cg::thread_block block = cg::this_thread_block();
+    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
+    extern __shared__ float shared[];
+    int row = blockIdx.x;
+    const float* rowInput = input + row * stride;
+    float* rowOutput = output + row * stride;
+    float maxVal = -INFINITY;
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+        maxVal = fmaxf(maxVal, rowInput[i]);
+    }
+    for (int offset = warp.size() / 2; offset > 0; offset /= 2) {
+        maxVal = fmaxf(maxVal, warp.shfl_down(maxVal, offset));
+    }
+    if (warp.thread_rank() == 0) {
+        shared[threadIdx.x / 32] = maxVal;
+    }
+    block.sync();
+    if (threadIdx.x < blockDim.x / 32) {
+        maxVal = shared[threadIdx.x];
+    } else {
+        maxVal = -INFINITY;
+    }
+    for (int offset = 16; offset > 0; offset /= 2) {
+        maxVal = fmaxf(maxVal, __shfl_down_sync(0xffffffff, maxVal, offset));
+    }
+    maxVal = __shfl_sync(0xffffffff, maxVal, 0);
+    float sum = 0.0f;
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+        float val = expf(rowInput[i] - maxVal);
+        rowOutput[i] = val;
+        sum += val;
+    }
+    for (int offset = warp.size() / 2; offset > 0; offset /= 2) {
+        sum += warp.shfl_down(sum, offset);
+    }
+    if (warp.thread_rank() == 0) {
+        shared[threadIdx.x / 32] = sum;
+    }
+    block.sync();
+    if (threadIdx.x < blockDim.x / 32) {
+        sum = shared[threadIdx.x];
+    } else {
+        sum = 0.0f;
+    }
+    for (int offset = 16; offset > 0; offset /= 2) {
+        sum += __shfl_down_sync(0xffffffff, sum, offset);
+    }
+    sum = __shfl_sync(0xffffffff, sum, 0);
+    float invSum = 1.0f / sum;
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+        rowOutput[i] *= invSum;
+    }
+}
+// ============================================================================
+// Kernel 4: Fused multiply-add with vectorized loads
+// ============================================================================
+__global__ void fusedMulAddVec4(const float4* __restrict__ A,
+                                 const float4* __restrict__ B,
+                                 float4* __restrict__ C,
+                                 float alpha, float beta, int N) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < N) {
+        float4 a = A[idx];
+        float4 b = B[idx];
+        float4 c;
+        c.x = fmaf(alpha, a.x, beta * b.x);
+        c.y = fmaf(alpha, a.y, beta * b.y);
+        c.z = fmaf(alpha, a.z, beta * b.z);
+        c.w = fmaf(alpha, a.w, beta * b.w);
+        C[idx] = c;
+    }
+}
+// ============================================================================
+// Kernel 5: Simple GEMM with shared memory tiling
+// ============================================================================
+__global__ void matmulTiled(const float* __restrict__ A,
+                            const float* __restrict__ B,
+                            float* __restrict__ C,
+                            int M, int N, int K) {
+    __shared__ float As[TILE_SIZE][TILE_SIZE];
+    __shared__ float Bs[TILE_SIZE][TILE_SIZE];
+    int row = blockIdx.y * TILE_SIZE + threadIdx.y;
+    int col = blockIdx.x * TILE_SIZE + threadIdx.x;
+    float sum = 0.0f;
+    for (int t = 0; t < (K + TILE_SIZE - 1) / TILE_SIZE; t++) {
+        int tiledCol = t * TILE_SIZE + threadIdx.x;
+        int tiledRow = t * TILE_SIZE + threadIdx.y;
+        As[threadIdx.y][threadIdx.x] = (row < M && tiledCol < K) ?
+                                        A[row * K + tiledCol] : 0.0f;
+        Bs[threadIdx.y][threadIdx.x] = (tiledRow < K && col < N) ?
+                                        B[tiledRow * N + col] : 0.0f;
+        __syncthreads();
+        #pragma unroll
+        for (int k = 0; k < TILE_SIZE; k++) {
+            sum = fmaf(As[threadIdx.y][k], Bs[k][threadIdx.x], sum);
+        }
+        __syncthreads();
+    }
+    if (row < M && col < N) {
+        C[row * N + col] = sum;
+    }
+}
+// ============================================================================
+// Device helper functions
+// ============================================================================
+__device__ __forceinline__ float warpReduceSum(float val) {
+    for (int offset = 16; offset > 0; offset /= 2) {
+        val += __shfl_down_sync(0xffffffff, val, offset);
+    }
+    return val;
+}
+__device__ __forceinline__ float blockReduceSum(float val) {
+    __shared__ float shared[32];
+    int lane = threadIdx.x % 32;
+    int wid = threadIdx.x / 32;
+    val = warpReduceSum(val);
+    if (lane == 0) shared[wid] = val;
+    __syncthreads();
+    val = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0.0f;
+    if (wid == 0) val = warpReduceSum(val);
+    return val;
+}
+// ============================================================================
+// Kernel 6: Layer normalization
+// ============================================================================
+__global__ void layerNorm(const float* __restrict__ input,
+                          const float* __restrict__ gamma,
+                          const float* __restrict__ beta,
+                          float* __restrict__ output,
+                          int N, float eps) {
+    int row = blockIdx.x;
+    const float* rowInput = input + row * N;
+    float* rowOutput = output + row * N;
+    float sum = 0.0f;
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+        sum += rowInput[i];
+    }
+    sum = blockReduceSum(sum);
+    __shared__ float s_mean, s_var;
+    if (threadIdx.x == 0) {
+        s_mean = sum / N;
+    }
+    __syncthreads();
+    float var = 0.0f;
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+        float diff = rowInput[i] - s_mean;
+        var += diff * diff;
+    }
+    var = blockReduceSum(var);
+    if (threadIdx.x == 0) {
+        s_var = rsqrtf(var / N + eps);
+    }
+    __syncthreads();
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+        float normalized = (rowInput[i] - s_mean) * s_var;
+        rowOutput[i] = fmaf(normalized, gamma[i], beta[i]);
+    }
+}
+"""
+# ============================================================================
+# Benchmark Results
+# ============================================================================
+class BenchmarkResult(NamedTuple):
+    """Result of a single benchmark run."""
+    kernel_name: str
+    kernel_lines: int
+    compile_time_ms: int
+    success: bool
+    ptx_lines: int | None
+    sass_lines: int | None
+    error: str | None
+def count_lines(code: str) -> int:
+    """Count non-empty lines in code."""
+    return len([line for line in code.split('\n') if line.strip()])
+def run_benchmark(
+    kernel_name: str,
+    kernel_code: str,
+    arch: str = "sm_90a",
+    output_formats: list[str] | None = None,
+    num_runs: int = 3,
+) -> list[BenchmarkResult]:
+    """Run benchmark for a kernel.
+    Args:
+        kernel_name: Name of the kernel for reporting
+        kernel_code: CUDA source code
+        arch: Target architecture
+        output_formats: Output formats to request (default: ["ptx", "sass"])
+        num_runs: Number of benchmark runs
+    Returns:
+        List of BenchmarkResult for each run
+    """
+    import modal
+    if output_formats is None:
+        output_formats = ["ptx", "sass"]
+    # Get the deployed function
+    compile_fn = modal.Function.from_name("cuda-compile", "compile_cuda")
+    kernel_lines = count_lines(kernel_code)
+    results: list[BenchmarkResult] = []
+    for run in range(num_runs):
+        print(f"  Run {run + 1}/{num_runs}...", end=" ", flush=True)
+        start_time = time.time()
+        try:
+            result = compile_fn.remote({
+                "files": {"kernel.cu": kernel_code},
+                "arch": arch,
+                "flags": ["-O3", "-lineinfo"],
+                "output": output_formats,
+            })
+            elapsed_ms = int((time.time() - start_time) * 1000)
+            if result["success"]:
+                ptx_lines = count_lines(result["ptx"]) if result.get("ptx") else None
+                sass_lines = count_lines(result["sass"]) if result.get("sass") else None
+                results.append(BenchmarkResult(
+                    kernel_name=kernel_name,
+                    kernel_lines=kernel_lines,
+                    compile_time_ms=elapsed_ms,
+                    success=True,
+                    ptx_lines=ptx_lines,
+                    sass_lines=sass_lines,
+                    error=None,
+                ))
+                print(f"{elapsed_ms}ms")
+            else:
+                results.append(BenchmarkResult(
+                    kernel_name=kernel_name,
+                    kernel_lines=kernel_lines,
+                    compile_time_ms=elapsed_ms,
+                    success=False,
+                    ptx_lines=None,
+                    sass_lines=None,
+                    error=result.get("stderr", "Unknown error"),
+                ))
+                print(f"FAILED ({elapsed_ms}ms)")
+        except Exception as e:
+            elapsed_ms = int((time.time() - start_time) * 1000)
+            results.append(BenchmarkResult(
+                kernel_name=kernel_name,
+                kernel_lines=kernel_lines,
+                compile_time_ms=elapsed_ms,
+                success=False,
+                ptx_lines=None,
+                sass_lines=None,
+                error=str(e),
+            ))
+            print(f"ERROR: {e}")
+    return results
+def print_summary(results: list[BenchmarkResult]) -> None:
+    """Print benchmark summary."""
+    successful = [r for r in results if r.success]
+    if not successful:
+        print("\n  No successful runs!")
+        if results:
+            print(f"  Error: {results[0].error}")
+        return
+    times = [r.compile_time_ms for r in successful]
+    mean_time = statistics.mean(times)
+    if len(times) > 1:
+        stdev = statistics.stdev(times)
+        min_time = min(times)
+        max_time = max(times)
+        print(f"\n  Results: {mean_time:.0f}ms avg (min: {min_time}ms, max: {max_time}ms, stdev: {stdev:.0f}ms)")
+    else:
+        print(f"\n  Results: {mean_time:.0f}ms")
+    # Show output sizes
+    if successful[0].ptx_lines:
+        print(f"  PTX output: {successful[0].ptx_lines} lines")
+    if successful[0].sass_lines:
+        print(f"  SASS output: {successful[0].sass_lines} lines")
+def run_all_benchmarks(num_runs: int = 3) -> dict[str, list[BenchmarkResult]]:
+    """Run benchmarks for all kernel sizes."""
+    print("=" * 60)
+    print("CUDA Compilation Benchmark")
+    print("=" * 60)
+    kernels = [
+        ("simple", SIMPLE_KERNEL),
+        ("medium", MEDIUM_KERNEL),
+        ("complex", COMPLEX_KERNEL),
+    ]
+    all_results: dict[str, list[BenchmarkResult]] = {}
+    for name, code in kernels:
+        lines = count_lines(code)
+        print(f"\n{name.upper()} KERNEL ({lines} lines)")
+        print("-" * 40)
+        results = run_benchmark(name, code, num_runs=num_runs)
+        all_results[name] = results
+        print_summary(results)
+    # Print final summary
+    print("\n" + "=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    for name in ["simple", "medium", "complex"]:
+        results = all_results.get(name, [])
+        successful = [r for r in results if r.success]
+        if successful:
+            avg_time = statistics.mean([r.compile_time_ms for r in successful])
+            print(f"  {name:10s}: {avg_time:6.0f}ms ({results[0].kernel_lines} lines)")
+        else:
+            print(f"  {name:10s}: FAILED")
+    return all_results
+def main() -> None:
+    """Main entry point."""
+    parser = argparse.ArgumentParser(description="Benchmark CUDA compilation")
+    parser.add_argument(
+        "--kernel",
+        choices=["simple", "medium", "complex", "all"],
+        default="all",
+        help="Which kernel to benchmark",
+    )
+    parser.add_argument(
+        "--runs",
+        type=int,
+        default=3,
+        help="Number of benchmark runs per kernel",
+    )
+    parser.add_argument(
+        "--arch",
+        default="sm_90a",
+        help="Target GPU architecture",
+    )
+    args = parser.parse_args()
+    if args.kernel == "all":
+        run_all_benchmarks(num_runs=args.runs)
+    else:
+        kernel_map = {
+            "simple": SIMPLE_KERNEL,
+            "medium": MEDIUM_KERNEL,
+            "complex": COMPLEX_KERNEL,
+        }
+        code = kernel_map[args.kernel]
+        lines = count_lines(code)
+        print(f"\n{args.kernel.upper()} KERNEL ({lines} lines)")
+        print("-" * 40)
+        results = run_benchmark(
+            args.kernel,
+            code,
+            arch=args.arch,
+            num_runs=args.runs,
+        )
+        print_summary(results)
+if __name__ == "__main__":
+    main()

wafer-core 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl

wafer-core 0.1.38py3-none-any.whl → 0.1.40py3-none-any.whl