PyPI - wafer-core - Versions diffs - 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl - Mend

wafer-core 0.1.26py3-none-any.whl → 0.1.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

wafer_core/lib/trace_compare/PERFORMANCE.md +148 -0
wafer_core/lib/trace_compare/__init__.py +22 -9
wafer_core/lib/trace_compare/aligner.py +369 -0
wafer_core/lib/trace_compare/analyzer.py +549 -159
wafer_core/lib/trace_compare/api.py +225 -0
wafer_core/lib/trace_compare/architecture.py +77 -0
wafer_core/lib/trace_compare/classifier.py +307 -13
wafer_core/lib/trace_compare/fusion_analyzer.py +311 -845
wafer_core/lib/trace_compare/kernel_registry.yaml +349 -0
wafer_core/lib/trace_compare/layer_segmentation.py +114 -0
wafer_core/lib/trace_compare/loader.py +526 -227
wafer_core/lib/trace_compare/same_kernel_analyzer.py +119 -0
wafer_core/lib/trace_compare/warnings.py +99 -0
{wafer_core-0.1.26.dist-info → wafer_core-0.1.27.dist-info}/METADATA +3 -1
{wafer_core-0.1.26.dist-info → wafer_core-0.1.27.dist-info}/RECORD +16 -8
{wafer_core-0.1.26.dist-info → wafer_core-0.1.27.dist-info}/WHEEL +0 -0

wafer_core/lib/trace_compare/kernel_registry.yaml ADDED Viewed

@@ -0,0 +1,349 @@
+# Kernel Pattern Registry
+# Version: 2025-01
+# Last updated: 2025-01-28
+# Update when: New GPU architecture, new library version, new model architecture
+version: "2025-01"
+# ============================================================================
+# SUPPORTED HARDWARE
+# ============================================================================
+# NVIDIA:
+#   - SM100 (Blackwell): B200, B100
+#   - SM90 (Hopper): H100, H200
+#   - SM89 (Ada Lovelace): L40, RTX 4090
+#   - SM80 (Ampere): A100, A10, A30
+#
+# AMD:
+#   - CDNA 4 (gfx950): MI355X
+#   - CDNA 3 (gfx942): MI300X, MI300A, MI325X
+#   - CDNA 2 (gfx90a): MI250X, MI210
+#
+# Note: MI325X uses same gfx942 ISA as MI300X but with 256GB HBM3e memory
+# ============================================================================
+attention:
+  nvidia:
+    # SM100 (Blackwell B200/B100) - 'a' suffix = prefill/context, 'f' suffix = decode/forgen
+    - pattern: "fmhaSm100a*"
+      hardware: "SM100 (Blackwell)"
+      library: "Flash Attention 3"
+      phase: prefill
+    - pattern: "fmhaSm100f*"
+      hardware: "SM100 (Blackwell)"
+      library: "Flash Attention 3"
+      phase: decode
+    # SM90 (Hopper H100/H200) - Flash Attention 2/3
+    - pattern: "fmhaSm90*"
+      hardware: "SM90 (Hopper)"
+      library: "Flash Attention 3"
+    - pattern: "flash::flash_fwd_kernel*"
+      hardware: "SM90 (Hopper)"
+      library: "Flash Attention 2"
+      phase: prefill
+    - pattern: "flash_fwd_*"
+      hardware: "SM90 (Hopper)"
+      library: "Flash Attention 2"
+    - pattern: "fmha_v2_*flash_attention_forward*"
+      hardware: "SM90 (Hopper)"
+      library: "Flash Attention 2"
+      phase: prefill
+    - pattern: "fmha_v2_*"
+      hardware: "SM90 (Hopper)"
+      library: "Flash Attention 2"
+    # SM89 (Ada Lovelace L40/RTX 4090)
+    - pattern: "fmhaSm89*"
+      hardware: "SM89 (Ada Lovelace)"
+      library: "Flash Attention"
+    # SM80 (Ampere A100/A10)
+    - pattern: "fmhaSm80*"
+      hardware: "SM80 (Ampere)"
+      library: "Flash Attention"
+    - pattern: "fmha_*"
+      hardware: "SM80 (Ampere)"
+      library: "Flash Attention"
+    # Generic phase patterns (fallback)
+    - pattern: "*Context*"
+      phase: prefill
+    - pattern: "*context*"
+      phase: prefill
+    - pattern: "*ForGen*"
+      phase: decode
+    - pattern: "*forgen*"
+      phase: decode
+  amd:
+    # CDNA 4 (MI355X - gfx950) - Composable Kernel v2
+    - pattern: "*ck_fmha_*"
+      hardware: "CDNA 4 (MI355X)"
+      library: "Composable Kernel"
+    - pattern: "*flash_attn_ck*"
+      hardware: "CDNA 4 (MI355X)"
+      library: "Composable Kernel"
+    # CDNA 3 (MI300X/MI325X - gfx942) - Composable Kernel unified attention
+    - pattern: "*unified_attention_2d*"
+      hardware: "CDNA 3 (MI300X/MI325X)"
+      phase: prefill
+      library: "Composable Kernel"
+    - pattern: "*unified_attention_3d*"
+      hardware: "CDNA 3 (MI300X/MI325X)"
+      phase: decode
+      library: "Composable Kernel"
+    - pattern: "kernel_unified_attention_2d*"
+      hardware: "CDNA 3 (MI300X/MI325X)"
+      phase: prefill
+      library: "Composable Kernel"
+    - pattern: "kernel_unified_attention_3d*"
+      hardware: "CDNA 3 (MI300X/MI325X)"
+      phase: decode
+      library: "Composable Kernel"
+    - pattern: "attention_2d*"
+      phase: prefill
+      library: "Composable Kernel"
+    - pattern: "attention_3d*"
+      phase: decode
+      library: "Composable Kernel"
+    # Triton Flash Attention (works on all AMD GPUs)
+    - pattern: "triton_*flash*"
+      library: "Triton Flash Attention"
+    - pattern: "triton_*attention*"
+      library: "Triton"
+gemm:
+  nvidia:
+    # cuBLASLt (H100/H200 optimized)
+    - pattern: "nvjet_*"
+      library: "cuBLASLt"
+      hardware: "SM90+ (Hopper/Blackwell)"
+    - pattern: "void cublasLt*"
+      library: "cuBLASLt"
+    # CUTLASS (all architectures)
+    - pattern: "cutlass*gemm*"
+      library: "CUTLASS 3.x"
+    - pattern: "cutlass_*"
+      library: "CUTLASS"
+    # cuBLAS legacy
+    - pattern: "cublas*"
+      library: "cuBLAS"
+    # FP8 GEMM (H100+ specific)
+    - pattern: "*fp8*gemm*"
+      library: "cuBLASLt FP8"
+      hardware: "SM90+ (Hopper)"
+    - pattern: "*e4m3*"
+      library: "cuBLASLt FP8"
+      hardware: "SM90+ (Hopper)"
+  amd:
+    # Tensile (all CDNA architectures)
+    - pattern: "Cijk_*"
+      library: "Tensile"
+    - pattern: "Custom_Cijk_*"
+      library: "Tensile"
+    # hipBLASLt (MI300X/MI325X/MI355X optimized)
+    - pattern: "wvSplitK*"
+      library: "hipBLASLt"
+      hardware: "CDNA 3/4 (MI300X/MI325X/MI355X)"
+    - pattern: "hipblaslt*"
+      library: "hipBLASLt"
+    - pattern: "hipblas*"
+      library: "hipBLAS"
+    # FP8 GEMM (MI300X+ specific)
+    - pattern: "*fp8*"
+      library: "hipBLASLt FP8"
+      hardware: "CDNA 3+ (MI300X/MI325X/MI355X)"
+    # CDNA 4 specific (MI355X - gfx950)
+    - pattern: "*gfx950*"
+      library: "Tensile"
+      hardware: "CDNA 4 (MI355X)"
+    # ISA-specific patterns (gfx942 = MI300X/MI325X, gfx950 = MI355X)
+    - pattern: "*ISA942*"
+      library: "Tensile"
+      hardware: "CDNA 3 (MI300X/MI325X)"
+    - pattern: "*ISA950*"
+      library: "Tensile"
+      hardware: "CDNA 4 (MI355X)"
+ssm:
+  both:
+    - pattern: "selective_scan*"
+      model: "Mamba"
+    - pattern: "ssd_*"
+      model: "Mamba-2"
+    - pattern: "causal_conv1d*"
+      model: "Mamba"
+    - pattern: "mamba_*"
+      model: "Mamba"
+rmsnorm:
+  both:
+    # Fused RMSNorm+GEMM patterns (AMD Triton fuses these)
+    # Key indicator: *rocm_unquantized_gemm* in kernel name
+    - pattern: "triton_*rocm_unquantized_gemm*rsqrt*"
+      library: "Triton"
+      fused_with: "GEMM"
+    - pattern: "triton_*rsqrt*rocm_unquantized_gemm*"
+      library: "Triton"
+      fused_with: "GEMM"
+    - pattern: "triton_*rsqrt*gemm*"
+      library: "Triton"
+      fused_with: "GEMM"
+    - pattern: "triton_*gemm*rsqrt*"
+      library: "Triton"
+      fused_with: "GEMM"
+    # Non-fused RMSNorm (no gemm in name)
+    - pattern: "triton_*rsqrt*"
+      library: "Triton"
+    - pattern: "*rmsnorm*"
+      library: "Various"
+moe:
+  both:
+    - pattern: "_matmul_ogs_*"
+      library: "Triton"
+    - pattern: "bmm_*dynbatch*"
+      library: "Triton"
+    - pattern: "*routing*"
+      library: "Various"
+    - pattern: "*topk*"
+      library: "Various"
+    - pattern: "fused_moe_kernel*"
+      library: "vLLM"
+    - pattern: "*vllm::moe::*"
+      library: "vLLM"
+    - pattern: "*moe_align_block_size*"
+      library: "vLLM"
+    - pattern: "*count_and_sort_expert*"
+      library: "vLLM"
+    - pattern: "*topkGatingSoftmax*"
+      library: "vLLM"
+# Activation functions (SwiGLU, SiLU, etc.)
+activation:
+  both:
+    # Fused SwiGLU+GEMM (AMD Triton fuses these)
+    - pattern: "triton_*rocm_unquantized_gemm*silu*"
+      operation: "SwiGLU+GEMM"
+      library: "Triton"
+      fused_with: "GEMM"
+    - pattern: "triton_*silu*rocm_unquantized_gemm*"
+      operation: "SwiGLU+GEMM"
+      library: "Triton"
+      fused_with: "GEMM"
+    - pattern: "triton_*gemm*silu*"
+      operation: "SwiGLU+GEMM"
+      library: "Triton"
+      fused_with: "GEMM"
+    - pattern: "triton_*silu*gemm*"
+      operation: "SwiGLU+GEMM"
+      library: "Triton"
+      fused_with: "GEMM"
+    # Non-fused activation
+    - pattern: "*act_and_mul_kernel*"
+      operation: "SwiGLU"
+      library: "vLLM"
+    - pattern: "triton_*silu*"
+      operation: "SiLU"
+      library: "Triton"
+    - pattern: "*silu_kernel*"
+      operation: "SiLU"
+      library: "vLLM"
+    - pattern: "*gelu*"
+      operation: "GELU"
+      library: "Various"
+# KV Cache operations
+kv_cache:
+  both:
+    - pattern: "*reshape_and_cache*"
+      library: "vLLM"
+    - pattern: "*concat_and_cache*"
+      library: "vLLM"
+    - pattern: "*cache_mla*"
+      library: "vLLM"
+# Softmax operations
+softmax:
+  both:
+    - pattern: "*SoftMax*"
+      library: "PyTorch"
+    - pattern: "*softmax*"
+      library: "PyTorch"
+# Triton fused operations (more specific patterns)
+triton:
+  both:
+    - pattern: "triton_poi_fused_mul*silu*"
+      operation: "SwiGLU"
+      library: "Triton"
+    - pattern: "triton_poi_fused*"
+      operation: "Pointwise"
+      library: "Triton"
+    - pattern: "triton_red_fused*"
+      operation: "Reduction"
+      library: "Triton"
+    - pattern: "triton_per_fused*"
+      operation: "Persistent"
+      library: "Triton"
+# Reduce/Scan operations
+reduce:
+  nvidia:
+    - pattern: "*cub::*Reduce*"
+      library: "CUB"
+    - pattern: "*cub::*Scan*"
+      library: "CUB"
+    - pattern: "*splitKreduce*"
+      library: "cuBLASLt"
+      note: "GEMM epilogue reduction"
+  amd:
+    - pattern: "*rocprim::*reduce*"
+      library: "rocPRIM"
+    - pattern: "*rocprim::*scan*"
+      library: "rocPRIM"
+    - pattern: "reduce_segments*"
+      library: "vLLM"
+# Sorting operations
+sorting:
+  nvidia:
+    - pattern: "*RadixSort*"
+      library: "CUB"
+    - pattern: "*DeviceSort*"
+      library: "CUB"
+  amd:
+    - pattern: "*rocprim::*sort*"
+      library: "rocPRIM"
+    - pattern: "*rocprim::*merge*"
+      library: "rocPRIM"
+# Memory/Copy operations
+memory:
+  both:
+    - pattern: "*memcpy*"
+      library: "CUDA/HIP Runtime"
+    - pattern: "*direct_copy*"
+      library: "PyTorch"
+    - pattern: "*copy_page_indices*"
+      library: "vLLM"
+    - pattern: "*rocclr_copyBuffer*"
+      library: "AMD ROCclr"
+    - pattern: "*rocprim::*transform*"
+      library: "rocPRIM"
+# Indexing/Scatter-Gather operations
+indexing:
+  both:
+    - pattern: "*scatter_gather*"
+      library: "PyTorch"
+    - pattern: "*index_elementwise*"
+      library: "PyTorch"
+    - pattern: "*fill_reverse_indices*"
+      library: "PyTorch"
+# Elementwise operations (fallback patterns)
+elementwise:
+  both:
+    - pattern: "at::native::*elementwise*"
+      library: "PyTorch"
+    - pattern: "at::native::*vectorized*"
+      library: "PyTorch"
+    - pattern: "*distribution_elementwise*"
+      library: "PyTorch"

wafer_core/lib/trace_compare/layer_segmentation.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""Layer segmentation based on architecture type.
+Segments kernels into transformer layers based on architecture-specific markers
+(e.g., attention kernels for transformers, SSM scan kernels for Mamba).
+"""
+import bisect
+from typing import Any
+from .architecture import ArchitectureType
+from .warnings import TraceWarning
+def segment_layers_by_architecture(
+    kernels: list[dict[str, Any]],
+    architecture: ArchitectureType,
+) -> tuple[dict[int, list[dict[str, Any]]], list[TraceWarning]]:
+    """Segment kernels into layers based on architecture.
+    Args:
+        kernels: List of kernel events with 'name', 'ts', and other fields
+        architecture: Detected architecture type
+    Returns:
+        Tuple of (layer_mapping, warnings)
+        layer_mapping: Dict mapping layer_num -> list of kernel events
+        warnings: List of warnings if segmentation fails
+    """
+    warnings: list[TraceWarning] = []
+    if architecture == ArchitectureType.HYBRID:
+        warnings.append(
+            TraceWarning(
+                code="HYBRID_ARCHITECTURE",
+                severity="info",
+                message="Hybrid architecture detected (both attention and SSM kernels). Layer segmentation unavailable.",
+                suggestion="Hybrid models require custom segmentation logic. Layer analysis will be skipped.",
+            )
+        )
+        return {}, warnings
+    if architecture == ArchitectureType.UNKNOWN:
+        warnings.append(
+            TraceWarning(
+                code="UNKNOWN_ARCHITECTURE",
+                severity="warning",
+                message="Cannot determine model architecture. Layer segmentation unavailable.",
+                suggestion="Ensure trace contains recognizable kernel patterns (attention, SSM, etc.).",
+            )
+        )
+        return {}, warnings
+    layer_markers: list[tuple[int, str]] = []
+    for kernel in kernels:
+        name_lower = kernel.get("name", "").lower()
+        if architecture == ArchitectureType.TRANSFORMER:
+            if any(pattern in name_lower for pattern in ["fmha", "attention", "flash"]):
+                if "context" in name_lower or "2d" in name_lower or "fmhasm100a" in name_lower:
+                    layer_markers.append((kernel.get("ts", 0), kernel.get("name", "")))
+        elif architecture == ArchitectureType.SSM:
+            if any(pattern in name_lower for pattern in ["selective_scan", "mamba", "ssd"]):
+                layer_markers.append((kernel.get("ts", 0), kernel.get("name", "")))
+    if not layer_markers:
+        warnings.append(
+            TraceWarning(
+                code="NO_LAYER_MARKERS",
+                severity="warning",
+                message=f"No layer marker kernels found for {architecture.value} architecture.",
+                suggestion="Ensure trace contains expected kernel patterns for this architecture type.",
+            )
+        )
+        return {}, warnings
+    layer_markers.sort(key=lambda x: x[0])
+    # Sort kernels by timestamp for binary search
+    sorted_kernels = sorted(kernels, key=lambda k: k.get("ts", 0))
+    kernel_timestamps = [k.get("ts", 0) for k in sorted_kernels]
+    layer_mapping: dict[int, list[dict[str, Any]]] = {}
+    for i, (marker_ts, _) in enumerate(layer_markers):
+        layer_num = i
+        ts_start = marker_ts
+        ts_end = layer_markers[i + 1][0] if i + 1 < len(layer_markers) else float("inf")
+        # Binary search for start and end indices
+        start_idx = bisect.bisect_left(kernel_timestamps, ts_start)
+        end_idx = bisect.bisect_left(kernel_timestamps, ts_end) if ts_end != float("inf") else len(sorted_kernels)
+        layer_kernels = sorted_kernels[start_idx:end_idx]
+        if layer_kernels:
+            layer_mapping[layer_num] = layer_kernels
+    if layer_mapping:
+        kernel_counts = [len(kernels) for kernels in layer_mapping.values()]
+        if kernel_counts:
+            mean_count = sum(kernel_counts) / len(kernel_counts)
+            variances = [abs(count - mean_count) / mean_count for count in kernel_counts]
+            if any(v > 0.3 for v in variances):
+                warnings.append(
+                    TraceWarning(
+                        code="LAYER_SIZE_VARIANCE",
+                        severity="info",
+                        message="Layer kernel counts vary significantly. Segmentation may be inaccurate.",
+                        suggestion="This is normal for models with varying layer sizes or non-uniform workloads.",
+                    )
+                )
+    return layer_mapping, warnings

wafer-core 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl

wafer-core 0.1.26py3-none-any.whl → 0.1.27py3-none-any.whl