PyPI - wafer-core - Versions diffs - 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl - Mend

wafer-core 0.1.26py3-none-any.whl → 0.1.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

wafer_core/lib/trace_compare/PERFORMANCE.md +148 -0
wafer_core/lib/trace_compare/__init__.py +22 -9
wafer_core/lib/trace_compare/aligner.py +369 -0
wafer_core/lib/trace_compare/analyzer.py +549 -159
wafer_core/lib/trace_compare/api.py +225 -0
wafer_core/lib/trace_compare/architecture.py +77 -0
wafer_core/lib/trace_compare/classifier.py +307 -13
wafer_core/lib/trace_compare/fusion_analyzer.py +311 -845
wafer_core/lib/trace_compare/kernel_registry.yaml +349 -0
wafer_core/lib/trace_compare/layer_segmentation.py +114 -0
wafer_core/lib/trace_compare/loader.py +526 -227
wafer_core/lib/trace_compare/same_kernel_analyzer.py +119 -0
wafer_core/lib/trace_compare/warnings.py +99 -0
{wafer_core-0.1.26.dist-info → wafer_core-0.1.27.dist-info}/METADATA +3 -1
{wafer_core-0.1.26.dist-info → wafer_core-0.1.27.dist-info}/RECORD +16 -8
{wafer_core-0.1.26.dist-info → wafer_core-0.1.27.dist-info}/WHEEL +0 -0

wafer_core/lib/trace_compare/classifier.py CHANGED Viewed

@@ -2,9 +2,21 @@
 Classifies GPU kernels into operation categories (attention, GEMM, normalization, etc.)
 based on kernel name patterns and platform-specific conventions.
+Can optionally load patterns from kernel_registry.yaml, but falls back to hardcoded patterns
+for comprehensive coverage.
 """
+import fnmatch
 from enum import Enum
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+try:
+    import yaml
+except ImportError:
+    yaml = None
 class Op(Enum):
@@ -20,16 +32,211 @@ class Op(Enum):
     DENSE_GEMM = "Dense GEMM"
     RMSNORM = "RMSNorm"
     RMSNORM_GEMM = "RMSNorm+GEMM"
+    SWIGLU = "SwiGLU"
+    SWIGLU_GEMM = "SwiGLU+GEMM"
+    EMBEDDING_RMSNORM_GEMM = "Embedding+RMSNorm+GEMM"
+    SOFTMAX = "SoftMax"
     TRITON_FUSED = "Triton Fused"
     ELEMENTWISE = "Elementwise"
     SORTING = "Sorting"
     REDUCE = "Reduce"
+    INDEXING = "Indexing"
     COPY_MEMORY = "Copy/Memory"
+    FUSED_UNKNOWN = "Fused (Unknown)"  # Heuristically detected fusion
     OTHER = "Other"
+# Keywords that indicate specific operations - used for heuristic fusion detection
+FUSION_KEYWORDS: dict[str, str] = {
+    # Normalization
+    "rsqrt": "Norm",
+    "rmsnorm": "RMSNorm",
+    "layernorm": "LayerNorm",
+    # GEMM
+    "gemm": "GEMM",
+    "matmul": "GEMM",
+    "mm_": "GEMM",
+    # Activations
+    "silu": "SiLU",
+    "swiglu": "SwiGLU",
+    "gelu": "GELU",
+    "relu": "ReLU",
+    # Other ops
+    "softmax": "Softmax",
+    "attention": "Attention",
+    "embedding": "Embedding",
+    "reduce": "Reduce",
+}
+# Load kernel registry YAML if available
+_KERNEL_REGISTRY: dict[str, Any] | None = None
+def _load_kernel_registry() -> dict[str, Any] | None:
+    """Load kernel pattern registry from YAML file."""
+    global _KERNEL_REGISTRY
+    if _KERNEL_REGISTRY is not None:
+        return _KERNEL_REGISTRY
+    if yaml is None:
+        return None
+    registry_path = Path(__file__).parent / "kernel_registry.yaml"
+    if not registry_path.exists():
+        return None
+    try:
+        with open(registry_path) as f:
+            _KERNEL_REGISTRY = yaml.safe_load(f)
+        return _KERNEL_REGISTRY
+    except Exception:
+        return None
+def _match_registry_pattern(name: str, category: str, platform: str) -> tuple[Op, str] | None:
+    """Try to match kernel name against registry patterns.
+    Returns (Op, pattern_name) if match found, None otherwise.
+    """
+    registry = _load_kernel_registry()
+    if not registry:
+        return None
+    nl = name.lower()
+    category_data = registry.get(category, {})
+    platform_key = "amd" if platform.lower() == "amd" else "nvidia"
+    patterns = category_data.get(platform_key, [])
+    both_patterns = category_data.get("both", [])
+    patterns = patterns + both_patterns
+    for pattern_entry in patterns:
+        pattern = pattern_entry.get("pattern", "")
+        if not pattern:
+            continue
+        if fnmatch.fnmatch(name, pattern) or fnmatch.fnmatch(nl, pattern.lower()):
+            if category == "attention":
+                phase = pattern_entry.get("phase")
+                if phase == "prefill":
+                    return Op.ATTN_PREFILL, pattern
+                elif phase == "decode":
+                    return Op.ATTN_DECODE, pattern
+                return Op.ATTN_PREFILL, pattern
+            elif category == "gemm":
+                return Op.DENSE_GEMM, pattern
+            elif category == "rmsnorm":
+                # Detect fused operations (AMD Triton often fuses RMSNorm with GEMM)
+                has_gemm = "gemm" in nl or "unquantized_gemm" in nl
+                has_embedding = "embedding" in nl
+                if has_gemm and has_embedding:
+                    # Embedding + RMSNorm + GEMM all fused together
+                    return Op.EMBEDDING_RMSNORM_GEMM, pattern
+                elif has_gemm:
+                    # RMSNorm + GEMM fused
+                    return Op.RMSNORM_GEMM, pattern
+                return Op.RMSNORM, pattern
+            elif category == "moe":
+                # Distinguish between MoE sub-operations
+                if "swiglu" in nl:
+                    return Op.MOE_GEMM_SWIGLU, pattern
+                if any(x in nl for x in ["routing", "topk", "align_block", "count_and_sort", "gating"]):
+                    return Op.MOE_ROUTING, pattern
+                if "finalize" in nl or "scatter" in nl:
+                    return Op.MOE_FINALIZE, pattern
+                return Op.MOE_GEMM, pattern
+            elif category == "kv_cache":
+                return Op.KV_CACHE, pattern
+            elif category == "softmax":
+                return Op.SOFTMAX, pattern
+            elif category == "reduce":
+                return Op.REDUCE, pattern
+            elif category == "sorting":
+                return Op.SORTING, pattern
+            elif category == "memory":
+                return Op.COPY_MEMORY, pattern
+            elif category == "indexing":
+                return Op.INDEXING, pattern
+            elif category == "elementwise":
+                return Op.ELEMENTWISE, pattern
+            elif category == "triton":
+                return Op.TRITON_FUSED, pattern
+            elif category == "activation":
+                # Check for fused SwiGLU+GEMM (AMD Triton)
+                has_gemm = "gemm" in nl or "unquantized_gemm" in nl
+                has_silu = "silu" in nl or "swiglu" in nl
+                if has_gemm and has_silu:
+                    return Op.SWIGLU_GEMM, pattern
+                elif has_silu:
+                    return Op.SWIGLU, pattern
+                # Other activations (GELU, etc.)
+                return Op.TRITON_FUSED, pattern
+    return None
+def _detect_heuristic_fusion(name: str) -> tuple[Op, str] | None:
+    """Heuristically detect potential fusions based on multiple operation keywords.
+    This is a fallback for kernels we haven't explicitly classified.
+    If a kernel name contains 2+ distinct operation keywords, it's likely fused.
+    Returns (Op.FUSED_UNKNOWN, "Component1+Component2+...") if suspected fusion.
+    The pattern name contains the fused components for display.
+    """
+    nl = name.lower()
+    # Only check Triton kernels - these are most likely to be fused
+    if "triton" not in nl:
+        return None
+    # Find all operation keywords present in the name
+    # Use ordered list to maintain consistent ordering
+    found_ops: list[str] = []
+    keyword_priority = [
+        # Order matters - more specific first
+        ("embedding", "Embedding"),
+        ("rmsnorm", "RMSNorm"),
+        ("layernorm", "LayerNorm"),
+        ("rsqrt", "Norm"),  # Generic norm indicator
+        ("swiglu", "SwiGLU"),
+        ("silu", "SiLU"),
+        ("gelu", "GELU"),
+        ("relu", "ReLU"),
+        ("gemm", "GEMM"),
+        ("matmul", "GEMM"),
+        ("mm_", "GEMM"),
+        ("softmax", "Softmax"),
+        ("attention", "Attention"),
+        ("reduce", "Reduce"),
+    ]
+    for keyword, op_name in keyword_priority:
+        if keyword in nl and op_name not in found_ops:
+            # Avoid duplicates like "RMSNorm" and "Norm"
+            if op_name == "Norm" and any(n in found_ops for n in ["RMSNorm", "LayerNorm"]):
+                continue
+            # Avoid duplicates like "SwiGLU" and "SiLU"
+            if op_name == "SiLU" and "SwiGLU" in found_ops:
+                continue
+            found_ops.append(op_name)
+    # If 2+ operations detected, it's likely a fusion
+    if len(found_ops) >= 2:
+        fused_name = "+".join(found_ops)
+        # The pattern name IS the fused operation name for display
+        return Op.FUSED_UNKNOWN, fused_name
+    return None
+@lru_cache(maxsize=4096)
 def classify(name: str, platform: str) -> tuple[Op, str]:
     """Classify kernel by operation type.
+    Cached because PyTorch traces have ~48 unique kernel names repeated 810k times.
     Args:
         name: Kernel name from trace
@@ -39,8 +246,27 @@ def classify(name: str, platform: str) -> tuple[Op, str]:
         Tuple of (operation type, pattern name)
     """
     nl = name.lower()
-    # Attention
+    # Check registry patterns first (order matters - more specific categories first)
+    registry_categories = [
+        "attention",   # Attention ops (prefill/decode)
+        "gemm",        # Dense GEMM
+        "rmsnorm",     # RMSNorm
+        "moe",         # MoE operations
+        "kv_cache",    # KV Cache
+        "activation",  # SwiGLU, SiLU, GELU
+        "softmax",     # Softmax
+        "reduce",      # Reduce/Scan
+        "sorting",     # Sorting
+        "memory",      # Memory/Copy
+        "indexing",    # Index/Scatter-Gather
+        "triton",      # Triton fused ops
+        "elementwise", # Elementwise (last - most generic)
+    ]
+    for category in registry_categories:
+        result = _match_registry_pattern(name, category, platform)
+        if result:
+            return result
     if "attention" in nl or "fmha" in nl:
         if platform == "AMD":
             if "2d" in nl:
@@ -55,10 +281,35 @@ def classify(name: str, platform: str) -> tuple[Op, str]:
                 return Op.ATTN_DECODE, "fmhaSm100f*_ForGen"
         return Op.ATTN_PREFILL, name[:40]
+    # Flash Attention variants (vLLM)
+    if "flash::flash_fwd_kernel" in name or "flash_fwd" in nl:
+        # Could distinguish prefill/decode if needed, defaulting to prefill
+        return Op.ATTN_PREFILL, "flash::flash_fwd_kernel"
     if "reshape_and_cache" in nl:
         return Op.KV_CACHE, "reshape_and_cache_*"
+    # KV Cache variants (vLLM)
+    if "concat_and_cache" in nl or "cache_mla" in nl:
+        return Op.KV_CACHE, "vllm::concat_and_cache_*"
     # MoE
+    # vLLM MoE kernels - these are very common in MoE models
+    if "fused_moe_kernel" in nl:
+        return Op.MOE_GEMM, "fused_moe_kernel"
+    if "vllm::moe::" in name:
+        if "moe_align_block_size" in nl:
+            if "small_batch" in nl:
+                return Op.MOE_ROUTING, "vllm::moe::moe_align_block_size_small_batch_*"
+            return Op.MOE_ROUTING, "vllm::moe::moe_align_block_size_*"
+        if "moe_sum" in nl:
+            return Op.MOE_FINALIZE, "vllm::moe::moe_sum_*"
+    # vLLM act_and_mul (can be mangled C++ name)
+    if "vllm::act_and_mul_kernel" in name or ("act_and_mul_kernel" in nl and "vllm" in nl):
+        return Op.MOE_GEMM_SWIGLU, "vllm::act_and_mul_kernel"
     if "_matmul_ogs_" in nl:
         if "swiglu" in nl:
             return Op.MOE_GEMM_SWIGLU, "_matmul_ogs_*_swiglu"
@@ -69,8 +320,13 @@ def classify(name: str, platform: str) -> tuple[Op, str]:
             return Op.MOE_GEMM_SWIGLU, "bmm_*_swiGlu_dynBatch"
         return Op.MOE_GEMM, "bmm_*_dynBatch"
+    # Generic MoE routing patterns (check before finalize)
     if any(x in nl for x in ["topk", "routing", "bitmatrix", "moe_forward", "_combined_routing"]):
+        if "moe::dev::routing::" in name or "moe::" in name:
+            return Op.MOE_ROUTING, "moe::dev::routing::*"
         return Op.MOE_ROUTING, "moe_routing_*"
+    # MoE finalize patterns
     if "finalize" in nl or ("scatter" in nl and "moe" in nl):
         return Op.MOE_FINALIZE, "moe_finalize_*"
@@ -87,6 +343,18 @@ def classify(name: str, platform: str) -> tuple[Op, str]:
         return Op.DENSE_GEMM, "nvjet_* (cuBLASLt)"
     if "wvsplitk" in nl or name.startswith("void wvSplitK"):
         return Op.DENSE_GEMM, "wvSplitK_* (hipBLASLt)"
+    # CUTLASS GEMM variants
+    if "cutlass" in nl and ("sgemm" in nl or "gemm" in nl or "cutlass3x" in name.lower()):
+        return Op.DENSE_GEMM, "cutlass*_gemm"
+    # GEMV (matrix-vector) operations - treat as GEMM
+    if "gemv" in nl or "gemvx" in nl:
+        return Op.DENSE_GEMM, "gemv*"
+    # Generic GEMM patterns
+    if "gemmsn" in nl or name.startswith("void gemmSN"):
+        return Op.DENSE_GEMM, "gemmSN_*"
     # Triton fused operations - very common
     if "triton_poi" in nl or "triton_red" in nl or "triton_per" in nl:
@@ -95,9 +363,21 @@ def classify(name: str, platform: str) -> tuple[Op, str]:
             return Op.TRITON_FUSED, "triton_*_silu"
         return Op.TRITON_FUSED, "triton_*"
-    # PyTorch native operations
-    if "at::native::" in name:
-        return Op.ELEMENTWISE, "at::native::*"
+    # SoftMax operations
+    if "softmax" in nl:
+        return Op.SOFTMAX, "softmax_*"
+    # Reduce operations - catch more patterns
+    if "reduce" in nl:
+        if "reduce_segments" in nl or "devicereduce" in nl:
+            return Op.REDUCE, "reduce_segments"
+        if "reduce_kernel" in nl:
+            return Op.REDUCE, "reduce_kernel"
+        return Op.REDUCE, "reduce_*"
+    # Scan operations (library internals - similar to reduce)
+    if "scan" in nl and ("cub::" in name or "rocprim::" in name or "at_cuda_detail::cub::" in name):
+        return Op.REDUCE, "cub/rocprim::scan_*"
     # Sorting operations (common in sampling/topk)
     if "sort" in nl or "radixsort" in nl or "merge" in nl:
@@ -106,21 +386,31 @@ def classify(name: str, platform: str) -> tuple[Op, str]:
         else:
             return Op.SORTING, "cub::DeviceRadixSort*"
-    # Reduce operations
-    if "reduce" in nl and ("reduce_segments" in nl or "devicereduce" in nl or "devicescan" in nl):
-        if platform == "AMD":
-            return Op.REDUCE, "reduce_segments"
-        else:
-            return Op.REDUCE, "cub::DeviceReduce*"
+    # Indexing/Scatter/Gather operations
+    if any(x in nl for x in ["indices", "scatter", "gather", "index_select", "embedding"]):
+        return Op.INDEXING, "index/scatter_*"
     # Memory copy operations
     if "copy" in nl or "memcpy" in nl or "_copy_page_indices" in nl:
         return Op.COPY_MEMORY, "copy_*"
+    # PyTorch native operations (catch-all for at::native)
+    if "at::native::" in name:
+        # Try to be more specific
+        if "fill" in nl:
+            return Op.ELEMENTWISE, "at::native::fill_*"
+        return Op.ELEMENTWISE, "at::native::*"
     # ROCm/CUDA library kernels (other)
     if "rocprim::" in name or "cub::" in name:
         return Op.OTHER, "rocprim/cub_*"
+    # Fallback: Heuristic fusion detection for unknown Triton kernels
+    # If a kernel has multiple operation keywords, it's likely fused
+    heuristic_result = _detect_heuristic_fusion(name)
+    if heuristic_result:
+        return heuristic_result
     return Op.OTHER, name[:40]
@@ -170,7 +460,7 @@ def classify_kernel(name: str) -> str:
         return "Triton_Persistent"
     # Reduce operations
-    if "reduce_segments" in nl or "devicereduce" in nl:
+    if "reduce" in nl:
         return "Reduce"
     # Sort operations
@@ -180,6 +470,10 @@ def classify_kernel(name: str) -> str:
     # Softmax
     if "softmax" in nl:
         return "Softmax"
+    # Indexing/Scatter/Gather
+    if any(x in nl for x in ["indices", "scatter", "gather", "index_select", "embedding"]):
+        return "Indexing"
     # Elementwise operations
     if any(x in nl for x in ["elementwise", "unrolled_elementwise"]):

wafer-core 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl

wafer-core 0.1.26py3-none-any.whl → 0.1.27py3-none-any.whl