PyPI - wafer-core - Versions diffs - 0.1.27__py3-none-any.whl → 0.1.29__py3-none-any.whl - Mend

wafer-core 0.1.27py3-none-any.whl → 0.1.29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

wafer_core/lib/trace_compare/aligner.py +13 -6
wafer_core/lib/trace_compare/analyzer.py +12 -3
wafer_core/lib/trace_compare/classifier.py +18 -9
wafer_core/lib/trace_compare/fusion_analyzer.py +424 -275
wafer_core/targets/__init__.py +47 -21
wafer_core/targets/pool.py +181 -0
wafer_core/targets/probe.py +113 -0
wafer_core/targets/providers/__init__.py +46 -0
wafer_core/targets/providers/baremetal.py +72 -0
wafer_core/targets/providers/digitalocean.py +164 -0
wafer_core/targets/providers/runpod.py +250 -0
wafer_core/targets/reconcile.py +90 -0
wafer_core/targets/spec_store.py +200 -0
wafer_core/targets/state_cache.py +150 -0
wafer_core/targets/types.py +141 -0
wafer_core/utils/kernel_utils/targets/config.py +8 -24
{wafer_core-0.1.27.dist-info → wafer_core-0.1.29.dist-info}/METADATA +1 -1
{wafer_core-0.1.27.dist-info → wafer_core-0.1.29.dist-info}/RECORD +19 -9
{wafer_core-0.1.27.dist-info → wafer_core-0.1.29.dist-info}/WHEEL +0 -0

wafer_core/lib/trace_compare/aligner.py CHANGED Viewed

@@ -214,21 +214,28 @@ def align_kernels_within_layer(
         # The platform that HAS the kernel IS fusing; the other runs components separately
         is_fused_op = "+" in op_str
+        # Operations that can't be "fused away" - absence means alignment issue, not fusion
+        non_fusable_ops = {
+            "Attention (Prefill)", "Attention (Decode)", "Dense GEMM",
+            "KV Cache", "MoE GEMM", "MoE Routing"
+        }
+        is_non_fusable = op_str in non_fusable_ops
         fusion_note = None
         if amd_count > 0 and nvidia_count == 0:
             if is_fused_op:
                 # AMD has a fused kernel like "RMSNorm+GEMM" → AMD IS fusing
                 fusion_note = f"AMD fuses {op_str} into {amd_kernel_name}"
-            else:
-                # AMD has a regular kernel that NVIDIA doesn't need → NVIDIA fuses it elsewhere
-                fusion_note = f"AMD runs {amd_kernel_name}, NVIDIA fuses into another kernel"
+            elif not is_non_fusable:
+                # Only mark as fusion for ops that can legitimately be fused
+                fusion_note = f"AMD runs {amd_kernel_name}, NVIDIA may fuse into another kernel"
         elif amd_count == 0 and nvidia_count > 0:
             if is_fused_op:
                 # NVIDIA has a fused kernel → NVIDIA IS fusing
                 fusion_note = f"NVIDIA fuses {op_str} into {nvidia_kernel_name}"
-            else:
-                # NVIDIA has a regular kernel that AMD doesn't need → AMD fuses it elsewhere
-                fusion_note = f"NVIDIA runs {nvidia_kernel_name}, AMD fuses into another kernel"
+            elif not is_non_fusable:
+                # Only mark as fusion for ops that can legitimately be fused
+                fusion_note = f"NVIDIA runs {nvidia_kernel_name}, AMD may fuse into another kernel"
         elif amd_count > nvidia_count * 1.5 and nvidia_count > 0:
             # AMD runs more kernels = NVIDIA is fusing some
             fusion_note = f"AMD runs {amd_kernel_name} {amd_count / nvidia_count:.1f}x more → NVIDIA fuses"

wafer_core/lib/trace_compare/analyzer.py CHANGED Viewed

@@ -429,13 +429,22 @@ def analyze_traces_aligned(
             "kernel_pairs": kernel_pairs,
         })
-    fusion_result = analyze_fusion_from_alignment(alignment.layer_alignments)
-    same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
+    # Determine which trace is AMD vs NVIDIA for fusion analysis
     if trace1.platform == "AMD":
         amd_trace, nvidia_trace = trace1, trace2
+        fusion_amd_kernels = amd_kernels
+        fusion_nvidia_kernels = nvidia_kernels
     else:
         amd_trace, nvidia_trace = trace2, trace1
+        fusion_amd_kernels = nvidia_kernels
+        fusion_nvidia_kernels = amd_kernels
+    fusion_result = analyze_fusion_from_alignment(
+        alignment.layer_alignments,
+        amd_kernels=fusion_amd_kernels,
+        nvidia_kernels=fusion_nvidia_kernels,
+    )
+    same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
     return {
         "metadata": {

wafer_core/lib/trace_compare/classifier.py CHANGED Viewed

@@ -24,12 +24,16 @@ class Op(Enum):
     ATTN_PREFILL = "Attention (Prefill)"
     ATTN_DECODE = "Attention (Decode)"
+    # NVIDIA Flash Attention fuses QKV projection + Softmax + Attention
+    FLASH_ATTN_FUSED = "FlashAttention (QKV+Softmax+Attn)"
     KV_CACHE = "KV Cache"
     MOE_ROUTING = "MoE Routing"
     MOE_GEMM = "MoE GEMM"
     MOE_GEMM_SWIGLU = "MoE GEMM+SwiGLU"
     MOE_FINALIZE = "MoE Finalize"
     DENSE_GEMM = "Dense GEMM"
+    # NVIDIA cuBLASLt/CUTLASS can fuse GEMM with epilogue (bias + activation)
+    GEMM_BIAS_ACT = "GEMM+Bias+Activation"
     RMSNORM = "RMSNorm"
     RMSNORM_GEMM = "RMSNorm+GEMM"
     SWIGLU = "SwiGLU"
@@ -274,16 +278,20 @@ def classify(name: str, platform: str) -> tuple[Op, str]:
             if "3d" in nl:
                 return Op.ATTN_DECODE, "kernel_unified_attention_3d"
         else:
-            # NVIDIA uses fmhaSm100 with 'a' (prefill/context) and 'f' (decode/forgen)
-            if "fmhasm100a" in nl or "context" in nl:
-                return Op.ATTN_PREFILL, "fmhaSm100a*_Context"
-            if "fmhasm100f" in nl or "forgen" in nl:
-                return Op.ATTN_DECODE, "fmhaSm100f*_ForGen"
+            # NVIDIA Flash Attention (fmhaSm100*) is a fused kernel
+            # It fuses QKV projection + Softmax + Attention into one kernel
+            if "fmhasm100" in nl:
+                if "fmhasm100a" in nl or "context" in nl:
+                    return Op.FLASH_ATTN_FUSED, "fmhaSm100a*_Context (QKV+Softmax+Attn)"
+                if "fmhasm100f" in nl or "forgen" in nl:
+                    return Op.FLASH_ATTN_FUSED, "fmhaSm100f*_ForGen (QKV+Softmax+Attn)"
+                return Op.FLASH_ATTN_FUSED, "fmhaSm100* (QKV+Softmax+Attn)"
         return Op.ATTN_PREFILL, name[:40]
-    # Flash Attention variants (vLLM)
+    # Flash Attention variants (vLLM) - these are fused on NVIDIA
     if "flash::flash_fwd_kernel" in name or "flash_fwd" in nl:
-        # Could distinguish prefill/decode if needed, defaulting to prefill
+        if platform != "AMD":
+            return Op.FLASH_ATTN_FUSED, "flash::flash_fwd_kernel (QKV+Softmax+Attn)"
         return Op.ATTN_PREFILL, "flash::flash_fwd_kernel"
     if "reshape_and_cache" in nl:
@@ -306,9 +314,10 @@ def classify(name: str, platform: str) -> tuple[Op, str]:
         if "moe_sum" in nl:
             return Op.MOE_FINALIZE, "vllm::moe::moe_sum_*"
-    # vLLM act_and_mul (can be mangled C++ name)
+    # vLLM act_and_mul - fuses activation with element-wise multiply (SiLU * x)
+    # This is a fused operation used in SwiGLU/MoE
     if "vllm::act_and_mul_kernel" in name or ("act_and_mul_kernel" in nl and "vllm" in nl):
-        return Op.MOE_GEMM_SWIGLU, "vllm::act_and_mul_kernel"
+        return Op.SWIGLU_GEMM, "vllm::act_and_mul_kernel (SwiGLU+Mul)"
     if "_matmul_ogs_" in nl:
         if "swiglu" in nl:

wafer-core 0.1.27__py3-none-any.whl → 0.1.29__py3-none-any.whl

wafer-core 0.1.27py3-none-any.whl → 0.1.29py3-none-any.whl