PyPI - wafer-core - Versions diffs - 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl - Mend

wafer-core 0.1.28py3-none-any.whl → 0.1.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

wafer_core/lib/trace_compare/analyzer.py CHANGED Viewed

@@ -145,7 +145,17 @@ def analyze_traces_from_loaded(
         trace2_total = trace2_agg["total_us"] / 1000
         trace1_count = int(trace1_agg["count"])
         trace2_count = int(trace2_agg["count"])
-        ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
+        # Speedup: ratio of total times (not per-call averages)
+        # Shows how many times faster/slower trace1 is compared to trace2
+        # > 1.0 means trace1 is slower, < 1.0 means trace1 is faster
+        # Using total time instead of avg time per call because operations may have
+        # vastly different call counts (e.g., fused vs unfused operations)
+        if trace2_total > 0:
+            ratio = trace1_total / trace2_total
+        elif trace1_total > 0:
+            ratio = float("inf")  # trace2 has no time, trace1 is infinitely slower
+        else:
+            ratio = 1.0  # Both are zero
         gap_ms = trace1_total - trace2_total
         trace1_pattern = list(
@@ -446,6 +456,11 @@ def analyze_traces_aligned(
     )
     same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
+    # Note: amd_kernels = trace1's kernels (filtered if phase_filter != "all")
+    #       nvidia_kernels = trace2's kernels (filtered if phase_filter != "all")
+    # The variable names are misleading but trace1_* should use amd_kernels,
+    # and trace2_* should use nvidia_kernels to match the filtered kernel counts/totals.
     return {
         "metadata": {
             "amd_gpu": amd_trace.gpu_name,
@@ -462,10 +477,10 @@ def analyze_traces_aligned(
             "trace2_platform": trace2.platform,
             "trace2_gpu": trace2.gpu_name,
             "trace2_device": trace2.device_props,
-            "trace1_kernels": len(amd_trace.kernel_events),
-            "trace2_kernels": len(nvidia_trace.kernel_events),
-            "trace1_total_ms": sum(k.get("dur", 0) for k in amd_trace.kernel_events) / 1000,
-            "trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_trace.kernel_events) / 1000,
+            "trace1_kernels": len(amd_kernels),
+            "trace2_kernels": len(nvidia_kernels),
+            "trace1_total_ms": sum(k.get("dur", 0) for k in amd_kernels) / 1000,
+            "trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_kernels) / 1000,
             "phase": phase_filter,
             "trace1_layers": alignment.num_layers,
             "trace2_layers": alignment.num_layers,
@@ -579,7 +594,17 @@ def analyze_traces_aligned(
         trace2_total = trace2_agg["total_us"] / 1000
         trace1_count = int(trace1_agg["count"])
         trace2_count = int(trace2_agg["count"])
-        ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
+        # Speedup: ratio of total times (not per-call averages)
+        # Shows how many times faster/slower trace1 is compared to trace2
+        # > 1.0 means trace1 is slower, < 1.0 means trace1 is faster
+        # Using total time instead of avg time per call because operations may have
+        # vastly different call counts (e.g., fused vs unfused operations)
+        if trace2_total > 0:
+            ratio = trace1_total / trace2_total
+        elif trace1_total > 0:
+            ratio = float("inf")  # trace2 has no time, trace1 is infinitely slower
+        else:
+            ratio = 1.0  # Both are zero
         gap_ms = trace1_total - trace2_total
         trace1_pattern = list(

wafer_core/lib/trace_compare/classifier.py CHANGED Viewed

@@ -24,12 +24,16 @@ class Op(Enum):
     ATTN_PREFILL = "Attention (Prefill)"
     ATTN_DECODE = "Attention (Decode)"
+    # NVIDIA Flash Attention fuses QKV projection + Softmax + Attention
+    FLASH_ATTN_FUSED = "FlashAttention (QKV+Softmax+Attn)"
     KV_CACHE = "KV Cache"
     MOE_ROUTING = "MoE Routing"
     MOE_GEMM = "MoE GEMM"
     MOE_GEMM_SWIGLU = "MoE GEMM+SwiGLU"
     MOE_FINALIZE = "MoE Finalize"
     DENSE_GEMM = "Dense GEMM"
+    # NVIDIA cuBLASLt/CUTLASS can fuse GEMM with epilogue (bias + activation)
+    GEMM_BIAS_ACT = "GEMM+Bias+Activation"
     RMSNORM = "RMSNorm"
     RMSNORM_GEMM = "RMSNorm+GEMM"
     SWIGLU = "SwiGLU"
@@ -274,16 +278,20 @@ def classify(name: str, platform: str) -> tuple[Op, str]:
             if "3d" in nl:
                 return Op.ATTN_DECODE, "kernel_unified_attention_3d"
         else:
-            # NVIDIA uses fmhaSm100 with 'a' (prefill/context) and 'f' (decode/forgen)
-            if "fmhasm100a" in nl or "context" in nl:
-                return Op.ATTN_PREFILL, "fmhaSm100a*_Context"
-            if "fmhasm100f" in nl or "forgen" in nl:
-                return Op.ATTN_DECODE, "fmhaSm100f*_ForGen"
+            # NVIDIA Flash Attention (fmhaSm100*) is a fused kernel
+            # It fuses QKV projection + Softmax + Attention into one kernel
+            if "fmhasm100" in nl:
+                if "fmhasm100a" in nl or "context" in nl:
+                    return Op.FLASH_ATTN_FUSED, "fmhaSm100a*_Context (QKV+Softmax+Attn)"
+                if "fmhasm100f" in nl or "forgen" in nl:
+                    return Op.FLASH_ATTN_FUSED, "fmhaSm100f*_ForGen (QKV+Softmax+Attn)"
+                return Op.FLASH_ATTN_FUSED, "fmhaSm100* (QKV+Softmax+Attn)"
         return Op.ATTN_PREFILL, name[:40]
-    # Flash Attention variants (vLLM)
+    # Flash Attention variants (vLLM) - these are fused on NVIDIA
     if "flash::flash_fwd_kernel" in name or "flash_fwd" in nl:
-        # Could distinguish prefill/decode if needed, defaulting to prefill
+        if platform != "AMD":
+            return Op.FLASH_ATTN_FUSED, "flash::flash_fwd_kernel (QKV+Softmax+Attn)"
         return Op.ATTN_PREFILL, "flash::flash_fwd_kernel"
     if "reshape_and_cache" in nl:
@@ -306,9 +314,10 @@ def classify(name: str, platform: str) -> tuple[Op, str]:
         if "moe_sum" in nl:
             return Op.MOE_FINALIZE, "vllm::moe::moe_sum_*"
-    # vLLM act_and_mul (can be mangled C++ name)
+    # vLLM act_and_mul - fuses activation with element-wise multiply (SiLU * x)
+    # This is a fused operation used in SwiGLU/MoE
     if "vllm::act_and_mul_kernel" in name or ("act_and_mul_kernel" in nl and "vllm" in nl):
-        return Op.MOE_GEMM_SWIGLU, "vllm::act_and_mul_kernel"
+        return Op.SWIGLU_GEMM, "vllm::act_and_mul_kernel (SwiGLU+Mul)"
     if "_matmul_ogs_" in nl:
         if "swiglu" in nl:

wafer_core/lib/trace_compare/fusion_analyzer.py CHANGED Viewed

@@ -82,13 +82,37 @@ def _find_fusion_mappings(
     trace1_only = trace1_type_set - trace2_type_set
     trace2_only = trace2_type_set - trace1_type_set
-    # For each unique type in trace1, find common sequence patterns
+    # For each unique type in trace1, check if it's a fused operation
+    # If trace1 has a unique kernel type that trace2 doesn't have, trace1 is likely fusing
     for unique_type in trace1_only:
         # Skip "Other" since it's too generic
         if unique_type == "Other":
             continue
-        # Find all occurrences of this type
+        # If the unique type contains '+', it's explicitly a fused kernel
+        # This means trace1 (which has it) is fusing, not trace2
+        if "+" in unique_type:
+            # Parse components from the fused op name
+            components = [c.strip() for c in unique_type.split("+")]
+            indices = [i for i, t in enumerate(trace1_types) if t == unique_type]
+            if len(indices) < 5:
+                continue
+            mappings.append({
+                "fused_platform": trace1_name,
+                "fused_kernel_type": unique_type,
+                "fused_count": len(indices),
+                "unfused_platform": trace2_name,
+                "unfused_sequence": components,
+                "unfused_count_per_type": {c: trace2_types.count(c) for c in components},
+                "pattern_count": len(indices),
+                "pattern_confidence": 1.0,
+                "evidence": f"{trace1_name} fuses {' + '.join(components)} into single kernel ({len(indices)} calls), {trace2_name} runs separately",
+            })
+            continue
+        # For non-fused unique types, find all occurrences
         indices = [i for i, t in enumerate(trace1_types) if t == unique_type]
         if len(indices) < 5:  # Need enough samples to be meaningful
@@ -106,11 +130,12 @@ def _find_fusion_mappings(
             continue
         most_common_before = max(before_types.items(), key=lambda x: x[1])
-        # If there's a strong pattern (>80% of occurrences)
+        # If there's a strong pattern (>80% of occurrences) and trace2 has the preceding type,
+        # trace1 runs them separately while trace2 might fuse
         if most_common_before[1] / len(indices) > 0.8:
             fusion_candidate = most_common_before[0]
-            # Verify trace2 has this type
+            # Verify trace2 has this type but NOT the unique_type
             if fusion_candidate in trace2_type_set:
                 trace1_fusion_count = trace1_types.count(fusion_candidate)
                 trace2_fusion_count = trace2_types.count(fusion_candidate)
@@ -127,7 +152,7 @@ def _find_fusion_mappings(
                     },
                     "pattern_count": len(indices),
                     "pattern_confidence": most_common_before[1] / len(indices),
-                    "evidence": f"{trace1_name} runs {fusion_candidate}+{unique_type} separately, {trace2_name} fuses into {fusion_candidate}",
+                    "evidence": f"{trace1_name} runs {fusion_candidate} + {unique_type} separately, {trace2_name} fuses into {fusion_candidate}",
                 })
     # Also check trace2-only types
@@ -135,6 +160,28 @@ def _find_fusion_mappings(
         if unique_type == "Other":
             continue
+        # If the unique type contains '+', it's explicitly a fused kernel
+        # This means trace2 (which has it) is fusing, not trace1
+        if "+" in unique_type:
+            components = [c.strip() for c in unique_type.split("+")]
+            indices = [i for i, t in enumerate(trace2_types) if t == unique_type]
+            if len(indices) < 5:
+                continue
+            mappings.append({
+                "fused_platform": trace2_name,
+                "fused_kernel_type": unique_type,
+                "fused_count": len(indices),
+                "unfused_platform": trace1_name,
+                "unfused_sequence": components,
+                "unfused_count_per_type": {c: trace1_types.count(c) for c in components},
+                "pattern_count": len(indices),
+                "pattern_confidence": 1.0,
+                "evidence": f"{trace2_name} fuses {' + '.join(components)} into single kernel ({len(indices)} calls), {trace1_name} runs separately",
+            })
+            continue
         indices = [i for i, t in enumerate(trace2_types) if t == unique_type]
         if len(indices) < 5:
@@ -169,7 +216,7 @@ def _find_fusion_mappings(
                     },
                     "pattern_count": len(indices),
                     "pattern_confidence": most_common_before[1] / len(indices),
-                    "evidence": f"{trace2_name} runs {fusion_candidate}+{unique_type} separately, {trace1_name} fuses into {fusion_candidate}",
+                    "evidence": f"{trace2_name} runs {fusion_candidate} + {unique_type} separately, {trace1_name} fuses into {fusion_candidate}",
                 })
     return mappings
@@ -184,8 +231,15 @@ def _find_count_imbalance_fusions(
 ) -> list[dict]:
     """Find fusions by looking for significant count imbalances.
-    When one platform has significantly more kernel calls of a type (>1.5x),
-    it suggests the other platform fuses those operations.
+    When one platform has significantly more kernel calls of a type (>3x),
+    it MAY suggest the other platform fuses those operations.
+    NOTE: This is speculative - count differences can also indicate:
+    - Different algorithmic implementations
+    - Different library choices (cuBLAS vs hipBLAS)
+    - Different optimization strategies
+    Only very large imbalances (>3x) with high counts are flagged.
     """
     mappings = []
     trace2_platform = "NVIDIA" if trace1_platform == "AMD" else "AMD"
@@ -201,24 +255,28 @@ def _find_count_imbalance_fusions(
     # Find common types with significant differences
     common_types = set(trace1_counts.keys()) & set(trace2_counts.keys())
+    # Skip types that are likely just implementation differences, not fusion
+    skip_types = {"Reduce", "Copy/Memory", "Sync", "Other", "Elementwise"}
     for ktype in common_types:
-        if ktype == "Other":
+        if ktype in skip_types:
             continue
         trace1_count = trace1_counts[ktype]
         trace2_count = trace2_counts[ktype]
-        # Skip trivial counts
-        if trace1_count + trace2_count < 50:
+        # Skip low counts - need significant samples
+        if trace1_count + trace2_count < 200:
             continue
-        # Check if there's a significant imbalance (>1.5x)
+        # Check if there's a very significant imbalance (>3x)
+        # Lower ratios are likely implementation differences, not fusion
         if trace1_count == 0 or trace2_count == 0:
             continue
         ratio = max(trace1_count, trace2_count) / min(trace1_count, trace2_count)
-        if ratio < 1.5:
+        if ratio < 3.0:
             continue
         # Determine which platform has more (unfused) and which has fewer (fused)
@@ -242,7 +300,7 @@ def _find_count_imbalance_fusions(
             "unfused_count_per_type": {ktype: unfused_count},
             "pattern_count": unfused_count - fused_count,
             "pattern_confidence": (unfused_count - fused_count) / unfused_count,
-            "evidence": f"{unfused_platform} calls {ktype} {ratio:.1f}x more ({unfused_count} vs {fused_count}), {fused_platform} likely fuses",
+            "evidence": f"{unfused_platform} calls {ktype} {ratio:.1f}x more ({unfused_count} vs {fused_count}) - possible fusion",
         })
     return mappings
@@ -325,12 +383,14 @@ def detect_fusion_patterns(
     amd_kernels: list[dict],
     nvidia_kernels: list[dict],
 ) -> FusionAnalysis:
-    """Detect fusion patterns using pattern-based analysis.
+    """Detect fusion patterns using explicit fused operation detection only.
-    This is the main entry point for fusion detection. It combines:
-    1. Explicit fused operations (kernels classified with '+' in name)
-    2. Sequence pattern analysis (unique kernel types with consistent patterns)
-    3. Count imbalance analysis (one platform has significantly more calls)
+    This approach only reports high-confidence fusions where kernels are
+    explicitly classified as fused (e.g., 'RMSNorm+GEMM', 'SwiGLU+GEMM').
+    We intentionally avoid speculative detection (sequence patterns, count
+    imbalances) because these produce too many false positives - count
+    differences are usually due to different implementations, not fusion.
     Args:
         amd_kernels: List of AMD kernel events
@@ -341,38 +401,19 @@ def detect_fusion_patterns(
     """
     all_mappings: list[dict] = []
-    # 1. Find explicit fused operations (highest confidence)
+    # Only use explicit fused operations (highest confidence, no false positives)
+    # These are kernels explicitly classified with '+' in their operation type
     explicit_fusions = _find_explicit_fused_operations(
         amd_kernels, nvidia_kernels,
         trace1_name="AMD", trace2_name="NVIDIA",
         trace1_platform="AMD",
     )
     all_mappings.extend(explicit_fusions)
-    # 2. Find sequence-based fusions
-    sequence_fusions = _find_fusion_mappings(
-        amd_kernels, nvidia_kernels,
-        trace1_name="AMD", trace2_name="NVIDIA",
-        trace1_platform="AMD",
-    )
-    # Deduplicate: skip if same fused_kernel_type already found
-    existing_types = {m["fused_kernel_type"] for m in all_mappings}
-    for fusion in sequence_fusions:
-        if fusion["fused_kernel_type"] not in existing_types:
-            all_mappings.append(fusion)
-            existing_types.add(fusion["fused_kernel_type"])
-    # 3. Find count-imbalance fusions
-    count_fusions = _find_count_imbalance_fusions(
-        amd_kernels, nvidia_kernels,
-        trace1_name="AMD", trace2_name="NVIDIA",
-        trace1_platform="AMD",
-    )
-    # Deduplicate
-    for fusion in count_fusions:
-        if fusion["fused_kernel_type"] not in existing_types:
-            all_mappings.append(fusion)
-            existing_types.add(fusion["fused_kernel_type"])
+    # NOTE: We intentionally skip sequence-based and count-imbalance detection
+    # because they produce false positives. Count differences between platforms
+    # are usually due to different library implementations (cuBLAS vs hipBLAS),
+    # not actual kernel fusion.
     # Convert to FusionPattern objects
     patterns: list[FusionPattern] = []

{wafer_core-0.1.28.dist-info → wafer_core-0.1.30.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: wafer-core
-Version: 0.1.28
+Version: 0.1.30
 Summary: Core utilities and environments for Wafer GPU kernel optimization
 Requires-Python: >=3.10
 Requires-Dist: aiohttp>=3.9.0

{wafer_core-0.1.28.dist-info → wafer_core-0.1.30.dist-info}/RECORD RENAMED Viewed

@@ -321,12 +321,12 @@ wafer_core/lib/rocprofiler/systems/sample/profiler.py,sha256=CYZPTzNXd48LoCfmY6h
 wafer_core/lib/trace_compare/PERFORMANCE.md,sha256=jkJh7ApZi8H7NKTcz8v0LNtwSFtIUqY88e3QbL749ww,3823
 wafer_core/lib/trace_compare/__init__.py,sha256=CyUPbPQDYhVLCFFA7S_jNSilG3OgqYjmHSKfR5X11go,1377
 wafer_core/lib/trace_compare/aligner.py,sha256=1S8Ob3RaEsIjN0HdqEx0yGsW5uf_lMrJVSH_MnZhKok,13788
-wafer_core/lib/trace_compare/analyzer.py,sha256=YkuOPA3HFX_7mNUEhE9CMOtEMGLQd12lvUkvqqeQF14,29698
+wafer_core/lib/trace_compare/analyzer.py,sha256=Ou_gooG027YVuYVF5oddAkMsObXrrPQLBPHUzSMA4Vg,31078
 wafer_core/lib/trace_compare/api.py,sha256=JSRTcd7eZK1Z8l18TFEiA5A8ENJS1TMz7oIiw1KBbAs,8796
 wafer_core/lib/trace_compare/architecture.py,sha256=8bqlAJQeJLBHblyXvFV-w55PIKiVQDPjDQZ8Jx4tuGg,2110
-wafer_core/lib/trace_compare/classifier.py,sha256=CDGzY9TY-I5wRuEGsu4mTCdljqVTOnLWyFLyNgmkGXI,16864
+wafer_core/lib/trace_compare/classifier.py,sha256=cYAmDW8S75N6cE3mJNZM-UKCJSX7rFP-8klVrukBvNQ,17504
 wafer_core/lib/trace_compare/formatter.py,sha256=GNrCZ45ueBN05CEXjOtTuKvTI8z-g-ZZFil-ni3sWVY,37962
-wafer_core/lib/trace_compare/fusion_analyzer.py,sha256=ZbFXUuPOt8ezT08WfjlDx7XaUNoUgg9hlFTJb68-eo0,17433
+wafer_core/lib/trace_compare/fusion_analyzer.py,sha256=ga0sfxx8OCQu9Hq7uJSAMfXhnCvBaAmzVofBN7_gdV8,19843
 wafer_core/lib/trace_compare/kernel_registry.yaml,sha256=0-knXwsF3pR1x1JdIz-aWaH-5xDgTylh53E47Kf6nHo,9808
 wafer_core/lib/trace_compare/layer_segmentation.py,sha256=kI_Y1e9nrKZfdwfcrGo4h7gpMxqXI_xkgXk46zuFen4,4642
 wafer_core/lib/trace_compare/loader.py,sha256=zBHI0r7CX_wJ2mz0_-s0lm9KGSdaVaq7OKyxUL6KIlw,23997
@@ -697,6 +697,6 @@ wafer_core/utils/modal_execution/modal_app.py,sha256=VfS2cX8gHtnlPXemmMcEwDPeQdh
 wafer_core/utils/modal_execution/modal_config.py,sha256=7cGX9TGqilQ3qxI3OFGXV5orjtyRU-PEDOJ4vP2oxno,4421
 wafer_core/utils/modal_execution/modal_execution.py,sha256=gChjnV6jqA3A7IRP3DfvV5cSfm_MN0X4f7JZufXgdZE,24594
 wafer_core/utils/modal_execution/test_modal.py,sha256=_jqou_hrLs1Daf1590Pnb0a_lXMMa2rczAPpW9HpoNQ,8153
-wafer_core-0.1.28.dist-info/METADATA,sha256=0x6opc3zOlxGhlZNJDVDY2LPnBZHYP5K4U0I6ZDl0Os,1477
-wafer_core-0.1.28.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-wafer_core-0.1.28.dist-info/RECORD,,
+wafer_core-0.1.30.dist-info/METADATA,sha256=YuF3VyyP3tvmv2S-7E8epi1J2_1e2yXJfapS1uGQ0Zs,1477
+wafer_core-0.1.30.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+wafer_core-0.1.30.dist-info/RECORD,,

{wafer_core-0.1.28.dist-info → wafer_core-0.1.30.dist-info}/WHEEL RENAMED Viewed

File without changes

wafer-core 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl

wafer-core 0.1.28py3-none-any.whl → 0.1.30py3-none-any.whl