PyPI - wafer-core - Versions diffs - 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl - Mend

wafer-core 0.1.33py3-none-any.whl → 0.1.35py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

wafer_core/lib/trace_compare/__init__.py +9 -22
wafer_core/lib/trace_compare/analyzer.py +160 -584
wafer_core/lib/trace_compare/classifier.py +18 -321
wafer_core/lib/trace_compare/fusion_analyzer.py +753 -329
wafer_core/lib/trace_compare/loader.py +220 -413
wafer_core/targets/__init__.py +21 -47
wafer_core/utils/kernel_utils/defense.py +1 -813
wafer_core/utils/kernel_utils/targets/config.py +24 -8
{wafer_core-0.1.33.dist-info → wafer_core-0.1.35.dist-info}/METADATA +1 -1
{wafer_core-0.1.33.dist-info → wafer_core-0.1.35.dist-info}/RECORD +11 -11
{wafer_core-0.1.33.dist-info → wafer_core-0.1.35.dist-info}/WHEEL +0 -0

wafer_core/lib/trace_compare/analyzer.py CHANGED Viewed

@@ -4,313 +4,13 @@ Compares GPU traces from AMD and NVIDIA platforms, identifying performance diffe
 at the operation level and layer level.
 """
-import sys
 from collections import defaultdict
-from concurrent.futures import ProcessPoolExecutor
 from pathlib import Path
 from typing import Any
 import pandas as pd
-from .aligner import align_traces, TraceAlignment
-from .fusion_analyzer import analyze_fusion_from_alignment
-from .same_kernel_analyzer import analyze_same_kernels_from_alignment
-from .loader import load_trace_full, LoadedTrace
-def analyze_traces_from_loaded(
-    trace1: LoadedTrace,
-    trace2: LoadedTrace,
-    phase_filter: str = "all",
-    max_stacks: int = 3,
-) -> dict[str, Any]:
-    """Analyze two loaded traces and return comparison data.
-    Args:
-        trace1: First loaded trace
-        trace2: Second loaded trace
-        phase_filter: Filter by phase ('all', 'prefill', or 'decode')
-        max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
-    Returns:
-        Dictionary containing:
-            - metadata: trace info (GPUs, kernel counts, total times, etc.)
-            - operations: per-operation comparison data
-            - layers: per-layer comparison data (if layers detected)
-    """
-    df1 = trace1.df
-    df2 = trace2.df
-    # Apply phase filter
-    if phase_filter != "all":
-        df1_filtered = df1[df1["phase"] == phase_filter]
-        df2_filtered = df2[df2["phase"] == phase_filter]
-        if len(df1_filtered) == 0 and len(df2_filtered) == 0:
-            trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
-            trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
-            raise ValueError(
-                f"No {phase_filter} phase found. "
-                f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
-            )
-        df1, df2 = df1_filtered, df2_filtered
-    # Pre-compute aggregations for both operations and layers in single pass
-    trace1_by_op = df1.groupby("op").agg({
-        "dur_us": ["sum", "mean", "count"],
-        "phase": lambda x: set(x.dropna().unique()),
-        "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
-    })
-    trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
-    trace2_by_op = df2.groupby("op").agg({
-        "dur_us": ["sum", "mean", "count"],
-        "phase": lambda x: set(x.dropna().unique()),
-        "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
-    })
-    trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
-    # Group by layer for layer-level analysis
-    df1_layered = df1[df1["layer"].notna()]
-    df2_layered = df2[df2["layer"].notna()]
-    trace1_by_layer = df1_layered.groupby("layer").agg({
-        "dur_us": ["sum", "count"],
-    }) if len(df1_layered) > 0 else pd.DataFrame()
-    if len(trace1_by_layer) > 0:
-        trace1_by_layer.columns = ["total_us", "count"]
-    trace2_by_layer = df2_layered.groupby("layer").agg({
-        "dur_us": ["sum", "count"],
-    }) if len(df2_layered) > 0 else pd.DataFrame()
-    if len(trace2_by_layer) > 0:
-        trace2_by_layer.columns = ["total_us", "count"]
-    results: dict[str, Any] = {
-        "metadata": {
-            "trace1_name": f"{trace1.platform} {trace1.gpu_name}",
-            "trace2_name": f"{trace2.platform} {trace2.gpu_name}",
-            "trace1_platform": trace1.platform,
-            "trace1_gpu": trace1.gpu_name,
-            "trace1_device": trace1.device_props,
-            "trace2_platform": trace2.platform,
-            "trace2_gpu": trace2.gpu_name,
-            "trace2_device": trace2.device_props,
-            "trace1_kernels": len(df1),
-            "trace2_kernels": len(df2),
-            "trace1_total_ms": df1["dur_us"].sum() / 1000,
-            "trace2_total_ms": df2["dur_us"].sum() / 1000,
-            "phase": phase_filter,
-            "trace1_layers": len(trace1.layers),
-            "trace2_layers": len(trace2.layers),
-        },
-        "operations": [],
-        "layers": [],
-    }
-    # Per-operation comparison
-    all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
-    rmsnorm_compared = False
-    for op in sorted(all_ops):
-        has_trace1 = op in trace1_by_op.index
-        has_trace2 = op in trace2_by_op.index
-        trace1_op_for_pattern = op
-        trace2_op_for_pattern = op
-        skip_comparison = False
-        if op == "RMSNorm+GEMM" and not has_trace2:
-            has_trace2 = "RMSNorm" in trace2_by_op.index
-            trace2_op_for_pattern = "RMSNorm"
-            rmsnorm_compared = True
-        elif op == "RMSNorm" and not has_trace1:
-            if rmsnorm_compared:
-                skip_comparison = True
-            else:
-                has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
-                trace1_op_for_pattern = "RMSNorm+GEMM"
-                rmsnorm_compared = True
-        if skip_comparison or not (has_trace1 and has_trace2):
-            continue
-        trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
-        trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
-        trace1_avg = trace1_agg["avg_us"]
-        trace2_avg = trace2_agg["avg_us"]
-        trace1_total = trace1_agg["total_us"] / 1000
-        trace2_total = trace2_agg["total_us"] / 1000
-        trace1_count = int(trace1_agg["count"])
-        trace2_count = int(trace2_agg["count"])
-        # Speedup: ratio of total times (not per-call averages)
-        # Shows how many times faster/slower trace1 is compared to trace2
-        # > 1.0 means trace1 is slower, < 1.0 means trace1 is faster
-        # Using total time instead of avg time per call because operations may have
-        # vastly different call counts (e.g., fused vs unfused operations)
-        if trace2_total > 0:
-            ratio = trace1_total / trace2_total
-        elif trace1_total > 0:
-            ratio = float("inf")  # trace2 has no time, trace1 is infinitely slower
-        else:
-            ratio = 1.0  # Both are zero
-        gap_ms = trace1_total - trace2_total
-        trace1_pattern = list(
-            trace1.patterns.get(
-                (trace1_op_for_pattern, "decode"),
-                trace1.patterns.get((trace1_op_for_pattern, "prefill"), {"unknown"}),
-            )
-        )[0]
-        trace2_pattern = list(
-            trace2.patterns.get(
-                (trace2_op_for_pattern, "decode"),
-                trace2.patterns.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
-            )
-        )[0]
-        trace1_cpu_op = trace1_agg["cpu_op"]
-        trace2_cpu_op = trace2_agg["cpu_op"]
-        # Get detailed kernel data and stacks only when needed
-        trace1_data = df1[df1["op"] == trace1_op_for_pattern]
-        trace2_data = df2[df2["op"] == trace2_op_for_pattern]
-        # Collect Python stacks if available
-        trace1_python_stacks = []
-        trace2_python_stacks = []
-        if max_stacks != 0:
-            stack_limit = None if max_stacks == 0 else max_stacks
-            for stack_list in trace1_data["python_stack"].head(stack_limit):
-                if stack_list and len(stack_list) > 0:
-                    trace1_python_stacks.append(stack_list)
-            for stack_list in trace2_data["python_stack"].head(stack_limit):
-                if stack_list and len(stack_list) > 0:
-                    trace2_python_stacks.append(stack_list)
-        # Aggregate individual kernels
-        trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
-        trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
-        trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
-        trace1_kernels_list = trace1_kernels.to_dict("records")
-        trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
-        trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
-        trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
-        trace2_kernels_list = trace2_kernels.to_dict("records")
-        if gap_ms > 5.0:
-            status = "slower"
-        elif gap_ms < -5.0:
-            status = "faster"
-        else:
-            status = "similar"
-        phases = trace1_agg["phases"] | trace2_agg["phases"]
-        results["operations"].append({
-            "operation": op,
-            "trace1_count": trace1_count,
-            "trace2_count": trace2_count,
-            "trace1_avg_us": trace1_avg,
-            "trace2_avg_us": trace2_avg,
-            "trace1_total_ms": trace1_total,
-            "trace2_total_ms": trace2_total,
-            "ratio": ratio,
-            "gap_ms": gap_ms,
-            "status": status,
-            "trace1_kernel": trace1_pattern,
-            "trace2_kernel": trace2_pattern,
-            "trace1_cpu_op": trace1_cpu_op,
-            "trace2_cpu_op": trace2_cpu_op,
-            "trace1_python_stacks": trace1_python_stacks,
-            "trace2_python_stacks": trace2_python_stacks,
-            "trace1_kernels": trace1_kernels_list,
-            "trace2_kernels": trace2_kernels_list,
-            "phases": sorted(list(phases)) if phases else ["all"],
-        })
-    results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
-    # Layer-wise analysis
-    if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
-        all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
-        for layer_num in all_layers:
-            has_trace1 = layer_num in trace1_by_layer.index
-            has_trace2 = layer_num in trace2_by_layer.index
-            if has_trace1 and has_trace2:
-                trace1_agg = trace1_by_layer.loc[layer_num]
-                trace2_agg = trace2_by_layer.loc[layer_num]
-                trace1_total = trace1_agg["total_us"] / 1000
-                trace2_total = trace2_agg["total_us"] / 1000
-                trace1_count = int(trace1_agg["count"])
-                trace2_count = int(trace2_agg["count"])
-                ratio = trace1_total / trace2_total if trace2_total > 0 else 1
-                gap_ms = trace1_total - trace2_total
-                threshold_ms = 0.1
-                threshold_ratio = 1.2
-                if gap_ms > threshold_ms and ratio > threshold_ratio:
-                    status = "slower"
-                elif gap_ms < -threshold_ms and ratio < (1.0 / threshold_ratio):
-                    status = "faster"
-                else:
-                    status = "similar"
-                results["layers"].append({
-                    "layer": int(layer_num),
-                    "trace1_kernels": trace1_count,
-                    "trace2_kernels": trace2_count,
-                    "trace1_total_ms": trace1_total,
-                    "trace2_total_ms": trace2_total,
-                    "ratio": ratio,
-                    "gap_ms": gap_ms,
-                    "status": status,
-                    "in_both": True,
-                })
-            elif has_trace1:
-                trace1_agg = trace1_by_layer.loc[layer_num]
-                trace1_total = trace1_agg["total_us"] / 1000
-                trace1_count = int(trace1_agg["count"])
-                results["layers"].append({
-                    "layer": int(layer_num),
-                    "trace1_kernels": trace1_count,
-                    "trace2_kernels": 0,
-                    "trace1_total_ms": trace1_total,
-                    "trace2_total_ms": 0.0,
-                    "ratio": 0.0,
-                    "gap_ms": trace1_total,
-                    "status": "trace1_only",
-                    "in_both": False,
-                })
-            elif has_trace2:
-                trace2_agg = trace2_by_layer.loc[layer_num]
-                trace2_total = trace2_agg["total_us"] / 1000
-                trace2_count = int(trace2_agg["count"])
-                results["layers"].append({
-                    "layer": int(layer_num),
-                    "trace1_kernels": 0,
-                    "trace2_kernels": trace2_count,
-                    "trace1_total_ms": 0.0,
-                    "trace2_total_ms": trace2_total,
-                    "ratio": 0.0,
-                    "gap_ms": -trace2_total,
-                    "status": "trace2_only",
-                    "in_both": False,
-                })
-        results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
-    return results
+from .loader import load_trace
 def analyze_traces(
@@ -318,224 +18,76 @@ def analyze_traces(
     trace2_path: str | Path,
     phase_filter: str = "all",
     max_stacks: int = 3,
-    include_stacks: bool = True,
 ) -> dict[str, Any]:
     """Analyze two traces and return comparison data.
     Args:
         trace1_path: Path to first trace file
         trace2_path: Path to second trace file
         phase_filter: Filter by phase ('all', 'prefill', or 'decode')
         max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
-        include_stacks: Whether to include Python stack traces (disable for faster analysis)
     Returns:
         Dictionary containing:
             - metadata: trace info (GPUs, kernel counts, total times, etc.)
             - operations: per-operation comparison data
             - layers: per-layer comparison data (if layers detected)
     """
-    # Load both traces in parallel using separate processes
-    # This provides ~1.7x speedup over sequential loading
-    print("Loading traces in parallel...", file=sys.stderr)
-    with ProcessPoolExecutor(max_workers=2) as executor:
-        future1 = executor.submit(load_trace_full, str(trace1_path), include_stacks)
-        future2 = executor.submit(load_trace_full, str(trace2_path), include_stacks)
-        trace1 = future1.result()
-        trace2 = future2.result()
-    print("Analyzing operations...", file=sys.stderr)
-    result = analyze_traces_from_loaded(trace1, trace2, phase_filter, max_stacks)
-    # Update metadata with file paths for backward compatibility
-    result["metadata"]["trace1_name"] = str(trace1_path)
-    result["metadata"]["trace2_name"] = str(trace2_path)
-    return result
-def analyze_traces_aligned(
-    trace1: LoadedTrace,
-    trace2: LoadedTrace,
-    phase_filter: str = "all",
-) -> dict[str, Any]:
-    """Analyze traces using kernel-to-kernel alignment.
-    Args:
-        trace1: First loaded trace
-        trace2: Second loaded trace
-        phase_filter: Filter by phase ('all', 'prefill', or 'decode')
-    Returns:
-        Dictionary with alignment-based comparison data
-    """
-    amd_phases = trace1.phases
-    nvidia_phases = trace2.phases
-    if phase_filter != "all":
-        amd_phases = [p for p in amd_phases if p.get("type") == phase_filter]
-        nvidia_phases = [p for p in nvidia_phases if p.get("type") == phase_filter]
-    amd_kernels = trace1.kernel_events
-    nvidia_kernels = trace2.kernel_events
-    if phase_filter != "all" and amd_phases:
-        phase_starts = [p["ts_start"] for p in amd_phases]
-        phase_ends = [p["ts_end"] for p in amd_phases]
-        amd_kernels = [
-            k for k in amd_kernels
-            if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
-                   for i in range(len(phase_starts)))
-        ]
-    if phase_filter != "all" and nvidia_phases:
-        phase_starts = [p["ts_start"] for p in nvidia_phases]
-        phase_ends = [p["ts_end"] for p in nvidia_phases]
-        nvidia_kernels = [
-            k for k in nvidia_kernels
-            if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
-                   for i in range(len(phase_starts)))
-        ]
-    alignment = align_traces(
-        amd_kernels,
-        nvidia_kernels,
-        amd_phases,
-        nvidia_phases,
-        trace1.platform,
-        trace2.platform,
-    )
-    layer_alignments = []
-    for layer_align in alignment.layer_alignments:
-        kernel_pairs = []
-        for pair in layer_align.kernel_pairs:
-            kernel_pairs.append({
-                "position": pair.position,
-                "operation": pair.operation,
-                "operation_detail": pair.operation_detail,
-                "amd_kernel": pair.amd_kernel,
-                "amd_avg_us": pair.amd_avg_us,
-                "amd_count": pair.amd_count,
-                "amd_total_us": pair.amd_total_us,
-                "nvidia_kernel": pair.nvidia_kernel,
-                "nvidia_avg_us": pair.nvidia_avg_us,
-                "nvidia_count": pair.nvidia_count,
-                "nvidia_total_us": pair.nvidia_total_us,
-                "ratio": pair.ratio,
-                "gap_us": pair.gap_us,
-                "fusion_note": pair.fusion_note,
-                "is_same_kernel": pair.is_same_kernel,
-            })
-        layer_alignments.append({
-            "layer": layer_align.layer,
-            "amd_total_us": layer_align.amd_total_us,
-            "nvidia_total_us": layer_align.nvidia_total_us,
-            "ratio": layer_align.ratio,
-            "gap_us": layer_align.gap_us,
-            "kernel_pairs": kernel_pairs,
-        })
-    # Determine which trace is AMD vs NVIDIA for fusion analysis
-    if trace1.platform == "AMD":
-        amd_trace, nvidia_trace = trace1, trace2
-        fusion_amd_kernels = amd_kernels
-        fusion_nvidia_kernels = nvidia_kernels
-    else:
-        amd_trace, nvidia_trace = trace2, trace1
-        fusion_amd_kernels = nvidia_kernels
-        fusion_nvidia_kernels = amd_kernels
-    fusion_result = analyze_fusion_from_alignment(
-        alignment.layer_alignments,
-        amd_kernels=fusion_amd_kernels,
-        nvidia_kernels=fusion_nvidia_kernels,
-    )
-    same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
-    # Note: amd_kernels = trace1's kernels (filtered if phase_filter != "all")
-    #       nvidia_kernels = trace2's kernels (filtered if phase_filter != "all")
-    # The variable names are misleading but trace1_* should use amd_kernels,
-    # and trace2_* should use nvidia_kernels to match the filtered kernel counts/totals.
-    return {
-        "metadata": {
-            "amd_gpu": amd_trace.gpu_name,
-            "nvidia_gpu": nvidia_trace.gpu_name,
-            "amd_platform": amd_trace.platform,
-            "nvidia_platform": nvidia_trace.platform,
-            "model_layers": alignment.num_layers,
-            "forward_passes": alignment.num_forward_passes,
-            "phase_breakdown": alignment.phase_breakdown,
-            "phase_filter": phase_filter,
-            "trace1_platform": trace1.platform,
-            "trace1_gpu": trace1.gpu_name,
-            "trace1_device": trace1.device_props,
-            "trace2_platform": trace2.platform,
-            "trace2_gpu": trace2.gpu_name,
-            "trace2_device": trace2.device_props,
-            "trace1_kernels": len(amd_kernels),
-            "trace2_kernels": len(nvidia_kernels),
-            "trace1_total_ms": sum(k.get("dur", 0) for k in amd_kernels) / 1000,
-            "trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_kernels) / 1000,
-            "phase": phase_filter,
-            "trace1_layers": alignment.num_layers,
-            "trace2_layers": alignment.num_layers,
-        },
-        "layer_alignments": layer_alignments,
-        "fusion_analysis": fusion_result,
-        "same_kernel_analysis": same_kernel_result,
-    }
+    # Load traces
+    p1, gpu1, dev1, df1, patterns1, layers1 = load_trace(trace1_path)
+    p2, gpu2, dev2, df2, patterns2, layers2 = load_trace(trace2_path)
     # Apply phase filter
     if phase_filter != "all":
         df1_filtered = df1[df1["phase"] == phase_filter]
         df2_filtered = df2[df2["phase"] == phase_filter]
         if len(df1_filtered) == 0 and len(df2_filtered) == 0:
+            # No data in requested phase - return early with error info
             trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
             trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
             raise ValueError(
                 f"No {phase_filter} phase found. "
                 f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
             )
         df1, df2 = df1_filtered, df2_filtered
     # Pre-compute aggregations for both operations and layers in single pass
+    # This is much faster than iterating through filtered dataframes multiple times
+    # Group by operation for operation-level analysis
     trace1_by_op = df1.groupby("op").agg({
         "dur_us": ["sum", "mean", "count"],
         "phase": lambda x: set(x.dropna().unique()),
         "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
     })
     trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
     trace2_by_op = df2.groupby("op").agg({
         "dur_us": ["sum", "mean", "count"],
         "phase": lambda x: set(x.dropna().unique()),
         "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
     })
     trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
-    # Group by layer for layer-level analysis
+    # Group by layer for layer-level analysis (only for kernels with layer info)
     df1_layered = df1[df1["layer"].notna()]
     df2_layered = df2[df2["layer"].notna()]
     trace1_by_layer = df1_layered.groupby("layer").agg({
         "dur_us": ["sum", "count"],
     }) if len(df1_layered) > 0 else pd.DataFrame()
     if len(trace1_by_layer) > 0:
         trace1_by_layer.columns = ["total_us", "count"]
     trace2_by_layer = df2_layered.groupby("layer").agg({
         "dur_us": ["sum", "count"],
     }) if len(df2_layered) > 0 else pd.DataFrame()
     if len(trace2_by_layer) > 0:
         trace2_by_layer.columns = ["total_us", "count"]
+    # Calculate per-operation statistics
     results: dict[str, Any] = {
         "metadata": {
             "trace1_name": str(trace1_path),
@@ -557,56 +109,57 @@ def analyze_traces_aligned(
         "operations": [],
         "layers": [],
     }
-    # Per-operation comparison
+    # Per-operation comparison using pre-computed aggregations
     all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
+    # Track if we've already compared RMSNorm variants to avoid duplicate comparisons
     rmsnorm_compared = False
     for op in sorted(all_ops):
+        # Use pre-computed aggregations instead of filtering entire dataframes
         has_trace1 = op in trace1_by_op.index
         has_trace2 = op in trace2_by_op.index
-        trace1_op_for_pattern = op
-        trace2_op_for_pattern = op
+        # Handle RMSNorm fusion differences: AMD does RMSNorm+GEMM, NVIDIA does separate RMSNorm
+        trace1_op_for_pattern = op  # Operation name to use for AMD pattern lookup
+        trace2_op_for_pattern = op  # Operation name to use for NVIDIA pattern lookup
         skip_comparison = False
         if op == "RMSNorm+GEMM" and not has_trace2:
+            # Compare AMD's fused version to NVIDIA's separate RMSNorm
             has_trace2 = "RMSNorm" in trace2_by_op.index
-            trace2_op_for_pattern = "RMSNorm"
-            rmsnorm_compared = True
+            trace2_op_for_pattern = "RMSNorm"  # NVIDIA kernels are stored under 'RMSNorm'
+            rmsnorm_compared = True  # Mark that we've compared RMSNorm
         elif op == "RMSNorm" and not has_trace1:
+            # Skip this comparison if we already handled it in RMSNorm+GEMM
             if rmsnorm_compared:
                 skip_comparison = True
             else:
+                # Compare NVIDIA's RMSNorm to AMD's fused version
                 has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
-                trace1_op_for_pattern = "RMSNorm+GEMM"
+                trace1_op_for_pattern = (
+                    "RMSNorm+GEMM"  # AMD kernels are stored under 'RMSNorm+GEMM'
+                )
                 rmsnorm_compared = True
         if skip_comparison or not (has_trace1 and has_trace2):
             continue
+        # Get pre-computed aggregations
         trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
         trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
         trace1_avg = trace1_agg["avg_us"]
         trace2_avg = trace2_agg["avg_us"]
         trace1_total = trace1_agg["total_us"] / 1000
         trace2_total = trace2_agg["total_us"] / 1000
         trace1_count = int(trace1_agg["count"])
         trace2_count = int(trace2_agg["count"])
-        # Speedup: ratio of total times (not per-call averages)
-        # Shows how many times faster/slower trace1 is compared to trace2
-        # > 1.0 means trace1 is slower, < 1.0 means trace1 is faster
-        # Using total time instead of avg time per call because operations may have
-        # vastly different call counts (e.g., fused vs unfused operations)
-        if trace2_total > 0:
-            ratio = trace1_total / trace2_total
-        elif trace1_total > 0:
-            ratio = float("inf")  # trace2 has no time, trace1 is infinitely slower
-        else:
-            ratio = 1.0  # Both are zero
+        ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
         gap_ms = trace1_total - trace2_total
+        # Get kernel patterns using the correct operation names for each platform
         trace1_pattern = list(
             patterns1.get(
                 (trace1_op_for_pattern, "decode"),
@@ -619,91 +172,106 @@ def analyze_traces_aligned(
                 patterns2.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
             )
         )[0]
+        # Get CPU operators from pre-computed aggregations
         trace1_cpu_op = trace1_agg["cpu_op"]
         trace2_cpu_op = trace2_agg["cpu_op"]
-        # Get detailed kernel data and stacks only when needed
+        # For detailed kernel data and python stacks, we still need to filter (but only when needed)
         trace1_data = df1[df1["op"] == trace1_op_for_pattern]
         trace2_data = df2[df2["op"] == trace2_op_for_pattern]
-        # Collect Python stacks if available
+        # Collect example Python stacks for this operation (for JSON output)
         trace1_python_stacks = []
+        stack_limit = None if max_stacks == 0 else max_stacks
+        for stack_list in trace1_data["python_stack"].head(stack_limit):
+            if stack_list and len(stack_list) > 0:
+                trace1_python_stacks.append(stack_list)
         trace2_python_stacks = []
-        if include_stacks:
-            stack_limit = None if max_stacks == 0 else max_stacks
-            for stack_list in trace1_data["python_stack"].head(stack_limit):
-                if stack_list and len(stack_list) > 0:
-                    trace1_python_stacks.append(stack_list)
-            for stack_list in trace2_data["python_stack"].head(stack_limit):
-                if stack_list and len(stack_list) > 0:
-                    trace2_python_stacks.append(stack_list)
-        # Aggregate individual kernels
+        for stack_list in trace2_data["python_stack"].head(stack_limit):
+            if stack_list and len(stack_list) > 0:
+                trace2_python_stacks.append(stack_list)
+        # Aggregate individual kernels by name for detailed view
+        # Group by kernel name and calculate sum/count/avg
         trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
         trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
         trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
         trace1_kernels_list = trace1_kernels.to_dict("records")
         trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
         trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
         trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
         trace2_kernels_list = trace2_kernels.to_dict("records")
-        if gap_ms > 5.0:
+        # Determine status based on TOTAL TIME (gap), not per-call ratio
+        # This handles cases where AMD runs fewer operations via fusion.
+        # 5ms threshold chosen because:
+        # - Filters out measurement noise and minor variations
+        # - Represents meaningful performance impact (0.5% of typical 1s inference)
+        # - Aligns with human perception of "noticeable" difference
+        # - Too small (1ms) creates false positives from variance
+        # - Too large (20ms) misses real optimization opportunities
+        if gap_ms > 5.0:  # AMD spends >5ms more total time
             status = "slower"
-        elif gap_ms < -5.0:
+        elif gap_ms < -5.0:  # AMD spends >5ms less total time
             status = "faster"
         else:
             status = "similar"
+        # Get phases from pre-computed aggregations
         phases = trace1_agg["phases"] | trace2_agg["phases"]
-        results["operations"].append({
-            "operation": op,
-            "trace1_count": trace1_count,
-            "trace2_count": trace2_count,
-            "trace1_avg_us": trace1_avg,
-            "trace2_avg_us": trace2_avg,
-            "trace1_total_ms": trace1_total,
-            "trace2_total_ms": trace2_total,
-            "ratio": ratio,
-            "gap_ms": gap_ms,
-            "status": status,
-            "trace1_kernel": trace1_pattern,
-            "trace2_kernel": trace2_pattern,
-            "trace1_cpu_op": trace1_cpu_op,
-            "trace2_cpu_op": trace2_cpu_op,
-            "trace1_python_stacks": trace1_python_stacks,
-            "trace2_python_stacks": trace2_python_stacks,
-            "trace1_kernels": trace1_kernels_list,
-            "trace2_kernels": trace2_kernels_list,
-            "phases": sorted(list(phases)) if phases else ["all"],
-        })
+        results["operations"].append(
+            {
+                "operation": op,
+                "trace1_count": trace1_count,
+                "trace2_count": trace2_count,
+                "trace1_avg_us": trace1_avg,
+                "trace2_avg_us": trace2_avg,
+                "trace1_total_ms": trace1_total,
+                "trace2_total_ms": trace2_total,
+                "ratio": ratio,
+                "gap_ms": gap_ms,
+                "status": status,
+                "trace1_kernel": trace1_pattern,
+                "trace2_kernel": trace2_pattern,
+                "trace1_cpu_op": trace1_cpu_op,
+                "trace2_cpu_op": trace2_cpu_op,
+                "trace1_python_stacks": trace1_python_stacks,  # Full stacks for JSON
+                "trace2_python_stacks": trace2_python_stacks,
+                "trace1_kernels": trace1_kernels_list,  # All individual kernels for JSON
+                "trace2_kernels": trace2_kernels_list,  # All individual kernels for JSON
+                "phases": sorted(list(phases)) if phases else ["all"],  # For client-side filtering
+            }
+        )
+    # Sort by absolute gap
     results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
-    # Layer-wise analysis
+    # Layer-wise analysis using pre-computed aggregations
     if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
+        # Get all unique layers present in either trace
         all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
         for layer_num in all_layers:
             has_trace1 = layer_num in trace1_by_layer.index
             has_trace2 = layer_num in trace2_by_layer.index
             if has_trace1 and has_trace2:
+                # Layer present in both traces - compare them
                 trace1_agg = trace1_by_layer.loc[layer_num]
                 trace2_agg = trace2_by_layer.loc[layer_num]
                 trace1_total = trace1_agg["total_us"] / 1000
                 trace2_total = trace2_agg["total_us"] / 1000
                 trace1_count = int(trace1_agg["count"])
                 trace2_count = int(trace2_agg["count"])
                 ratio = trace1_total / trace2_total if trace2_total > 0 else 1
                 gap_ms = trace1_total - trace2_total
+                # Determine status (use smaller threshold for layers: 0.1ms or 20% difference)
                 threshold_ms = 0.1
                 threshold_ratio = 1.2
                 if gap_ms > threshold_ms and ratio > threshold_ratio:
@@ -712,52 +280,60 @@ def analyze_traces_aligned(
                     status = "faster"
                 else:
                     status = "similar"
-                results["layers"].append({
-                    "layer": int(layer_num),
-                    "trace1_kernels": trace1_count,
-                    "trace2_kernels": trace2_count,
-                    "trace1_total_ms": trace1_total,
-                    "trace2_total_ms": trace2_total,
-                    "ratio": ratio,
-                    "gap_ms": gap_ms,
-                    "status": status,
-                    "in_both": True,
-                })
+                results["layers"].append(
+                    {
+                        "layer": int(layer_num),
+                        "trace1_kernels": trace1_count,
+                        "trace2_kernels": trace2_count,
+                        "trace1_total_ms": trace1_total,
+                        "trace2_total_ms": trace2_total,
+                        "ratio": ratio,
+                        "gap_ms": gap_ms,
+                        "status": status,
+                        "in_both": True,
+                    }
+                )
             elif has_trace1:
+                # Layer only in trace1
                 trace1_agg = trace1_by_layer.loc[layer_num]
                 trace1_total = trace1_agg["total_us"] / 1000
                 trace1_count = int(trace1_agg["count"])
-                results["layers"].append({
-                    "layer": int(layer_num),
-                    "trace1_kernels": trace1_count,
-                    "trace2_kernels": 0,
-                    "trace1_total_ms": trace1_total,
-                    "trace2_total_ms": 0.0,
-                    "ratio": 0.0,
-                    "gap_ms": trace1_total,
-                    "status": "trace1_only",
-                    "in_both": False,
-                })
+                results["layers"].append(
+                    {
+                        "layer": int(layer_num),
+                        "trace1_kernels": trace1_count,
+                        "trace2_kernels": 0,
+                        "trace1_total_ms": trace1_total,
+                        "trace2_total_ms": 0.0,
+                        "ratio": 0.0,
+                        "gap_ms": trace1_total,
+                        "status": "trace1_only",
+                        "in_both": False,
+                    }
+                )
             elif has_trace2:
+                # Layer only in trace2
                 trace2_agg = trace2_by_layer.loc[layer_num]
                 trace2_total = trace2_agg["total_us"] / 1000
                 trace2_count = int(trace2_agg["count"])
-                results["layers"].append({
-                    "layer": int(layer_num),
-                    "trace1_kernels": 0,
-                    "trace2_kernels": trace2_count,
-                    "trace1_total_ms": 0.0,
-                    "trace2_total_ms": trace2_total,
-                    "ratio": 0.0,
-                    "gap_ms": -trace2_total,
-                    "status": "trace2_only",
-                    "in_both": False,
-                })
+                results["layers"].append(
+                    {
+                        "layer": int(layer_num),
+                        "trace1_kernels": 0,
+                        "trace2_kernels": trace2_count,
+                        "trace1_total_ms": 0.0,
+                        "trace2_total_ms": trace2_total,
+                        "ratio": 0.0,
+                        "gap_ms": -trace2_total,
+                        "status": "trace2_only",
+                        "in_both": False,
+                    }
+                )
+        # Sort: comparable layers first (by absolute gap), then trace-unique layers
         results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
-    print("Analysis complete.", file=sys.stderr)
     return results

wafer-core 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl

wafer-core 0.1.33py3-none-any.whl → 0.1.35py3-none-any.whl