PyPI - wafer-core - Versions diffs - 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl - Mend

wafer-core 0.1.26py3-none-any.whl → 0.1.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

wafer_core/lib/trace_compare/PERFORMANCE.md +148 -0
wafer_core/lib/trace_compare/__init__.py +22 -9
wafer_core/lib/trace_compare/aligner.py +369 -0
wafer_core/lib/trace_compare/analyzer.py +549 -159
wafer_core/lib/trace_compare/api.py +225 -0
wafer_core/lib/trace_compare/architecture.py +77 -0
wafer_core/lib/trace_compare/classifier.py +307 -13
wafer_core/lib/trace_compare/fusion_analyzer.py +311 -845
wafer_core/lib/trace_compare/kernel_registry.yaml +349 -0
wafer_core/lib/trace_compare/layer_segmentation.py +114 -0
wafer_core/lib/trace_compare/loader.py +526 -227
wafer_core/lib/trace_compare/same_kernel_analyzer.py +119 -0
wafer_core/lib/trace_compare/warnings.py +99 -0
{wafer_core-0.1.26.dist-info → wafer_core-0.1.27.dist-info}/METADATA +3 -1
{wafer_core-0.1.26.dist-info → wafer_core-0.1.27.dist-info}/RECORD +16 -8
{wafer_core-0.1.26.dist-info → wafer_core-0.1.27.dist-info}/WHEEL +0 -0

wafer_core/lib/trace_compare/analyzer.py CHANGED Viewed

@@ -4,13 +4,303 @@ Compares GPU traces from AMD and NVIDIA platforms, identifying performance diffe
 at the operation level and layer level.
 """
+import sys
 from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor
 from pathlib import Path
 from typing import Any
 import pandas as pd
-from .loader import load_trace
+from .aligner import align_traces, TraceAlignment
+from .fusion_analyzer import analyze_fusion_from_alignment
+from .same_kernel_analyzer import analyze_same_kernels_from_alignment
+from .loader import load_trace_full, LoadedTrace
+def analyze_traces_from_loaded(
+    trace1: LoadedTrace,
+    trace2: LoadedTrace,
+    phase_filter: str = "all",
+    max_stacks: int = 3,
+) -> dict[str, Any]:
+    """Analyze two loaded traces and return comparison data.
+    Args:
+        trace1: First loaded trace
+        trace2: Second loaded trace
+        phase_filter: Filter by phase ('all', 'prefill', or 'decode')
+        max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
+    Returns:
+        Dictionary containing:
+            - metadata: trace info (GPUs, kernel counts, total times, etc.)
+            - operations: per-operation comparison data
+            - layers: per-layer comparison data (if layers detected)
+    """
+    df1 = trace1.df
+    df2 = trace2.df
+    # Apply phase filter
+    if phase_filter != "all":
+        df1_filtered = df1[df1["phase"] == phase_filter]
+        df2_filtered = df2[df2["phase"] == phase_filter]
+        if len(df1_filtered) == 0 and len(df2_filtered) == 0:
+            trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
+            trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
+            raise ValueError(
+                f"No {phase_filter} phase found. "
+                f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
+            )
+        df1, df2 = df1_filtered, df2_filtered
+    # Pre-compute aggregations for both operations and layers in single pass
+    trace1_by_op = df1.groupby("op").agg({
+        "dur_us": ["sum", "mean", "count"],
+        "phase": lambda x: set(x.dropna().unique()),
+        "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
+    })
+    trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
+    trace2_by_op = df2.groupby("op").agg({
+        "dur_us": ["sum", "mean", "count"],
+        "phase": lambda x: set(x.dropna().unique()),
+        "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
+    })
+    trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
+    # Group by layer for layer-level analysis
+    df1_layered = df1[df1["layer"].notna()]
+    df2_layered = df2[df2["layer"].notna()]
+    trace1_by_layer = df1_layered.groupby("layer").agg({
+        "dur_us": ["sum", "count"],
+    }) if len(df1_layered) > 0 else pd.DataFrame()
+    if len(trace1_by_layer) > 0:
+        trace1_by_layer.columns = ["total_us", "count"]
+    trace2_by_layer = df2_layered.groupby("layer").agg({
+        "dur_us": ["sum", "count"],
+    }) if len(df2_layered) > 0 else pd.DataFrame()
+    if len(trace2_by_layer) > 0:
+        trace2_by_layer.columns = ["total_us", "count"]
+    results: dict[str, Any] = {
+        "metadata": {
+            "trace1_name": f"{trace1.platform} {trace1.gpu_name}",
+            "trace2_name": f"{trace2.platform} {trace2.gpu_name}",
+            "trace1_platform": trace1.platform,
+            "trace1_gpu": trace1.gpu_name,
+            "trace1_device": trace1.device_props,
+            "trace2_platform": trace2.platform,
+            "trace2_gpu": trace2.gpu_name,
+            "trace2_device": trace2.device_props,
+            "trace1_kernels": len(df1),
+            "trace2_kernels": len(df2),
+            "trace1_total_ms": df1["dur_us"].sum() / 1000,
+            "trace2_total_ms": df2["dur_us"].sum() / 1000,
+            "phase": phase_filter,
+            "trace1_layers": len(trace1.layers),
+            "trace2_layers": len(trace2.layers),
+        },
+        "operations": [],
+        "layers": [],
+    }
+    # Per-operation comparison
+    all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
+    rmsnorm_compared = False
+    for op in sorted(all_ops):
+        has_trace1 = op in trace1_by_op.index
+        has_trace2 = op in trace2_by_op.index
+        trace1_op_for_pattern = op
+        trace2_op_for_pattern = op
+        skip_comparison = False
+        if op == "RMSNorm+GEMM" and not has_trace2:
+            has_trace2 = "RMSNorm" in trace2_by_op.index
+            trace2_op_for_pattern = "RMSNorm"
+            rmsnorm_compared = True
+        elif op == "RMSNorm" and not has_trace1:
+            if rmsnorm_compared:
+                skip_comparison = True
+            else:
+                has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
+                trace1_op_for_pattern = "RMSNorm+GEMM"
+                rmsnorm_compared = True
+        if skip_comparison or not (has_trace1 and has_trace2):
+            continue
+        trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
+        trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
+        trace1_avg = trace1_agg["avg_us"]
+        trace2_avg = trace2_agg["avg_us"]
+        trace1_total = trace1_agg["total_us"] / 1000
+        trace2_total = trace2_agg["total_us"] / 1000
+        trace1_count = int(trace1_agg["count"])
+        trace2_count = int(trace2_agg["count"])
+        ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
+        gap_ms = trace1_total - trace2_total
+        trace1_pattern = list(
+            trace1.patterns.get(
+                (trace1_op_for_pattern, "decode"),
+                trace1.patterns.get((trace1_op_for_pattern, "prefill"), {"unknown"}),
+            )
+        )[0]
+        trace2_pattern = list(
+            trace2.patterns.get(
+                (trace2_op_for_pattern, "decode"),
+                trace2.patterns.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
+            )
+        )[0]
+        trace1_cpu_op = trace1_agg["cpu_op"]
+        trace2_cpu_op = trace2_agg["cpu_op"]
+        # Get detailed kernel data and stacks only when needed
+        trace1_data = df1[df1["op"] == trace1_op_for_pattern]
+        trace2_data = df2[df2["op"] == trace2_op_for_pattern]
+        # Collect Python stacks if available
+        trace1_python_stacks = []
+        trace2_python_stacks = []
+        if max_stacks != 0:
+            stack_limit = None if max_stacks == 0 else max_stacks
+            for stack_list in trace1_data["python_stack"].head(stack_limit):
+                if stack_list and len(stack_list) > 0:
+                    trace1_python_stacks.append(stack_list)
+            for stack_list in trace2_data["python_stack"].head(stack_limit):
+                if stack_list and len(stack_list) > 0:
+                    trace2_python_stacks.append(stack_list)
+        # Aggregate individual kernels
+        trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
+        trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
+        trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
+        trace1_kernels_list = trace1_kernels.to_dict("records")
+        trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
+        trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
+        trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
+        trace2_kernels_list = trace2_kernels.to_dict("records")
+        if gap_ms > 5.0:
+            status = "slower"
+        elif gap_ms < -5.0:
+            status = "faster"
+        else:
+            status = "similar"
+        phases = trace1_agg["phases"] | trace2_agg["phases"]
+        results["operations"].append({
+            "operation": op,
+            "trace1_count": trace1_count,
+            "trace2_count": trace2_count,
+            "trace1_avg_us": trace1_avg,
+            "trace2_avg_us": trace2_avg,
+            "trace1_total_ms": trace1_total,
+            "trace2_total_ms": trace2_total,
+            "ratio": ratio,
+            "gap_ms": gap_ms,
+            "status": status,
+            "trace1_kernel": trace1_pattern,
+            "trace2_kernel": trace2_pattern,
+            "trace1_cpu_op": trace1_cpu_op,
+            "trace2_cpu_op": trace2_cpu_op,
+            "trace1_python_stacks": trace1_python_stacks,
+            "trace2_python_stacks": trace2_python_stacks,
+            "trace1_kernels": trace1_kernels_list,
+            "trace2_kernels": trace2_kernels_list,
+            "phases": sorted(list(phases)) if phases else ["all"],
+        })
+    results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
+    # Layer-wise analysis
+    if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
+        all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
+        for layer_num in all_layers:
+            has_trace1 = layer_num in trace1_by_layer.index
+            has_trace2 = layer_num in trace2_by_layer.index
+            if has_trace1 and has_trace2:
+                trace1_agg = trace1_by_layer.loc[layer_num]
+                trace2_agg = trace2_by_layer.loc[layer_num]
+                trace1_total = trace1_agg["total_us"] / 1000
+                trace2_total = trace2_agg["total_us"] / 1000
+                trace1_count = int(trace1_agg["count"])
+                trace2_count = int(trace2_agg["count"])
+                ratio = trace1_total / trace2_total if trace2_total > 0 else 1
+                gap_ms = trace1_total - trace2_total
+                threshold_ms = 0.1
+                threshold_ratio = 1.2
+                if gap_ms > threshold_ms and ratio > threshold_ratio:
+                    status = "slower"
+                elif gap_ms < -threshold_ms and ratio < (1.0 / threshold_ratio):
+                    status = "faster"
+                else:
+                    status = "similar"
+                results["layers"].append({
+                    "layer": int(layer_num),
+                    "trace1_kernels": trace1_count,
+                    "trace2_kernels": trace2_count,
+                    "trace1_total_ms": trace1_total,
+                    "trace2_total_ms": trace2_total,
+                    "ratio": ratio,
+                    "gap_ms": gap_ms,
+                    "status": status,
+                    "in_both": True,
+                })
+            elif has_trace1:
+                trace1_agg = trace1_by_layer.loc[layer_num]
+                trace1_total = trace1_agg["total_us"] / 1000
+                trace1_count = int(trace1_agg["count"])
+                results["layers"].append({
+                    "layer": int(layer_num),
+                    "trace1_kernels": trace1_count,
+                    "trace2_kernels": 0,
+                    "trace1_total_ms": trace1_total,
+                    "trace2_total_ms": 0.0,
+                    "ratio": 0.0,
+                    "gap_ms": trace1_total,
+                    "status": "trace1_only",
+                    "in_both": False,
+                })
+            elif has_trace2:
+                trace2_agg = trace2_by_layer.loc[layer_num]
+                trace2_total = trace2_agg["total_us"] / 1000
+                trace2_count = int(trace2_agg["count"])
+                results["layers"].append({
+                    "layer": int(layer_num),
+                    "trace1_kernels": 0,
+                    "trace2_kernels": trace2_count,
+                    "trace1_total_ms": 0.0,
+                    "trace2_total_ms": trace2_total,
+                    "ratio": 0.0,
+                    "gap_ms": -trace2_total,
+                    "status": "trace2_only",
+                    "in_both": False,
+                })
+        results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
+    return results
 def analyze_traces(
@@ -18,76 +308,210 @@ def analyze_traces(
     trace2_path: str | Path,
     phase_filter: str = "all",
     max_stacks: int = 3,
+    include_stacks: bool = True,
 ) -> dict[str, Any]:
     """Analyze two traces and return comparison data.
     Args:
         trace1_path: Path to first trace file
         trace2_path: Path to second trace file
         phase_filter: Filter by phase ('all', 'prefill', or 'decode')
         max_stacks: Maximum number of Python stack traces to collect per operation (0 for unlimited)
+        include_stacks: Whether to include Python stack traces (disable for faster analysis)
     Returns:
         Dictionary containing:
             - metadata: trace info (GPUs, kernel counts, total times, etc.)
             - operations: per-operation comparison data
             - layers: per-layer comparison data (if layers detected)
     """
-    # Load traces
-    p1, gpu1, dev1, df1, patterns1, layers1 = load_trace(trace1_path)
-    p2, gpu2, dev2, df2, patterns2, layers2 = load_trace(trace2_path)
+    # Load both traces in parallel using separate processes
+    # This provides ~1.7x speedup over sequential loading
+    print("Loading traces in parallel...", file=sys.stderr)
+    with ProcessPoolExecutor(max_workers=2) as executor:
+        future1 = executor.submit(load_trace_full, str(trace1_path), include_stacks)
+        future2 = executor.submit(load_trace_full, str(trace2_path), include_stacks)
+        trace1 = future1.result()
+        trace2 = future2.result()
+    print("Analyzing operations...", file=sys.stderr)
+    result = analyze_traces_from_loaded(trace1, trace2, phase_filter, max_stacks)
+    # Update metadata with file paths for backward compatibility
+    result["metadata"]["trace1_name"] = str(trace1_path)
+    result["metadata"]["trace2_name"] = str(trace2_path)
+    return result
+def analyze_traces_aligned(
+    trace1: LoadedTrace,
+    trace2: LoadedTrace,
+    phase_filter: str = "all",
+) -> dict[str, Any]:
+    """Analyze traces using kernel-to-kernel alignment.
+    Args:
+        trace1: First loaded trace
+        trace2: Second loaded trace
+        phase_filter: Filter by phase ('all', 'prefill', or 'decode')
+    Returns:
+        Dictionary with alignment-based comparison data
+    """
+    amd_phases = trace1.phases
+    nvidia_phases = trace2.phases
+    if phase_filter != "all":
+        amd_phases = [p for p in amd_phases if p.get("type") == phase_filter]
+        nvidia_phases = [p for p in nvidia_phases if p.get("type") == phase_filter]
+    amd_kernels = trace1.kernel_events
+    nvidia_kernels = trace2.kernel_events
+    if phase_filter != "all" and amd_phases:
+        phase_starts = [p["ts_start"] for p in amd_phases]
+        phase_ends = [p["ts_end"] for p in amd_phases]
+        amd_kernels = [
+            k for k in amd_kernels
+            if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
+                   for i in range(len(phase_starts)))
+        ]
+    if phase_filter != "all" and nvidia_phases:
+        phase_starts = [p["ts_start"] for p in nvidia_phases]
+        phase_ends = [p["ts_end"] for p in nvidia_phases]
+        nvidia_kernels = [
+            k for k in nvidia_kernels
+            if any(phase_starts[i] <= k.get("ts", 0) <= phase_ends[i]
+                   for i in range(len(phase_starts)))
+        ]
+    alignment = align_traces(
+        amd_kernels,
+        nvidia_kernels,
+        amd_phases,
+        nvidia_phases,
+        trace1.platform,
+        trace2.platform,
+    )
+    layer_alignments = []
+    for layer_align in alignment.layer_alignments:
+        kernel_pairs = []
+        for pair in layer_align.kernel_pairs:
+            kernel_pairs.append({
+                "position": pair.position,
+                "operation": pair.operation,
+                "operation_detail": pair.operation_detail,
+                "amd_kernel": pair.amd_kernel,
+                "amd_avg_us": pair.amd_avg_us,
+                "amd_count": pair.amd_count,
+                "amd_total_us": pair.amd_total_us,
+                "nvidia_kernel": pair.nvidia_kernel,
+                "nvidia_avg_us": pair.nvidia_avg_us,
+                "nvidia_count": pair.nvidia_count,
+                "nvidia_total_us": pair.nvidia_total_us,
+                "ratio": pair.ratio,
+                "gap_us": pair.gap_us,
+                "fusion_note": pair.fusion_note,
+                "is_same_kernel": pair.is_same_kernel,
+            })
+        layer_alignments.append({
+            "layer": layer_align.layer,
+            "amd_total_us": layer_align.amd_total_us,
+            "nvidia_total_us": layer_align.nvidia_total_us,
+            "ratio": layer_align.ratio,
+            "gap_us": layer_align.gap_us,
+            "kernel_pairs": kernel_pairs,
+        })
+    fusion_result = analyze_fusion_from_alignment(alignment.layer_alignments)
+    same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
+    if trace1.platform == "AMD":
+        amd_trace, nvidia_trace = trace1, trace2
+    else:
+        amd_trace, nvidia_trace = trace2, trace1
+    return {
+        "metadata": {
+            "amd_gpu": amd_trace.gpu_name,
+            "nvidia_gpu": nvidia_trace.gpu_name,
+            "amd_platform": amd_trace.platform,
+            "nvidia_platform": nvidia_trace.platform,
+            "model_layers": alignment.num_layers,
+            "forward_passes": alignment.num_forward_passes,
+            "phase_breakdown": alignment.phase_breakdown,
+            "phase_filter": phase_filter,
+            "trace1_platform": trace1.platform,
+            "trace1_gpu": trace1.gpu_name,
+            "trace1_device": trace1.device_props,
+            "trace2_platform": trace2.platform,
+            "trace2_gpu": trace2.gpu_name,
+            "trace2_device": trace2.device_props,
+            "trace1_kernels": len(amd_trace.kernel_events),
+            "trace2_kernels": len(nvidia_trace.kernel_events),
+            "trace1_total_ms": sum(k.get("dur", 0) for k in amd_trace.kernel_events) / 1000,
+            "trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_trace.kernel_events) / 1000,
+            "phase": phase_filter,
+            "trace1_layers": alignment.num_layers,
+            "trace2_layers": alignment.num_layers,
+        },
+        "layer_alignments": layer_alignments,
+        "fusion_analysis": fusion_result,
+        "same_kernel_analysis": same_kernel_result,
+    }
     # Apply phase filter
     if phase_filter != "all":
         df1_filtered = df1[df1["phase"] == phase_filter]
         df2_filtered = df2[df2["phase"] == phase_filter]
         if len(df1_filtered) == 0 and len(df2_filtered) == 0:
-            # No data in requested phase - return early with error info
             trace1_phases = {k: int(v) for k, v in df1["phase"].value_counts().items()}
             trace2_phases = {k: int(v) for k, v in df2["phase"].value_counts().items()}
             raise ValueError(
                 f"No {phase_filter} phase found. "
                 f"Trace1 phases: {trace1_phases}, Trace2 phases: {trace2_phases}"
             )
         df1, df2 = df1_filtered, df2_filtered
     # Pre-compute aggregations for both operations and layers in single pass
-    # This is much faster than iterating through filtered dataframes multiple times
-    # Group by operation for operation-level analysis
     trace1_by_op = df1.groupby("op").agg({
         "dur_us": ["sum", "mean", "count"],
         "phase": lambda x: set(x.dropna().unique()),
         "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
     })
     trace1_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
     trace2_by_op = df2.groupby("op").agg({
         "dur_us": ["sum", "mean", "count"],
         "phase": lambda x: set(x.dropna().unique()),
         "cpu_op": lambda x: x.dropna().mode()[0] if len(x.dropna()) > 0 else None,
     })
     trace2_by_op.columns = ["total_us", "avg_us", "count", "phases", "cpu_op"]
-    # Group by layer for layer-level analysis (only for kernels with layer info)
+    # Group by layer for layer-level analysis
     df1_layered = df1[df1["layer"].notna()]
     df2_layered = df2[df2["layer"].notna()]
     trace1_by_layer = df1_layered.groupby("layer").agg({
         "dur_us": ["sum", "count"],
     }) if len(df1_layered) > 0 else pd.DataFrame()
     if len(trace1_by_layer) > 0:
         trace1_by_layer.columns = ["total_us", "count"]
     trace2_by_layer = df2_layered.groupby("layer").agg({
         "dur_us": ["sum", "count"],
     }) if len(df2_layered) > 0 else pd.DataFrame()
     if len(trace2_by_layer) > 0:
         trace2_by_layer.columns = ["total_us", "count"]
-    # Calculate per-operation statistics
     results: dict[str, Any] = {
         "metadata": {
             "trace1_name": str(trace1_path),
@@ -109,47 +533,37 @@ def analyze_traces(
         "operations": [],
         "layers": [],
     }
-    # Per-operation comparison using pre-computed aggregations
+    # Per-operation comparison
     all_ops = set(trace1_by_op.index) | set(trace2_by_op.index)
-    # Track if we've already compared RMSNorm variants to avoid duplicate comparisons
     rmsnorm_compared = False
     for op in sorted(all_ops):
-        # Use pre-computed aggregations instead of filtering entire dataframes
         has_trace1 = op in trace1_by_op.index
         has_trace2 = op in trace2_by_op.index
-        # Handle RMSNorm fusion differences: AMD does RMSNorm+GEMM, NVIDIA does separate RMSNorm
-        trace1_op_for_pattern = op  # Operation name to use for AMD pattern lookup
-        trace2_op_for_pattern = op  # Operation name to use for NVIDIA pattern lookup
+        trace1_op_for_pattern = op
+        trace2_op_for_pattern = op
         skip_comparison = False
         if op == "RMSNorm+GEMM" and not has_trace2:
-            # Compare AMD's fused version to NVIDIA's separate RMSNorm
             has_trace2 = "RMSNorm" in trace2_by_op.index
-            trace2_op_for_pattern = "RMSNorm"  # NVIDIA kernels are stored under 'RMSNorm'
-            rmsnorm_compared = True  # Mark that we've compared RMSNorm
+            trace2_op_for_pattern = "RMSNorm"
+            rmsnorm_compared = True
         elif op == "RMSNorm" and not has_trace1:
-            # Skip this comparison if we already handled it in RMSNorm+GEMM
             if rmsnorm_compared:
                 skip_comparison = True
             else:
-                # Compare NVIDIA's RMSNorm to AMD's fused version
                 has_trace1 = "RMSNorm+GEMM" in trace1_by_op.index
-                trace1_op_for_pattern = (
-                    "RMSNorm+GEMM"  # AMD kernels are stored under 'RMSNorm+GEMM'
-                )
+                trace1_op_for_pattern = "RMSNorm+GEMM"
                 rmsnorm_compared = True
         if skip_comparison or not (has_trace1 and has_trace2):
             continue
-        # Get pre-computed aggregations
         trace1_agg = trace1_by_op.loc[trace1_op_for_pattern]
         trace2_agg = trace2_by_op.loc[trace2_op_for_pattern]
         trace1_avg = trace1_agg["avg_us"]
         trace2_avg = trace2_agg["avg_us"]
         trace1_total = trace1_agg["total_us"] / 1000
@@ -158,8 +572,7 @@ def analyze_traces(
         trace2_count = int(trace2_agg["count"])
         ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
         gap_ms = trace1_total - trace2_total
-        # Get kernel patterns using the correct operation names for each platform
         trace1_pattern = list(
             patterns1.get(
                 (trace1_op_for_pattern, "decode"),
@@ -172,106 +585,91 @@ def analyze_traces(
                 patterns2.get((trace2_op_for_pattern, "prefill"), {"unknown"}),
             )
         )[0]
-        # Get CPU operators from pre-computed aggregations
         trace1_cpu_op = trace1_agg["cpu_op"]
         trace2_cpu_op = trace2_agg["cpu_op"]
-        # For detailed kernel data and python stacks, we still need to filter (but only when needed)
+        # Get detailed kernel data and stacks only when needed
         trace1_data = df1[df1["op"] == trace1_op_for_pattern]
         trace2_data = df2[df2["op"] == trace2_op_for_pattern]
-        # Collect example Python stacks for this operation (for JSON output)
+        # Collect Python stacks if available
         trace1_python_stacks = []
-        stack_limit = None if max_stacks == 0 else max_stacks
-        for stack_list in trace1_data["python_stack"].head(stack_limit):
-            if stack_list and len(stack_list) > 0:
-                trace1_python_stacks.append(stack_list)
         trace2_python_stacks = []
-        for stack_list in trace2_data["python_stack"].head(stack_limit):
-            if stack_list and len(stack_list) > 0:
-                trace2_python_stacks.append(stack_list)
-        # Aggregate individual kernels by name for detailed view
-        # Group by kernel name and calculate sum/count/avg
+        if include_stacks:
+            stack_limit = None if max_stacks == 0 else max_stacks
+            for stack_list in trace1_data["python_stack"].head(stack_limit):
+                if stack_list and len(stack_list) > 0:
+                    trace1_python_stacks.append(stack_list)
+            for stack_list in trace2_data["python_stack"].head(stack_limit):
+                if stack_list and len(stack_list) > 0:
+                    trace2_python_stacks.append(stack_list)
+        # Aggregate individual kernels
         trace1_kernels = trace1_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
         trace1_kernels.columns = ["name", "total_us", "count", "avg_us"]
         trace1_kernels = trace1_kernels.sort_values("total_us", ascending=False)
         trace1_kernels_list = trace1_kernels.to_dict("records")
         trace2_kernels = trace2_data.groupby("name").agg({"dur_us": ["sum", "count", "mean"]}).reset_index()
         trace2_kernels.columns = ["name", "total_us", "count", "avg_us"]
         trace2_kernels = trace2_kernels.sort_values("total_us", ascending=False)
         trace2_kernels_list = trace2_kernels.to_dict("records")
-        # Determine status based on TOTAL TIME (gap), not per-call ratio
-        # This handles cases where AMD runs fewer operations via fusion.
-        # 5ms threshold chosen because:
-        # - Filters out measurement noise and minor variations
-        # - Represents meaningful performance impact (0.5% of typical 1s inference)
-        # - Aligns with human perception of "noticeable" difference
-        # - Too small (1ms) creates false positives from variance
-        # - Too large (20ms) misses real optimization opportunities
-        if gap_ms > 5.0:  # AMD spends >5ms more total time
+        if gap_ms > 5.0:
             status = "slower"
-        elif gap_ms < -5.0:  # AMD spends >5ms less total time
+        elif gap_ms < -5.0:
             status = "faster"
         else:
             status = "similar"
-        # Get phases from pre-computed aggregations
         phases = trace1_agg["phases"] | trace2_agg["phases"]
-        results["operations"].append(
-            {
-                "operation": op,
-                "trace1_count": trace1_count,
-                "trace2_count": trace2_count,
-                "trace1_avg_us": trace1_avg,
-                "trace2_avg_us": trace2_avg,
-                "trace1_total_ms": trace1_total,
-                "trace2_total_ms": trace2_total,
-                "ratio": ratio,
-                "gap_ms": gap_ms,
-                "status": status,
-                "trace1_kernel": trace1_pattern,
-                "trace2_kernel": trace2_pattern,
-                "trace1_cpu_op": trace1_cpu_op,
-                "trace2_cpu_op": trace2_cpu_op,
-                "trace1_python_stacks": trace1_python_stacks,  # Full stacks for JSON
-                "trace2_python_stacks": trace2_python_stacks,
-                "trace1_kernels": trace1_kernels_list,  # All individual kernels for JSON
-                "trace2_kernels": trace2_kernels_list,  # All individual kernels for JSON
-                "phases": sorted(list(phases)) if phases else ["all"],  # For client-side filtering
-            }
-        )
-    # Sort by absolute gap
+        results["operations"].append({
+            "operation": op,
+            "trace1_count": trace1_count,
+            "trace2_count": trace2_count,
+            "trace1_avg_us": trace1_avg,
+            "trace2_avg_us": trace2_avg,
+            "trace1_total_ms": trace1_total,
+            "trace2_total_ms": trace2_total,
+            "ratio": ratio,
+            "gap_ms": gap_ms,
+            "status": status,
+            "trace1_kernel": trace1_pattern,
+            "trace2_kernel": trace2_pattern,
+            "trace1_cpu_op": trace1_cpu_op,
+            "trace2_cpu_op": trace2_cpu_op,
+            "trace1_python_stacks": trace1_python_stacks,
+            "trace2_python_stacks": trace2_python_stacks,
+            "trace1_kernels": trace1_kernels_list,
+            "trace2_kernels": trace2_kernels_list,
+            "phases": sorted(list(phases)) if phases else ["all"],
+        })
     results["operations"].sort(key=lambda x: abs(x["gap_ms"]), reverse=True)
-    # Layer-wise analysis using pre-computed aggregations
+    # Layer-wise analysis
     if len(trace1_by_layer) > 0 or len(trace2_by_layer) > 0:
-        # Get all unique layers present in either trace
         all_layers = sorted(set(trace1_by_layer.index) | set(trace2_by_layer.index))
         for layer_num in all_layers:
             has_trace1 = layer_num in trace1_by_layer.index
             has_trace2 = layer_num in trace2_by_layer.index
             if has_trace1 and has_trace2:
-                # Layer present in both traces - compare them
                 trace1_agg = trace1_by_layer.loc[layer_num]
                 trace2_agg = trace2_by_layer.loc[layer_num]
                 trace1_total = trace1_agg["total_us"] / 1000
                 trace2_total = trace2_agg["total_us"] / 1000
                 trace1_count = int(trace1_agg["count"])
                 trace2_count = int(trace2_agg["count"])
                 ratio = trace1_total / trace2_total if trace2_total > 0 else 1
                 gap_ms = trace1_total - trace2_total
-                # Determine status (use smaller threshold for layers: 0.1ms or 20% difference)
                 threshold_ms = 0.1
                 threshold_ratio = 1.2
                 if gap_ms > threshold_ms and ratio > threshold_ratio:
@@ -280,60 +678,52 @@ def analyze_traces(
                     status = "faster"
                 else:
                     status = "similar"
-                results["layers"].append(
-                    {
-                        "layer": int(layer_num),
-                        "trace1_kernels": trace1_count,
-                        "trace2_kernels": trace2_count,
-                        "trace1_total_ms": trace1_total,
-                        "trace2_total_ms": trace2_total,
-                        "ratio": ratio,
-                        "gap_ms": gap_ms,
-                        "status": status,
-                        "in_both": True,
-                    }
-                )
+                results["layers"].append({
+                    "layer": int(layer_num),
+                    "trace1_kernels": trace1_count,
+                    "trace2_kernels": trace2_count,
+                    "trace1_total_ms": trace1_total,
+                    "trace2_total_ms": trace2_total,
+                    "ratio": ratio,
+                    "gap_ms": gap_ms,
+                    "status": status,
+                    "in_both": True,
+                })
             elif has_trace1:
-                # Layer only in trace1
                 trace1_agg = trace1_by_layer.loc[layer_num]
                 trace1_total = trace1_agg["total_us"] / 1000
                 trace1_count = int(trace1_agg["count"])
-                results["layers"].append(
-                    {
-                        "layer": int(layer_num),
-                        "trace1_kernels": trace1_count,
-                        "trace2_kernels": 0,
-                        "trace1_total_ms": trace1_total,
-                        "trace2_total_ms": 0.0,
-                        "ratio": 0.0,
-                        "gap_ms": trace1_total,
-                        "status": "trace1_only",
-                        "in_both": False,
-                    }
-                )
+                results["layers"].append({
+                    "layer": int(layer_num),
+                    "trace1_kernels": trace1_count,
+                    "trace2_kernels": 0,
+                    "trace1_total_ms": trace1_total,
+                    "trace2_total_ms": 0.0,
+                    "ratio": 0.0,
+                    "gap_ms": trace1_total,
+                    "status": "trace1_only",
+                    "in_both": False,
+                })
             elif has_trace2:
-                # Layer only in trace2
                 trace2_agg = trace2_by_layer.loc[layer_num]
                 trace2_total = trace2_agg["total_us"] / 1000
                 trace2_count = int(trace2_agg["count"])
-                results["layers"].append(
-                    {
-                        "layer": int(layer_num),
-                        "trace1_kernels": 0,
-                        "trace2_kernels": trace2_count,
-                        "trace1_total_ms": 0.0,
-                        "trace2_total_ms": trace2_total,
-                        "ratio": 0.0,
-                        "gap_ms": -trace2_total,
-                        "status": "trace2_only",
-                        "in_both": False,
-                    }
-                )
-        # Sort: comparable layers first (by absolute gap), then trace-unique layers
+                results["layers"].append({
+                    "layer": int(layer_num),
+                    "trace1_kernels": 0,
+                    "trace2_kernels": trace2_count,
+                    "trace1_total_ms": 0.0,
+                    "trace2_total_ms": trace2_total,
+                    "ratio": 0.0,
+                    "gap_ms": -trace2_total,
+                    "status": "trace2_only",
+                    "in_both": False,
+                })
         results["layers"].sort(key=lambda x: (not x["in_both"], abs(x["gap_ms"])), reverse=True)
+    print("Analysis complete.", file=sys.stderr)
     return results

wafer-core 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl

wafer-core 0.1.26py3-none-any.whl → 0.1.27py3-none-any.whl