PyPI - wafer-core - Versions diffs - 0.1.45__py3-none-any.whl → 0.1.47__py3-none-any.whl - Mend

wafer-core 0.1.45py3-none-any.whl → 0.1.47py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

wafer_core/lib/trace_compare/__init__.py +12 -0
wafer_core/lib/trace_compare/graph_formatter.py +263 -0
wafer_core/lib/trace_compare/graph_formatter_detailed.py +225 -0
wafer_core/lib/trace_compare/graph_matcher.py +315 -0
wafer_core/lib/trace_compare/graph_matcher_v2.py +332 -0
wafer_core/rollouts/_pytui/app.py +8 -0
wafer_core/rollouts/_pytui/viewport.py +186 -0
wafer_core/rollouts/agents.py +12 -0
wafer_core/rollouts/progress_app.py +434 -148
wafer_core/rollouts/scoring.py +19 -2
{wafer_core-0.1.45.dist-info → wafer_core-0.1.47.dist-info}/METADATA +1 -1
{wafer_core-0.1.45.dist-info → wafer_core-0.1.47.dist-info}/RECORD +13 -8
{wafer_core-0.1.45.dist-info → wafer_core-0.1.47.dist-info}/WHEEL +0 -0

wafer_core/lib/trace_compare/graph_matcher.py ADDED Viewed

@@ -0,0 +1,315 @@
+"""Deterministic kernel matching using CUDA graph execution order.
+This module provides 98-99% deterministic matching by leveraging the fact that
+both AMD and NVIDIA traces execute CUDA graphs in identical order, and kernels
+within each graph execute in deterministic timestamp order.
+"""
+from dataclasses import dataclass
+from typing import Any
+import orjson
+@dataclass
+class KernelMatch:
+    """A matched pair of kernels from AMD and NVIDIA traces."""
+    graph_index: int  # Which graph execution this belongs to (0-184)
+    position_in_graph: int  # Position within the graph (0-based)
+    amd_kernel: dict[str, Any]
+    nvidia_kernel: dict[str, Any]
+    operation_type: str  # GEMM, ATTN, RMS, etc.
+    confidence: float  # 1.0 = perfect match, <1.0 = potential fusion difference
+    # For debugging/validation
+    amd_correlation: int
+    nvidia_correlation: int
+@dataclass
+class GraphPair:
+    """A pair of matched CUDA graphs from AMD and NVIDIA traces."""
+    graph_index: int
+    amd_correlation: int
+    nvidia_correlation: int
+    amd_kernels: list[dict[str, Any]]
+    nvidia_kernels: list[dict[str, Any]]
+    is_layer: bool  # True if this is a transformer layer (>100 kernels)
+def classify_kernel(name: str) -> str:
+    """Classify kernel by operation type.
+    This is a coarse classification for matching purposes.
+    """
+    nl = name.lower()
+    if 'cijk_' in nl or 'nvjet' in nl:
+        return 'GEMM'
+    elif 'attention' in nl or 'fmha' in nl:
+        return 'ATTN'
+    elif 'reshape_and_cache' in nl:
+        return 'KV'
+    elif 'triton_per' in nl and 'rsqrt' in nl:
+        return 'RMS'
+    elif 'triton_poi' in nl and 'silu' in nl:
+        return 'SILU'
+    elif 'triton_poi' in nl:
+        return 'POI'
+    elif 'triton_red' in nl:
+        return 'RED'
+    elif 'reduce_segments' in nl:
+        return 'RSEG'
+    else:
+        return 'OTH'
+def is_platform_specific_kernel(name: str, platform: str) -> bool:
+    """Check if kernel is platform-specific and should be excluded from matching.
+    AMD runs reduce_segments after attention operations, but NVIDIA fuses this
+    into adjacent kernels. We need to filter these out for accurate matching.
+    """
+    if platform == "AMD":
+        return 'reduce_segments' in name.lower()
+    # Add NVIDIA-specific exclusions here if discovered
+    return False
+def load_graph_execution_order(trace_path: str) -> list[tuple[int, int]]:
+    """Load CUDA graph execution order from trace.
+    Returns:
+        List of (timestamp, correlation_id) tuples in execution order
+    """
+    with open(trace_path, "rb") as f:
+        trace = orjson.loads(f.read())
+    graph_launches = []
+    for event in trace.get("traceEvents", []):
+        if event.get("cat") == "cuda_runtime":
+            name = event.get("name", "")
+            if "GraphLaunch" in name or "graphLaunch" in name.lower():
+                ts = event.get("ts")
+                corr_id = event.get("args", {}).get("correlation")
+                if ts is not None and corr_id is not None:
+                    graph_launches.append((ts, corr_id))
+    # Sort by timestamp to get execution order
+    graph_launches.sort()
+    return graph_launches
+def load_kernels_for_correlation(trace_path: str, correlation_id: int, platform: str) -> list[dict[str, Any]]:
+    """Load all kernels for a given correlation ID in timestamp order.
+    Args:
+        trace_path: Path to trace JSON
+        correlation_id: Correlation ID to filter by
+        platform: "AMD" or "NVIDIA" for platform-specific filtering
+    Returns:
+        List of kernel events sorted by timestamp, with platform-specific kernels removed
+    """
+    with open(trace_path, "rb") as f:
+        trace = orjson.loads(f.read())
+    kernels = []
+    for event in trace.get("traceEvents", []):
+        if event.get("cat") == "kernel":
+            corr_id = event.get("args", {}).get("correlation")
+            if corr_id == correlation_id:
+                name = event.get("name", "")
+                # Skip platform-specific kernels
+                if is_platform_specific_kernel(name, platform):
+                    continue
+                kernels.append({
+                    "name": name,
+                    "ts": event.get("ts"),
+                    "dur": event.get("dur", 0),
+                    "correlation": corr_id,
+                    "args": event.get("args", {}),
+                })
+    # Sort by timestamp for deterministic ordering
+    kernels.sort(key=lambda k: k["ts"])
+    return kernels
+def match_traces(
+    amd_trace_path: str,
+    nvidia_trace_path: str
+) -> tuple[list[GraphPair], list[KernelMatch]]:
+    """Match kernels between AMD and NVIDIA traces using graph execution order.
+    This provides 98-99% deterministic matching by:
+    1. Matching graphs by execution order (100% deterministic)
+    2. Matching kernels by position within graphs (98-99% deterministic)
+    3. Filtering platform-specific operations (e.g., AMD's reduce_segments)
+    Args:
+        amd_trace_path: Path to AMD trace JSON
+        nvidia_trace_path: Path to NVIDIA trace JSON
+    Returns:
+        Tuple of (graph_pairs, kernel_matches)
+        - graph_pairs: List of matched CUDA graph pairs
+        - kernel_matches: List of all kernel matches across all graphs
+    """
+    # Step 1: Get graph execution order from both traces
+    amd_graphs = load_graph_execution_order(amd_trace_path)
+    nvidia_graphs = load_graph_execution_order(nvidia_trace_path)
+    if len(amd_graphs) != len(nvidia_graphs):
+        raise ValueError(
+            f"Graph count mismatch: AMD has {len(amd_graphs)} graphs, "
+            f"NVIDIA has {len(nvidia_graphs)} graphs. "
+            "Traces may be from different workloads."
+        )
+    graph_pairs = []
+    kernel_matches = []
+    # Step 2: Match graphs by execution order
+    for graph_idx, ((amd_ts, amd_corr), (nv_ts, nv_corr)) in enumerate(zip(amd_graphs, nvidia_graphs)):
+        # Load kernels for this correlation
+        amd_kernels = load_kernels_for_correlation(amd_trace_path, amd_corr, "AMD")
+        nvidia_kernels = load_kernels_for_correlation(nvidia_trace_path, nv_corr, "NVIDIA")
+        is_layer = len(amd_kernels) > 100 or len(nvidia_kernels) > 100
+        graph_pairs.append(GraphPair(
+            graph_index=graph_idx,
+            amd_correlation=amd_corr,
+            nvidia_correlation=nv_corr,
+            amd_kernels=amd_kernels,
+            nvidia_kernels=nvidia_kernels,
+            is_layer=is_layer,
+        ))
+        # Step 3: Match kernels within this graph by position
+        matches = match_kernels_in_graph(
+            graph_idx=graph_idx,
+            amd_corr=amd_corr,
+            nvidia_corr=nv_corr,
+            amd_kernels=amd_kernels,
+            nvidia_kernels=nvidia_kernels,
+        )
+        kernel_matches.extend(matches)
+    return graph_pairs, kernel_matches
+def match_kernels_in_graph(
+    graph_idx: int,
+    amd_corr: int,
+    nvidia_corr: int,
+    amd_kernels: list[dict[str, Any]],
+    nvidia_kernels: list[dict[str, Any]],
+) -> list[KernelMatch]:
+    """Match kernels within a single CUDA graph by position.
+    Args:
+        graph_idx: Index of this graph in execution order
+        amd_corr: AMD correlation ID
+        nvidia_corr: NVIDIA correlation ID
+        amd_kernels: AMD kernels (already sorted by timestamp, filtered for platform-specific ops)
+        nvidia_kernels: NVIDIA kernels (already sorted by timestamp, filtered for platform-specific ops)
+    Returns:
+        List of kernel matches with confidence scores
+    """
+    matches = []
+    # If kernel counts don't match after filtering, something unexpected happened
+    if len(amd_kernels) != len(nvidia_kernels):
+        # Handle gracefully: match what we can
+        min_len = min(len(amd_kernels), len(nvidia_kernels))
+        for i in range(min_len):
+            amd_k = amd_kernels[i]
+            nv_k = nvidia_kernels[i]
+            amd_type = classify_kernel(amd_k["name"])
+            nv_type = classify_kernel(nv_k["name"])
+            # Lower confidence if operation types don't match
+            confidence = 1.0 if amd_type == nv_type else 0.5
+            matches.append(KernelMatch(
+                graph_index=graph_idx,
+                position_in_graph=i,
+                amd_kernel=amd_k,
+                nvidia_kernel=nv_k,
+                operation_type=amd_type,
+                confidence=confidence,
+                amd_correlation=amd_corr,
+                nvidia_correlation=nvidia_corr,
+            ))
+        # Note: Unmatched kernels are implicitly dropped
+        # Could add logging here for debugging
+    else:
+        # Perfect length match - match by position
+        for i, (amd_k, nv_k) in enumerate(zip(amd_kernels, nvidia_kernels)):
+            amd_type = classify_kernel(amd_k["name"])
+            nv_type = classify_kernel(nv_k["name"])
+            # Confidence = 1.0 if operation types match, else 0.8
+            # (0.8 because position-based matching is still very reliable)
+            confidence = 1.0 if amd_type == nv_type else 0.8
+            matches.append(KernelMatch(
+                graph_index=graph_idx,
+                position_in_graph=i,
+                amd_kernel=amd_k,
+                nvidia_kernel=nv_k,
+                operation_type=amd_type if amd_type == nv_type else f"{amd_type}→{nv_type}",
+                confidence=confidence,
+                amd_correlation=amd_corr,
+                nvidia_correlation=nvidia_corr,
+            ))
+    return matches
+def get_matching_statistics(kernel_matches: list[KernelMatch]) -> dict[str, Any]:
+    """Calculate statistics about matching quality.
+    Returns:
+        Dict with:
+        - total_matches: Total kernel pairs matched
+        - perfect_matches: Matches with confidence=1.0
+        - fuzzy_matches: Matches with confidence<1.0
+        - match_rate: Percentage of perfect matches
+        - by_operation: Breakdown by operation type
+    """
+    total = len(kernel_matches)
+    perfect = sum(1 for m in kernel_matches if m.confidence == 1.0)
+    # Breakdown by operation type
+    from collections import defaultdict
+    by_operation = defaultdict(lambda: {"total": 0, "perfect": 0})
+    for match in kernel_matches:
+        op = match.operation_type
+        by_operation[op]["total"] += 1
+        if match.confidence == 1.0:
+            by_operation[op]["perfect"] += 1
+    return {
+        "total_matches": total,
+        "perfect_matches": perfect,
+        "fuzzy_matches": total - perfect,
+        "match_rate": perfect / total if total > 0 else 0.0,
+        "by_operation": dict(by_operation),
+    }

wafer_core/lib/trace_compare/graph_matcher_v2.py ADDED Viewed

@@ -0,0 +1,332 @@
+"""Improved kernel matching that preserves fusion information.
+Key improvements over v1:
+1. Uses existing classifier.py instead of reimplementing
+2. Marks fusion differences instead of filtering them out
+3. Provides detailed fusion analysis
+4. Handles sequence alignment when platforms have different kernel counts
+"""
+from dataclasses import dataclass, field
+from typing import Any
+import orjson
+from .classifier import classify
+@dataclass
+class FusionDifference:
+    """A fusion difference between platforms."""
+    platform_with_kernel: str  # "AMD" or "NVIDIA"
+    kernel_name: str
+    operation_type: str
+    position: int
+    likely_fused_into: str | None = None  # Best guess of where this work went
+@dataclass
+class KernelMatch:
+    """A matched pair of kernels, or a fusion difference."""
+    graph_index: int
+    amd_position: int | None  # None if this is NVIDIA-only
+    nvidia_position: int | None  # None if this is AMD-only
+    amd_kernel: dict[str, Any] | None
+    nvidia_kernel: dict[str, Any] | None
+    operation_type: str
+    confidence: float  # 1.0 = perfect, 0.5 = fusion difference
+    # If this is a fusion difference
+    is_fusion_difference: bool = False
+    fusion_info: FusionDifference | None = None
+    amd_correlation: int | None = None
+    nvidia_correlation: int | None = None
+@dataclass
+class GraphPair:
+    """Matched CUDA graph pair with fusion analysis."""
+    graph_index: int
+    amd_correlation: int
+    nvidia_correlation: int
+    amd_kernels: list[dict[str, Any]]
+    nvidia_kernels: list[dict[str, Any]]
+    is_layer: bool
+    fusion_differences: list[FusionDifference] = field(default_factory=list)
+def load_graph_execution_order(trace_path: str) -> list[tuple[int, int]]:
+    """Load CUDA graph execution order."""
+    with open(trace_path, "rb") as f:
+        trace = orjson.loads(f.read())
+    graph_launches = []
+    for event in trace.get("traceEvents", []):
+        if event.get("cat") == "cuda_runtime":
+            name = event.get("name", "")
+            if "GraphLaunch" in name or "graphLaunch" in name.lower():
+                ts = event.get("ts")
+                corr_id = event.get("args", {}).get("correlation")
+                if ts is not None and corr_id is not None:
+                    graph_launches.append((ts, corr_id))
+    graph_launches.sort()
+    return graph_launches
+def load_kernels_for_correlation(
+    trace_path: str, correlation_id: int, platform: str
+) -> list[dict[str, Any]]:
+    """Load all kernels for a correlation, keeping ALL kernels including fusion differences."""
+    with open(trace_path, "rb") as f:
+        trace = orjson.loads(f.read())
+    kernels = []
+    for event in trace.get("traceEvents", []):
+        if event.get("cat") == "kernel":
+            corr_id = event.get("args", {}).get("correlation")
+            if corr_id == correlation_id:
+                kernels.append({
+                    "name": event.get("name", ""),
+                    "ts": event.get("ts"),
+                    "dur": event.get("dur", 0),
+                    "correlation": corr_id,
+                    "args": event.get("args", {}),
+                })
+    kernels.sort(key=lambda k: k["ts"])
+    return kernels
+def align_sequences_with_fusion(
+    amd_kernels: list[dict[str, Any]],
+    nvidia_kernels: list[dict[str, Any]],
+    platform_amd: str = "AMD",
+    platform_nvidia: str = "NVIDIA",
+) -> list[tuple[int | None, int | None, str]]:
+    """Align two kernel sequences, identifying fusion differences.
+    Returns:
+        List of (amd_index, nvidia_index, alignment_type) where:
+        - alignment_type is "match", "amd_only", or "nvidia_only"
+    """
+    # Classify all kernels
+    amd_ops = [classify(k["name"], platform_amd)[0].value for k in amd_kernels]
+    nvidia_ops = [classify(k["name"], platform_nvidia)[0].value for k in nvidia_kernels]
+    # Use simple sequence alignment
+    # For now, handle the common case: AMD has extra "reduce_segments" operations
+    alignments = []
+    amd_i = 0
+    nv_i = 0
+    while amd_i < len(amd_ops) or nv_i < len(nvidia_ops):
+        if amd_i >= len(amd_ops):
+            # Remaining NVIDIA ops
+            alignments.append((None, nv_i, "nvidia_only"))
+            nv_i += 1
+        elif nv_i >= len(nvidia_ops):
+            # Remaining AMD ops
+            alignments.append((amd_i, None, "amd_only"))
+            amd_i += 1
+        elif amd_ops[amd_i] == nvidia_ops[nv_i]:
+            # Match
+            alignments.append((amd_i, nv_i, "match"))
+            amd_i += 1
+            nv_i += 1
+        else:
+            # Mismatch - check if AMD has an extra operation
+            amd_kernel_name = amd_kernels[amd_i]["name"].lower()
+            # Known fusion differences
+            if "reduce_segments" in amd_kernel_name:
+                # AMD has reduce_segments, NVIDIA fuses it
+                alignments.append((amd_i, None, "amd_only"))
+                amd_i += 1
+            else:
+                # Unknown mismatch - try to match anyway
+                alignments.append((amd_i, nv_i, "match"))
+                amd_i += 1
+                nv_i += 1
+    return alignments
+def match_traces(
+    amd_trace_path: str,
+    nvidia_trace_path: str,
+) -> tuple[list[GraphPair], list[KernelMatch]]:
+    """Match traces with fusion difference detection."""
+    amd_graphs = load_graph_execution_order(amd_trace_path)
+    nvidia_graphs = load_graph_execution_order(nvidia_trace_path)
+    if len(amd_graphs) != len(nvidia_graphs):
+        raise ValueError(
+            f"Graph count mismatch: AMD={len(amd_graphs)}, NVIDIA={len(nvidia_graphs)}"
+        )
+    # Detect platform from first kernel
+    with open(amd_trace_path, "rb") as f:
+        amd_trace = orjson.loads(f.read())
+    props = amd_trace.get("deviceProperties", [{}])[0]
+    platform_amd = "AMD" if amd_trace.get("roctracer_version") or props.get("warpSize") == 64 else "NVIDIA"
+    with open(nvidia_trace_path, "rb") as f:
+        nvidia_trace = orjson.loads(f.read())
+    props = nvidia_trace.get("deviceProperties", [{}])[0]
+    platform_nvidia = "AMD" if nvidia_trace.get("roctracer_version") or props.get("warpSize") == 64 else "NVIDIA"
+    graph_pairs = []
+    kernel_matches = []
+    for graph_idx, ((amd_ts, amd_corr), (nv_ts, nv_corr)) in enumerate(
+        zip(amd_graphs, nvidia_graphs)
+    ):
+        amd_kernels = load_kernels_for_correlation(amd_trace_path, amd_corr, platform_amd)
+        nvidia_kernels = load_kernels_for_correlation(
+            nvidia_trace_path, nv_corr, platform_nvidia
+        )
+        is_layer = len(amd_kernels) > 100 or len(nvidia_kernels) > 100
+        # Align sequences
+        alignments = align_sequences_with_fusion(
+            amd_kernels, nvidia_kernels, platform_amd, platform_nvidia
+        )
+        # Create matches and track fusion differences
+        fusion_diffs = []
+        for amd_i, nv_i, align_type in alignments:
+            if align_type == "match":
+                # Perfect match
+                amd_k = amd_kernels[amd_i]
+                nv_k = nvidia_kernels[nv_i]
+                amd_op, _ = classify(amd_k["name"], platform_amd)
+                nv_op, _ = classify(nv_k["name"], platform_nvidia)
+                confidence = 1.0 if amd_op == nv_op else 0.8
+                kernel_matches.append(
+                    KernelMatch(
+                        graph_index=graph_idx,
+                        amd_position=amd_i,
+                        nvidia_position=nv_i,
+                        amd_kernel=amd_k,
+                        nvidia_kernel=nv_k,
+                        operation_type=amd_op.value,
+                        confidence=confidence,
+                        amd_correlation=amd_corr,
+                        nvidia_correlation=nv_corr,
+                    )
+                )
+            elif align_type == "amd_only":
+                # Fusion difference: AMD has this, NVIDIA fused it
+                amd_k = amd_kernels[amd_i]
+                amd_op, _ = classify(amd_k["name"], platform_amd)
+                fusion_diff = FusionDifference(
+                    platform_with_kernel="AMD",
+                    kernel_name=amd_k["name"],
+                    operation_type=amd_op.value,
+                    position=amd_i,
+                    likely_fused_into="adjacent operation (NVIDIA fuses)",
+                )
+                fusion_diffs.append(fusion_diff)
+                kernel_matches.append(
+                    KernelMatch(
+                        graph_index=graph_idx,
+                        amd_position=amd_i,
+                        nvidia_position=None,
+                        amd_kernel=amd_k,
+                        nvidia_kernel=None,
+                        operation_type=amd_op.value,
+                        confidence=0.5,
+                        is_fusion_difference=True,
+                        fusion_info=fusion_diff,
+                        amd_correlation=amd_corr,
+                        nvidia_correlation=nv_corr,
+                    )
+                )
+            elif align_type == "nvidia_only":
+                # Fusion difference: NVIDIA has this, AMD fused it
+                nv_k = nvidia_kernels[nv_i]
+                nv_op, _ = classify(nv_k["name"], platform_nvidia)
+                fusion_diff = FusionDifference(
+                    platform_with_kernel="NVIDIA",
+                    kernel_name=nv_k["name"],
+                    operation_type=nv_op.value,
+                    position=nv_i,
+                    likely_fused_into="adjacent operation (AMD fuses)",
+                )
+                fusion_diffs.append(fusion_diff)
+                kernel_matches.append(
+                    KernelMatch(
+                        graph_index=graph_idx,
+                        amd_position=None,
+                        nvidia_position=nv_i,
+                        amd_kernel=None,
+                        nvidia_kernel=nv_k,
+                        operation_type=nv_op.value,
+                        confidence=0.5,
+                        is_fusion_difference=True,
+                        fusion_info=fusion_diff,
+                        amd_correlation=amd_corr,
+                        nvidia_correlation=nv_corr,
+                    )
+                )
+        graph_pairs.append(
+            GraphPair(
+                graph_index=graph_idx,
+                amd_correlation=amd_corr,
+                nvidia_correlation=nv_corr,
+                amd_kernels=amd_kernels,
+                nvidia_kernels=nvidia_kernels,
+                is_layer=is_layer,
+                fusion_differences=fusion_diffs,
+            )
+        )
+    return graph_pairs, kernel_matches
+def get_matching_statistics(kernel_matches: list[KernelMatch]) -> dict[str, Any]:
+    """Calculate statistics including fusion analysis."""
+    total = len(kernel_matches)
+    perfect = sum(1 for m in kernel_matches if m.confidence == 1.0)
+    fusion_diffs = sum(1 for m in kernel_matches if m.is_fusion_difference)
+    # Breakdown by operation type
+    from collections import defaultdict
+    by_operation = defaultdict(lambda: {"total": 0, "perfect": 0, "fusion": 0})
+    for match in kernel_matches:
+        op = match.operation_type
+        by_operation[op]["total"] += 1
+        if match.confidence == 1.0:
+            by_operation[op]["perfect"] += 1
+        if match.is_fusion_difference:
+            by_operation[op]["fusion"] += 1
+    return {
+        "total_matches": total,
+        "perfect_matches": perfect,
+        "fuzzy_matches": total - perfect - fusion_diffs,
+        "fusion_differences": fusion_diffs,
+        "match_rate": perfect / total if total > 0 else 0.0,
+        "by_operation": dict(by_operation),
+    }

wafer_core/rollouts/_pytui/app.py CHANGED Viewed

@@ -722,7 +722,15 @@ class App:
         """Send message through update, execute resulting command."""
         if not self._running:
             return
+        old_model = self._model
         self._model, cmd = self._update_fn(self._model, msg)
+        _log(
+            "dispatch",
+            msg_type=type(msg).__name__,
+            msg_data=repr(msg)[:200],
+            cmd_kind=cmd._kind,
+            model_changed=self._model is not old_model,
+        )
         self._execute_cmd(cmd)
     def _execute_cmd(self, cmd: Cmd) -> None:

wafer-core 0.1.45__py3-none-any.whl → 0.1.47__py3-none-any.whl

wafer-core 0.1.45py3-none-any.whl → 0.1.47py3-none-any.whl