PyPI - wafer-core - Versions diffs - 0.1.45__py3-none-any.whl → 0.1.47__py3-none-any.whl - Mend

wafer-core 0.1.45py3-none-any.whl → 0.1.47py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

wafer_core/lib/trace_compare/__init__.py +12 -0
wafer_core/lib/trace_compare/graph_formatter.py +263 -0
wafer_core/lib/trace_compare/graph_formatter_detailed.py +225 -0
wafer_core/lib/trace_compare/graph_matcher.py +315 -0
wafer_core/lib/trace_compare/graph_matcher_v2.py +332 -0
wafer_core/rollouts/_pytui/app.py +8 -0
wafer_core/rollouts/_pytui/viewport.py +186 -0
wafer_core/rollouts/agents.py +12 -0
wafer_core/rollouts/progress_app.py +434 -148
wafer_core/rollouts/scoring.py +19 -2
{wafer_core-0.1.45.dist-info → wafer_core-0.1.47.dist-info}/METADATA +1 -1
{wafer_core-0.1.45.dist-info → wafer_core-0.1.47.dist-info}/RECORD +13 -8
{wafer_core-0.1.45.dist-info → wafer_core-0.1.47.dist-info}/WHEEL +0 -0

wafer_core/lib/trace_compare/__init__.py CHANGED Viewed

@@ -5,6 +5,8 @@ identifying kernel-level performance differences and fusion opportunities.
 """
 from .analyzer import analyze_traces
+# from .api import analyze_trace_pair  # TODO: api.py has unimplemented dependencies
+from .architecture import ArchitectureType, detect_architecture
 from .classifier import Op, classify
 from .formatter import (
     format_csv,
@@ -15,6 +17,9 @@ from .formatter import (
     format_text,
 )
 from .fusion_analyzer import analyze_fusion_differences
+from .graph_formatter import format_graph_comparison_json, format_graph_comparison_text
+from .graph_formatter_detailed import format_graph_comparison_detailed
+from .graph_matcher import match_traces
 from .loader import load_trace
 __all__ = [
@@ -22,6 +27,9 @@ __all__ = [
     "classify",
     "load_trace",
     "analyze_traces",
+    # "analyze_trace_pair",  # TODO: not yet implemented
+    "detect_architecture",
+    "ArchitectureType",
     "analyze_fusion_differences",
     "format_text",
     "format_csv",
@@ -29,4 +37,8 @@ __all__ = [
     "format_fusion_text",
     "format_fusion_csv",
     "format_fusion_json",
+    "match_traces",
+    "format_graph_comparison_text",
+    "format_graph_comparison_json",
+    "format_graph_comparison_detailed",
 ]

wafer_core/lib/trace_compare/graph_formatter.py ADDED Viewed

@@ -0,0 +1,263 @@
+"""Graph-based pattern-focused formatter for trace comparison.
+Presents results grouped by CUDA graph execution patterns, reducing cognitive load
+by showing 1-7 unique patterns instead of thousands of individual kernel executions.
+"""
+from collections import Counter, defaultdict
+from typing import Any
+def format_graph_comparison_text(result: dict[str, Any], show_all: bool = False) -> str:
+    """Format graph matching results as pattern-focused text report.
+    Args:
+        result: Results from graph_matcher.match_traces()
+        show_all: Show all patterns without truncation
+    Returns:
+        Formatted text report with pattern-focused UX
+    """
+    lines = []
+    summary = result['summary']
+    graph_pairs = result['graph_pairs']
+    # Header
+    lines.append("=" * 80)
+    lines.append("TRACE COMPARISON - GRAPH-BASED ANALYSIS")
+    lines.append("=" * 80)
+    lines.append("")
+    # Overview section
+    lines.append("┌" + "─" * 78 + "┐")
+    lines.append("│ OVERVIEW" + " " * 69 + "│")
+    lines.append("├" + "─" * 78 + "┤")
+    # Calculate total time from graph pairs
+    amd_total_ms = sum(
+        sum(k.get('dur', 0) for k in pair['amd_kernels'])
+        for pair in graph_pairs
+    ) / 1000
+    nv_total_ms = sum(
+        sum(k.get('dur', 0) for k in pair['nv_kernels'])
+        for pair in graph_pairs
+    ) / 1000
+    amd_kernels = summary['total_kernel_pairs']  # Actually shows unique positions
+    nv_kernels = summary['total_kernel_pairs']
+    lines.append(f"│ AMD:     {amd_kernels:>6,} kernel positions  {amd_total_ms:>8.1f}ms" + " " * 25 + "│")
+    lines.append(f"│ NVIDIA:  {nv_kernels:>6,} kernel positions  {nv_total_ms:>8.1f}ms" + " " * 25 + "│")
+    lines.append(f"│" + " " * 78 + "│")
+    lines.append(f"│ Match rate: {summary['match_rate']:.1f}%  "
+                 f"({summary['matched']:,} matched, "
+                 f"{summary['amd_only']:,} AMD-only, "
+                 f"{summary['nv_only']:,} NV-only)" + " " * 5 + "│")
+    lines.append("└" + "─" * 78 + "┘")
+    lines.append("")
+    # CUDA Graph Patterns section
+    lines.append("┌" + "─" * 78 + "┐")
+    lines.append("│ CUDA GRAPH PATTERNS (Transformer Layers)" + " " * 36 + "│")
+    lines.append("├" + "─" * 78 + "┤")
+    lines.append(f"│ Total graph executions:  {summary['num_graph_pairs']:,}" + " " * 50 + "│")
+    # Group patterns by kernel sequence
+    amd_patterns = _group_by_pattern(graph_pairs, 'amd')
+    nv_patterns = _group_by_pattern(graph_pairs, 'nv')
+    lines.append(f"│ Unique AMD patterns:     {len(amd_patterns)}" + " " * 50 + "│")
+    lines.append(f"│ Unique NVIDIA patterns:  {len(nv_patterns)}" + " " * 50 + "│")
+    lines.append("└" + "─" * 78 + "┘")
+    lines.append("")
+    # Pattern Details
+    lines.append("=" * 80)
+    lines.append("PATTERN DETAILS")
+    lines.append("=" * 80)
+    lines.append("")
+    # Show AMD patterns
+    lines.append("AMD Patterns:")
+    max_patterns = len(amd_patterns) if show_all else min(5, len(amd_patterns))
+    for i, (pattern, executions) in enumerate(list(amd_patterns.items())[:max_patterns], 1):
+        kernel_count = len(executions[0]['amd_kernels'])
+        lines.append(f"  Pattern {i}: {len(executions):>4} executions ({kernel_count} kernels each)")
+        if i == 1:  # Show detail for main pattern
+            lines.append(f"    First few kernels:")
+            first_kernels = sorted(executions[0]['amd_kernels'], key=lambda x: x.get('ts', 0))[:5]
+            for k in first_kernels:
+                name = k.get('name', '')[:60]
+                lines.append(f"      - {name}")
+    if not show_all and len(amd_patterns) > 5:
+        lines.append(f"  ... ({len(amd_patterns) - 5} more patterns)")
+    lines.append("")
+    # Show NVIDIA patterns
+    lines.append("NVIDIA Patterns:")
+    max_patterns = len(nv_patterns) if show_all else min(5, len(nv_patterns))
+    for i, (pattern, executions) in enumerate(list(nv_patterns.items())[:max_patterns], 1):
+        kernel_count = len(executions[0]['nv_kernels'])
+        lines.append(f"  Pattern {i}: {len(executions):>4} executions ({kernel_count} kernels each)")
+        if i == 1:  # Show detail for main pattern
+            lines.append(f"    First few kernels:")
+            first_kernels = sorted(executions[0]['nv_kernels'], key=lambda x: x.get('ts', 0))[:5]
+            for k in first_kernels:
+                name = k.get('name', '')[:60]
+                lines.append(f"      - {name}")
+    if not show_all and len(nv_patterns) > 5:
+        lines.append(f"  ... ({len(nv_patterns) - 5} more patterns)")
+    lines.append("")
+    # Drilling down into main pattern
+    lines.append("=" * 80)
+    lines.append("MAIN PATTERN COMPARISON (Pattern 1)")
+    lines.append("=" * 80)
+    lines.append("")
+    if amd_patterns and nv_patterns:
+        # Get first execution of main patterns
+        amd_main_executions = list(amd_patterns.values())[0]
+        nv_main_executions = list(nv_patterns.values())[0]
+        amd_main = amd_main_executions[0]
+        nv_main = nv_main_executions[0]
+        # Get kernel type distribution from matches
+        amd_types = Counter()
+        nv_types = Counter()
+        for match in amd_main['matches']:
+            if match['status'] in ['MATCH', 'AMD_ONLY']:
+                amd_types[match['amd_type']] += 1
+            if match['status'] in ['MATCH', 'NV_ONLY']:
+                nv_types[match['nv_type']] += 1
+        lines.append("Kernel Type Distribution (per execution):")
+        lines.append(f"{'Type':<20} {'AMD':>8} {'NVIDIA':>8} {'Diff':>8}")
+        lines.append("-" * 50)
+        all_types = sorted(set(amd_types.keys()) | set(nv_types.keys()))
+        differences = []
+        for ktype in all_types:
+            amd_count = amd_types.get(ktype, 0)
+            nv_count = nv_types.get(ktype, 0)
+            diff = amd_count - nv_count
+            diff_str = f"+{diff}" if diff > 0 else str(diff) if diff < 0 else "="
+            lines.append(f"{ktype:<20} {amd_count:>8} {nv_count:>8} {diff_str:>8}")
+            if diff != 0:
+                differences.append((ktype, diff))
+        lines.append("")
+        lines.append("-" * 80)
+        lines.append("Key Findings:")
+        if differences:
+            # Sort by absolute difference
+            differences.sort(key=lambda x: abs(x[1]), reverse=True)
+            for ktype, diff in differences[:3]:
+                total_extra = abs(diff) * len(amd_main_executions)
+                if diff > 0:
+                    lines.append(f"  • AMD runs {diff:+d} extra {ktype} per execution")
+                    lines.append(f"    → {total_extra:,} extra operations across all executions")
+                else:
+                    lines.append(f"  • NVIDIA runs {abs(diff)} extra {ktype} per execution")
+                    lines.append(f"    → {total_extra:,} extra operations across all executions")
+        else:
+            lines.append("  • Perfect match - kernel types align exactly!")
+    lines.append("")
+    # Aggregate Statistics
+    lines.append("=" * 80)
+    lines.append("AGGREGATE STATISTICS")
+    lines.append("=" * 80)
+    lines.append("")
+    if amd_patterns and nv_patterns:
+        amd_main_executions = list(amd_patterns.values())[0]
+        nv_main_executions = list(nv_patterns.values())[0]
+        amd_main_count = len(amd_main_executions)
+        nv_main_count = len(nv_main_executions)
+        lines.append(f"Main Pattern (appears {min(amd_main_count, nv_main_count)}x on both platforms):")
+        amd_kernels_per = len(amd_main_executions[0]['amd_kernels'])
+        nv_kernels_per = len(nv_main_executions[0]['nv_kernels'])
+        lines.append(f"  AMD:    {amd_kernels_per} kernels × {amd_main_count} executions = {amd_kernels_per * amd_main_count:,} total kernels")
+        lines.append(f"  NVIDIA: {nv_kernels_per} kernels × {nv_main_count} executions = {nv_kernels_per * nv_main_count:,} total kernels")
+        # Calculate time for main pattern
+        amd_time = sum(
+            sum(k.get('dur', 0) for k in exec['amd_kernels'])
+            for exec in amd_main_executions
+        )
+        nv_time = sum(
+            sum(k.get('dur', 0) for k in exec['nv_kernels'])
+            for exec in nv_main_executions
+        )
+        lines.append(f"")
+        lines.append(f"  Total time in main pattern:")
+        lines.append(f"    AMD:    {amd_time/1000:.1f}ms ({amd_time/amd_total_ms/10:.1f}% of total)")
+        lines.append(f"    NVIDIA: {nv_time/1000:.1f}ms ({nv_time/nv_total_ms/10:.1f}% of total)")
+    lines.append("")
+    return "\n".join(lines)
+def _group_by_pattern(
+    graph_pairs: list[dict[str, Any]],
+    platform: str
+) -> dict[tuple, list[dict[str, Any]]]:
+    """Group graph executions by their kernel sequence pattern.
+    Args:
+        graph_pairs: List of graph pair dictionaries
+        platform: 'amd' or 'nv'
+    Returns:
+        Dictionary mapping pattern signatures to list of executions
+    """
+    patterns: dict[tuple, list[dict[str, Any]]] = defaultdict(list)
+    kernels_key = f'{platform}_kernels'
+    for pair in graph_pairs:
+        kernels = pair[kernels_key]
+        sorted_kernels = sorted(kernels, key=lambda x: x.get('ts', 0))
+        # Pattern signature: tuple of kernel names in order
+        signature = tuple(k.get('name', '') for k in sorted_kernels)
+        patterns[signature].append(pair)
+    # Sort by frequency (most common first)
+    sorted_patterns = dict(sorted(
+        patterns.items(),
+        key=lambda x: len(x[1]),
+        reverse=True
+    ))
+    return sorted_patterns
+def format_graph_comparison_json(result: dict[str, Any]) -> str:
+    """Format graph matching results as JSON.
+    Args:
+        result: Results from graph_matcher.match_traces()
+    Returns:
+        JSON string
+    """
+    import json
+    return json.dumps(result, indent=2)

wafer_core/lib/trace_compare/graph_formatter_detailed.py ADDED Viewed

@@ -0,0 +1,225 @@
+"""Detailed kernel-to-kernel formatter for graph matching results.
+Shows individual kernel pairs in position order (default) or grouped by operation type.
+"""
+from collections import Counter, defaultdict
+from typing import Any
+def format_graph_comparison_detailed(
+    result: dict[str, Any],
+    show_all: bool = False,
+    group_by_op: bool = False,
+    max_graphs: int = 3,
+) -> str:
+    """Format graph matching results with kernel-to-kernel details.
+    Args:
+        result: Results from graph_matcher.match_traces()
+        show_all: Show all kernel pairs without truncation
+        group_by_op: Group kernels by operation type instead of position order
+        max_graphs: Maximum number of graph pairs to show in detail (default: 3)
+    Returns:
+        Formatted text report with kernel-to-kernel matching
+    """
+    lines = []
+    summary = result['summary']
+    graph_pairs = result['graph_pairs']
+    # Header
+    lines.append("=" * 80)
+    lines.append("TRACE COMPARISON - AMD vs NVIDIA")
+    lines.append("=" * 80)
+    lines.append("")
+    # Section 1: Overview
+    lines.append("┏" + "━" * 78 + "┓")
+    lines.append("┃ SECTION 1: OVERVIEW" + " " * 58 + "┃")
+    lines.append("┣" + "━" * 78 + "┫")
+    total_graphs = summary['num_graph_pairs']
+    amd_total_kernels = sum(len(pair['amd_kernels']) for pair in graph_pairs)
+    nv_total_kernels = sum(len(pair['nv_kernels']) for pair in graph_pairs)
+    lines.append(f"┃ Transformer layer graphs:  AMD: {total_graphs}  NVIDIA: {total_graphs}" + " " * 27 + "┃")
+    lines.append(f"┃ Graph pairs to compare:    {total_graphs}" + " " * 50 + "┃")
+    lines.append(f"┃ Total kernels in graphs:   AMD: {amd_total_kernels}  NVIDIA: {nv_total_kernels}" + " " * 23 + "┃")
+    lines.append("┗" + "━" * 78 + "┛")
+    lines.append("")
+    # Section 2: Non-graph kernels (placeholder - always 0 for transformer layers)
+    lines.append("┏" + "━" * 78 + "┓")
+    lines.append("┃ SECTION 2: NON-GRAPH KERNELS" + " " * 48 + "┃")
+    lines.append("┣" + "━" * 78 + "┫")
+    lines.append("┃ ✓ All kernels are in CUDA graphs" + " " * 44 + "┃")
+    lines.append("┗" + "━" * 78 + "┛")
+    lines.append("")
+    # Section 3: Unique patterns count
+    amd_patterns = _group_by_pattern(graph_pairs, 'amd')
+    nv_patterns = _group_by_pattern(graph_pairs, 'nv')
+    lines.append("┏" + "━" * 78 + "┓")
+    lines.append("┃ SECTION 3: CUDA GRAPH PATTERNS (Transformer Layers)" + " " * 25 + "┃")
+    lines.append("┣" + "━" * 78 + "┫")
+    lines.append(f"┃ Unique patterns:  AMD: {len(amd_patterns)}, NVIDIA: {len(nv_patterns)}" + " " * 42 + "┃")
+    lines.append(f"┃ Total executions: AMD: {total_graphs}, NVIDIA: {total_graphs}" + " " * 32 + "┃")
+    lines.append("┗" + "━" * 78 + "┛")
+    lines.append("")
+    # Show representative patterns
+    lines.append("=" * 80)
+    lines.append(f"UNIQUE GRAPH PATTERNS (showing up to {max_graphs})")
+    lines.append("=" * 80)
+    lines.append("")
+    num_patterns_to_show = min(max_graphs, len(amd_patterns)) if not show_all else len(amd_patterns)
+    for pattern_idx, (amd_pattern, amd_executions) in enumerate(list(amd_patterns.items())[:num_patterns_to_show], 1):
+        lines.append(f"┌─ Pattern {pattern_idx} " + "─" * (73 - len(f"Pattern {pattern_idx}")) + "┐")
+        # Find corresponding NVIDIA pattern
+        nv_pattern_idx = min(pattern_idx - 1, len(nv_patterns) - 1)
+        nv_executions = list(nv_patterns.values())[nv_pattern_idx]
+        amd_kernels_per = len(amd_executions[0]['amd_kernels'])
+        nv_kernels_per = len(nv_executions[0]['nv_kernels'])
+        lines.append(f"│ AMD:    {len(amd_executions)} executions × {amd_kernels_per} kernels each" + " " * 35 + "│")
+        lines.append(f"│ NVIDIA: {len(nv_executions)} executions × {nv_kernels_per} kernels each" + " " * 35 + "│")
+        lines.append("└" + "─" * 78 + "┘")
+        lines.append("")
+        # Show match quality for this pattern
+        matches = amd_executions[0]['matches']
+        matched = sum(1 for m in matches if m['status'] == 'MATCH')
+        amd_only = sum(1 for m in matches if m['status'] == 'AMD_ONLY')
+        nv_only = sum(1 for m in matches if m['status'] == 'NV_ONLY')
+        mismatch = sum(1 for m in matches if m['status'] == 'MISMATCH')
+        match_rate = (matched / len(matches) * 100) if matches else 0
+        lines.append(f"Match Quality: {match_rate:.1f}%")
+        lines.append(f"  ✓ Matched:     {matched}")
+        lines.append(f"  ⚠ AMD only:    {amd_only} (fusion differences)")
+        lines.append(f"  ⚠ NVIDIA only: {nv_only} (fusion differences)")
+        if mismatch > 0:
+            lines.append(f"  ✗ Mismatched:  {mismatch}")
+        lines.append("")
+        # Show kernel details
+        if group_by_op:
+            lines.extend(_format_kernels_grouped_by_op(matches, show_all))
+        else:
+            lines.extend(_format_kernels_in_order(matches, show_all))
+        lines.append("")
+    return "\n".join(lines)
+def _format_kernels_in_order(matches: list[dict[str, Any]], show_all: bool) -> list[str]:
+    """Format kernels in position order."""
+    lines = []
+    lines.append("Kernel-to-Kernel Comparison (representative execution):")
+    lines.append("")
+    lines.append(f"{'Pos':<4} {'AMD Kernel':<45} {'NVIDIA Kernel':<45} {'Status':<8}")
+    lines.append("-" * 110)
+    max_pairs = len(matches) if show_all else min(20, len(matches))
+    for idx, match in enumerate(matches[:max_pairs], 1):
+        status_icon = {
+            'MATCH': '✓',
+            'AMD_ONLY': '⚠ AMD',
+            'NV_ONLY': '⚠ NV',
+            'MISMATCH': '✗',
+        }.get(match['status'], '?')
+        # Add operation type label
+        if idx == 1 or (idx > 1 and match['amd_type'] != matches[idx-2]['amd_type']):
+            op_type = match['amd_type'] if match['amd_type'] != '-' else match['nv_type']
+            lines.append("")
+            lines.append(f"[{op_type}]")
+        amd_name = match['amd_name'][:44] if match['amd_name'] != '-' else '-'
+        nv_name = match['nv_name'][:44] if match['nv_name'] != '-' else '-'
+        lines.append(f"{idx:<4} {amd_name:<45} {nv_name:<45} {status_icon:<8}")
+    if not show_all and len(matches) > 20:
+        lines.append(f"     ... ({len(matches) - 20} more kernel pairs)")
+    return lines
+def _format_kernels_grouped_by_op(matches: list[dict[str, Any]], show_all: bool) -> list[str]:
+    """Format kernels grouped by operation type."""
+    lines = []
+    lines.append("Kernel-to-Kernel Comparison (representative execution):")
+    lines.append("")
+    # Group matches by operation type
+    by_op: dict[str, list[tuple[int, dict[str, Any]]]] = defaultdict(list)
+    for idx, match in enumerate(matches, 1):
+        op_type = match['amd_type'] if match['amd_type'] != '-' else match['nv_type']
+        by_op[op_type].append((idx, match))
+    # Sort operations by first appearance
+    sorted_ops = sorted(by_op.items(), key=lambda x: x[1][0][0])
+    for op_type, op_matches in sorted_ops:
+        lines.append(f"── {op_type} ({len(op_matches)} kernel pairs) " + "─" * (80 - len(f"── {op_type} ({len(op_matches)} kernel pairs) ")))
+        lines.append(f"{'Pos':<4} {'AMD Kernel':<45} {'NVIDIA Kernel':<45} {'Status':<8}")
+        lines.append("-" * 110)
+        max_to_show = len(op_matches) if show_all else min(3, len(op_matches))
+        for idx, match in op_matches[:max_to_show]:
+            status_icon = {
+                'MATCH': '✓',
+                'AMD_ONLY': '⚠ AMD',
+                'NV_ONLY': '⚠ NV',
+                'MISMATCH': '✗',
+            }.get(match['status'], '?')
+            amd_name = match['amd_name'][:44] if match['amd_name'] != '-' else '-'
+            nv_name = match['nv_name'][:44] if match['nv_name'] != '-' else '-'
+            lines.append(f"{idx:<4} {amd_name:<45} {nv_name:<45} {status_icon:<8}")
+        if not show_all and len(op_matches) > 3:
+            lines.append(f"     ... ({len(op_matches) - 3} more {op_type} pairs)")
+        lines.append("")
+    return lines
+def _group_by_pattern(
+    graph_pairs: list[dict[str, Any]],
+    platform: str
+) -> dict[tuple, list[dict[str, Any]]]:
+    """Group graph executions by their kernel sequence pattern."""
+    patterns: dict[tuple, list[dict[str, Any]]] = defaultdict(list)
+    kernels_key = f'{platform}_kernels'
+    for pair in graph_pairs:
+        kernels = pair[kernels_key]
+        sorted_kernels = sorted(kernels, key=lambda x: x.get('ts', 0))
+        # Pattern signature: tuple of kernel names in order
+        signature = tuple(k.get('name', '') for k in sorted_kernels)
+        patterns[signature].append(pair)
+    # Sort by frequency (most common first)
+    sorted_patterns = dict(sorted(
+        patterns.items(),
+        key=lambda x: len(x[1]),
+        reverse=True
+    ))
+    return sorted_patterns

wafer-core 0.1.45__py3-none-any.whl → 0.1.47__py3-none-any.whl

wafer-core 0.1.45py3-none-any.whl → 0.1.47py3-none-any.whl