PyPI - wafer-core - Versions diffs - 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl - Mend

wafer-core 0.1.26py3-none-any.whl → 0.1.28py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

wafer_core/lib/trace_compare/PERFORMANCE.md +148 -0
wafer_core/lib/trace_compare/__init__.py +22 -9
wafer_core/lib/trace_compare/aligner.py +376 -0
wafer_core/lib/trace_compare/analyzer.py +558 -159
wafer_core/lib/trace_compare/api.py +225 -0
wafer_core/lib/trace_compare/architecture.py +77 -0
wafer_core/lib/trace_compare/classifier.py +307 -13
wafer_core/lib/trace_compare/fusion_analyzer.py +280 -706
wafer_core/lib/trace_compare/kernel_registry.yaml +349 -0
wafer_core/lib/trace_compare/layer_segmentation.py +114 -0
wafer_core/lib/trace_compare/loader.py +526 -227
wafer_core/lib/trace_compare/same_kernel_analyzer.py +119 -0
wafer_core/lib/trace_compare/warnings.py +99 -0
wafer_core/targets/__init__.py +47 -21
wafer_core/targets/pool.py +181 -0
wafer_core/targets/probe.py +113 -0
wafer_core/targets/providers/__init__.py +46 -0
wafer_core/targets/providers/baremetal.py +72 -0
wafer_core/targets/providers/digitalocean.py +164 -0
wafer_core/targets/providers/runpod.py +250 -0
wafer_core/targets/reconcile.py +90 -0
wafer_core/targets/spec_store.py +200 -0
wafer_core/targets/state_cache.py +150 -0
wafer_core/targets/types.py +141 -0
wafer_core/utils/kernel_utils/targets/config.py +8 -24
{wafer_core-0.1.26.dist-info → wafer_core-0.1.28.dist-info}/METADATA +3 -1
{wafer_core-0.1.26.dist-info → wafer_core-0.1.28.dist-info}/RECORD +28 -10
{wafer_core-0.1.26.dist-info → wafer_core-0.1.28.dist-info}/WHEEL +0 -0

wafer_core/lib/trace_compare/loader.py CHANGED Viewed

@@ -5,233 +5,268 @@ Python call stacks, CPU operator mappings, and layer correlations.
 """
 import bisect
-import json
+import sys
 from collections import defaultdict
+from collections.abc import Callable
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
+try:
+    import ijson
+except ImportError:
+    ijson = None
+import orjson
 import pandas as pd
+from .architecture import detect_architecture
 from .classifier import classify
+from .layer_segmentation import segment_layers_by_architecture
+ProgressCallback = Callable[[str, float], None]
-def extract_layer_mapping(events: list[dict[str, Any]], platform: str) -> dict[int, int]:
-    """Extract correlation ID to layer number mapping.
-    vLLM's execution graph creates large correlation groups for full transformer layers.
-    Each layer's forward pass (norm + attention + FFN) gets grouped under one correlation ID,
-    containing 200-400 kernels depending on batch size and sequence length.
-    We identify layers as correlation groups with many kernels (70+), which filters out
-    individual operations like sampling, logit processing, etc.
+@dataclass
+class SinglePassResult:
+    """Collected data from single-pass event processing."""
+    cpu_op_mapping: dict[int, str] = field(default_factory=dict)
+    python_intervals: list[tuple[int, int, int, int | None, str]] = field(default_factory=list)
+    # Raw python events for lazy python_by_id construction (built on-demand in _get_python_stack_full)
+    python_events_raw: list[dict[str, Any]] = field(default_factory=list)
+    phases: list[dict[str, Any]] = field(default_factory=list)
+    correlation_groups: dict[int, dict[str, Any]] = field(default_factory=lambda: defaultdict(
+        lambda: {"count": 0, "has_attention": False, "has_ffn": False}
+    ))
+    kernel_events: list[dict[str, Any]] = field(default_factory=list)
+    # Lazily built when needed for stack resolution
+    _python_by_id: dict[int, dict[str, Any]] | None = field(default=None)
+    @property
+    def python_by_id(self) -> dict[int, dict[str, Any]]:
+        """Lazily build python_by_id from raw events on first access."""
+        if self._python_by_id is None:
+            self._python_by_id = {}
+            for ev in self.python_events_raw:
+                args = ev.get("args")
+                py_id = args.get("Python id") if args else None
+                if py_id is not None:
+                    self._python_by_id[py_id] = {
+                        "name": ev["name"],
+                        "parent_id": args.get("Python parent id") if args else None,
+                    }
+        return self._python_by_id
+@dataclass
+class LoadedTrace:
+    """Complete trace data loaded once and reused by all analysis functions."""
+    platform: str
+    gpu_name: str
+    device_props: dict[str, Any]
+    df: pd.DataFrame
+    patterns: dict[tuple[str, str], set[str]]
+    layers: dict[int, int]
+    # For fusion/warnings (kept from raw JSON)
+    kernel_events: list[dict[str, Any]]
+    all_events: list[dict[str, Any]]
+    correlation_groups: dict[int, list[dict[str, Any]]]
+    phases: list[dict[str, Any]]  # Phase annotations for alignment
+def _process_events_single_pass(
+    events: list[dict[str, Any]],
+    include_stacks: bool = True,
+) -> SinglePassResult:
+    """Process all events in a single iteration.
+    Optimizations applied:
+    - Cache list.append methods for 2-3x speedup on hot paths
+    - Store raw python events, build python_by_id lazily (only ~48 lookups due to caching)
+    - Local variable caching for frequently accessed attributes
     Args:
         events: List of trace events
-        platform: 'AMD' or 'NVIDIA'
-    Returns:
-        Dict mapping correlation ID to layer number
+        include_stacks: Whether to collect Python stack info (expensive operation).
+                       When False, skips python_function processing entirely.
     """
-    # Group kernels by correlation ID
-    correlation_groups = defaultdict(
+    result = SinglePassResult()
+    correlation_groups: dict[int, dict[str, Any]] = defaultdict(
         lambda: {"count": 0, "has_attention": False, "has_ffn": False}
     )
+    # Cache list.append methods for faster appending (measured 2-3x speedup)
+    kernel_append = result.kernel_events.append
+    python_interval_append = result.python_intervals.append
+    python_raw_append = result.python_events_raw.append
+    phase_append = result.phases.append
+    cpu_op_mapping = result.cpu_op_mapping
     for ev in events:
-        if ev.get("cat") != "kernel":
-            continue
-        corr_id = ev.get("args", {}).get("correlation")
-        if corr_id is None:
-            continue
-        kernel_name = ev.get("name", "").lower()
-        # Track what operations this correlation contains
-        correlation_groups[corr_id]["count"] += 1
-        if "attention" in kernel_name or "fmha" in kernel_name:
-            correlation_groups[corr_id]["has_attention"] = True
-        if any(x in kernel_name for x in ["cijk_", "nvjet", "wvsplitk", "gemm"]):
-            correlation_groups[corr_id]["has_ffn"] = True
-    # Map correlation IDs to layer numbers
-    # Transformer layers have many kernels AND contain both attention and FFN ops
+        cat = ev.get("cat")
+        if cat == "kernel":
+            args = ev.get("args")
+            corr_id = args.get("correlation") if args else None
+            if corr_id is not None:
+                kernel_name = ev.get("name", "").lower()
+                grp = correlation_groups[corr_id]
+                grp["count"] += 1
+                if "attention" in kernel_name or "fmha" in kernel_name:
+                    grp["has_attention"] = True
+                if "cijk_" in kernel_name or "nvjet" in kernel_name or "wvsplitk" in kernel_name or "gemm" in kernel_name:
+                    grp["has_ffn"] = True
+            kernel_append(ev)
+        elif cat == "cpu_op":
+            args = ev.get("args")
+            ext_id = args.get("External id") if args else None
+            if ext_id is not None:
+                cpu_op_mapping[ext_id] = ev.get("name", "")
+        elif cat == "python_function" and include_stacks:
+            # Store raw event for lazy python_by_id construction
+            python_raw_append(ev)
+            # Build interval tuple for binary search
+            args = ev.get("args")
+            py_id = args.get("Python id") if args else None
+            ts = ev["ts"]
+            dur = ev.get("dur", 0)
+            python_interval_append((ts, ts + dur, dur, py_id, ev["name"]))
+        elif cat == "user_annotation":
+            name = ev.get("name", "")
+            if name.startswith("execute_context"):
+                tokens = 0
+                parts = name.split("_")
+                for i, p in enumerate(parts):
+                    if i > 0 and parts[i-1] == "context" and "(" in p and ")" in p:
+                        try:
+                            tokens = int(p.split("(")[1].split(")")[0])
+                            break
+                        except Exception:
+                            pass
+                is_prefill = "generation_0" in name and tokens > 0
+                phase_append({
+                    "type": "prefill" if is_prefill else "decode",
+                    "ts_start": ev["ts"],
+                    "ts_end": ev["ts"] + ev["dur"],
+                })
+    if include_stacks and result.python_intervals:
+        result.python_intervals.sort()
+    result.correlation_groups = dict(correlation_groups)
+    return result
+def _build_layer_mapping(correlation_groups: dict[int, dict[str, Any]]) -> dict[int, int]:
+    """Build layer mapping from correlation groups."""
     correlation_to_layer = {}
     layer_num = 0
     for corr_id in sorted(correlation_groups.keys()):
         group = correlation_groups[corr_id]
-        # Identify complete transformer layers by their characteristics:
-        # - Has attention operations (self-attention or cross-attention)
-        # - Has FFN operations (feed-forward network)
-        # - Has sufficient kernel count (70+): typical transformer block has ~80-100 kernels
-        #   including attention QKV projections, softmax, output projection, FFN layers,
-        #   normalization, and elementwise ops. This threshold filters out:
-        #   - Individual operations (1-10 kernels)
-        #   - Sampling/generation steps (20-40 kernels)
-        #   - Partial layer executions
         is_layer = (
             group["count"] >= 70 and group["has_attention"] and group["has_ffn"]
         )
         if is_layer:
             correlation_to_layer[corr_id] = layer_num
             layer_num += 1
     return correlation_to_layer
-def _build_python_stack_index(
-    events: list[dict[str, Any]],
-) -> tuple[list[tuple[int, int, int, int | None, str]], dict[int, dict[str, Any]]]:
-    """Build Python call stack index for kernels.
-    Args:
-        events: List of trace events
-    Returns:
-        Tuple of (python_intervals, python_by_id)
-    """
-    python_by_id: dict[int, dict[str, Any]] = {}
-    python_intervals: list[tuple[int, int, int, int | None, str]] = []
-    for ev in events:
-        if ev.get("cat") == "python_function":
-            py_id = ev.get("args", {}).get("Python id")
-            name = ev["name"]
-            ts_start = ev["ts"]
-            ts_end = ts_start + ev.get("dur", 0)
-            duration = ev.get("dur", 0)
-            parent_id = ev.get("args", {}).get("Python parent id")
-            python_intervals.append((ts_start, ts_end, duration, py_id, name))
-            if py_id is not None:
-                python_by_id[py_id] = {
-                    "name": name,
-                    "parent_id": parent_id,
-                    "ts_start": ts_start,
-                    "ts_end": ts_end,
-                    "duration": duration,
-                }
-    # Sort by start time for efficient binary search
-    python_intervals.sort()
-    return python_intervals, python_by_id
 def _get_python_stack_full(
     timestamp: int,
     python_intervals: list[tuple[int, int, int, int | None, str]],
     python_by_id: dict[int, dict[str, Any]],
 ) -> tuple[str | None, list[str]]:
     """Get full Python call stack for a kernel launch.
-    Args:
-        timestamp: Kernel launch timestamp
-        python_intervals: Sorted list of Python function intervals
-        python_by_id: Mapping of Python ID to function info
-    Returns:
-        Tuple of (summary_string, full_stack_list)
+    Computes the chain on-demand by walking parent pointers.
+    This is fast because we only call this ~48 times due to cpu_op caching.
     """
-    # Binary search for Python functions active at this timestamp
     idx = bisect.bisect_right(
         python_intervals, (timestamp, float("inf"), float("inf"), None, "")
     )
-    # Find active functions
     active_funcs = []
     for i in range(idx - 1, max(0, idx - 1000), -1):
         ts_start, ts_end, duration, py_id, name = python_intervals[i]
         if ts_start <= timestamp <= ts_end:
             active_funcs.append((duration, py_id, name))
-        if ts_end < timestamp - 1000000:  # 1 second before
+        if ts_end < timestamp - 1000000:
             break
     if not active_funcs:
         return None, []
-    # Get the innermost (most specific) function
     active_funcs.sort()
     leaf_duration, leaf_id, leaf_name = active_funcs[0]
-    # Walk up parent chain to get FULL stack
     full_stack = []
     current_id = leaf_id
-    visited = set()
-    while (
-        current_id is not None
-        and current_id not in visited
-        and current_id in python_by_id
-    ):
+    visited: set[int] = set()
+    while current_id is not None and current_id not in visited and current_id in python_by_id:
         func = python_by_id[current_id]
-        name = func["name"]
-        full_stack.append(name)
+        full_stack.append(func["name"])
         visited.add(current_id)
         current_id = func["parent_id"]
-        # Safety limit: prevent infinite loops from circular parent references
-        # and bound memory usage. 50 frames is deeper than typical Python stacks.
         if len(full_stack) >= 50:
             break
-    # Reverse so it's outermost -> innermost
     full_stack.reverse()
-    # Create summary for text output: show the most informative vLLM/model function
     summary = None
-    vllm_funcs = [
-        f
-        for f in full_stack
-        if any(x in f.lower() for x in ["vllm/", "model", "<eval_with_key>"])
-    ]
+    vllm_funcs = [f for f in full_stack if any(x in f.lower() for x in ["vllm/", "model", "<eval_with_key>"])]
     if vllm_funcs:
-        # Get innermost vLLM function (most specific)
         summary = vllm_funcs[-1]
-        # Check if it's a CUDA graph - add annotation
         if any("torch/cuda/graphs" in f for f in full_stack):
-            # Shorten if too long
             if len(summary) > 45:
-                parts = summary.split("/")[-1]
-                summary = "vllm/..." + parts
+                summary = "vllm/..." + summary.split("/")[-1]
             summary = f"{summary} [CUDA graph]"
         elif len(summary) > 53:
-            parts = summary.split("/")[-1]
-            summary = "vllm/..." + parts
+            summary = "vllm/..." + summary.split("/")[-1]
     else:
-        # Fallback to innermost function
         summary = leaf_name
     return summary, full_stack
 def load_trace(
     file_path: str | Path,
+    include_stacks: bool = True,
+    on_progress: ProgressCallback | None = None,
 ) -> tuple[str, str, dict[str, Any], pd.DataFrame, dict[tuple[str, str], set[str]], dict[int, int]]:
     """Load trace and return platform info, device properties, kernels, patterns, and layer mapping.
     Args:
-        file_path: Path to JSON trace file
-    Returns:
-        Tuple of (platform, gpu_name, device_props, kernel_df, kernel_patterns, layer_mapping)
+        file_path: Path to the trace JSON file
+        include_stacks: Whether to resolve Python call stacks (slower but more info)
+        on_progress: Optional callback for progress updates: (stage_name, progress_fraction)
     """
+    def _progress(stage: str, pct: float) -> None:
+        if on_progress:
+            on_progress(stage, pct)
+    _progress("Reading file", 0.0)
     with open(file_path, "rb") as f:
-        trace = json.load(f)
+        raw = f.read()
+    _progress("Parsing JSON", 0.1)
+    trace = orjson.loads(raw)
     props = trace.get("deviceProperties", [{}])[0]
     is_amd = trace.get("roctracer_version") or props.get("warpSize") == 64
     platform = "AMD" if is_amd else "NVIDIA"
     gpu_name = props.get("name", "MI300X" if is_amd else "Unknown GPU")
-    # Extract relevant device properties
     device_props = {
         "name": gpu_name,
         "compute_capability": f"{props.get('computeMajor', 0)}.{props.get('computeMinor', 0)}",
@@ -241,96 +276,360 @@ def load_trace(
         "max_threads_per_block": props.get("maxThreadsPerBlock", 0),
         "shared_mem_per_block_kb": props.get("sharedMemPerBlock", 0) / 1024,
     }
+    _progress("Processing events", 0.4)
     events = trace.get("traceEvents", [])
-    # Build mapping: external_id -> CPU operator name
-    external_to_cpu = {}
-    for ev in events:
-        if ev.get("cat") == "cpu_op":
-            ext_id = ev.get("args", {}).get("External id")
-            cpu_op_name = ev.get("name", "")
-            if ext_id is not None:
-                external_to_cpu[ext_id] = cpu_op_name
-    # Build Python call stack index for kernels without External IDs
-    python_intervals, python_by_id = _build_python_stack_index(events)
-    # Extract phases
-    phases = []
-    for ev in events:
-        if ev.get("cat") == "user_annotation" and ev.get("name", "").startswith(
-            "execute_context"
-        ):
-            name = ev["name"]
-            # Parse execute_context_X(TOKENS)_generation_Y(Y)
-            # We want the TOKENS from execute_context, not the generation number
-            tokens = 0
-            parts = name.split("_")
-            for i, p in enumerate(parts):
-                # Look for execute_context_X(TOKENS) specifically
-                if i > 0 and parts[i-1] == "context" and "(" in p and ")" in p:
-                    try:
-                        tokens = int(p.split("(")[1].split(")")[0])
-                        break  # Stop after finding context tokens
-                    except Exception:
-                        pass
-            is_prefill = tokens >= 1024 and "generation_0" in name
-            phases.append(
-                {
-                    "type": "prefill" if is_prefill else "decode",
-                    "ts_start": ev["ts"],
-                    "ts_end": ev["ts"] + ev["dur"],
-                }
-            )
-    # Extract layer mapping from correlation IDs
-    layer_mapping = extract_layer_mapping(events, platform)
+    pass_result = _process_events_single_pass(events, include_stacks=include_stacks)
+    _progress("Detecting architecture", 0.6)
+    kernel_names = [ev.get("name", "") for ev in pass_result.kernel_events]
+    architecture, _ = detect_architecture(kernel_names)
+    layer_kernels_dict, layer_warnings = segment_layers_by_architecture(
+        pass_result.kernel_events,
+        architecture,
+    )
+    # Convert layer_kernels_dict (layer_num -> kernels) to correlation_id -> layer_num mapping
+    # Fall back to old method if architecture-based segmentation fails
+    if layer_kernels_dict:
+        layer_mapping: dict[int, int] = {}
+        for layer_num, kernels in layer_kernels_dict.items():
+            for kernel in kernels:
+                corr_id = kernel.get("args", {}).get("correlation")
+                if corr_id is not None:
+                    layer_mapping[corr_id] = layer_num
+    else:
+        # Fallback to correlation-based method if architecture segmentation failed
+        layer_mapping = _build_layer_mapping(pass_result.correlation_groups)
     kernel_data = []
     kernel_patterns: dict[tuple[str, str], set[str]] = defaultdict(set)
-    for ev in events:
-        if ev.get("cat") != "kernel":
-            continue
-        name, dur, ts = ev["name"], ev.get("dur", 0), ev["ts"]
+    sorted_phases = sorted(pass_result.phases, key=lambda p: p["ts_start"])
+    phase_starts = [p["ts_start"] for p in sorted_phases]
+    phase_types = [p["type"] for p in sorted_phases]
+    phase_ends = [p["ts_end"] for p in sorted_phases]
+    def _get_phase_for_timestamp(ts: int) -> str:
+        """Get phase for a timestamp using binary search. O(log n)."""
+        if not phase_starts:
+            return "decode"
+        idx = bisect.bisect_right(phase_starts, ts) - 1
+        if idx >= 0 and phase_starts[idx] <= ts <= phase_ends[idx]:
+            return phase_types[idx]
+        return "decode"
+    cpu_op_cache: dict[str, str | None] = {}
+    _progress("Classifying kernels", 0.7)
+    for ev in pass_result.kernel_events:
+        name_raw = ev["name"]
+        name = sys.intern(name_raw)
+        dur, ts = ev.get("dur", 0), ev["ts"]
         corr_id = ev.get("args", {}).get("correlation")
         ext_id = ev.get("args", {}).get("External id")
-        phase = "decode"
-        for p in phases:
-            if p["ts_start"] <= ts <= p["ts_end"]:
-                phase = p["type"]
-                break
+        phase = _get_phase_for_timestamp(ts)
         op, pattern = classify(name, platform)
         kernel_patterns[(op.value, phase)].add(pattern)
-        # Assign layer number from correlation ID
         layer = layer_mapping.get(corr_id) if corr_id is not None else None
-        # Get CPU operator name from external ID, or fallback to Python stack
-        cpu_op = external_to_cpu.get(ext_id) if ext_id is not None else None
+        cpu_op = pass_result.cpu_op_mapping.get(ext_id) if ext_id is not None else None
+        python_stack: list[str] = []
+        if cpu_op is None and include_stacks:
+            if name in cpu_op_cache:
+                cpu_op = cpu_op_cache[name]
+            else:
+                cpu_op, python_stack = _get_python_stack_full(
+                    ts, pass_result.python_intervals, pass_result.python_by_id
+                )
+                cpu_op_cache[name] = cpu_op
+        kernel_data.append({
+            "name": name,
+            "dur_us": dur,
+            "phase": phase,
+            "op": op.value,
+            "pattern": pattern,
+            "layer": layer,
+            "correlation": corr_id,
+            "cpu_op": cpu_op,
+            "python_stack": python_stack,
+        })
+    _progress("Building DataFrame", 0.95)
+    df = pd.DataFrame(kernel_data)
+    _progress("Complete", 1.0)
+    return platform, gpu_name, device_props, df, dict(kernel_patterns), layer_mapping
+def load_trace_full(
+    file_path: str | Path,
+    include_stacks: bool = True,
+    on_progress: ProgressCallback | None = None,
+) -> LoadedTrace:
+    """Load trace once with all data needed by downstream analysis functions.
+    Args:
+        file_path: Path to the trace JSON file
+        include_stacks: Whether to resolve Python call stacks
+        on_progress: Optional callback for progress updates
+    Returns:
+        LoadedTrace with all trace data
+    """
+    def _progress(stage: str, pct: float) -> None:
+        if on_progress:
+            on_progress(stage, pct)
+    _progress("Reading file", 0.0)
+    with open(file_path, "rb") as f:
+        raw = f.read()
+    _progress("Parsing JSON", 0.1)
+    trace = orjson.loads(raw)
+    all_events = trace.get("traceEvents", [])
+    props = trace.get("deviceProperties", [{}])[0]
+    is_amd = trace.get("roctracer_version") or props.get("warpSize") == 64
+    platform = "AMD" if is_amd else "NVIDIA"
+    gpu_name = props.get("name", "MI300X" if is_amd else "Unknown GPU")
+    device_props = {
+        "name": gpu_name,
+        "compute_capability": f"{props.get('computeMajor', 0)}.{props.get('computeMinor', 0)}",
+        "total_memory_gb": props.get("totalGlobalMem", 0) / (1024**3),
+        "sm_count": props.get("numSms", 0),
+        "warp_size": props.get("warpSize", 32),
+        "max_threads_per_block": props.get("maxThreadsPerBlock", 0),
+        "shared_mem_per_block_kb": props.get("sharedMemPerBlock", 0) / 1024,
+    }
+    _progress("Processing events", 0.4)
+    pass_result = _process_events_single_pass(all_events, include_stacks=include_stacks)
+    _progress("Detecting architecture", 0.6)
+    kernel_names = [ev.get("name", "") for ev in pass_result.kernel_events]
+    architecture, _ = detect_architecture(kernel_names)
+    layer_kernels_dict, layer_warnings = segment_layers_by_architecture(
+        pass_result.kernel_events,
+        architecture,
+    )
+    # Convert layer_kernels_dict (layer_num -> kernels) to correlation_id -> layer_num mapping
+    if layer_kernels_dict:
+        layer_mapping: dict[int, int] = {}
+        for layer_num, kernels in layer_kernels_dict.items():
+            for kernel in kernels:
+                corr_id = kernel.get("args", {}).get("correlation")
+                if corr_id is not None:
+                    layer_mapping[corr_id] = layer_num
+    else:
+        layer_mapping = _build_layer_mapping(pass_result.correlation_groups)
+    kernel_data = []
+    kernel_patterns: dict[tuple[str, str], set[str]] = defaultdict(set)
+    sorted_phases = sorted(pass_result.phases, key=lambda p: p["ts_start"])
+    phase_starts = [p["ts_start"] for p in sorted_phases]
+    phase_types = [p["type"] for p in sorted_phases]
+    phase_ends = [p["ts_end"] for p in sorted_phases]
+    def _get_phase_for_timestamp(ts: int) -> str:
+        """Get phase for a timestamp using binary search. O(log n)."""
+        if not phase_starts:
+            return "decode"
+        idx = bisect.bisect_right(phase_starts, ts) - 1
+        if idx >= 0 and phase_starts[idx] <= ts <= phase_ends[idx]:
+            return phase_types[idx]
+        return "decode"
+    cpu_op_cache: dict[str, str | None] = {}
+    _progress("Classifying kernels", 0.7)
+    for ev in pass_result.kernel_events:
+        name_raw = ev["name"]
+        name = sys.intern(name_raw)
+        dur, ts = ev.get("dur", 0), ev["ts"]
+        corr_id = ev.get("args", {}).get("correlation")
+        ext_id = ev.get("args", {}).get("External id")
+        phase = _get_phase_for_timestamp(ts)
+        op, pattern = classify(name, platform)
+        kernel_patterns[(op.value, phase)].add(pattern)
+        layer = layer_mapping.get(corr_id) if corr_id is not None else None
+        cpu_op = pass_result.cpu_op_mapping.get(ext_id) if ext_id is not None else None
         python_stack: list[str] = []
+        if cpu_op is None and include_stacks:
+            if name in cpu_op_cache:
+                cpu_op = cpu_op_cache[name]
+            else:
+                cpu_op, python_stack = _get_python_stack_full(
+                    ts, pass_result.python_intervals, pass_result.python_by_id
+                )
+                cpu_op_cache[name] = cpu_op
+        kernel_data.append({
+            "name": name,
+            "dur_us": dur,
+            "phase": phase,
+            "op": op.value,
+            "pattern": pattern,
+            "layer": layer,
+            "correlation": corr_id,
+            "cpu_op": cpu_op,
+            "python_stack": python_stack,
+        })
+    _progress("Building DataFrame", 0.95)
+    df = pd.DataFrame(kernel_data)
+    kernel_events = pass_result.kernel_events
+    correlation_groups: dict[int, list[dict[str, Any]]] = defaultdict(list)
+    for ev in kernel_events:
+        corr_id = ev.get("args", {}).get("correlation")
+        if corr_id is not None:
+            correlation_groups[corr_id].append(ev)
+    _progress("Complete", 1.0)
+    return LoadedTrace(
+        platform=platform,
+        gpu_name=gpu_name,
+        device_props=device_props,
+        df=df,
+        patterns=dict(kernel_patterns),
+        layers=layer_mapping,
+        kernel_events=kernel_events,
+        all_events=all_events,
+        correlation_groups=dict(correlation_groups),
+        phases=pass_result.phases,
+    )
-        # If no CPU op via External ID, try Python stack trace
-        if cpu_op is None:
-            cpu_op, python_stack = _get_python_stack_full(
-                ts, python_intervals, python_by_id
-            )
+@dataclass
+class StreamingMetadata:
+    """Early metadata available before full trace processing."""
+    platform: str
+    gpu_name: str
+    device_props: dict[str, Any]
+    file_size_mb: float
-        kernel_data.append(
-            {
-                "name": name,
-                "dur_us": dur,
-                "phase": phase,
-                "op": op.value,
-                "pattern": pattern,
-                "layer": layer,
-                "correlation": corr_id,
-                "cpu_op": cpu_op,
-                "python_stack": python_stack,  # Full stack for JSON output
-            }
+def _extract_metadata_fast(file_path: Path) -> StreamingMetadata:
+    """Extract trace metadata instantly using streaming parser (~2ms).
+    Uses ijson to read only the deviceProperties section without
+    parsing the entire file. Falls back to full parse if ijson unavailable.
+    """
+    file_size_mb = file_path.stat().st_size / (1024 * 1024)
+    platform = "Unknown"
+    gpu_name = "Unknown GPU"
+    device_props: dict[str, Any] = {}
+    if ijson is None:
+        # Fallback: parse full file (slower but works)
+        with open(file_path, "rb") as f:
+            trace = orjson.loads(f.read())
+        props = trace.get("deviceProperties", [{}])[0]
+        is_amd = trace.get("roctracer_version") or props.get("warpSize") == 64
+        platform = "AMD" if is_amd else "NVIDIA"
+        gpu_name = props.get("name", "MI300X" if is_amd else "Unknown GPU")
+        device_props = {
+            "name": gpu_name,
+            "compute_capability": f"{props.get('computeMajor', 0)}.{props.get('computeMinor', 0)}",
+            "total_memory_gb": props.get("totalGlobalMem", 0) / (1024**3),
+            "sm_count": props.get("numSms", 0),
+            "warp_size": props.get("warpSize", 32),
+            "max_threads_per_block": props.get("maxThreadsPerBlock", 0),
+            "shared_mem_per_block_kb": props.get("sharedMemPerBlock", 0) / 1024,
+        }
+        return StreamingMetadata(
+            platform=platform,
+            gpu_name=gpu_name,
+            device_props=device_props,
+            file_size_mb=file_size_mb,
         )
+    with open(file_path, "rb") as f:
+        parser = ijson.parse(f)
+        for prefix, event, value in parser:
+            if prefix == "deviceProperties.item.name":
+                gpu_name = value
+            elif prefix == "deviceProperties.item.warpSize":
+                platform = "AMD" if value == 64 else "NVIDIA"
+            elif prefix == "deviceProperties.item.computeMajor":
+                device_props["compute_major"] = value
+            elif prefix == "deviceProperties.item.computeMinor":
+                device_props["compute_minor"] = value
+            elif prefix == "deviceProperties.item.totalGlobalMem":
+                device_props["total_memory_gb"] = value / (1024**3)
+            elif prefix == "deviceProperties.item.numSms":
+                device_props["sm_count"] = value
+            elif prefix == "deviceProperties.item.maxThreadsPerBlock":
+                device_props["max_threads_per_block"] = value
+            elif prefix == "deviceProperties.item.sharedMemPerBlock":
+                device_props["shared_mem_per_block_kb"] = value / 1024
+            elif prefix == "traceEvents.item":
+                # Hit first event, stop - we only need metadata
+                break
+    # Fallback platform detection
+    if platform == "Unknown":
+        platform = "AMD" if "MI" in gpu_name or "Instinct" in gpu_name else "NVIDIA"
+    device_props["name"] = gpu_name
+    device_props["compute_capability"] = f"{device_props.get('compute_major', 0)}.{device_props.get('compute_minor', 0)}"
+    device_props["warp_size"] = 64 if platform == "AMD" else 32
+    return StreamingMetadata(
+        platform=platform,
+        gpu_name=gpu_name,
+        device_props=device_props,
+        file_size_mb=file_size_mb,
+    )
-    return platform, gpu_name, device_props, pd.DataFrame(kernel_data), dict(kernel_patterns), layer_mapping
+def load_trace_streaming(
+    file_path: str | Path,
+    include_stacks: bool = True,
+    on_metadata: Callable[[StreamingMetadata], None] | None = None,
+    on_progress: ProgressCallback | None = None,
+) -> tuple[str, str, dict[str, Any], pd.DataFrame, dict[tuple[str, str], set[str]], dict[int, int]]:
+    """Load trace with instant metadata feedback.
+    Hybrid approach:
+    1. Phase 1 (~2ms): Extract metadata with ijson streaming
+    2. Phase 2 (full): Parse with orjson and process
+    The on_metadata callback fires immediately with GPU/platform info,
+    allowing the UI to show feedback while the full load continues.
+    Args:
+        file_path: Path to the trace JSON file
+        include_stacks: Whether to resolve Python call stacks
+        on_metadata: Callback for instant metadata (fires in ~2ms)
+        on_progress: Callback for progress updates during processing
+    """
+    file_path = Path(file_path)
+    # Phase 1: Instant metadata extraction (~2ms)
+    metadata = _extract_metadata_fast(file_path)
+    if on_metadata:
+        on_metadata(metadata)
+    # Phase 2: Full load with orjson (fast) + progress updates
+    return load_trace(
+        file_path,
+        include_stacks=include_stacks,
+        on_progress=on_progress,
+    )

wafer-core 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl

wafer-core 0.1.26py3-none-any.whl → 0.1.28py3-none-any.whl