PyPI - wafer-cli - Versions diffs - 0.2.36__py3-none-any.whl → 0.2.37__py3-none-any.whl - Mend

wafer-cli 0.2.36py3-none-any.whl → 0.2.37py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

wafer/ncu_analyze.py CHANGED Viewed

@@ -41,6 +41,32 @@ NCU_PATHS = {
 }
+# GPU SM counts for common NVIDIA GPUs (used for underfill detection)
+GPU_SM_COUNTS = {
+    "B200": 148,
+    "H100": 132,
+    "H200": 132,
+    "A100": 108,
+    "A10": 72,
+    "L4": 58,
+    "L40": 142,
+    "V100": 80,
+    "RTX 4090": 128,
+    "RTX 3090": 82,
+}
+def _get_sm_count_for_gpu(gpu_name: str) -> int:
+    """Get SM count for a GPU name. Returns 148 (B200) as default."""
+    if not gpu_name:
+        return 148
+    gpu_upper = gpu_name.upper()
+    for gpu_key, sm_count in GPU_SM_COUNTS.items():
+        if gpu_key.upper() in gpu_upper:
+            return sm_count
+    return 148  # Default to B200
 def _get_platform() -> str:
     """Get normalized platform name."""
     system = platform.system().lower()
@@ -85,197 +111,471 @@ def _get_install_command() -> str:
     return "Download from https://developer.nvidia.com/nsight-compute"
+def _parse_gpu_from_session(session_output: str) -> str:
+    """Parse GPU name from NCU session output."""
+    assert isinstance(session_output, str)
+    for line in session_output.split("\n"):
+        if "display_name" in line:
+            parts = line.split()
+            if len(parts) >= 2:
+                return " ".join(parts[1:])
+    return "Unknown"
+def _create_kernel_entry(kernel_name: str) -> dict:
+    """Create a new kernel metrics dict with default values."""
+    assert kernel_name, "kernel_name must not be empty"
+    return {
+        "name": kernel_name,
+        "duration_us": 0,
+        "duration_ms": 0,
+        "memory_throughput_pct": 0,
+        "compute_throughput_pct": 0,
+        "achieved_occupancy_pct": 0,
+        "theoretical_occupancy_pct": 0,
+        "registers_per_thread": 0,
+        "block_size": 0,
+        "grid_size": 0,
+        "waves_per_sm": 0,
+        "estimated_speedup_pct": 0,
+        "recommendations": [],
+    }
+def _parse_metric_line(kernel: dict, metric_line: str, parts: list[str], current_section: str | None) -> None:
+    """Parse a metric line and update the kernel dict in place."""
+    assert kernel is not None
+    assert parts, "parts must not be empty"
+    # Duration (in us)
+    if metric_line.startswith("Duration") and "us" in metric_line:
+        try:
+            value = float(parts[-1].replace(",", ""))
+            kernel["duration_us"] = value
+            kernel["duration_ms"] = value / 1000
+        except (ValueError, IndexError):
+            pass
+    # Memory Throughput (%)
+    elif "Memory Throughput" in metric_line and "%" in metric_line:
+        try:
+            kernel["memory_throughput_pct"] = float(parts[-1].replace(",", ""))
+        except (ValueError, IndexError):
+            pass
+    # Compute (SM) Throughput (%)
+    elif "Compute (SM) Throughput" in metric_line or "Compute Throughput" in metric_line:
+        try:
+            kernel["compute_throughput_pct"] = float(parts[-1].replace(",", ""))
+        except (ValueError, IndexError):
+            pass
+    # Achieved Occupancy (%)
+    elif "Achieved Occupancy" in metric_line and "%" in metric_line:
+        try:
+            kernel["achieved_occupancy_pct"] = float(parts[-1].replace(",", ""))
+        except (ValueError, IndexError):
+            pass
+    # Registers Per Thread
+    elif "Registers Per Thread" in metric_line:
+        try:
+            kernel["registers_per_thread"] = int(float(parts[-1].replace(",", "")))
+        except (ValueError, IndexError):
+            pass
+    # Block Size (only from Launch Statistics section)
+    elif metric_line.startswith("Block Size") and current_section == "Launch Statistics":
+        try:
+            kernel["block_size"] = int(float(parts[-1].replace(",", "")))
+        except (ValueError, IndexError):
+            pass
+    # Grid Size (only from Launch Statistics section)
+    elif metric_line.startswith("Grid Size") and current_section == "Launch Statistics":
+        try:
+            kernel["grid_size"] = int(float(parts[-1].replace(",", "")))
+        except (ValueError, IndexError):
+            pass
+    # Waves Per SM (key metric for underfill detection)
+    elif "Waves Per SM" in metric_line:
+        try:
+            kernel["waves_per_sm"] = float(parts[-1].replace(",", ""))
+        except (ValueError, IndexError):
+            pass
+    # Theoretical Occupancy (%)
+    elif "Theoretical Occupancy" in metric_line and "%" in metric_line:
+        try:
+            kernel["theoretical_occupancy_pct"] = float(parts[-1].replace(",", ""))
+        except (ValueError, IndexError):
+            pass
+def _extract_speedup(kernel: dict, stripped: str) -> None:
+    """Extract estimated speedup from recommendation line."""
+    import re
+    assert kernel is not None
+    for pattern in [r"Est\. Speedup:\s*([\d.]+)%", r"Est\. Local Speedup:\s*([\d.]+)%"]:
+        match = re.search(pattern, stripped)
+        if match:
+            try:
+                speedup = float(match.group(1))
+                if speedup > kernel["estimated_speedup_pct"]:
+                    kernel["estimated_speedup_pct"] = speedup
+            except ValueError:
+                pass
 def _parse_ncu_output(session_output: str, details_output: str) -> dict:
     """Parse NCU session and details output into structured data."""
     import re
+    assert isinstance(session_output, str)
+    assert isinstance(details_output, str)
     summary: dict = {
-        "gpu": "Unknown",
+        "gpu": _parse_gpu_from_session(session_output) if session_output else "Unknown",
         "kernels": [],
         "recommendations": [],
     }
-    # Parse session output for GPU name
-    if session_output:
-        for line in session_output.split("\n"):
-            if "display_name" in line:
-                parts = line.split()
-                if len(parts) >= 2:
-                    summary["gpu"] = " ".join(parts[1:])
-                break
-    # Parse details output for kernel metrics and recommendations
-    if details_output:
-        lines = details_output.split("\n")
-        current_kernel: dict | None = None
-        current_section: str | None = None
-        in_recommendation = False
-        recommendation_lines: list[str] = []
-        i = 0
-        while i < len(lines):
-            line = lines[i]
-            stripped = line.strip()
-            # Detect kernel header
-            if (
-                line.startswith("  ")
-                and not line.startswith("    ")
-                and "Context" in line
-                and "Device" in line
-            ):
-                match = re.match(r"^  (.+?)\s+\(\d+,\s*\d+,\s*\d+\)x\(\d+,\s*\d+,\s*\d+\)", line)
-                if match:
-                    kernel_name = match.group(1).strip()
-                    current_kernel = {
-                        "name": kernel_name,
-                        "duration_us": 0,
-                        "duration_ms": 0,
-                        "memory_throughput_pct": 0,
-                        "compute_throughput_pct": 0,
-                        "achieved_occupancy_pct": 0,
-                        "registers_per_thread": 0,
-                        "block_size": 0,
-                        "grid_size": 0,
-                        "estimated_speedup_pct": 0,
-                        "recommendations": [],
-                    }
-                    summary["kernels"].append(current_kernel)
-            # Detect section headers
-            if stripped.startswith("Section:"):
-                current_section = stripped.replace("Section:", "").strip()
-            # Parse metrics from table rows
-            if current_kernel and "          " in line:
-                parts = line.split()
-                if len(parts) >= 2:
-                    metric_line = stripped
-                    # Duration (in us)
-                    if metric_line.startswith("Duration") and "us" in metric_line:
-                        try:
-                            value = float(parts[-1].replace(",", ""))
-                            current_kernel["duration_us"] = value
-                            current_kernel["duration_ms"] = value / 1000
-                        except (ValueError, IndexError):
-                            pass
-                    # Memory Throughput (%)
-                    elif "Memory Throughput" in metric_line and "%" in metric_line:
-                        try:
-                            value = float(parts[-1].replace(",", ""))
-                            current_kernel["memory_throughput_pct"] = value
-                        except (ValueError, IndexError):
-                            pass
-                    # Compute (SM) Throughput (%)
-                    elif (
-                        "Compute (SM) Throughput" in metric_line
-                        or "Compute Throughput" in metric_line
-                    ):
-                        try:
-                            value = float(parts[-1].replace(",", ""))
-                            current_kernel["compute_throughput_pct"] = value
-                        except (ValueError, IndexError):
-                            pass
-                    # Achieved Occupancy (%)
-                    elif "Achieved Occupancy" in metric_line and "%" in metric_line:
-                        try:
-                            value = float(parts[-1].replace(",", ""))
-                            current_kernel["achieved_occupancy_pct"] = value
-                        except (ValueError, IndexError):
-                            pass
-                    # Registers Per Thread
-                    elif "Registers Per Thread" in metric_line:
-                        try:
-                            value = int(float(parts[-1].replace(",", "")))
-                            current_kernel["registers_per_thread"] = value
-                        except (ValueError, IndexError):
-                            pass
-                    # Block Size
-                    elif (
-                        metric_line.startswith("Block Size")
-                        and current_section == "Launch Statistics"
-                    ):
-                        try:
-                            value = int(float(parts[-1].replace(",", "")))
-                            current_kernel["block_size"] = value
-                        except (ValueError, IndexError):
-                            pass
-                    # Grid Size
-                    elif (
-                        metric_line.startswith("Grid Size")
-                        and current_section == "Launch Statistics"
-                    ):
-                        try:
-                            value = int(float(parts[-1].replace(",", "")))
-                            current_kernel["grid_size"] = value
-                        except (ValueError, IndexError):
-                            pass
-            # Parse recommendations (OPT and INF markers)
-            if stripped.startswith("OPT") or stripped.startswith("INF"):
-                in_recommendation = True
-                recommendation_lines = [stripped]
-                # Extract estimated speedup
-                if current_kernel and "Est. Speedup:" in stripped:
-                    speedup_match = re.search(r"Est\. Speedup:\s*([\d.]+)%", stripped)
-                    if speedup_match:
-                        try:
-                            speedup = float(speedup_match.group(1))
-                            if speedup > current_kernel["estimated_speedup_pct"]:
-                                current_kernel["estimated_speedup_pct"] = speedup
-                        except ValueError:
-                            pass
-                if current_kernel and "Est. Local Speedup:" in stripped:
-                    speedup_match = re.search(r"Est\. Local Speedup:\s*([\d.]+)%", stripped)
-                    if speedup_match:
-                        try:
-                            speedup = float(speedup_match.group(1))
-                            if speedup > current_kernel["estimated_speedup_pct"]:
-                                current_kernel["estimated_speedup_pct"] = speedup
-                        except ValueError:
-                            pass
-            elif in_recommendation:
-                if line.startswith("          ") and stripped:
-                    recommendation_lines.append(stripped)
-                elif (
-                    stripped.startswith("Section:")
-                    or stripped.startswith("---")
-                    or (stripped and not line.startswith(" "))
-                ):
-                    if recommendation_lines:
-                        full_rec = " ".join(recommendation_lines)
-                        if full_rec not in summary["recommendations"]:
-                            summary["recommendations"].append(full_rec)
-                        if current_kernel and full_rec not in current_kernel["recommendations"]:
-                            current_kernel["recommendations"].append(full_rec)
-                    in_recommendation = False
-                    recommendation_lines = []
-            i += 1
-        # Capture last recommendation if any
-        if recommendation_lines:
-            full_rec = " ".join(recommendation_lines)
-            if full_rec not in summary["recommendations"]:
-                summary["recommendations"].append(full_rec)
-            if current_kernel and full_rec not in current_kernel["recommendations"]:
-                current_kernel["recommendations"].append(full_rec)
+    if not details_output:
+        return summary
+    lines = details_output.split("\n")
+    current_kernel: dict | None = None
+    current_section: str | None = None
+    in_recommendation = False
+    recommendation_lines: list[str] = []
+    for line in lines:
+        stripped = line.strip()
+        # Detect kernel header
+        if line.startswith("  ") and not line.startswith("    ") and "Context" in line and "Device" in line:
+            match = re.match(r"^  (.+?)\s+\(\d+,\s*\d+,\s*\d+\)x\(\d+,\s*\d+,\s*\d+\)", line)
+            if match:
+                current_kernel = _create_kernel_entry(match.group(1).strip())
+                summary["kernels"].append(current_kernel)
+        # Detect section headers
+        if stripped.startswith("Section:"):
+            current_section = stripped.replace("Section:", "").strip()
+        # Parse metrics from table rows
+        if current_kernel and "          " in line:
+            parts = line.split()
+            if len(parts) >= 2:
+                _parse_metric_line(current_kernel, stripped, parts, current_section)
+        # Parse recommendations (OPT and INF markers)
+        if stripped.startswith("OPT") or stripped.startswith("INF"):
+            in_recommendation = True
+            recommendation_lines = [stripped]
+            if current_kernel:
+                _extract_speedup(current_kernel, stripped)
+        elif in_recommendation:
+            if line.startswith("          ") and stripped:
+                recommendation_lines.append(stripped)
+            elif stripped.startswith("Section:") or stripped.startswith("---") or (stripped and not line.startswith(" ")):
+                if recommendation_lines:
+                    full_rec = " ".join(recommendation_lines)
+                    if full_rec not in summary["recommendations"]:
+                        summary["recommendations"].append(full_rec)
+                    if current_kernel and full_rec not in current_kernel["recommendations"]:
+                        current_kernel["recommendations"].append(full_rec)
+                in_recommendation = False
+                recommendation_lines = []
+    # Capture last recommendation if any
+    if recommendation_lines:
+        full_rec = " ".join(recommendation_lines)
+        if full_rec not in summary["recommendations"]:
+            summary["recommendations"].append(full_rec)
+        if current_kernel and full_rec not in current_kernel["recommendations"]:
+            current_kernel["recommendations"].append(full_rec)
     return summary
+def _classify_underfill(
+    waves_per_sm: float, grid_size: int, num_sms: int
+) -> tuple[str | None, str | None]:
+    """Classify underfill type and severity based on metrics.
+    Returns:
+        (underfill_type, severity) where:
+        - underfill_type: "launch" | "resource" | None
+        - severity: "severe" | "moderate" | None
+    """
+    assert waves_per_sm >= 0, f"waves_per_sm must be non-negative, got {waves_per_sm}"
+    assert grid_size >= 0, f"grid_size must be non-negative, got {grid_size}"
+    assert num_sms > 0, f"num_sms must be positive, got {num_sms}"
+    is_grid_small = grid_size > 0 and grid_size < num_sms
+    if waves_per_sm > 0 and waves_per_sm < 1.0:
+        return ("launch" if is_grid_small else "resource", "severe")
+    if waves_per_sm > 0 and waves_per_sm < 2.0:
+        return ("launch" if is_grid_small else "resource", "moderate")
+    if is_grid_small:
+        return ("launch", "severe")
+    return (None, None)
+def _classify_occupancy(
+    achieved_occ: float, theoretical_occ: float
+) -> tuple[bool, str | None]:
+    """Classify occupancy issue.
+    Returns:
+        (is_low_occupancy, analysis_type) where:
+        - is_low_occupancy: True if achieved < 50%
+        - analysis_type: "runtime_issue" | "resource_limited" | None
+    """
+    assert achieved_occ >= 0, f"achieved_occ must be non-negative, got {achieved_occ}"
+    assert theoretical_occ >= 0, f"theoretical_occ must be non-negative, got {theoretical_occ}"
+    if achieved_occ <= 0 or achieved_occ >= 50:
+        return (False, None)
+    if theoretical_occ <= 0:
+        return (True, None)
+    occ_gap = theoretical_occ - achieved_occ
+    if theoretical_occ >= 50 and occ_gap > 20:
+        return (True, "runtime_issue")
+    if theoretical_occ < 50:
+        return (True, "resource_limited")
+    return (True, None)
+def _classify_throughput(
+    memory_tp: float, compute_tp: float, achieved_occ: float
+) -> tuple[bool, bool, bool]:
+    """Classify throughput observations.
+    Returns:
+        (has_high_memory, has_high_compute, has_both_low)
+    """
+    assert memory_tp >= 0, f"memory_tp must be non-negative, got {memory_tp}"
+    assert compute_tp >= 0, f"compute_tp must be non-negative, got {compute_tp}"
+    assert achieved_occ >= 0, f"achieved_occ must be non-negative, got {achieved_occ}"
+    has_high_memory = memory_tp > 60
+    has_high_compute = compute_tp > 60
+    has_both_low = memory_tp < 30 and compute_tp < 30 and achieved_occ >= 50
+    return (has_high_memory, has_high_compute, has_both_low)
+def _format_underfill_diagnosis(
+    underfill_type: str,
+    underfill_severity: str,
+    waves_per_sm: float,
+    grid_size: int,
+    num_sms: int,
+    achieved_occ: float,
+    theoretical_occ: float,
+    compute_tp: float,
+    memory_tp: float,
+    estimated_speedup: float,
+) -> list[str]:
+    """Format diagnosis lines for underfill issues. Returns early from _generate_diagnosis."""
+    assert underfill_type in ("launch", "resource")
+    assert underfill_severity in ("severe", "moderate")
+    severity_label = "UNDERFILL" if underfill_severity == "severe" else "LIMITED CONCURRENCY"
+    blocks_per_sm = grid_size / num_sms if grid_size > 0 else 0
+    lines = [f"**Primary Issue: {severity_label}**"]
+    if waves_per_sm > 0:
+        lines.append(f"- Waves per SM: {waves_per_sm:.2f} (often benefits from >2 to hide latency)")
+    if grid_size > 0:
+        lines.append(f"- Grid: {grid_size} blocks for {num_sms} SMs ({blocks_per_sm:.2f} blocks/SM)")
+    lines.append("- ⚠️ Compute/memory throughput % not reliable for global bottleneck; underfill dominates")
+    lines.append("")
+    if underfill_type == "launch":
+        lines.extend([
+            "**Type: LAUNCH-LIMITED** (grid smaller than SM count)",
+            "",
+            "**What WON'T help:**",
+            "- Reducing registers/shared memory (can't create more blocks than launched)",
+            "",
+            "**What MAY help:**",
+            "- Increase batch size or problem dimensions",
+            "- Split work into more blocks (e.g., tile over batch/head/rows; sequence tiling only if algorithm permits)",
+            "- Use persistent CTAs / work queue: launch ~k×SM blocks that pull tasks",
+            "- If inherently sequential, focus on per-block latency optimization",
+        ])
+    else:
+        lines.extend([
+            "**Type: RESOURCE-LIMITED** (grid is adequate, but few blocks fit per SM)",
+            "",
+            "**What MAY help:**",
+            "- Reduce registers per thread (__launch_bounds__, fewer local vars)",
+            "- Reduce shared memory per block (smaller tiles, multi-stage)",
+            "- Reduce block size to fit more blocks per SM",
+            "- Check 'Block Limit' in NCU Occupancy section for the limiter",
+            "",
+            "**Note:** If kernel is very short, waves/SM may be less indicative.",
+            "Confirm with Occupancy 'Block Limit' and duration metrics.",
+        ])
+    lines.extend(["", "**Raw metrics (interpret with caution due to underfill):**"])
+    lines.append(f"- Achieved Occupancy: {achieved_occ:.1f}%")
+    if theoretical_occ > 0:
+        lines.append(f"- Theoretical Occupancy: {theoretical_occ:.1f}%")
+    lines.append(f"- Compute Throughput: {compute_tp:.1f}%")
+    lines.append(f"- Memory Throughput: {memory_tp:.1f}%")
+    if estimated_speedup > 0:
+        lines.append(f"- NCU Est. Speedup potential: {estimated_speedup:.1f}%")
+    lines.append("")
+    return lines
+def _format_occupancy_diagnosis(
+    achieved_occ: float,
+    theoretical_occ: float,
+    occupancy_analysis: str | None,
+) -> list[str]:
+    """Format diagnosis lines for low occupancy issues."""
+    assert achieved_occ >= 0
+    lines = ["**Observation: Low Achieved Occupancy**", f"- Achieved: {achieved_occ:.1f}%"]
+    if theoretical_occ > 0:
+        lines.append(f"- Theoretical: {theoretical_occ:.1f}%")
+        if occupancy_analysis == "runtime_issue":
+            lines.extend([
+                "",
+                "**Analysis: Large gap between theoretical and achieved**",
+                "- Theoretical is high, so this is NOT a resource limit (regs/shmem)",
+                "- Likely causes: load imbalance, barriers, short kernel duration, tail effects",
+                "- Check if work is evenly distributed across blocks",
+            ])
+        elif occupancy_analysis == "resource_limited":
+            lines.extend([
+                "",
+                "**Analysis: Theoretical occupancy is also low**",
+                "- This IS a resource limit (registers, shared memory, or block size)",
+                "- Check 'Block Limit' in NCU Occupancy section for the specific limiter",
+            ])
+    lines.extend([
+        "",
+        "**General suggestions:**",
+        "- If register-limited: try __launch_bounds__, reduce local arrays",
+        "- If shared-mem-limited: reduce tile sizes or use multi-stage",
+        "- If runtime-limited: check barriers, load balance, kernel duration",
+        "",
+    ])
+    return lines
+def _format_throughput_diagnosis(
+    has_high_memory: bool,
+    has_high_compute: bool,
+    has_both_low: bool,
+    memory_tp: float,
+    compute_tp: float,
+) -> list[str]:
+    """Format diagnosis lines for throughput observations."""
+    lines: list[str] = []
+    if has_high_memory or has_high_compute:
+        lines.append("**Throughput observations:**")
+        if has_high_memory:
+            lines.append(f"- Memory throughput relatively high ({memory_tp:.1f}%)")
+            lines.append("  - May benefit from: better caching, shared memory tiling, coalesced access")
+        if has_high_compute:
+            lines.append(f"- Compute throughput relatively high ({compute_tp:.1f}%)")
+            lines.append("  - May benefit from: reduced instruction count, better ILP")
+            lines.append("  - Check which pipeline is saturated (FP32/FP16/INT/SFU/TensorCore) if available")
+        lines.append("")
+    elif has_both_low:
+        lines.extend([
+            "**Observation: Both % of peak are low**",
+            "- Likely: latency-bound, sync-bound, dependency stalls, or non-peak pipelines",
+            "- This can happen with: integer-heavy, SFU-heavy, or control-flow-heavy kernels",
+            "- Check instruction mix / pipeline utilization metrics if available",
+            "- Check NCU stall reasons (smsp__warp_issue_stalled_*) for more detail",
+            "",
+        ])
+    return lines
+def _generate_diagnosis(kernel: dict, num_sms: int = 148) -> list[str]:
+    """Generate actionable diagnosis based on kernel metrics.
+    Uses a prioritized decision order:
+    1. Underfill check (waves_per_sm < 2 OR grid_size < num_sms) - overrides other diagnoses
+    2. Occupancy limiters (theoretical vs achieved gap analysis)
+    3. General observations (avoid strong "bound" labels without stall data)
+    """
+    assert isinstance(kernel, dict), "kernel must be a dict"
+    assert num_sms > 0, f"num_sms must be positive, got {num_sms}"
+    # Extract metrics (single assignments)
+    grid_size = kernel.get('grid_size', 0)
+    achieved_occ = kernel.get('achieved_occupancy_pct', 0)
+    theoretical_occ = kernel.get('theoretical_occupancy_pct', 0)
+    compute_tp = kernel.get('compute_throughput_pct', 0)
+    memory_tp = kernel.get('memory_throughput_pct', 0)
+    estimated_speedup = kernel.get('estimated_speedup_pct', 0)
+    waves_per_sm = kernel.get('waves_per_sm', 0)
+    # Skip if we don't have enough data
+    if grid_size == 0 and achieved_occ == 0 and waves_per_sm == 0:
+        return []
+    # Compute all classifications upfront (single assignments)
+    underfill_type, underfill_severity = _classify_underfill(waves_per_sm, grid_size, num_sms)
+    is_low_occupancy, occupancy_analysis = _classify_occupancy(achieved_occ, theoretical_occ)
+    has_high_memory, has_high_compute, has_both_low = _classify_throughput(memory_tp, compute_tp, achieved_occ)
+    # Derived flags (single assignments)
+    has_underfill = underfill_type is not None
+    has_throughput_obs = has_high_memory or has_high_compute or has_both_low
+    # Build output
+    lines = ["#### 🔍 Diagnosis", ""]
+    # PRIORITY 1: Underfill (overrides other diagnoses)
+    if has_underfill:
+        lines.extend(_format_underfill_diagnosis(
+            underfill_type, underfill_severity, waves_per_sm, grid_size, num_sms,
+            achieved_occ, theoretical_occ, compute_tp, memory_tp, estimated_speedup,
+        ))
+        return lines
+    # PRIORITY 2: Low occupancy (when NOT caused by underfill)
+    if is_low_occupancy:
+        lines.extend(_format_occupancy_diagnosis(achieved_occ, theoretical_occ, occupancy_analysis))
+    # PRIORITY 3: Throughput observations
+    lines.extend(_format_throughput_diagnosis(has_high_memory, has_high_compute, has_both_low, memory_tp, compute_tp))
+    # Show NCU's own recommendations if present
+    if estimated_speedup > 0:
+        lines.extend([f"**NCU estimated speedup potential: {estimated_speedup:.1f}%**",
+                      "- See NCU recommendations below for specific suggestions", ""])
+    # No major issues detected
+    if not (has_underfill or is_low_occupancy or has_throughput_obs):
+        lines.extend(["**Status: No obvious bottleneck detected**",
+                      f"- Occupancy: {achieved_occ:.1f}%, Compute: {compute_tp:.1f}%, Memory: {memory_tp:.1f}%",
+                      "- Consider profiling with --set full for stall breakdown",
+                      "- Or the kernel may already be well-optimized for its workload", ""])
+    return lines
 def _generate_text_output(filename: str, summary: dict) -> str:
     """Generate human-readable markdown text from summary."""
     timestamp = datetime.now().isoformat()
+    gpu_name = summary.get('gpu', 'Unknown')
+    num_sms = _get_sm_count_for_gpu(gpu_name)
     lines = [
         "# NCU Profiling Analysis",
@@ -283,7 +583,7 @@ def _generate_text_output(filename: str, summary: dict) -> str:
         f"Generated: {timestamp}",
         "",
         "## GPU Information",
-        f"- Device: {summary.get('gpu', 'Unknown')}",
+        f"- Device: {gpu_name}",
         "",
         "## Kernel Summary",
         "",
@@ -301,10 +601,15 @@ def _generate_text_output(filename: str, summary: dict) -> str:
             f"- Grid Size: {kernel.get('grid_size', 0)}",
             "",
         ])
+        # Add actionable diagnosis
+        diagnosis = _generate_diagnosis(kernel, num_sms=num_sms)
+        if diagnosis:
+            lines.extend(diagnosis)
     if summary.get("recommendations"):
         lines.extend([
-            "## Recommendations",
+            "## NCU Recommendations",
             "",
         ])
         for i, rec in enumerate(summary["recommendations"], 1):
@@ -534,6 +839,8 @@ def _analyze_remote_api(
 def _generate_ncu_api_text_output(filename: str, result: dict) -> str:
     """Generate human-readable text from NCU API result."""
     timestamp = datetime.now().isoformat()
+    gpu_name = result.get('gpu', 'Unknown')
+    num_sms = _get_sm_count_for_gpu(gpu_name)
     lines = [
         "# NCU Profiling Analysis",
@@ -542,7 +849,7 @@ def _generate_ncu_api_text_output(filename: str, result: dict) -> str:
         f"Report ID: {result.get('report_id', 'N/A')}",
         "",
         "## GPU Information",
-        f"- Device: {result.get('gpu', 'Unknown')}",
+        f"- Device: {gpu_name}",
         "",
         "## Kernel Summary",
         "",
@@ -556,8 +863,23 @@ def _generate_ncu_api_text_output(filename: str, result: dict) -> str:
             f"- Achieved Occupancy: {kernel.get('achieved_occupancy_pct', kernel.get('occupancy', 0)):.1f}%",
             f"- Compute Throughput: {kernel.get('compute_throughput_pct', kernel.get('sm_throughput', 0)):.1f}%",
             f"- Memory Throughput: {kernel.get('memory_throughput_pct', kernel.get('mem_throughput', 0)):.1f}%",
+            f"- Grid Size: {kernel.get('grid_size', 0)}",
+            f"- Block Size: {kernel.get('block_size', 0)}",
             "",
         ])
+        # Add actionable diagnosis (normalize field names from API)
+        normalized_kernel = {
+            'grid_size': kernel.get('grid_size', 0),
+            'block_size': kernel.get('block_size', 0),
+            'achieved_occupancy_pct': kernel.get('achieved_occupancy_pct', kernel.get('occupancy', 0)),
+            'compute_throughput_pct': kernel.get('compute_throughput_pct', kernel.get('sm_throughput', 0)),
+            'memory_throughput_pct': kernel.get('memory_throughput_pct', kernel.get('mem_throughput', 0)),
+            'registers_per_thread': kernel.get('registers_per_thread', 0),
+        }
+        diagnosis = _generate_diagnosis(normalized_kernel, num_sms=num_sms)
+        if diagnosis:
+            lines.extend(diagnosis)
     # Add source correlation summary if present
     source_data = result.get("source_correlation", [])

wafer/targets.py CHANGED Viewed

@@ -4,7 +4,7 @@ CRUD operations for GPU targets stored in ~/.wafer/targets/.
 """
 import tomllib
-from dataclasses import asdict
+from dataclasses import asdict, fields
 from pathlib import Path
 from typing import Any
@@ -18,6 +18,12 @@ from wafer_core.utils.kernel_utils.targets.config import (
     WorkspaceTarget,
 )
+def _filter_dataclass_fields(data: dict[str, Any], dataclass_type: type) -> dict[str, Any]:
+    """Filter dict to only include fields that exist in the dataclass."""
+    valid_fields = {f.name for f in fields(dataclass_type)}
+    return {k: v for k, v in data.items() if k in valid_fields}
 # Default paths
 WAFER_DIR = Path.home() / ".wafer"
 TARGETS_DIR = WAFER_DIR / "targets"
@@ -64,17 +70,17 @@ def _parse_target(data: dict[str, Any]) -> TargetConfig:
         data_copy["gpu_ids"] = tuple(data_copy["gpu_ids"])
     if target_type == "baremetal":
-        return BaremetalTarget(**data_copy)
+        return BaremetalTarget(**_filter_dataclass_fields(data_copy, BaremetalTarget))
     elif target_type == "vm":
-        return VMTarget(**data_copy)
+        return VMTarget(**_filter_dataclass_fields(data_copy, VMTarget))
     elif target_type == "modal":
-        return ModalTarget(**data_copy)
+        return ModalTarget(**_filter_dataclass_fields(data_copy, ModalTarget))
     elif target_type == "workspace":
-        return WorkspaceTarget(**data_copy)
+        return WorkspaceTarget(**_filter_dataclass_fields(data_copy, WorkspaceTarget))
     elif target_type == "runpod":
-        return RunPodTarget(**data_copy)
+        return RunPodTarget(**_filter_dataclass_fields(data_copy, RunPodTarget))
     elif target_type == "digitalocean":
-        return DigitalOceanTarget(**data_copy)
+        return DigitalOceanTarget(**_filter_dataclass_fields(data_copy, DigitalOceanTarget))
     else:
         raise ValueError(
             f"Unknown target type: {target_type}. Must be baremetal, vm, modal, workspace, runpod, or digitalocean"

{wafer_cli-0.2.36.dist-info → wafer_cli-0.2.37.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: wafer-cli
-Version: 0.2.36
+Version: 0.2.37
 Summary: CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown

{wafer_cli-0.2.36.dist-info → wafer_cli-0.2.37.dist-info}/RECORD RENAMED Viewed

@@ -16,7 +16,7 @@ wafer/global_config.py,sha256=fhaR_RU3ufMksDmOohH1OLeQ0JT0SDW1hEip_zaP75k,11345
 wafer/gpu_run.py,sha256=TwqXy72T7f2I7e6n5WWod3xgxCPnDhU0BgLsB4CUoQY,9716
 wafer/inference.py,sha256=tZCO5i05FKY27ewis3CSBHFBeFbXY3xwj0DSjdoMY9s,4314
 wafer/kernel_scope.py,sha256=YtnxknAChkJoeU_vIdxiqWsAITGBeabp9OGIK-X32i0,20796
-wafer/ncu_analyze.py,sha256=rAWzKQRZEY6E_CL3gAWUaW3uZ4kvQVZskVCPDpsFJuE,24633
+wafer/ncu_analyze.py,sha256=8id2eJRuBabxINnUF0M6SQtS1YbAWBM3pzIN8xkxMCE,37139
 wafer/nsys_analyze.py,sha256=AhNcjPaapB0QCbqiHRXvyy-ccjevvVwEyxes84D28JU,36124
 wafer/nsys_profile.py,sha256=QFBl8pkr8r4uRNdNUO9gY-obj9slqpOgVYFZ_sXu6Nw,15478
 wafer/output.py,sha256=8jw5ifvIMK8ldyBMGW4NhrKvJPl66TV2Y2fJ5Tlhh1I,8293
@@ -27,7 +27,7 @@ wafer/rocprof_systems.py,sha256=4IWbMcbYk1x_8iS7P3FC_u5sgH6EXADCtR2lV9id80M,1862
 wafer/specs_cli.py,sha256=frMEKwMflxVNpFlAuxprmr33ZZ1Oeh2lB0KWZ4oZWzw,4360
 wafer/ssh_keys.py,sha256=9kSdhV_dg9T6pQu2JmNQptarkkwGtN9rLyRkI1bW4i4,8094
 wafer/target_lock.py,sha256=SDKhNzv2N7gsphGflcNni9FE5YYuAMuEthngAJEo4Gs,7809
-wafer/targets.py,sha256=9r-iRWoKSH5cQl1LcamaX-T7cNVOg99ngIm_hlRk-qU,26922
+wafer/targets.py,sha256=XeEZeOykNBnjJLnCqpoXAnzeqbp6MWZRIW9A26BKqdU,27469
 wafer/targets_cli.py,sha256=Oe3e02rSXeNrMbe_Qv9DNfQ8dEOKodtU7BbQQWxlNwA,16348
 wafer/targets_ops.py,sha256=jN1oIBx0mutxRNE9xpIc7SaBxPkVmOyus2eqn0kEKNI,21475
 wafer/trace_compare.py,sha256=IBVSGI8u5A10haDzL4eQ0R24fM1G_dd1F3-4iEkG1EQ,6349
@@ -43,8 +43,8 @@ wafer/templates/optimize_kernelbench.py,sha256=aoOA13zWEl89r6QW03xF9NKxQ7j4mWe9r
 wafer/templates/optimize_vllm.py,sha256=_D1rDP9wHA8CCvmoUrdLEW94MiaK4nAYJ-jbnpAvq7A,6154
 wafer/templates/trace_analyze.py,sha256=B7CiRlsokERzBjLL-k49kGjpU2zlJZqzTE05xbRS1WI,2878
 wafer/tests/test_eval_cli_parity.py,sha256=SGmaj2NGBZ7GdDF53bXsECvQbV21iHZw8YeL_MJOLk0,7206
-wafer_cli-0.2.36.dist-info/METADATA,sha256=POYE0Ub7A0rETiPPssuxv13NIqe_1yHNsAO0ddg_bxk,6461
-wafer_cli-0.2.36.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-wafer_cli-0.2.36.dist-info/entry_points.txt,sha256=WqB7hB__WhtPY8y1cO2sZiUz7fCq6Ik-usAigpeFvWE,41
-wafer_cli-0.2.36.dist-info/top_level.txt,sha256=2MK1IVMWfpLL8BZCQ3E9aG6L6L666gSA_teYlwan4fs,6
-wafer_cli-0.2.36.dist-info/RECORD,,
+wafer_cli-0.2.37.dist-info/METADATA,sha256=LOnnD6sSASC_Tf0qFMa5hBBUR6qJiMCkZysy2y4NdZw,6461
+wafer_cli-0.2.37.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+wafer_cli-0.2.37.dist-info/entry_points.txt,sha256=WqB7hB__WhtPY8y1cO2sZiUz7fCq6Ik-usAigpeFvWE,41
+wafer_cli-0.2.37.dist-info/top_level.txt,sha256=2MK1IVMWfpLL8BZCQ3E9aG6L6L666gSA_teYlwan4fs,6
+wafer_cli-0.2.37.dist-info/RECORD,,

{wafer_cli-0.2.36.dist-info → wafer_cli-0.2.37.dist-info}/WHEEL RENAMED Viewed

File without changes

{wafer_cli-0.2.36.dist-info → wafer_cli-0.2.37.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{wafer_cli-0.2.36.dist-info → wafer_cli-0.2.37.dist-info}/top_level.txt RENAMED Viewed

File without changes

wafer-cli 0.2.36__py3-none-any.whl → 0.2.37__py3-none-any.whl

wafer-cli 0.2.36py3-none-any.whl → 0.2.37py3-none-any.whl