PyPI - wafer-cli - Versions diffs - 0.2.14__py3-none-any.whl - Mend

wafer-cli 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

wafer/GUIDE.md +118 -0
wafer/__init__.py +3 -0
wafer/analytics.py +306 -0
wafer/api_client.py +195 -0
wafer/auth.py +432 -0
wafer/autotuner.py +1080 -0
wafer/billing.py +233 -0
wafer/cli.py +7289 -0
wafer/config.py +105 -0
wafer/corpus.py +366 -0
wafer/evaluate.py +4593 -0
wafer/global_config.py +350 -0
wafer/gpu_run.py +307 -0
wafer/inference.py +148 -0
wafer/kernel_scope.py +552 -0
wafer/ncu_analyze.py +651 -0
wafer/nsys_analyze.py +1042 -0
wafer/nsys_profile.py +510 -0
wafer/output.py +248 -0
wafer/problems.py +357 -0
wafer/rocprof_compute.py +490 -0
wafer/rocprof_sdk.py +274 -0
wafer/rocprof_systems.py +520 -0
wafer/skills/wafer-guide/SKILL.md +129 -0
wafer/ssh_keys.py +261 -0
wafer/target_lock.py +270 -0
wafer/targets.py +842 -0
wafer/targets_ops.py +717 -0
wafer/templates/__init__.py +0 -0
wafer/templates/ask_docs.py +61 -0
wafer/templates/optimize_kernel.py +71 -0
wafer/templates/optimize_kernelbench.py +137 -0
wafer/templates/trace_analyze.py +74 -0
wafer/tracelens.py +218 -0
wafer/wevin_cli.py +577 -0
wafer/workspaces.py +852 -0
wafer_cli-0.2.14.dist-info/METADATA +16 -0
wafer_cli-0.2.14.dist-info/RECORD +41 -0
wafer_cli-0.2.14.dist-info/WHEEL +5 -0
wafer_cli-0.2.14.dist-info/entry_points.txt +2 -0
wafer_cli-0.2.14.dist-info/top_level.txt +1 -0

wafer/templates/ask_docs.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""Template for querying GPU documentation.
+Usage:
+    wafer wevin -t ask-docs "How do bank conflicts occur?"
+    wafer wevin -t ask-docs --args corpus=./cuda-docs/ "Explain warp divergence"
+"""
+try:
+    from wafer_core.rollouts.templates import TemplateConfig
+except ImportError:
+    from rollouts.templates import TemplateConfig
+# NOTE: Agent tends to prefer bash (find, ls) over glob/grep tools despite system prompt
+# guidance. Expanded allowlist so this works. TODO: improve error display when blocked
+# commands are attempted (currently shows ❌ but error message not visible in TUI).
+template = TemplateConfig(
+    # Identity
+    name="ask-docs",
+    description="Query GPU documentation to answer technical questions",
+    # System prompt
+    system_prompt="""You are a GPU programming expert helping answer questions about CUDA, GPU architecture, and kernel optimization.
+Your task: Answer the user's question using the available documentation and tools.
+You have these tools available:
+- **glob**: Find files by pattern (e.g., glob pattern="**/*.md")
+- **grep**: Search file contents (e.g., grep pattern="shared memory" path=".")
+- **read**: Read file contents (e.g., read file_path="./guide.md")
+- **bash**: Run shell commands (ls, find, cat, head, tail, wc, jq, python -c)
+Strategy:
+1. Use the glob tool to find relevant documentation files (e.g., glob pattern="**/*.md")
+2. Use the grep tool to search for relevant content (e.g., grep pattern="your topic")
+3. Use the read tool to examine promising files
+4. Synthesize a clear, accurate answer
+Prefer glob/grep/read tools over bash equivalents when possible, but bash is available for common commands.
+Output your answer directly. Be concise but thorough. Include code examples when relevant.
+""",
+    # Tools
+    tools=["read", "glob", "grep", "bash"],
+    bash_allowlist=[
+        "ls",
+        "find",
+        "cat",
+        "head",
+        "tail",
+        "wc",
+        "jq",
+        "python -c",
+    ],
+    # Model config
+    model="anthropic/claude-sonnet-4-5-20250929",
+    max_tokens=8192,
+    # Thinking config - disabled for simple doc queries
+    thinking=False,
+    thinking_budget=10000,
+    # Execution mode - multi-turn for follow-up questions
+    single_turn=False,
+)

wafer/templates/optimize_kernel.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""Template for optimizing GPU kernels.
+Usage:
+    wafer wevin -t optimize-kernel --args kernel=./matmul.cu "Optimize for H100"
+    wafer wevin -t optimize-kernel --args kernel=./attention.cu --args target=A100 "Reduce memory bandwidth"
+"""
+try:
+    from wafer_core.rollouts.templates import TemplateConfig
+except ImportError:
+    from rollouts.templates import TemplateConfig
+template = TemplateConfig(
+    # Identity
+    name="optimize-kernel",
+    description="Optimize GPU kernel implementations for performance",
+    # System prompt
+    system_prompt="""You are a GPU kernel optimization expert. Your task is to optimize kernel code for maximum performance.
+Kernel file(s): $kernel
+Target GPU: $target
+Strategy:
+1. Read and understand the current implementation
+2. Run `wafer evaluate` to get baseline performance metrics
+3. Identify optimization opportunities:
+   - Memory access patterns (coalescing, bank conflicts)
+   - Occupancy and register usage
+   - Warp divergence
+   - Instruction-level parallelism
+4. Implement optimizations using edit tool
+5. Re-run `wafer evaluate` to verify improvements
+6. Iterate until target performance is achieved
+Commands:
+- `wafer evaluate --impl <file> --reference <ref> --test-cases <tests>` - Run evaluation
+- `wafer evaluate --impl <file> --reference <ref> --test-cases <tests> --profile` - With NCU profiling
+- `wafer remote-run "<command>"` - Run arbitrary commands on remote GPU
+Output:
+- Summary of optimizations applied
+- Before/after performance comparison
+- Explanation of key changes
+IMPORTANT: Always verify correctness with wafer evaluate before claiming success.
+""",
+    # Tools
+    tools=["read", "write", "edit", "glob", "grep", "bash"],
+    bash_allowlist=[
+        "wafer evaluate",
+        "wafer remote-run",
+        "wafer nvidia ncu",
+        "wafer nvidia nsys",
+        "wafer nvidia perfetto",
+        "jq",
+        "python -c",
+    ],
+    # Model config - use thinking for complex optimization reasoning
+    model="anthropic/claude-sonnet-4-5-20250929",
+    max_tokens=16384,
+    # Thinking config - enabled for complex kernel optimization
+    thinking=True,
+    thinking_budget=10000,
+    # Execution mode - multi-turn for iterative optimization
+    single_turn=False,
+    # Template variables
+    defaults={
+        "kernel": "./kernel.cu",
+        "target": "H100",
+    },
+)

wafer/templates/optimize_kernelbench.py ADDED Viewed

@@ -0,0 +1,137 @@
+"""Template for KernelBench optimization - matches eval system prompt.
+Usage:
+    # Run on a specific problem
+    wafer agent -t optimize-kernelbench \
+        --args reference=/path/to/problem.py \
+        --args pool=kernelbench-pool \
+        --args backend=hip \
+        --json \
+        "Optimize the Softmax kernel"
+    # Watch in real-time with JSON streaming
+    wafer agent -t optimize-kernelbench \
+        --args reference=./23_Softmax.py \
+        --json
+Variables:
+    - reference: Path to the KernelBench problem file (required)
+    - pool: Target pool name (default: kernelbench-pool)
+    - target: Single target name (alternative to pool)
+    - backend: Backend type - hip or cuda (default: hip)
+"""
+try:
+    from wafer_core.rollouts.templates import TemplateConfig
+except ImportError:
+    from rollouts.templates import TemplateConfig
+# System prompt matches optimize_kernelbench_eval/base_config.py SYSTEM_PROMPT
+SYSTEM_PROMPT = """\
+You are a GPU kernel optimization expert. Your task is to write optimized GPU kernels that are correct and faster than the PyTorch baseline.
+IMPORTANT: You do NOT have a local GPU. You MUST use `wafer evaluate kernelbench` to test kernels on remote GPU hardware.
+## Kernel Format (KernelBench)
+The reference file contains a PyTorch `Model` class. You must write a `ModelNew` class that:
+1. Has the same `__init__` signature as `Model`
+2. Has a `forward()` method with the same input/output signature
+3. Uses custom $backend_upper kernels for the computation (NOT PyTorch ops like F.scaled_dot_product_attention or torch.matmul)
+The reference file also provides:
+- `get_inputs()` - generates test inputs for forward()
+- `get_init_inputs()` - generates constructor arguments
+## Available Tools
+- read(file_path): Read source files
+- write(file_path, content): Write your optimized kernel
+- glob(pattern): Find files by pattern
+- grep(pattern): Search code
+- bash(command): Run shell commands including wafer CLI
+## Workflow
+1. Read the reference problem file to understand what `Model` does
+2. Analyze the computation and identify optimization opportunities
+3. Write an optimized `ModelNew` class with custom $backend_upper kernels using `__global__` kernel definitions and `torch.utils.cpp_extension.load_inline`
+4. Test with: `wafer evaluate kernelbench $target_flag --backend $backend --impl <your_file.py> --reference <problem.py> --benchmark`
+5. Iterate based on feedback until correct and fast
+## Example Command
+```bash
+wafer evaluate kernelbench \\
+    $target_flag \\
+    --backend $backend \\
+    --impl optimized_kernel.py \\
+    --reference $reference \\
+    --benchmark
+```
+## Profiling Tools (USE THESE!)
+When your kernel is slower than expected, use profiling to understand WHY:
+- `wafer rocprof profile --impl <file> --reference <ref>` - AMD GPU profiling
+- `wafer nvidia ncu --impl <file> --reference <ref>` - NVIDIA NCU profiling
+## CRITICAL: Reactive Debugging
+After EVERY `wafer evaluate` call:
+1. Check the speedup result
+2. If speedup < 1.0x (slowdown), STOP and analyze:
+   - Run profiling to identify the bottleneck
+   - Ask: "Why is this slow?" before trying another approach
+3. Don't just try random optimizations - understand the root cause
+Your kernel MUST:
+- Pass correctness tests (outputs match reference within tolerance)
+- Achieve speedup > 1.0x over PyTorch baseline
+- Use actual $backend_upper kernels (with `__global__` definitions), NOT PyTorch ops
+You MUST run `wafer evaluate kernelbench` to verify your kernel. Your score depends on actual measured results."""
+template = TemplateConfig(
+    # Identity
+    name="optimize-kernelbench",
+    description="Optimize KernelBench problems (matches eval system prompt)",
+    # System prompt
+    system_prompt=SYSTEM_PROMPT,
+    # Tools
+    tools=["read", "write", "edit", "glob", "grep", "bash"],
+    bash_allowlist=[
+        "wafer evaluate",
+        "wafer nvidia ncu",
+        "wafer nvidia nsys",
+        "wafer rocprof",
+        "wafer compiler-analyze",
+        "python",
+        "python3",
+        "timeout",
+        "ls",
+        "cat",
+        "head",
+        "tail",
+        "wc",
+        "pwd",
+        "which",
+    ],
+    # Model config - match eval settings
+    model="anthropic/claude-opus-4-5-20251101",
+    max_tokens=8192,
+    # No thinking by default (match eval), can override with --thinking
+    thinking=False,
+    # Multi-turn for iterative optimization
+    single_turn=False,
+    # Template variables
+    defaults={
+        "reference": "./problem.py",
+        "pool": "kernelbench-pool",
+        "target": "",  # If set, overrides pool
+        "backend": "hip",
+        "backend_upper": "HIP",  # Auto-computed from backend
+        "target_flag": "--pool kernelbench-pool",  # Auto-computed
+    },
+)

wafer/templates/trace_analyze.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""Template for analyzing GPU performance traces.
+Usage:
+    wafer wevin -t trace-analyze --args trace=./profile.ncu-rep "What's the bottleneck?"
+    wafer wevin -t trace-analyze --args trace=./trace.nsys-rep "Why is kernel X slow?"
+    wafer wevin -t trace-analyze --args trace=./trace.json "Analyze this PyTorch trace"
+"""
+try:
+    from wafer_core.rollouts.templates import TemplateConfig
+except ImportError:
+    from rollouts.templates import TemplateConfig
+template = TemplateConfig(
+    # Identity
+    name="trace-analyze",
+    description="Analyze GPU performance traces (NCU, NSYS, Perfetto, PyTorch)",
+    # System prompt
+    system_prompt="""You are a GPU performance analysis expert. Your task is to analyze performance traces and identify optimization opportunities.
+Trace file: $trace
+Strategy:
+1. Identify the trace type by extension:
+   - `.ncu-rep` → NVIDIA Nsight Compute profile
+   - `.nsys-rep` → NVIDIA Nsight Systems trace
+   - `.json` or `.pt.trace.json` → PyTorch profiler trace (Chrome trace format)
+   - `.perfetto` or `.pftrace` → Perfetto trace
+2. Use the appropriate wafer analyze command:
+   - `wafer nvidia ncu analyze <file>` for NCU profiles
+   - `wafer nvidia nsys analyze <file>` for NSYS traces
+   - `wafer nvidia perfetto query <file> "<SQL>"` for Perfetto OR PyTorch JSON traces
+   - `wafer nvidia perfetto tables <file>` to list available tables
+3. For PyTorch/Perfetto traces, useful SQL queries:
+   - `SELECT DISTINCT cat FROM slice` - list event categories
+   - `SELECT name, dur/1000000.0 as dur_ms FROM slice WHERE cat = 'kernel' ORDER BY dur DESC LIMIT 20` - slowest GPU kernels
+   - `SELECT name, SUM(dur)/1000000.0 as total_ms, COUNT(*) as count FROM slice WHERE cat = 'kernel' GROUP BY name ORDER BY total_ms DESC` - kernel time breakdown
+   - `SELECT name, dur/1000000.0 as dur_ms FROM slice WHERE cat = 'cpu_op' ORDER BY dur DESC LIMIT 20` - slowest CPU ops
+4. Identify bottlenecks and provide actionable recommendations
+Output format:
+- Summary of key findings
+- Performance bottlenecks identified (ranked by impact)
+- Specific optimization recommendations with expected improvements
+- Code changes if applicable
+Use `--json` flags when available for structured output that's easier to parse.
+""",
+    # Tools
+    tools=["read", "glob", "grep", "bash"],
+    bash_allowlist=[
+        "wafer nvidia ncu",
+        "wafer nvidia nsys",
+        "wafer nvidia perfetto",
+        "wafer nvidia tracelens",
+        "jq",
+        "python -c",
+    ],
+    # Model config
+    model="anthropic/claude-sonnet-4-5-20250929",
+    max_tokens=8192,
+    # Thinking config - disabled for trace analysis (mostly parsing)
+    thinking=False,
+    thinking_budget=10000,
+    # Execution mode - single turn for one-shot analysis
+    single_turn=True,
+    # Template variables
+    defaults={
+        "trace": "./profile.ncu-rep",
+    },
+)

wafer/tracelens.py ADDED Viewed

@@ -0,0 +1,218 @@
+"""TraceLens CLI wrapper.
+Provides human-readable CLI interface for TraceLens operations.
+This follows the same pattern as rocprof_sdk.py and other CLI wrappers.
+"""
+import json
+import sys
+from dataclasses import asdict
+def print_usage() -> None:
+    """Print CLI usage information."""
+    print("Usage: wafer tracelens <subcommand> [options]", file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Subcommands:", file=sys.stderr)
+    print("  check              Check TraceLens installation status", file=sys.stderr)
+    print("  report TRACE       Generate performance report from trace file", file=sys.stderr)
+    print("  compare A B        Compare two performance reports", file=sys.stderr)
+    print("  collective DIR     Generate multi-rank collective report", file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Report Options:", file=sys.stderr)
+    print("  --output PATH      Output file path", file=sys.stderr)
+    print("  --format FORMAT    Trace format: auto, pytorch, rocprof, jax", file=sys.stderr)
+    print("  --short-kernel     Include short kernel analysis", file=sys.stderr)
+    print("  --kernel-details   Include detailed kernel breakdown", file=sys.stderr)
+    print("  --json             Output result as JSON", file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Examples:", file=sys.stderr)
+    print("  wafer tracelens check", file=sys.stderr)
+    print("  wafer tracelens report trace.json", file=sys.stderr)
+    print("  wafer tracelens report trace.json --format pytorch --kernel-details", file=sys.stderr)
+    print("  wafer tracelens compare baseline.xlsx candidate.xlsx", file=sys.stderr)
+    print("  wafer tracelens collective ./traces --world-size 8", file=sys.stderr)
+def check_command(json_output: bool = False) -> str:
+    """CLI wrapper for checking TraceLens installation.
+    Args:
+        json_output: If True, return JSON; otherwise print human-readable
+    Returns:
+        Status message or JSON string
+    """
+    from wafer_core.lib.tracelens import check_installation
+    result = check_installation()
+    if json_output:
+        return json.dumps(asdict(result), indent=2)
+    else:
+        if result.installed:
+            print("✓ TraceLens is installed", file=sys.stderr)
+            if result.version:
+                print(f"  Version: {result.version}", file=sys.stderr)
+            if result.commands_available:
+                print("  Available commands:", file=sys.stderr)
+                for cmd in result.commands_available:
+                    print(f"    - {cmd}", file=sys.stderr)
+            return "TraceLens is installed"
+        else:
+            print("✗ TraceLens is not installed", file=sys.stderr)
+            if result.install_command:
+                print(f"  Install: {result.install_command}", file=sys.stderr)
+            return "TraceLens is not installed"
+def report_command(
+    trace_path: str,
+    output_path: str | None = None,
+    trace_format: str = "auto",
+    short_kernel: bool = False,
+    kernel_details: bool = False,
+    json_output: bool = False,
+) -> str:
+    """CLI wrapper for generating performance report.
+    Args:
+        trace_path: Path to trace file
+        output_path: Optional output path for Excel report
+        trace_format: Trace format (auto, pytorch, rocprof, jax)
+        short_kernel: Include short kernel analysis
+        kernel_details: Include detailed kernel breakdown
+        json_output: If True, return JSON; otherwise print human-readable
+    Returns:
+        Success message or JSON string
+    Raises:
+        RuntimeError: If report generation fails
+    """
+    from wafer_core.lib.tracelens import generate_perf_report
+    from wafer_core.lib.tracelens.types import TraceFormat
+    format_map = {
+        "auto": TraceFormat.AUTO,
+        "pytorch": TraceFormat.PYTORCH,
+        "rocprof": TraceFormat.ROCPROF,
+        "jax": TraceFormat.JAX,
+    }
+    result = generate_perf_report(
+        trace_path=trace_path,
+        output_path=output_path,
+        trace_format=format_map.get(trace_format, TraceFormat.AUTO),
+        short_kernel_study=short_kernel,
+        kernel_details=kernel_details,
+    )
+    if json_output:
+        return json.dumps(asdict(result), indent=2)
+    else:
+        if result.success:
+            print("✓ Report generated successfully", file=sys.stderr)
+            print(f"  Output: {result.output_path}", file=sys.stderr)
+            print(f"  Format: {result.trace_format}", file=sys.stderr)
+            return "Report generated"
+        else:
+            print("✗ Report generation failed", file=sys.stderr)
+            if result.error:
+                print(f"  Error: {result.error}", file=sys.stderr)
+            if result.stderr:
+                print("  stderr:", file=sys.stderr)
+                print(result.stderr, file=sys.stderr)
+            raise RuntimeError(result.error or "Report generation failed")
+def compare_command(
+    baseline_path: str,
+    candidate_path: str,
+    output_path: str | None = None,
+    baseline_name: str = "baseline",
+    candidate_name: str = "candidate",
+    json_output: bool = False,
+) -> str:
+    """CLI wrapper for comparing two performance reports.
+    Args:
+        baseline_path: Path to baseline Excel report
+        candidate_path: Path to candidate Excel report
+        output_path: Optional output path for comparison file
+        baseline_name: Display name for baseline
+        candidate_name: Display name for candidate
+        json_output: If True, return JSON; otherwise print human-readable
+    Returns:
+        Success message or JSON string
+    Raises:
+        RuntimeError: If comparison fails
+    """
+    from wafer_core.lib.tracelens import compare_reports
+    result = compare_reports(
+        baseline_path=baseline_path,
+        candidate_path=candidate_path,
+        output_path=output_path,
+        baseline_name=baseline_name,
+        candidate_name=candidate_name,
+    )
+    if json_output:
+        return json.dumps(asdict(result), indent=2)
+    else:
+        if result.success:
+            print("✓ Comparison complete", file=sys.stderr)
+            print(f"  Output: {result.output_path}", file=sys.stderr)
+            return "Comparison complete"
+        else:
+            print("✗ Comparison failed", file=sys.stderr)
+            if result.error:
+                print(f"  Error: {result.error}", file=sys.stderr)
+            raise RuntimeError(result.error or "Comparison failed")
+def collective_command(
+    trace_dir: str,
+    world_size: int,
+    output_path: str | None = None,
+    json_output: bool = False,
+) -> str:
+    """CLI wrapper for generating multi-rank collective report.
+    Args:
+        trace_dir: Directory containing trace files for all ranks
+        world_size: Number of ranks (GPUs)
+        output_path: Optional output path for report
+        json_output: If True, return JSON; otherwise print human-readable
+    Returns:
+        Success message or JSON string
+    Raises:
+        RuntimeError: If report generation fails
+    """
+    from wafer_core.lib.tracelens import generate_collective_report
+    result = generate_collective_report(
+        trace_dir=trace_dir,
+        world_size=world_size,
+        output_path=output_path,
+    )
+    if json_output:
+        return json.dumps(asdict(result), indent=2)
+    else:
+        if result.success:
+            print("✓ Collective report generated", file=sys.stderr)
+            print(f"  World size: {result.world_size}", file=sys.stderr)
+            if result.output_path:
+                print(f"  Output: {result.output_path}", file=sys.stderr)
+            return "Collective report generated"
+        else:
+            print("✗ Collective report failed", file=sys.stderr)
+            if result.error:
+                print(f"  Error: {result.error}", file=sys.stderr)
+            raise RuntimeError(result.error or "Collective report failed")