PyPI - wafer-cli - Versions diffs - 0.2.24__py3-none-any.whl → 0.2.25__py3-none-any.whl - Mend

wafer-cli 0.2.24py3-none-any.whl → 0.2.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

wafer/GUIDE.md +1 -1
wafer/agent_defaults.py +42 -0
wafer/billing.py +6 -6
wafer/cli.py +454 -86
wafer/cli_instructions.py +143 -0
wafer/corpus.py +7 -1
wafer/evaluate.py +13 -6
wafer/kernel_scope.py +1 -1
wafer/ncu_analyze.py +1 -1
wafer/nsys_analyze.py +1 -1
wafer/skills/wafer-guide/SKILL.md +22 -6
wafer/ssh_keys.py +6 -6
wafer/templates/ask_docs.py +1 -1
wafer/templates/optimize_kernel.py +1 -1
wafer/templates/optimize_kernelbench.py +17 -62
wafer/templates/trace_analyze.py +1 -1
wafer/tests/test_eval_cli_parity.py +199 -0
wafer/trace_compare.py +183 -0
wafer/wevin_cli.py +68 -9
wafer/workspaces.py +8 -8
wafer_cli-0.2.25.dist-info/METADATA +107 -0
wafer_cli-0.2.25.dist-info/RECORD +45 -0
wafer_cli-0.2.24.dist-info/METADATA +0 -16
wafer_cli-0.2.24.dist-info/RECORD +0 -41
{wafer_cli-0.2.24.dist-info → wafer_cli-0.2.25.dist-info}/WHEEL +0 -0
{wafer_cli-0.2.24.dist-info → wafer_cli-0.2.25.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.24.dist-info → wafer_cli-0.2.25.dist-info}/top_level.txt +0 -0

wafer/cli_instructions.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""Generate agent system prompt instructions from the wafer CLI's own --help text.
+Walks the typer/click command tree and extracts help text for commands
+matching the bash_allowlist. This ensures agent instructions stay in sync
+with the CLI — the --help text is the single source of truth for both
+human users and AI agents.
+Usage:
+    from wafer.cli_instructions import build_cli_instructions
+    instructions = build_cli_instructions([
+        "wafer evaluate",
+        "wafer nvidia ncu",
+        "wafer rocprof profile",
+        "python",  # non-wafer commands are skipped
+    ])
+"""
+from __future__ import annotations
+import click
+import typer.main
+def _resolve_command(root: click.BaseCommand, parts: list[str]) -> click.BaseCommand | None:
+    """Walk the click command tree to find a (sub)command by name parts.
+    Args:
+        root: The root click command (from typer.main.get_command)
+        parts: Command path segments, e.g. ["evaluate", "kernelbench"]
+    Returns:
+        The click command at that path, or None if not found.
+    """
+    cmd = root
+    for part in parts:
+        if not isinstance(cmd, click.MultiCommand):
+            return None
+        ctx = click.Context(cmd, info_name=part)
+        child = cmd.get_command(ctx, part)
+        if child is None:
+            return None
+        cmd = child
+    return cmd
+def _format_command_help(cmd_path: str, cmd: click.BaseCommand) -> str:
+    """Format a single command's help text for inclusion in a system prompt.
+    Extracts the description and option help text (skipping --help itself).
+    """
+    lines = [f"### `{cmd_path}`"]
+    if cmd.help:
+        lines.append(cmd.help.strip())
+    # Extract option help
+    option_lines = []
+    for param in getattr(cmd, "params", []):
+        if not isinstance(param, click.Option):
+            continue
+        # Skip --help
+        if param.name == "help":
+            continue
+        name = "/".join(param.opts)
+        type_name = param.type.name.upper() if hasattr(param.type, "name") else ""
+        help_text = param.help or ""
+        is_flag = type_name in ("BOOL", "BOOLEAN") or param.is_flag
+        if type_name and not is_flag:
+            option_lines.append(f"  {name} {type_name}  {help_text}")
+        else:
+            option_lines.append(f"  {name}  {help_text}")
+    if option_lines:
+        lines.append("")
+        lines.append("Options:")
+        lines.extend(option_lines)
+    # List subcommands if this is a group
+    if isinstance(cmd, click.MultiCommand):
+        ctx = click.Context(cmd, info_name=cmd_path.split()[-1])
+        subcmd_names = cmd.list_commands(ctx)
+        if subcmd_names:
+            subcmd_lines = []
+            for name in subcmd_names:
+                subcmd = cmd.get_command(ctx, name)
+                if subcmd:
+                    desc = (subcmd.help or subcmd.short_help or "").strip().split("\n")[0]
+                    subcmd_lines.append(f"  {cmd_path} {name}  {desc}")
+            if subcmd_lines:
+                lines.append("")
+                lines.append("Subcommands:")
+                lines.extend(subcmd_lines)
+    return "\n".join(lines)
+def build_cli_instructions(bash_allowlist: list[str]) -> str:
+    """Generate CLI instruction text from --help for allowed wafer commands.
+    Walks the typer/click command tree and extracts help text for each
+    wafer command in the bash_allowlist. Non-wafer commands (python, ls, etc.)
+    are skipped.
+    Args:
+        bash_allowlist: List of allowed bash command prefixes.
+            Example: ["wafer evaluate", "wafer nvidia ncu", "python"]
+    Returns:
+        Markdown-formatted CLI instructions, or empty string if no wafer
+        commands are in the allowlist.
+    """
+    if not bash_allowlist:
+        return ""
+    # Filter to wafer commands only
+    wafer_commands = [cmd for cmd in bash_allowlist if cmd.startswith("wafer ")]
+    if not wafer_commands:
+        return ""
+    # Lazy import to avoid circular deps at module level
+    from wafer.cli import app
+    root = typer.main.get_command(app)
+    sections = []
+    for cmd_str in wafer_commands:
+        # "wafer evaluate kernelbench" -> ["evaluate", "kernelbench"]
+        parts = cmd_str.split()[1:]  # drop "wafer" prefix
+        cmd = _resolve_command(root, parts)
+        if cmd is None:
+            # Command not found in tree — skip silently
+            continue
+        sections.append(_format_command_help(cmd_str, cmd))
+    if not sections:
+        return ""
+    header = (
+        "## Wafer CLI Commands\n\n"
+        "You do not have a local GPU. Use the wafer CLI to run on remote GPU hardware.\n"
+    )
+    return header + "\n\n".join(sections)

wafer/corpus.py CHANGED Viewed

@@ -160,11 +160,17 @@ CORPORA: dict[CorpusName, CorpusConfig] = {
                 paths=["docs"],
                 branch="develop_deprecated",
             ),
-            # HipKittens - high-performance AMD kernels
+            # HipKittens - high-performance AMD kernels (main branch: MI350X/CDNA4+)
             RepoSource(
                 repo="HazyResearch/HipKittens",
                 paths=["docs", "kernels", "include"],
             ),
+            # HipKittens cdna3 branch - MI300X/MI325X (gfx942)
+            RepoSource(
+                repo="HazyResearch/HipKittens",
+                paths=["kernels", "include", "tests"],
+                branch="cdna3",
+            ),
             # vLLM AMD kernels
             RepoSource(
                 repo="vllm-project/vllm",

wafer/evaluate.py CHANGED Viewed

@@ -3496,7 +3496,7 @@ def _build_modal_kernelbench_script(
     # Install CUTLASS headers (for cute/tensor.hpp and cutlass/util/*.h) from GitHub
     # The nvidia-cutlass-dsl pip package doesn't include the C++ headers needed for nvcc
     # IMPORTANT: symlink to /usr/local/cuda/include because nvcc searches there by default
-    cutlass_install = '''
+    cutlass_install = """
         .run_commands([
             # Clone CUTLASS headers from GitHub (shallow clone, full include tree)
             # Use simple shallow clone - sparse-checkout can be buggy in some environments
@@ -3512,7 +3512,7 @@ def _build_modal_kernelbench_script(
             index_url="https://pypi.nvidia.com",
             extra_index_url="https://pypi.org/simple",
         )
-    '''
+    """
     inputs_write = ""
     if inputs_code_b64:
@@ -3772,7 +3772,7 @@ async def run_evaluate_kernelbench_modal(
     result_json = None
     for line in stdout.split("\n"):
         if line.startswith("EVAL_RESULT_JSON:"):
-            result_json = line[len("EVAL_RESULT_JSON:"):]
+            result_json = line[len("EVAL_RESULT_JSON:") :]
             break
     if not result_json:
@@ -4486,6 +4486,7 @@ async def run_evaluate_kernelbench_runpod(
                 # Find Python with PyTorch - check common locations on RunPod
                 python_exe = "python3"
                 for candidate in [
+                    "/opt/venv/bin/python3",
                     "/opt/conda/envs/py_3.10/bin/python3",
                     "/opt/conda/bin/python3",
                 ]:
@@ -4630,7 +4631,9 @@ async def run_evaluate_kernelbench_baremetal_direct(
     """
     # Reuse the AMD function but with CUDA env vars
     # The logic is identical, just the GPU env var is different
-    return await _run_evaluate_kernelbench_baremetal_direct_impl(args, target, gpu_env_var="CUDA_VISIBLE_DEVICES")
+    return await _run_evaluate_kernelbench_baremetal_direct_impl(
+        args, target, gpu_env_var="CUDA_VISIBLE_DEVICES"
+    )
 async def run_evaluate_kernelbench_baremetal_amd(
@@ -4642,7 +4645,9 @@ async def run_evaluate_kernelbench_baremetal_amd(
     Runs evaluation script directly on host (no Docker) for AMD GPUs
     that have PyTorch/ROCm installed.
     """
-    return await _run_evaluate_kernelbench_baremetal_direct_impl(args, target, gpu_env_var="HIP_VISIBLE_DEVICES")
+    return await _run_evaluate_kernelbench_baremetal_direct_impl(
+        args, target, gpu_env_var="HIP_VISIBLE_DEVICES"
+    )
 async def _run_evaluate_kernelbench_baremetal_direct_impl(
@@ -4809,7 +4814,9 @@ async def _run_evaluate_kernelbench_baremetal_direct_impl(
             # AMD: PYTORCH_ROCM_ARCH for faster compile
             rocm_arch = _get_rocm_arch(target.compute_capability)
             arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
-            env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
+            env_vars = (
+                f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
+            )
         else:
             # NVIDIA: just set CUDA_VISIBLE_DEVICES
             env_vars = f"CUDA_VISIBLE_DEVICES={gpu_id} PYTHONUNBUFFERED=1"

wafer/kernel_scope.py CHANGED Viewed

@@ -95,7 +95,7 @@ def analyze_command(
             if not api_url or not auth_headers:
                 raise RuntimeError(
                     "API authentication required for .co file analysis. "
-                    "Run 'wafer login' first."
+                    "Run 'wafer auth login' first."
                 )
             result = analyze_code_object(target_path, api_url, auth_headers)
         # ISA files - use kernel_index parameter

wafer/ncu_analyze.py CHANGED Viewed

@@ -520,7 +520,7 @@ def _analyze_remote_api(
     except httpx.HTTPStatusError as e:
         if e.response.status_code == 401:
-            raise RuntimeError("Not authenticated. Run: wafer login") from e
+            raise RuntimeError("Not authenticated. Run: wafer auth login") from e
         raise RuntimeError(f"API error: {e.response.status_code} - {e.response.text}") from e
     except httpx.RequestError as e:
         raise RuntimeError(f"Could not reach API: {e}") from e

wafer/nsys_analyze.py CHANGED Viewed

@@ -844,7 +844,7 @@ def _analyze_remote_api(
     except httpx.HTTPStatusError as e:
         if e.response.status_code == 401:
-            raise RuntimeError("Not authenticated. Run: wafer login") from e
+            raise RuntimeError("Not authenticated. Run: wafer auth login") from e
         raise RuntimeError(f"API error: {e.response.status_code} - {e.response.text}") from e
     except httpx.RequestError as e:
         raise RuntimeError(f"Could not reach API: {e}") from e

wafer/skills/wafer-guide/SKILL.md CHANGED Viewed

@@ -16,7 +16,7 @@ Before using Wafer CLI commands, install the tool:
 uv tool install wafer-cli
 # Authenticate (one-time setup)
-wafer login
+wafer auth login
 ```
@@ -71,15 +71,31 @@ Test correctness and measure speedup against a reference:
 wafer evaluate make-template ./my-kernel
 # Creates: kernel.py, reference.py, test_cases.json
-# Run evaluation on a configured target
-wafer evaluate \
+# test_cases.json format:
+# [{"name": "small", "n": 1024, "seed": 42}, {"name": "large", "n": 1048576, "seed": 42}]
+# Each dict is passed as **kwargs to generate_input() in reference.py
+# Run correctness check (GPUMode functional format)
+wafer evaluate gpumode \
   --impl ./my-kernel/kernel.py \
   --reference ./my-kernel/reference.py \
   --test-cases ./my-kernel/test_cases.json \
   --target <target-name>
-# With profiling
-wafer evaluate ... --profile
+# Run correctness + benchmark (measures speedup vs reference)
+wafer evaluate gpumode \
+  --impl ./my-kernel/kernel.py \
+  --reference ./my-kernel/reference.py \
+  --test-cases ./my-kernel/test_cases.json \
+  --target <target-name> --benchmark
+# Run with defensive timing (detects evaluation hacking)
+wafer evaluate gpumode ... --benchmark --defensive
+# KernelBench format (ModelNew class)
+wafer evaluate kernelbench \
+  --impl my_kernel.py --reference problem.py \
+  --target <target-name> --stages all
 ```
 ### 4. AI-Assisted Optimization
@@ -126,4 +142,4 @@ wafer config targets init runpod       # RunPod cloud GPUs
 wafer config targets init digitalocean # DigitalOcean AMD GPUs
 ```
-Then use: `wafer evaluate --target <name> ...`
+Then use: `wafer evaluate gpumode --target <name> ...`

wafer/ssh_keys.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """SSH Keys CLI - Manage SSH public keys for workspace access.
-This module provides the implementation for the `wafer ssh-keys` subcommand.
+This module provides the implementation for the `wafer config ssh-keys` subcommand.
 Users register their SSH public keys here, which are then installed in all
 workspaces they attach to (BYOK - Bring Your Own Key model).
 """
@@ -94,7 +94,7 @@ def list_ssh_keys(json_output: bool = False) -> str:
             keys = response.json()
     except httpx.HTTPStatusError as e:
         if e.response.status_code == 401:
-            raise RuntimeError("Not authenticated. Run: wafer login") from e
+            raise RuntimeError("Not authenticated. Run: wafer auth login") from e
         raise RuntimeError(f"API error: {e.response.status_code} - {e.response.text}") from e
     except httpx.RequestError as e:
         raise RuntimeError(f"Could not reach API: {e}") from e
@@ -107,7 +107,7 @@ def list_ssh_keys(json_output: bool = False) -> str:
             "No SSH keys registered.\n"
             "\n"
             "Add your SSH key:\n"
-            "  wafer ssh-keys add\n"
+            "  wafer config ssh-keys add\n"
             "\n"
             "This will auto-detect your key from ~/.ssh/"
         )
@@ -149,7 +149,7 @@ def add_ssh_key(
                 "  ssh-keygen -t ed25519\n"
                 "\n"
                 "Or specify a path:\n"
-                "  wafer ssh-keys add /path/to/key.pub"
+                "  wafer config ssh-keys add /path/to/key.pub"
             )
         pubkey_path = detected[0]
@@ -202,7 +202,7 @@ def add_ssh_key(
             key_data = response.json()
     except httpx.HTTPStatusError as e:
         if e.response.status_code == 401:
-            raise RuntimeError("Not authenticated. Run: wafer login") from e
+            raise RuntimeError("Not authenticated. Run: wafer auth login") from e
         if e.response.status_code == 400:
             # Parse error detail
             try:
@@ -248,7 +248,7 @@ def remove_ssh_key(key_id: str, json_output: bool = False) -> str:
             response.raise_for_status()
     except httpx.HTTPStatusError as e:
         if e.response.status_code == 401:
-            raise RuntimeError("Not authenticated. Run: wafer login") from e
+            raise RuntimeError("Not authenticated. Run: wafer auth login") from e
         if e.response.status_code == 404:
             raise RuntimeError(f"SSH key not found: {key_id}") from e
         raise RuntimeError(f"API error: {e.response.status_code} - {e.response.text}") from e

wafer/templates/ask_docs.py CHANGED Viewed

@@ -51,7 +51,7 @@ Output your answer directly. Be concise but thorough. Include code examples when
         "python -c",
     ],
     # Model config
-    model="anthropic/claude-sonnet-4-5-20250929",
+    model="anthropic/claude-opus-4-5-20251101",
     max_tokens=8192,
     # Thinking config - disabled for simple doc queries
     thinking=False,

wafer/templates/optimize_kernel.py CHANGED Viewed

@@ -56,7 +56,7 @@ IMPORTANT: Always verify correctness with wafer evaluate before claiming success
         "python -c",
     ],
     # Model config - use thinking for complex optimization reasoning
-    model="anthropic/claude-sonnet-4-5-20250929",
+    model="anthropic/claude-opus-4-5-20251101",
     max_tokens=16384,
     # Thinking config - enabled for complex kernel optimization
     thinking=True,

wafer/templates/optimize_kernelbench.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Template for KernelBench optimization - matches eval system prompt.
+"""Template for KernelBench optimization.
 Usage:
     # Run on a specific problem
@@ -26,12 +26,18 @@ try:
 except ImportError:
     from rollouts.templates import TemplateConfig
-# System prompt matches optimize_kernelbench_eval/base_config.py SYSTEM_PROMPT
+from wafer.agent_defaults import ENABLED_TOOLS, KERNELBENCH_BASH_ALLOWLIST
+# Task-specific instructions only — must stay in sync with the eval's SYSTEM_PROMPT
+# in research/evals/optimize_kernelbench_eval/.../base_config.py.
+# Run test_eval_cli_parity.py to verify.
+# Wafer CLI command docs are auto-generated from --help text and composed
+# at runtime by wevin_cli.py (see wafer.cli_instructions.build_cli_instructions).
+# TODO: Consider having both eval and template import SYSTEM_PROMPT from a shared
+# module so there's only one copy to maintain.
 SYSTEM_PROMPT = """\
 You are a GPU kernel optimization expert. Your task is to write optimized GPU kernels that are correct and faster than the PyTorch baseline.
-IMPORTANT: You do NOT have a local GPU. You MUST use `wafer evaluate kernelbench` to test kernels on remote GPU hardware.
 ## Kernel Format (KernelBench)
 The reference file contains a PyTorch `Model` class. You must write a `ModelNew` class that:
@@ -43,49 +49,14 @@ The reference file also provides:
 - `get_inputs()` - generates test inputs for forward()
 - `get_init_inputs()` - generates constructor arguments
-## Available Tools
-- read(file_path): Read source files
-- write(file_path, content): Write your optimized kernel
-- glob(pattern): Find files by pattern
-- grep(pattern): Search code
-- bash(command): Run shell commands including wafer CLI
 ## Workflow
 1. Read the reference problem file to understand what `Model` does
 2. Analyze the computation and identify optimization opportunities
 3. Write an optimized `ModelNew` class with custom $backend_upper kernels using `__global__` kernel definitions and `torch.utils.cpp_extension.load_inline`
-4. Test with: `wafer evaluate kernelbench $target_flag --backend $backend --impl <your_file.py> --reference <problem.py> --benchmark`
+4. Test with: `wafer evaluate kernelbench $target_flag --backend $backend --impl optimized.py --reference <problem.py> --benchmark`
 5. Iterate based on feedback until correct and fast
-## Example Command
-```bash
-wafer evaluate kernelbench \\
-    $target_flag \\
-    --backend $backend \\
-    --impl optimized_kernel.py \\
-    --reference $reference \\
-    --benchmark
-```
-## Profiling Tools (USE THESE!)
-When your kernel is slower than expected, use profiling to understand WHY:
-- `wafer rocprof profile --impl <file> --reference <ref>` - AMD GPU profiling
-- `wafer nvidia ncu --impl <file> --reference <ref>` - NVIDIA NCU profiling
-## CRITICAL: Reactive Debugging
-After EVERY `wafer evaluate` call:
-1. Check the speedup result
-2. If speedup < 1.0x (slowdown), STOP and analyze:
-   - Run profiling to identify the bottleneck
-   - Ask: "Why is this slow?" before trying another approach
-3. Don't just try random optimizations - understand the root cause
 Your kernel MUST:
 - Pass correctness tests (outputs match reference within tolerance)
 - Achieve speedup > 1.0x over PyTorch baseline
@@ -96,32 +67,16 @@ You MUST run `wafer evaluate kernelbench` to verify your kernel. Your score depe
 template = TemplateConfig(
     # Identity
     name="optimize-kernelbench",
-    description="Optimize KernelBench problems (matches eval system prompt)",
-    # System prompt
+    description="Optimize KernelBench problems",
+    # System prompt (task-specific; CLI docs appended at runtime)
     system_prompt=SYSTEM_PROMPT,
     # Tools
-    tools=["read", "write", "edit", "glob", "grep", "bash"],
-    bash_allowlist=[
-        "wafer evaluate",
-        "wafer nvidia ncu",
-        "wafer nvidia nsys",
-        "wafer rocprof",
-        "wafer compiler-analyze",
-        "python",
-        "python3",
-        "timeout",
-        "ls",
-        "cat",
-        "head",
-        "tail",
-        "wc",
-        "pwd",
-        "which",
-    ],
-    # Model config - match eval settings
+    tools=ENABLED_TOOLS,
+    bash_allowlist=KERNELBENCH_BASH_ALLOWLIST,
+    # Model config
     model="anthropic/claude-opus-4-5-20251101",
     max_tokens=8192,
-    # No thinking by default (match eval), can override with --thinking
+    # No thinking by default, can override with --thinking
     thinking=False,
     # Multi-turn for iterative optimization
     single_turn=False,

wafer/templates/trace_analyze.py CHANGED Viewed

@@ -60,7 +60,7 @@ Use `--json` flags when available for structured output that's easier to parse.
         "python -c",
     ],
     # Model config
-    model="anthropic/claude-sonnet-4-5-20250929",
+    model="anthropic/claude-opus-4-5-20251101",
     max_tokens=8192,
     # Thinking config - disabled for trace analysis (mostly parsing)
     thinking=False,

wafer-cli 0.2.24__py3-none-any.whl → 0.2.25__py3-none-any.whl

wafer-cli 0.2.24py3-none-any.whl → 0.2.25py3-none-any.whl