PyPI - wafer-cli - Versions diffs - 0.2.46__tar.gz → 0.2.48__tar.gz - Mend

wafer-cli 0.2.46tar.gz → 0.2.48tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

{wafer_cli-0.2.46 → wafer_cli-0.2.48}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: wafer-cli
-Version: 0.2.46
+Version: 0.2.48
 Summary: CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown

{wafer_cli-0.2.46 → wafer_cli-0.2.48}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "wafer-cli"
-version = "0.2.46"
+version = "0.2.48"
 description = "CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels"
 readme = "README.md"
 requires-python = ">=3.11"

{wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/agent_defaults.py RENAMED Viewed

@@ -239,6 +239,8 @@ AUDIT_BASH_ALLOWLIST: list[str] = [
     "wafer amd rocprof-sdk",
     "wafer amd rocprof-systems",
     "wafer compiler-analyze",
+    # Sub-agents
+    "wafer agent -t ask-docs",
     # Misc
     "timeout",
 ]

{wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/cli.py RENAMED Viewed

@@ -1539,65 +1539,25 @@ _make_agent_alias("wevin", "Alias for 'wafer agent'.")
 @evaluate_app.callback(invoke_without_command=True)
-def evaluate(  # noqa: PLR0913
-    ctx: typer.Context,
-    implementation: Path | None = typer.Option(
-        None, "--impl", "-i", help="Path to implementation kernel file"
-    ),
-    reference: Path | None = typer.Option(
-        None, "--reference", help="Path to reference kernel file"
-    ),
-    test_cases: Path | None = typer.Option(
-        None, "--test-cases", help="Path to test cases JSON file"
-    ),
-    target: str | None = typer.Option(
-        None,
-        "--target",
-        "-t",
-        help="GPU target name. See 'wafer config targets list' for available targets.",
-        autocompletion=complete_target_name,
-    ),
-    benchmark: bool = typer.Option(False, "--benchmark", help="Run performance benchmarks"),
-    profile: bool = typer.Option(False, "--profile", help="Enable profiling"),
-    defensive: bool = typer.Option(
-        False, "--defensive", help="Enable defensive timing to detect evaluation hacking"
-    ),
-    sync_artifacts: bool = typer.Option(
-        True, "--sync-artifacts/--no-sync-artifacts", help="Download artifacts"
-    ),
-    gpu_id: int | None = typer.Option(None, "--gpu-id", help="Override GPU ID"),
-) -> None:
-    """Run kernel evaluation on a remote GPU target.
+def evaluate(ctx: typer.Context) -> None:
+    """Test kernel correctness and performance.
-    Uses the functional format: custom_kernel(inputs) and ref_kernel(inputs).
+    Use one of the subcommands to evaluate your kernel:
-    The evaluation checks:
-      1. Correctness: Does the kernel produce the same output as the reference?
-      2. Performance (--benchmark): How fast is it compared to the reference?
-      3. Defense (--defensive): Detects evaluation hacking (stream injection, etc.)
+    - gpumode: Functional format (custom_kernel/ref_kernel functions)
+    - kernelbench: KernelBench format (ModelNew class)
     Examples:
-        # Basic correctness check
-        wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json
-        # With benchmarking on a specific target
-        wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json \\
-            --target vultr-b200 --benchmark
-        # Full evaluation with defensive timing (detects cheating)
-        wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json \\
-            --benchmark --defensive
+        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json
+        wafer evaluate kernelbench --impl impl.py --reference ref.py --benchmark
-    Subcommands:
-        gpumode        Use GPUMode format (functional) - RECOMMENDED
-        kernelbench    Use KernelBench format (ModelNew class)
-        make-template  Generate template files for this format (deprecated)
+    Run 'wafer evaluate gpumode --help' or 'wafer evaluate kernelbench --help' for options.
     """
-    # If a subcommand is being invoked, skip the main evaluation logic
+    # If a subcommand is being invoked, skip
     if ctx.invoked_subcommand is not None:
         return
-    # Bare 'wafer evaluate' is no longer supported - must use subcommand
+    # Bare 'wafer evaluate' shows help
     typer.echo("Error: 'wafer evaluate' requires a subcommand.", err=True)
     typer.echo("", err=True)
     typer.echo("Available subcommands:", err=True)
@@ -1622,134 +1582,6 @@ def evaluate(  # noqa: PLR0913
     raise typer.Exit(1)
-TEMPLATE_KERNEL = '''\
-import torch
-import triton
-import triton.language as tl
-@triton.jit
-def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
-    """Triton kernel for element-wise addition."""
-    pid = tl.program_id(0)
-    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_elements
-    x = tl.load(x_ptr + offsets, mask=mask)
-    y = tl.load(y_ptr + offsets, mask=mask)
-    tl.store(output_ptr + offsets, x + y, mask=mask)
-def custom_kernel(inputs: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
-    """Your optimized kernel implementation.
-    Args:
-        inputs: Tuple from generate_input() - passed as single argument
-    Returns:
-        Output tensor matching ref_kernel output
-    """
-    x, y = inputs  # Unpack the input tuple
-    output = torch.empty_like(x)
-    n_elements = x.numel()
-    grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
-    return output
-'''
-TEMPLATE_REFERENCE = '''\
-import torch
-def ref_kernel(inputs: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
-    """Ground truth implementation.
-    Args:
-        inputs: Tuple from generate_input() - passed as single argument
-    Returns:
-        Expected output tensor
-    """
-    x, y = inputs  # Unpack the input tuple
-    return x + y
-def generate_input(n: int, seed: int = 42, **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
-    """Generate test inputs based on test case parameters.
-    Called with params from test_cases.json. The returned tuple is passed
-    as a single argument to both ref_kernel and custom_kernel.
-    Args:
-        n: Size of tensors (from test case)
-        seed: Random seed for reproducibility
-        **kwargs: Any other params from test case
-    Returns:
-        Tuple of inputs (passed as single arg to kernels)
-    """
-    torch.manual_seed(seed)
-    x = torch.randn(n, device="cuda", dtype=torch.float32)
-    y = torch.randn(n, device="cuda", dtype=torch.float32)
-    return (x, y)
-'''
-TEMPLATE_TEST_CASES = """\
-[
-  {"name": "small", "n": 1024, "seed": 42},
-  {"name": "medium", "n": 65536, "seed": 42},
-  {"name": "large", "n": 1048576, "seed": 42}
-]
-"""
-@evaluate_app.command("make-template")
-def evaluate_make_template(
-    output_dir: Path = typer.Argument(
-        Path("."),
-        help="Directory to write template files (default: current directory)",
-    ),
-    force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing files"),
-) -> None:
-    """Generate template files for wafer evaluate (functional format).
-    Creates three files:
-    - kernel.py: Implementation template with custom_kernel
-    - reference.py: Reference template with ref_kernel and generate_input
-    - test_cases.json: Test case parameters
-    Examples:
-        wafer evaluate make-template                # Write to current directory
-        wafer evaluate make-template ./my-kernel    # Write to specific directory
-        wafer evaluate make-template --force        # Overwrite existing files
-    """
-    output_dir = output_dir.resolve()
-    output_dir.mkdir(parents=True, exist_ok=True)
-    files = [
-        ("kernel.py", TEMPLATE_KERNEL),
-        ("reference.py", TEMPLATE_REFERENCE),
-        ("test_cases.json", TEMPLATE_TEST_CASES),
-    ]
-    for filename, content in files:
-        path = output_dir / filename
-        if path.exists() and not force:
-            typer.echo(f"Skipping {path} (already exists, use --force to overwrite)")
-            continue
-        path.write_text(content)
-        typer.echo(f"Created {path}")
-    typer.echo("")
-    typer.echo("Next steps:")
-    typer.echo(f"  1. Edit {output_dir / 'kernel.py'} with your optimized implementation")
-    typer.echo(f"  2. Edit {output_dir / 'reference.py'} with the ground truth + input generator")
-    typer.echo(f"  3. Edit {output_dir / 'test_cases.json'} with your test parameters")
-    typer.echo("  4. Run:")
-    typer.echo(f"     wafer evaluate --impl {output_dir / 'kernel.py'} \\")
-    typer.echo(f"         --reference {output_dir / 'reference.py'} \\")
-    typer.echo(f"         --test-cases {output_dir / 'test_cases.json'} --benchmark")
 # =============================================================================
 # KernelBench format evaluation
 # =============================================================================

{wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/cli_instructions.py RENAMED Viewed

@@ -5,19 +5,24 @@ matching the bash_allowlist. This ensures agent instructions stay in sync
 with the CLI — the --help text is the single source of truth for both
 human users and AI agents.
+Also generates help text for wafer agent templates (e.g., "wafer agent -t ask-docs")
+by loading template metadata from the template registry.
 Usage:
     from wafer.cli_instructions import build_cli_instructions
     instructions = build_cli_instructions([
         "wafer evaluate",
         "wafer nvidia ncu",
-        "wafer rocprof profile",
+        "wafer agent -t ask-docs",
         "python",  # non-wafer commands are skipped
     ])
 """
 from __future__ import annotations
+import re
 import click
 import typer.main
@@ -133,11 +138,65 @@ def build_cli_instructions(bash_allowlist: list[str]) -> str:
             continue
         sections.append(_format_command_help(cmd_str, cmd))
-    if not sections:
+    # Also generate help for agent templates
+    template_help = _build_template_instructions(bash_allowlist)
+    if not sections and not template_help:
         return ""
     header = (
         "## Wafer CLI Commands\n\n"
         "You do not have a local GPU. Use the wafer CLI to run on remote GPU hardware.\n"
     )
-    return header + "\n\n".join(sections)
+    result = header + "\n\n".join(sections)
+    if template_help:
+        result += "\n\n" + template_help
+    return result
+def _build_template_instructions(bash_allowlist: list[str]) -> str:
+    """Generate help text for wafer agent templates in the allowlist.
+    Looks for commands matching "wafer agent -t <template>" or
+    "wafer agent --template <template>" and loads their descriptions
+    from the template registry.
+    Args:
+        bash_allowlist: List of allowed bash command prefixes.
+    Returns:
+        Markdown-formatted template help, or empty string if no templates found.
+    """
+    # Match patterns like "wafer agent -t ask-docs" or "wafer agent --template ask-docs"
+    template_pattern = re.compile(r"wafer agent\s+(?:-t|--template)\s+(\S+)")
+    template_names = []
+    for cmd in bash_allowlist:
+        match = template_pattern.match(cmd)
+        if match:
+            template_names.append((cmd, match.group(1)))
+    if not template_names:
+        return ""
+    # Lazy import to avoid circular deps
+    try:
+        from wafer_core.rollouts.templates import load_template
+    except ImportError:
+        return ""
+    sections = []
+    for cmd, template_name in template_names:
+        try:
+            template = load_template(template_name)
+            desc = template.description or f"Run the {template_name} agent template"
+            sections.append(f"### `{cmd}`\n{desc}")
+        except FileNotFoundError:
+            # Template not found — skip silently
+            continue
+    if not sections:
+        return ""
+    return "## Wafer Agent Templates\n\n" + "\n\n".join(sections)

wafer_cli-0.2.48/wafer/templates/audit.py ADDED Viewed

@@ -0,0 +1,120 @@
+"""Template for auditing GPU kernels.
+Usage:
+    wafer agent -t audit --args dir=./my_project --args cmd="make && ./bench" "Find the performance bottleneck"
+    wafer agent -t audit --args dir=. --args cmd="hipcc kernel.hip -o kernel && ./kernel" "Why is this slow?"
+"""
+try:
+    from wafer.agent_defaults import AUDIT_BASH_ALLOWLIST, AUDIT_ENABLED_TOOLS
+except ImportError:
+    # Fallback for when wafer-cli package isn't installed
+    AUDIT_ENABLED_TOOLS = ["read", "glob", "grep", "bash"]
+    AUDIT_BASH_ALLOWLIST = [
+        "ls",
+        "cat",
+        "head",
+        "tail",
+        "wc",
+        "find",
+        "grep",
+        "rg",
+        "pwd",
+        "tree",
+        "which",
+        "diff",
+        "sort",
+        "mkdir",
+        "make",
+        "cmake",
+        "nvcc",
+        "hipcc",
+        "g++",
+        "gcc",
+        "clang",
+        "python",
+        "python3",
+        "./",
+        "wafer evaluate",
+        "wafer nvidia ncu",
+        "wafer nvidia nsys",
+        "wafer amd rocprof-compute",
+        "wafer amd rocprof-sdk",
+        "wafer amd rocprof-systems",
+        "wafer compiler-analyze",
+        "wafer agent -t ask-docs",
+        "timeout",
+    ]
+try:
+    from wafer_core.rollouts.templates import TemplateConfig
+except ImportError:
+    from rollouts.templates import TemplateConfig
+template = TemplateConfig(
+    # Identity
+    name="audit",
+    description="Audit GPU kernels for performance issues, correctness bugs, and optimization opportunities",
+    # System prompt
+    system_prompt="""You are a GPU kernel auditing expert. Your task is to analyze kernel code, identify problems, and explain what's wrong and how to fix it.
+Working directory: $dir
+Build/run command: $cmd
+## Strategy
+1. Read the kernel source code to understand what it does
+2. Run the build/run command to compile and execute:
+   ```bash
+   $cmd
+   ```
+3. Analyze the output for errors, warnings, or performance data
+4. For architectural questions about the target GPU (AMD MI300X, NVIDIA H100, etc.), query the documentation:
+   ```bash
+   wafer agent -t ask-docs --corpus amd "your question about MI300X architecture"
+   wafer agent -t ask-docs --corpus cuda "your question about NVIDIA/CUDA"
+   ```
+   Use this for: wave/warp scheduling, occupancy limits, LDS/shared memory sizing, memory hierarchy, instruction throughput, XCD/GCD topology, MFMA/tensor core specifics.
+5. Identify concrete issues in the code:
+   - Correctness bugs (race conditions, out-of-bounds, incorrect results)
+   - Performance problems (uncoalesced memory access, bank conflicts, low occupancy, warp divergence)
+   - Architectural mismatches (tile sizes vs hardware limits, missing pipelining, suboptimal wave utilization)
+   - Missed optimization opportunities (producer-consumer patterns, software pipelining, wave specialization)
+6. For each issue, explain:
+   - What the problem is
+   - Where in the code it occurs (file + line)
+   - Why it matters (quantify impact if possible, cite architecture specs)
+   - How to fix it (concrete code change, not hand-waving)
+## Output
+Produce a structured audit report:
+1. Summary (one paragraph)
+2. Issues found (ranked by severity/impact)
+3. Suggested fixes (concrete, actionable)
+Be specific. "Use shared memory" is not useful. "Lines 45-62: the inner loop loads A[k][threadIdx.x] from global memory on every iteration. Tile this into shared memory with a 32x32 block to reduce global loads by 32x" is useful.
+Focus on architectural issues, not just micro-optimizations:
+- Is the tile size appropriate for the target GPU's wave/warp structure?
+- Is there opportunity for pipelining or overlapping memory and compute?
+- Could wave/warp specialization (producer-consumer pattern) help?
+- Are occupancy limits being hit due to register or LDS/shared memory pressure?
+IMPORTANT: Ground every claim in evidence from the code, profiler output, or architecture documentation. Use ask-docs for architectural facts you're unsure about.""",
+    # Tools - read-only plus bash for compilation/profiling
+    tools=AUDIT_ENABLED_TOOLS,
+    bash_allowlist=AUDIT_BASH_ALLOWLIST,
+    # Model config - use thinking for deep analysis
+    model="anthropic/claude-sonnet-4-5-20250929",
+    max_tokens=16384,
+    thinking=True,
+    thinking_budget=10000,
+    # Multi-turn for follow-up questions
+    single_turn=False,
+    # Template variables
+    defaults={
+        "dir": ".",
+        "cmd": "echo 'No build command provided'",
+    },
+)

{wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer_cli.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: wafer-cli
-Version: 0.2.46
+Version: 0.2.48
 Summary: CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown

{wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer_cli.egg-info/SOURCES.txt RENAMED Viewed

@@ -57,6 +57,7 @@ wafer/skills/wafer-guide/SKILL.md
 wafer/templates/__init__.py
 wafer/templates/aiter_optimize.py
 wafer/templates/ask_docs.py
+wafer/templates/audit.py
 wafer/templates/optimize_kernel.py
 wafer/templates/optimize_kernelbench.py
 wafer/templates/optimize_vllm.py