wafer-cli 0.2.45__tar.gz → 0.2.47__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/PKG-INFO +1 -1
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/pyproject.toml +1 -1
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/agent_defaults.py +2 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/cli.py +10 -178
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/cli_instructions.py +62 -3
- wafer_cli-0.2.47/wafer/templates/audit.py +120 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer_cli.egg-info/PKG-INFO +1 -1
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer_cli.egg-info/SOURCES.txt +1 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/README.md +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/setup.cfg +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_analytics.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_auth.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_billing.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_cli_coverage.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_cli_parity_integration.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_config_integration.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_file_operations_integration.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_kernel_scope_cli.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_nsys_analyze.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_nsys_profile.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_output.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_rocprof_compute_integration.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_skill_commands.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_ssh_integration.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_targets_ops.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_wevin_cli.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/tests/test_workflow_integration.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/GUIDE.md +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/__init__.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/analytics.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/api_client.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/auth.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/autotuner.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/baseline.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/billing.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/config.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/corpus.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/evaluate.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/global_config.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/gpu_run.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/inference.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/kernel_scope.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/ncu_analyze.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/nsys_analyze.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/nsys_profile.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/output.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/problems.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/rocprof_compute.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/rocprof_sdk.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/rocprof_systems.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/skills/wafer-guide/SKILL.md +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/specs_cli.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/ssh_keys.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/target_lock.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/targets.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/targets_cli.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/targets_ops.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/templates/__init__.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/templates/aiter_optimize.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/templates/ask_docs.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/templates/optimize_kernel.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/templates/optimize_kernelbench.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/templates/optimize_vllm.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/templates/trace_analyze.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/tests/test_eval_cli_parity.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/trace_compare.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/tracelens.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/wevin_cli.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer/workspaces.py +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer_cli.egg-info/dependency_links.txt +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer_cli.egg-info/entry_points.txt +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer_cli.egg-info/requires.txt +0 -0
- {wafer_cli-0.2.45 → wafer_cli-0.2.47}/wafer_cli.egg-info/top_level.txt +0 -0
|
@@ -1539,65 +1539,25 @@ _make_agent_alias("wevin", "Alias for 'wafer agent'.")
|
|
|
1539
1539
|
|
|
1540
1540
|
|
|
1541
1541
|
@evaluate_app.callback(invoke_without_command=True)
|
|
1542
|
-
def evaluate(
|
|
1543
|
-
|
|
1544
|
-
implementation: Path | None = typer.Option(
|
|
1545
|
-
None, "--impl", "-i", help="Path to implementation kernel file"
|
|
1546
|
-
),
|
|
1547
|
-
reference: Path | None = typer.Option(
|
|
1548
|
-
None, "--reference", help="Path to reference kernel file"
|
|
1549
|
-
),
|
|
1550
|
-
test_cases: Path | None = typer.Option(
|
|
1551
|
-
None, "--test-cases", help="Path to test cases JSON file"
|
|
1552
|
-
),
|
|
1553
|
-
target: str | None = typer.Option(
|
|
1554
|
-
None,
|
|
1555
|
-
"--target",
|
|
1556
|
-
"-t",
|
|
1557
|
-
help="GPU target name. See 'wafer config targets list' for available targets.",
|
|
1558
|
-
autocompletion=complete_target_name,
|
|
1559
|
-
),
|
|
1560
|
-
benchmark: bool = typer.Option(False, "--benchmark", help="Run performance benchmarks"),
|
|
1561
|
-
profile: bool = typer.Option(False, "--profile", help="Enable profiling"),
|
|
1562
|
-
defensive: bool = typer.Option(
|
|
1563
|
-
False, "--defensive", help="Enable defensive timing to detect evaluation hacking"
|
|
1564
|
-
),
|
|
1565
|
-
sync_artifacts: bool = typer.Option(
|
|
1566
|
-
True, "--sync-artifacts/--no-sync-artifacts", help="Download artifacts"
|
|
1567
|
-
),
|
|
1568
|
-
gpu_id: int | None = typer.Option(None, "--gpu-id", help="Override GPU ID"),
|
|
1569
|
-
) -> None:
|
|
1570
|
-
"""Run kernel evaluation on a remote GPU target.
|
|
1542
|
+
def evaluate(ctx: typer.Context) -> None:
|
|
1543
|
+
"""Test kernel correctness and performance.
|
|
1571
1544
|
|
|
1572
|
-
|
|
1545
|
+
Use one of the subcommands to evaluate your kernel:
|
|
1573
1546
|
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
2. Performance (--benchmark): How fast is it compared to the reference?
|
|
1577
|
-
3. Defense (--defensive): Detects evaluation hacking (stream injection, etc.)
|
|
1547
|
+
- gpumode: Functional format (custom_kernel/ref_kernel functions)
|
|
1548
|
+
- kernelbench: KernelBench format (ModelNew class)
|
|
1578
1549
|
|
|
1579
1550
|
Examples:
|
|
1580
|
-
|
|
1581
|
-
wafer evaluate --impl
|
|
1582
|
-
|
|
1583
|
-
# With benchmarking on a specific target
|
|
1584
|
-
wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json \\
|
|
1585
|
-
--target vultr-b200 --benchmark
|
|
1586
|
-
|
|
1587
|
-
# Full evaluation with defensive timing (detects cheating)
|
|
1588
|
-
wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json \\
|
|
1589
|
-
--benchmark --defensive
|
|
1551
|
+
wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json
|
|
1552
|
+
wafer evaluate kernelbench --impl impl.py --reference ref.py --benchmark
|
|
1590
1553
|
|
|
1591
|
-
|
|
1592
|
-
gpumode Use GPUMode format (functional) - RECOMMENDED
|
|
1593
|
-
kernelbench Use KernelBench format (ModelNew class)
|
|
1594
|
-
make-template Generate template files for this format (deprecated)
|
|
1554
|
+
Run 'wafer evaluate gpumode --help' or 'wafer evaluate kernelbench --help' for options.
|
|
1595
1555
|
"""
|
|
1596
|
-
# If a subcommand is being invoked, skip
|
|
1556
|
+
# If a subcommand is being invoked, skip
|
|
1597
1557
|
if ctx.invoked_subcommand is not None:
|
|
1598
1558
|
return
|
|
1599
1559
|
|
|
1600
|
-
# Bare 'wafer evaluate'
|
|
1560
|
+
# Bare 'wafer evaluate' shows help
|
|
1601
1561
|
typer.echo("Error: 'wafer evaluate' requires a subcommand.", err=True)
|
|
1602
1562
|
typer.echo("", err=True)
|
|
1603
1563
|
typer.echo("Available subcommands:", err=True)
|
|
@@ -1622,134 +1582,6 @@ def evaluate( # noqa: PLR0913
|
|
|
1622
1582
|
raise typer.Exit(1)
|
|
1623
1583
|
|
|
1624
1584
|
|
|
1625
|
-
TEMPLATE_KERNEL = '''\
|
|
1626
|
-
import torch
|
|
1627
|
-
import triton
|
|
1628
|
-
import triton.language as tl
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
@triton.jit
|
|
1632
|
-
def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
|
|
1633
|
-
"""Triton kernel for element-wise addition."""
|
|
1634
|
-
pid = tl.program_id(0)
|
|
1635
|
-
offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
|
|
1636
|
-
mask = offsets < n_elements
|
|
1637
|
-
x = tl.load(x_ptr + offsets, mask=mask)
|
|
1638
|
-
y = tl.load(y_ptr + offsets, mask=mask)
|
|
1639
|
-
tl.store(output_ptr + offsets, x + y, mask=mask)
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
def custom_kernel(inputs: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
|
1643
|
-
"""Your optimized kernel implementation.
|
|
1644
|
-
|
|
1645
|
-
Args:
|
|
1646
|
-
inputs: Tuple from generate_input() - passed as single argument
|
|
1647
|
-
|
|
1648
|
-
Returns:
|
|
1649
|
-
Output tensor matching ref_kernel output
|
|
1650
|
-
"""
|
|
1651
|
-
x, y = inputs # Unpack the input tuple
|
|
1652
|
-
output = torch.empty_like(x)
|
|
1653
|
-
n_elements = x.numel()
|
|
1654
|
-
grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
|
|
1655
|
-
add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
|
|
1656
|
-
return output
|
|
1657
|
-
'''
|
|
1658
|
-
|
|
1659
|
-
TEMPLATE_REFERENCE = '''\
|
|
1660
|
-
import torch
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
def ref_kernel(inputs: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
|
1664
|
-
"""Ground truth implementation.
|
|
1665
|
-
|
|
1666
|
-
Args:
|
|
1667
|
-
inputs: Tuple from generate_input() - passed as single argument
|
|
1668
|
-
|
|
1669
|
-
Returns:
|
|
1670
|
-
Expected output tensor
|
|
1671
|
-
"""
|
|
1672
|
-
x, y = inputs # Unpack the input tuple
|
|
1673
|
-
return x + y
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
def generate_input(n: int, seed: int = 42, **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
|
|
1677
|
-
"""Generate test inputs based on test case parameters.
|
|
1678
|
-
|
|
1679
|
-
Called with params from test_cases.json. The returned tuple is passed
|
|
1680
|
-
as a single argument to both ref_kernel and custom_kernel.
|
|
1681
|
-
|
|
1682
|
-
Args:
|
|
1683
|
-
n: Size of tensors (from test case)
|
|
1684
|
-
seed: Random seed for reproducibility
|
|
1685
|
-
**kwargs: Any other params from test case
|
|
1686
|
-
|
|
1687
|
-
Returns:
|
|
1688
|
-
Tuple of inputs (passed as single arg to kernels)
|
|
1689
|
-
"""
|
|
1690
|
-
torch.manual_seed(seed)
|
|
1691
|
-
x = torch.randn(n, device="cuda", dtype=torch.float32)
|
|
1692
|
-
y = torch.randn(n, device="cuda", dtype=torch.float32)
|
|
1693
|
-
return (x, y)
|
|
1694
|
-
'''
|
|
1695
|
-
|
|
1696
|
-
TEMPLATE_TEST_CASES = """\
|
|
1697
|
-
[
|
|
1698
|
-
{"name": "small", "n": 1024, "seed": 42},
|
|
1699
|
-
{"name": "medium", "n": 65536, "seed": 42},
|
|
1700
|
-
{"name": "large", "n": 1048576, "seed": 42}
|
|
1701
|
-
]
|
|
1702
|
-
"""
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
@evaluate_app.command("make-template")
|
|
1706
|
-
def evaluate_make_template(
|
|
1707
|
-
output_dir: Path = typer.Argument(
|
|
1708
|
-
Path("."),
|
|
1709
|
-
help="Directory to write template files (default: current directory)",
|
|
1710
|
-
),
|
|
1711
|
-
force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing files"),
|
|
1712
|
-
) -> None:
|
|
1713
|
-
"""Generate template files for wafer evaluate (functional format).
|
|
1714
|
-
|
|
1715
|
-
Creates three files:
|
|
1716
|
-
- kernel.py: Implementation template with custom_kernel
|
|
1717
|
-
- reference.py: Reference template with ref_kernel and generate_input
|
|
1718
|
-
- test_cases.json: Test case parameters
|
|
1719
|
-
|
|
1720
|
-
Examples:
|
|
1721
|
-
wafer evaluate make-template # Write to current directory
|
|
1722
|
-
wafer evaluate make-template ./my-kernel # Write to specific directory
|
|
1723
|
-
wafer evaluate make-template --force # Overwrite existing files
|
|
1724
|
-
"""
|
|
1725
|
-
output_dir = output_dir.resolve()
|
|
1726
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
|
1727
|
-
|
|
1728
|
-
files = [
|
|
1729
|
-
("kernel.py", TEMPLATE_KERNEL),
|
|
1730
|
-
("reference.py", TEMPLATE_REFERENCE),
|
|
1731
|
-
("test_cases.json", TEMPLATE_TEST_CASES),
|
|
1732
|
-
]
|
|
1733
|
-
|
|
1734
|
-
for filename, content in files:
|
|
1735
|
-
path = output_dir / filename
|
|
1736
|
-
if path.exists() and not force:
|
|
1737
|
-
typer.echo(f"Skipping {path} (already exists, use --force to overwrite)")
|
|
1738
|
-
continue
|
|
1739
|
-
path.write_text(content)
|
|
1740
|
-
typer.echo(f"Created {path}")
|
|
1741
|
-
|
|
1742
|
-
typer.echo("")
|
|
1743
|
-
typer.echo("Next steps:")
|
|
1744
|
-
typer.echo(f" 1. Edit {output_dir / 'kernel.py'} with your optimized implementation")
|
|
1745
|
-
typer.echo(f" 2. Edit {output_dir / 'reference.py'} with the ground truth + input generator")
|
|
1746
|
-
typer.echo(f" 3. Edit {output_dir / 'test_cases.json'} with your test parameters")
|
|
1747
|
-
typer.echo(" 4. Run:")
|
|
1748
|
-
typer.echo(f" wafer evaluate --impl {output_dir / 'kernel.py'} \\")
|
|
1749
|
-
typer.echo(f" --reference {output_dir / 'reference.py'} \\")
|
|
1750
|
-
typer.echo(f" --test-cases {output_dir / 'test_cases.json'} --benchmark")
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
1585
|
# =============================================================================
|
|
1754
1586
|
# KernelBench format evaluation
|
|
1755
1587
|
# =============================================================================
|
|
@@ -5,19 +5,24 @@ matching the bash_allowlist. This ensures agent instructions stay in sync
|
|
|
5
5
|
with the CLI — the --help text is the single source of truth for both
|
|
6
6
|
human users and AI agents.
|
|
7
7
|
|
|
8
|
+
Also generates help text for wafer agent templates (e.g., "wafer agent -t ask-docs")
|
|
9
|
+
by loading template metadata from the template registry.
|
|
10
|
+
|
|
8
11
|
Usage:
|
|
9
12
|
from wafer.cli_instructions import build_cli_instructions
|
|
10
13
|
|
|
11
14
|
instructions = build_cli_instructions([
|
|
12
15
|
"wafer evaluate",
|
|
13
16
|
"wafer nvidia ncu",
|
|
14
|
-
"wafer
|
|
17
|
+
"wafer agent -t ask-docs",
|
|
15
18
|
"python", # non-wafer commands are skipped
|
|
16
19
|
])
|
|
17
20
|
"""
|
|
18
21
|
|
|
19
22
|
from __future__ import annotations
|
|
20
23
|
|
|
24
|
+
import re
|
|
25
|
+
|
|
21
26
|
import click
|
|
22
27
|
import typer.main
|
|
23
28
|
|
|
@@ -133,11 +138,65 @@ def build_cli_instructions(bash_allowlist: list[str]) -> str:
|
|
|
133
138
|
continue
|
|
134
139
|
sections.append(_format_command_help(cmd_str, cmd))
|
|
135
140
|
|
|
136
|
-
|
|
141
|
+
# Also generate help for agent templates
|
|
142
|
+
template_help = _build_template_instructions(bash_allowlist)
|
|
143
|
+
|
|
144
|
+
if not sections and not template_help:
|
|
137
145
|
return ""
|
|
138
146
|
|
|
139
147
|
header = (
|
|
140
148
|
"## Wafer CLI Commands\n\n"
|
|
141
149
|
"You do not have a local GPU. Use the wafer CLI to run on remote GPU hardware.\n"
|
|
142
150
|
)
|
|
143
|
-
|
|
151
|
+
|
|
152
|
+
result = header + "\n\n".join(sections)
|
|
153
|
+
if template_help:
|
|
154
|
+
result += "\n\n" + template_help
|
|
155
|
+
return result
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _build_template_instructions(bash_allowlist: list[str]) -> str:
|
|
159
|
+
"""Generate help text for wafer agent templates in the allowlist.
|
|
160
|
+
|
|
161
|
+
Looks for commands matching "wafer agent -t <template>" or
|
|
162
|
+
"wafer agent --template <template>" and loads their descriptions
|
|
163
|
+
from the template registry.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
bash_allowlist: List of allowed bash command prefixes.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Markdown-formatted template help, or empty string if no templates found.
|
|
170
|
+
"""
|
|
171
|
+
# Match patterns like "wafer agent -t ask-docs" or "wafer agent --template ask-docs"
|
|
172
|
+
template_pattern = re.compile(r"wafer agent\s+(?:-t|--template)\s+(\S+)")
|
|
173
|
+
|
|
174
|
+
template_names = []
|
|
175
|
+
for cmd in bash_allowlist:
|
|
176
|
+
match = template_pattern.match(cmd)
|
|
177
|
+
if match:
|
|
178
|
+
template_names.append((cmd, match.group(1)))
|
|
179
|
+
|
|
180
|
+
if not template_names:
|
|
181
|
+
return ""
|
|
182
|
+
|
|
183
|
+
# Lazy import to avoid circular deps
|
|
184
|
+
try:
|
|
185
|
+
from wafer_core.rollouts.templates import load_template
|
|
186
|
+
except ImportError:
|
|
187
|
+
return ""
|
|
188
|
+
|
|
189
|
+
sections = []
|
|
190
|
+
for cmd, template_name in template_names:
|
|
191
|
+
try:
|
|
192
|
+
template = load_template(template_name)
|
|
193
|
+
desc = template.description or f"Run the {template_name} agent template"
|
|
194
|
+
sections.append(f"### `{cmd}`\n{desc}")
|
|
195
|
+
except FileNotFoundError:
|
|
196
|
+
# Template not found — skip silently
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
if not sections:
|
|
200
|
+
return ""
|
|
201
|
+
|
|
202
|
+
return "## Wafer Agent Templates\n\n" + "\n\n".join(sections)
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Template for auditing GPU kernels.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
wafer agent -t audit --args dir=./my_project --args cmd="make && ./bench" "Find the performance bottleneck"
|
|
5
|
+
wafer agent -t audit --args dir=. --args cmd="hipcc kernel.hip -o kernel && ./kernel" "Why is this slow?"
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from wafer.agent_defaults import AUDIT_BASH_ALLOWLIST, AUDIT_ENABLED_TOOLS
|
|
10
|
+
except ImportError:
|
|
11
|
+
# Fallback for when wafer-cli package isn't installed
|
|
12
|
+
AUDIT_ENABLED_TOOLS = ["read", "glob", "grep", "bash"]
|
|
13
|
+
AUDIT_BASH_ALLOWLIST = [
|
|
14
|
+
"ls",
|
|
15
|
+
"cat",
|
|
16
|
+
"head",
|
|
17
|
+
"tail",
|
|
18
|
+
"wc",
|
|
19
|
+
"find",
|
|
20
|
+
"grep",
|
|
21
|
+
"rg",
|
|
22
|
+
"pwd",
|
|
23
|
+
"tree",
|
|
24
|
+
"which",
|
|
25
|
+
"diff",
|
|
26
|
+
"sort",
|
|
27
|
+
"mkdir",
|
|
28
|
+
"make",
|
|
29
|
+
"cmake",
|
|
30
|
+
"nvcc",
|
|
31
|
+
"hipcc",
|
|
32
|
+
"g++",
|
|
33
|
+
"gcc",
|
|
34
|
+
"clang",
|
|
35
|
+
"python",
|
|
36
|
+
"python3",
|
|
37
|
+
"./",
|
|
38
|
+
"wafer evaluate",
|
|
39
|
+
"wafer nvidia ncu",
|
|
40
|
+
"wafer nvidia nsys",
|
|
41
|
+
"wafer amd rocprof-compute",
|
|
42
|
+
"wafer amd rocprof-sdk",
|
|
43
|
+
"wafer amd rocprof-systems",
|
|
44
|
+
"wafer compiler-analyze",
|
|
45
|
+
"wafer agent -t ask-docs",
|
|
46
|
+
"timeout",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
from wafer_core.rollouts.templates import TemplateConfig
|
|
51
|
+
except ImportError:
|
|
52
|
+
from rollouts.templates import TemplateConfig
|
|
53
|
+
|
|
54
|
+
template = TemplateConfig(
|
|
55
|
+
# Identity
|
|
56
|
+
name="audit",
|
|
57
|
+
description="Audit GPU kernels for performance issues, correctness bugs, and optimization opportunities",
|
|
58
|
+
# System prompt
|
|
59
|
+
system_prompt="""You are a GPU kernel auditing expert. Your task is to analyze kernel code, identify problems, and explain what's wrong and how to fix it.
|
|
60
|
+
|
|
61
|
+
Working directory: $dir
|
|
62
|
+
Build/run command: $cmd
|
|
63
|
+
|
|
64
|
+
## Strategy
|
|
65
|
+
|
|
66
|
+
1. Read the kernel source code to understand what it does
|
|
67
|
+
2. Run the build/run command to compile and execute:
|
|
68
|
+
```bash
|
|
69
|
+
$cmd
|
|
70
|
+
```
|
|
71
|
+
3. Analyze the output for errors, warnings, or performance data
|
|
72
|
+
4. For architectural questions about the target GPU (AMD MI300X, NVIDIA H100, etc.), query the documentation:
|
|
73
|
+
```bash
|
|
74
|
+
wafer agent -t ask-docs --corpus amd "your question about MI300X architecture"
|
|
75
|
+
wafer agent -t ask-docs --corpus cuda "your question about NVIDIA/CUDA"
|
|
76
|
+
```
|
|
77
|
+
Use this for: wave/warp scheduling, occupancy limits, LDS/shared memory sizing, memory hierarchy, instruction throughput, XCD/GCD topology, MFMA/tensor core specifics.
|
|
78
|
+
5. Identify concrete issues in the code:
|
|
79
|
+
- Correctness bugs (race conditions, out-of-bounds, incorrect results)
|
|
80
|
+
- Performance problems (uncoalesced memory access, bank conflicts, low occupancy, warp divergence)
|
|
81
|
+
- Architectural mismatches (tile sizes vs hardware limits, missing pipelining, suboptimal wave utilization)
|
|
82
|
+
- Missed optimization opportunities (producer-consumer patterns, software pipelining, wave specialization)
|
|
83
|
+
6. For each issue, explain:
|
|
84
|
+
- What the problem is
|
|
85
|
+
- Where in the code it occurs (file + line)
|
|
86
|
+
- Why it matters (quantify impact if possible, cite architecture specs)
|
|
87
|
+
- How to fix it (concrete code change, not hand-waving)
|
|
88
|
+
|
|
89
|
+
## Output
|
|
90
|
+
|
|
91
|
+
Produce a structured audit report:
|
|
92
|
+
1. Summary (one paragraph)
|
|
93
|
+
2. Issues found (ranked by severity/impact)
|
|
94
|
+
3. Suggested fixes (concrete, actionable)
|
|
95
|
+
|
|
96
|
+
Be specific. "Use shared memory" is not useful. "Lines 45-62: the inner loop loads A[k][threadIdx.x] from global memory on every iteration. Tile this into shared memory with a 32x32 block to reduce global loads by 32x" is useful.
|
|
97
|
+
|
|
98
|
+
Focus on architectural issues, not just micro-optimizations:
|
|
99
|
+
- Is the tile size appropriate for the target GPU's wave/warp structure?
|
|
100
|
+
- Is there opportunity for pipelining or overlapping memory and compute?
|
|
101
|
+
- Could wave/warp specialization (producer-consumer pattern) help?
|
|
102
|
+
- Are occupancy limits being hit due to register or LDS/shared memory pressure?
|
|
103
|
+
|
|
104
|
+
IMPORTANT: Ground every claim in evidence from the code, profiler output, or architecture documentation. Use ask-docs for architectural facts you're unsure about.""",
|
|
105
|
+
# Tools - read-only plus bash for compilation/profiling
|
|
106
|
+
tools=AUDIT_ENABLED_TOOLS,
|
|
107
|
+
bash_allowlist=AUDIT_BASH_ALLOWLIST,
|
|
108
|
+
# Model config - use thinking for deep analysis
|
|
109
|
+
model="anthropic/claude-sonnet-4-5-20250929",
|
|
110
|
+
max_tokens=16384,
|
|
111
|
+
thinking=True,
|
|
112
|
+
thinking_budget=10000,
|
|
113
|
+
# Multi-turn for follow-up questions
|
|
114
|
+
single_turn=False,
|
|
115
|
+
# Template variables
|
|
116
|
+
defaults={
|
|
117
|
+
"dir": ".",
|
|
118
|
+
"cmd": "echo 'No build command provided'",
|
|
119
|
+
},
|
|
120
|
+
)
|
|
@@ -57,6 +57,7 @@ wafer/skills/wafer-guide/SKILL.md
|
|
|
57
57
|
wafer/templates/__init__.py
|
|
58
58
|
wafer/templates/aiter_optimize.py
|
|
59
59
|
wafer/templates/ask_docs.py
|
|
60
|
+
wafer/templates/audit.py
|
|
60
61
|
wafer/templates/optimize_kernel.py
|
|
61
62
|
wafer/templates/optimize_kernelbench.py
|
|
62
63
|
wafer/templates/optimize_vllm.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|