wafer-cli 0.2.46__tar.gz → 0.2.48__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/PKG-INFO +1 -1
  2. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/pyproject.toml +1 -1
  3. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/agent_defaults.py +2 -0
  4. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/cli.py +10 -178
  5. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/cli_instructions.py +62 -3
  6. wafer_cli-0.2.48/wafer/templates/audit.py +120 -0
  7. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer_cli.egg-info/PKG-INFO +1 -1
  8. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer_cli.egg-info/SOURCES.txt +1 -0
  9. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/README.md +0 -0
  10. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/setup.cfg +0 -0
  11. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_analytics.py +0 -0
  12. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_auth.py +0 -0
  13. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_billing.py +0 -0
  14. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_cli_coverage.py +0 -0
  15. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_cli_parity_integration.py +0 -0
  16. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_config_integration.py +0 -0
  17. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_file_operations_integration.py +0 -0
  18. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_kernel_scope_cli.py +0 -0
  19. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_nsys_analyze.py +0 -0
  20. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_nsys_profile.py +0 -0
  21. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_output.py +0 -0
  22. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_rocprof_compute_integration.py +0 -0
  23. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_skill_commands.py +0 -0
  24. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_ssh_integration.py +0 -0
  25. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_targets_ops.py +0 -0
  26. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_wevin_cli.py +0 -0
  27. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/tests/test_workflow_integration.py +0 -0
  28. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/GUIDE.md +0 -0
  29. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/__init__.py +0 -0
  30. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/analytics.py +0 -0
  31. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/api_client.py +0 -0
  32. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/auth.py +0 -0
  33. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/autotuner.py +0 -0
  34. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/baseline.py +0 -0
  35. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/billing.py +0 -0
  36. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/config.py +0 -0
  37. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/corpus.py +0 -0
  38. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/evaluate.py +0 -0
  39. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/global_config.py +0 -0
  40. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/gpu_run.py +0 -0
  41. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/inference.py +0 -0
  42. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/kernel_scope.py +0 -0
  43. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/ncu_analyze.py +0 -0
  44. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/nsys_analyze.py +0 -0
  45. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/nsys_profile.py +0 -0
  46. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/output.py +0 -0
  47. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/problems.py +0 -0
  48. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/rocprof_compute.py +0 -0
  49. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/rocprof_sdk.py +0 -0
  50. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/rocprof_systems.py +0 -0
  51. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/skills/wafer-guide/SKILL.md +0 -0
  52. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/specs_cli.py +0 -0
  53. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/ssh_keys.py +0 -0
  54. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/target_lock.py +0 -0
  55. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/targets.py +0 -0
  56. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/targets_cli.py +0 -0
  57. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/targets_ops.py +0 -0
  58. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/templates/__init__.py +0 -0
  59. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/templates/aiter_optimize.py +0 -0
  60. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/templates/ask_docs.py +0 -0
  61. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/templates/optimize_kernel.py +0 -0
  62. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/templates/optimize_kernelbench.py +0 -0
  63. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/templates/optimize_vllm.py +0 -0
  64. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/templates/trace_analyze.py +0 -0
  65. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/tests/test_eval_cli_parity.py +0 -0
  66. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/trace_compare.py +0 -0
  67. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/tracelens.py +0 -0
  68. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/wevin_cli.py +0 -0
  69. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer/workspaces.py +0 -0
  70. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer_cli.egg-info/dependency_links.txt +0 -0
  71. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer_cli.egg-info/entry_points.txt +0 -0
  72. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer_cli.egg-info/requires.txt +0 -0
  73. {wafer_cli-0.2.46 → wafer_cli-0.2.48}/wafer_cli.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wafer-cli
3
- Version: 0.2.46
3
+ Version: 0.2.48
4
4
  Summary: CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "wafer-cli"
3
- version = "0.2.46"
3
+ version = "0.2.48"
4
4
  description = "CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -239,6 +239,8 @@ AUDIT_BASH_ALLOWLIST: list[str] = [
239
239
  "wafer amd rocprof-sdk",
240
240
  "wafer amd rocprof-systems",
241
241
  "wafer compiler-analyze",
242
+ # Sub-agents
243
+ "wafer agent -t ask-docs",
242
244
  # Misc
243
245
  "timeout",
244
246
  ]
@@ -1539,65 +1539,25 @@ _make_agent_alias("wevin", "Alias for 'wafer agent'.")
1539
1539
 
1540
1540
 
1541
1541
  @evaluate_app.callback(invoke_without_command=True)
1542
- def evaluate( # noqa: PLR0913
1543
- ctx: typer.Context,
1544
- implementation: Path | None = typer.Option(
1545
- None, "--impl", "-i", help="Path to implementation kernel file"
1546
- ),
1547
- reference: Path | None = typer.Option(
1548
- None, "--reference", help="Path to reference kernel file"
1549
- ),
1550
- test_cases: Path | None = typer.Option(
1551
- None, "--test-cases", help="Path to test cases JSON file"
1552
- ),
1553
- target: str | None = typer.Option(
1554
- None,
1555
- "--target",
1556
- "-t",
1557
- help="GPU target name. See 'wafer config targets list' for available targets.",
1558
- autocompletion=complete_target_name,
1559
- ),
1560
- benchmark: bool = typer.Option(False, "--benchmark", help="Run performance benchmarks"),
1561
- profile: bool = typer.Option(False, "--profile", help="Enable profiling"),
1562
- defensive: bool = typer.Option(
1563
- False, "--defensive", help="Enable defensive timing to detect evaluation hacking"
1564
- ),
1565
- sync_artifacts: bool = typer.Option(
1566
- True, "--sync-artifacts/--no-sync-artifacts", help="Download artifacts"
1567
- ),
1568
- gpu_id: int | None = typer.Option(None, "--gpu-id", help="Override GPU ID"),
1569
- ) -> None:
1570
- """Run kernel evaluation on a remote GPU target.
1542
+ def evaluate(ctx: typer.Context) -> None:
1543
+ """Test kernel correctness and performance.
1571
1544
 
1572
- Uses the functional format: custom_kernel(inputs) and ref_kernel(inputs).
1545
+ Use one of the subcommands to evaluate your kernel:
1573
1546
 
1574
- The evaluation checks:
1575
- 1. Correctness: Does the kernel produce the same output as the reference?
1576
- 2. Performance (--benchmark): How fast is it compared to the reference?
1577
- 3. Defense (--defensive): Detects evaluation hacking (stream injection, etc.)
1547
+ - gpumode: Functional format (custom_kernel/ref_kernel functions)
1548
+ - kernelbench: KernelBench format (ModelNew class)
1578
1549
 
1579
1550
  Examples:
1580
- # Basic correctness check
1581
- wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json
1582
-
1583
- # With benchmarking on a specific target
1584
- wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json \\
1585
- --target vultr-b200 --benchmark
1586
-
1587
- # Full evaluation with defensive timing (detects cheating)
1588
- wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json \\
1589
- --benchmark --defensive
1551
+ wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json
1552
+ wafer evaluate kernelbench --impl impl.py --reference ref.py --benchmark
1590
1553
 
1591
- Subcommands:
1592
- gpumode Use GPUMode format (functional) - RECOMMENDED
1593
- kernelbench Use KernelBench format (ModelNew class)
1594
- make-template Generate template files for this format (deprecated)
1554
+ Run 'wafer evaluate gpumode --help' or 'wafer evaluate kernelbench --help' for options.
1595
1555
  """
1596
- # If a subcommand is being invoked, skip the main evaluation logic
1556
+ # If a subcommand is being invoked, skip
1597
1557
  if ctx.invoked_subcommand is not None:
1598
1558
  return
1599
1559
 
1600
- # Bare 'wafer evaluate' is no longer supported - must use subcommand
1560
+ # Bare 'wafer evaluate' shows help
1601
1561
  typer.echo("Error: 'wafer evaluate' requires a subcommand.", err=True)
1602
1562
  typer.echo("", err=True)
1603
1563
  typer.echo("Available subcommands:", err=True)
@@ -1622,134 +1582,6 @@ def evaluate( # noqa: PLR0913
1622
1582
  raise typer.Exit(1)
1623
1583
 
1624
1584
 
1625
- TEMPLATE_KERNEL = '''\
1626
- import torch
1627
- import triton
1628
- import triton.language as tl
1629
-
1630
-
1631
- @triton.jit
1632
- def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
1633
- """Triton kernel for element-wise addition."""
1634
- pid = tl.program_id(0)
1635
- offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
1636
- mask = offsets < n_elements
1637
- x = tl.load(x_ptr + offsets, mask=mask)
1638
- y = tl.load(y_ptr + offsets, mask=mask)
1639
- tl.store(output_ptr + offsets, x + y, mask=mask)
1640
-
1641
-
1642
- def custom_kernel(inputs: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
1643
- """Your optimized kernel implementation.
1644
-
1645
- Args:
1646
- inputs: Tuple from generate_input() - passed as single argument
1647
-
1648
- Returns:
1649
- Output tensor matching ref_kernel output
1650
- """
1651
- x, y = inputs # Unpack the input tuple
1652
- output = torch.empty_like(x)
1653
- n_elements = x.numel()
1654
- grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
1655
- add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
1656
- return output
1657
- '''
1658
-
1659
- TEMPLATE_REFERENCE = '''\
1660
- import torch
1661
-
1662
-
1663
- def ref_kernel(inputs: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
1664
- """Ground truth implementation.
1665
-
1666
- Args:
1667
- inputs: Tuple from generate_input() - passed as single argument
1668
-
1669
- Returns:
1670
- Expected output tensor
1671
- """
1672
- x, y = inputs # Unpack the input tuple
1673
- return x + y
1674
-
1675
-
1676
- def generate_input(n: int, seed: int = 42, **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
1677
- """Generate test inputs based on test case parameters.
1678
-
1679
- Called with params from test_cases.json. The returned tuple is passed
1680
- as a single argument to both ref_kernel and custom_kernel.
1681
-
1682
- Args:
1683
- n: Size of tensors (from test case)
1684
- seed: Random seed for reproducibility
1685
- **kwargs: Any other params from test case
1686
-
1687
- Returns:
1688
- Tuple of inputs (passed as single arg to kernels)
1689
- """
1690
- torch.manual_seed(seed)
1691
- x = torch.randn(n, device="cuda", dtype=torch.float32)
1692
- y = torch.randn(n, device="cuda", dtype=torch.float32)
1693
- return (x, y)
1694
- '''
1695
-
1696
- TEMPLATE_TEST_CASES = """\
1697
- [
1698
- {"name": "small", "n": 1024, "seed": 42},
1699
- {"name": "medium", "n": 65536, "seed": 42},
1700
- {"name": "large", "n": 1048576, "seed": 42}
1701
- ]
1702
- """
1703
-
1704
-
1705
- @evaluate_app.command("make-template")
1706
- def evaluate_make_template(
1707
- output_dir: Path = typer.Argument(
1708
- Path("."),
1709
- help="Directory to write template files (default: current directory)",
1710
- ),
1711
- force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing files"),
1712
- ) -> None:
1713
- """Generate template files for wafer evaluate (functional format).
1714
-
1715
- Creates three files:
1716
- - kernel.py: Implementation template with custom_kernel
1717
- - reference.py: Reference template with ref_kernel and generate_input
1718
- - test_cases.json: Test case parameters
1719
-
1720
- Examples:
1721
- wafer evaluate make-template # Write to current directory
1722
- wafer evaluate make-template ./my-kernel # Write to specific directory
1723
- wafer evaluate make-template --force # Overwrite existing files
1724
- """
1725
- output_dir = output_dir.resolve()
1726
- output_dir.mkdir(parents=True, exist_ok=True)
1727
-
1728
- files = [
1729
- ("kernel.py", TEMPLATE_KERNEL),
1730
- ("reference.py", TEMPLATE_REFERENCE),
1731
- ("test_cases.json", TEMPLATE_TEST_CASES),
1732
- ]
1733
-
1734
- for filename, content in files:
1735
- path = output_dir / filename
1736
- if path.exists() and not force:
1737
- typer.echo(f"Skipping {path} (already exists, use --force to overwrite)")
1738
- continue
1739
- path.write_text(content)
1740
- typer.echo(f"Created {path}")
1741
-
1742
- typer.echo("")
1743
- typer.echo("Next steps:")
1744
- typer.echo(f" 1. Edit {output_dir / 'kernel.py'} with your optimized implementation")
1745
- typer.echo(f" 2. Edit {output_dir / 'reference.py'} with the ground truth + input generator")
1746
- typer.echo(f" 3. Edit {output_dir / 'test_cases.json'} with your test parameters")
1747
- typer.echo(" 4. Run:")
1748
- typer.echo(f" wafer evaluate --impl {output_dir / 'kernel.py'} \\")
1749
- typer.echo(f" --reference {output_dir / 'reference.py'} \\")
1750
- typer.echo(f" --test-cases {output_dir / 'test_cases.json'} --benchmark")
1751
-
1752
-
1753
1585
  # =============================================================================
1754
1586
  # KernelBench format evaluation
1755
1587
  # =============================================================================
@@ -5,19 +5,24 @@ matching the bash_allowlist. This ensures agent instructions stay in sync
5
5
  with the CLI — the --help text is the single source of truth for both
6
6
  human users and AI agents.
7
7
 
8
+ Also generates help text for wafer agent templates (e.g., "wafer agent -t ask-docs")
9
+ by loading template metadata from the template registry.
10
+
8
11
  Usage:
9
12
  from wafer.cli_instructions import build_cli_instructions
10
13
 
11
14
  instructions = build_cli_instructions([
12
15
  "wafer evaluate",
13
16
  "wafer nvidia ncu",
14
- "wafer rocprof profile",
17
+ "wafer agent -t ask-docs",
15
18
  "python", # non-wafer commands are skipped
16
19
  ])
17
20
  """
18
21
 
19
22
  from __future__ import annotations
20
23
 
24
+ import re
25
+
21
26
  import click
22
27
  import typer.main
23
28
 
@@ -133,11 +138,65 @@ def build_cli_instructions(bash_allowlist: list[str]) -> str:
133
138
  continue
134
139
  sections.append(_format_command_help(cmd_str, cmd))
135
140
 
136
- if not sections:
141
+ # Also generate help for agent templates
142
+ template_help = _build_template_instructions(bash_allowlist)
143
+
144
+ if not sections and not template_help:
137
145
  return ""
138
146
 
139
147
  header = (
140
148
  "## Wafer CLI Commands\n\n"
141
149
  "You do not have a local GPU. Use the wafer CLI to run on remote GPU hardware.\n"
142
150
  )
143
- return header + "\n\n".join(sections)
151
+
152
+ result = header + "\n\n".join(sections)
153
+ if template_help:
154
+ result += "\n\n" + template_help
155
+ return result
156
+
157
+
158
+ def _build_template_instructions(bash_allowlist: list[str]) -> str:
159
+ """Generate help text for wafer agent templates in the allowlist.
160
+
161
+ Looks for commands matching "wafer agent -t <template>" or
162
+ "wafer agent --template <template>" and loads their descriptions
163
+ from the template registry.
164
+
165
+ Args:
166
+ bash_allowlist: List of allowed bash command prefixes.
167
+
168
+ Returns:
169
+ Markdown-formatted template help, or empty string if no templates found.
170
+ """
171
+ # Match patterns like "wafer agent -t ask-docs" or "wafer agent --template ask-docs"
172
+ template_pattern = re.compile(r"wafer agent\s+(?:-t|--template)\s+(\S+)")
173
+
174
+ template_names = []
175
+ for cmd in bash_allowlist:
176
+ match = template_pattern.match(cmd)
177
+ if match:
178
+ template_names.append((cmd, match.group(1)))
179
+
180
+ if not template_names:
181
+ return ""
182
+
183
+ # Lazy import to avoid circular deps
184
+ try:
185
+ from wafer_core.rollouts.templates import load_template
186
+ except ImportError:
187
+ return ""
188
+
189
+ sections = []
190
+ for cmd, template_name in template_names:
191
+ try:
192
+ template = load_template(template_name)
193
+ desc = template.description or f"Run the {template_name} agent template"
194
+ sections.append(f"### `{cmd}`\n{desc}")
195
+ except FileNotFoundError:
196
+ # Template not found — skip silently
197
+ continue
198
+
199
+ if not sections:
200
+ return ""
201
+
202
+ return "## Wafer Agent Templates\n\n" + "\n\n".join(sections)
@@ -0,0 +1,120 @@
1
+ """Template for auditing GPU kernels.
2
+
3
+ Usage:
4
+ wafer agent -t audit --args dir=./my_project --args cmd="make && ./bench" "Find the performance bottleneck"
5
+ wafer agent -t audit --args dir=. --args cmd="hipcc kernel.hip -o kernel && ./kernel" "Why is this slow?"
6
+ """
7
+
8
+ try:
9
+ from wafer.agent_defaults import AUDIT_BASH_ALLOWLIST, AUDIT_ENABLED_TOOLS
10
+ except ImportError:
11
+ # Fallback for when wafer-cli package isn't installed
12
+ AUDIT_ENABLED_TOOLS = ["read", "glob", "grep", "bash"]
13
+ AUDIT_BASH_ALLOWLIST = [
14
+ "ls",
15
+ "cat",
16
+ "head",
17
+ "tail",
18
+ "wc",
19
+ "find",
20
+ "grep",
21
+ "rg",
22
+ "pwd",
23
+ "tree",
24
+ "which",
25
+ "diff",
26
+ "sort",
27
+ "mkdir",
28
+ "make",
29
+ "cmake",
30
+ "nvcc",
31
+ "hipcc",
32
+ "g++",
33
+ "gcc",
34
+ "clang",
35
+ "python",
36
+ "python3",
37
+ "./",
38
+ "wafer evaluate",
39
+ "wafer nvidia ncu",
40
+ "wafer nvidia nsys",
41
+ "wafer amd rocprof-compute",
42
+ "wafer amd rocprof-sdk",
43
+ "wafer amd rocprof-systems",
44
+ "wafer compiler-analyze",
45
+ "wafer agent -t ask-docs",
46
+ "timeout",
47
+ ]
48
+
49
+ try:
50
+ from wafer_core.rollouts.templates import TemplateConfig
51
+ except ImportError:
52
+ from rollouts.templates import TemplateConfig
53
+
54
+ template = TemplateConfig(
55
+ # Identity
56
+ name="audit",
57
+ description="Audit GPU kernels for performance issues, correctness bugs, and optimization opportunities",
58
+ # System prompt
59
+ system_prompt="""You are a GPU kernel auditing expert. Your task is to analyze kernel code, identify problems, and explain what's wrong and how to fix it.
60
+
61
+ Working directory: $dir
62
+ Build/run command: $cmd
63
+
64
+ ## Strategy
65
+
66
+ 1. Read the kernel source code to understand what it does
67
+ 2. Run the build/run command to compile and execute:
68
+ ```bash
69
+ $cmd
70
+ ```
71
+ 3. Analyze the output for errors, warnings, or performance data
72
+ 4. For architectural questions about the target GPU (AMD MI300X, NVIDIA H100, etc.), query the documentation:
73
+ ```bash
74
+ wafer agent -t ask-docs --corpus amd "your question about MI300X architecture"
75
+ wafer agent -t ask-docs --corpus cuda "your question about NVIDIA/CUDA"
76
+ ```
77
+ Use this for: wave/warp scheduling, occupancy limits, LDS/shared memory sizing, memory hierarchy, instruction throughput, XCD/GCD topology, MFMA/tensor core specifics.
78
+ 5. Identify concrete issues in the code:
79
+ - Correctness bugs (race conditions, out-of-bounds, incorrect results)
80
+ - Performance problems (uncoalesced memory access, bank conflicts, low occupancy, warp divergence)
81
+ - Architectural mismatches (tile sizes vs hardware limits, missing pipelining, suboptimal wave utilization)
82
+ - Missed optimization opportunities (producer-consumer patterns, software pipelining, wave specialization)
83
+ 6. For each issue, explain:
84
+ - What the problem is
85
+ - Where in the code it occurs (file + line)
86
+ - Why it matters (quantify impact if possible, cite architecture specs)
87
+ - How to fix it (concrete code change, not hand-waving)
88
+
89
+ ## Output
90
+
91
+ Produce a structured audit report:
92
+ 1. Summary (one paragraph)
93
+ 2. Issues found (ranked by severity/impact)
94
+ 3. Suggested fixes (concrete, actionable)
95
+
96
+ Be specific. "Use shared memory" is not useful. "Lines 45-62: the inner loop loads A[k][threadIdx.x] from global memory on every iteration. Tile this into shared memory with a 32x32 block to reduce global loads by 32x" is useful.
97
+
98
+ Focus on architectural issues, not just micro-optimizations:
99
+ - Is the tile size appropriate for the target GPU's wave/warp structure?
100
+ - Is there opportunity for pipelining or overlapping memory and compute?
101
+ - Could wave/warp specialization (producer-consumer pattern) help?
102
+ - Are occupancy limits being hit due to register or LDS/shared memory pressure?
103
+
104
+ IMPORTANT: Ground every claim in evidence from the code, profiler output, or architecture documentation. Use ask-docs for architectural facts you're unsure about.""",
105
+ # Tools - read-only plus bash for compilation/profiling
106
+ tools=AUDIT_ENABLED_TOOLS,
107
+ bash_allowlist=AUDIT_BASH_ALLOWLIST,
108
+ # Model config - use thinking for deep analysis
109
+ model="anthropic/claude-sonnet-4-5-20250929",
110
+ max_tokens=16384,
111
+ thinking=True,
112
+ thinking_budget=10000,
113
+ # Multi-turn for follow-up questions
114
+ single_turn=False,
115
+ # Template variables
116
+ defaults={
117
+ "dir": ".",
118
+ "cmd": "echo 'No build command provided'",
119
+ },
120
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wafer-cli
3
- Version: 0.2.46
3
+ Version: 0.2.48
4
4
  Summary: CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -57,6 +57,7 @@ wafer/skills/wafer-guide/SKILL.md
57
57
  wafer/templates/__init__.py
58
58
  wafer/templates/aiter_optimize.py
59
59
  wafer/templates/ask_docs.py
60
+ wafer/templates/audit.py
60
61
  wafer/templates/optimize_kernel.py
61
62
  wafer/templates/optimize_kernelbench.py
62
63
  wafer/templates/optimize_vllm.py
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes