wafer-cli 0.2.31__py3-none-any.whl → 0.2.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/GUIDE.md +1 -1
- wafer/agent_defaults.py +157 -2
- wafer/billing.py +6 -6
- wafer/cli.py +432 -346
- wafer/corpus.py +6 -72
- wafer/evaluate.py +143 -81
- wafer/global_config.py +0 -13
- wafer/kernel_scope.py +1 -1
- wafer/ncu_analyze.py +1 -1
- wafer/nsys_analyze.py +1 -1
- wafer/skills/wafer-guide/SKILL.md +6 -22
- wafer/ssh_keys.py +6 -6
- wafer/targets_ops.py +2 -29
- wafer/templates/aiter_optimize.py +59 -0
- wafer/templates/optimize_kernel.py +2 -4
- wafer/templates/optimize_kernelbench.py +62 -17
- wafer/templates/optimize_vllm.py +156 -0
- wafer/trace_compare.py +48 -139
- wafer/wevin_cli.py +1 -12
- wafer/workspaces.py +8 -8
- wafer_cli-0.2.33.dist-info/METADATA +260 -0
- {wafer_cli-0.2.31.dist-info → wafer_cli-0.2.33.dist-info}/RECORD +25 -23
- wafer_cli-0.2.31.dist-info/METADATA +0 -107
- {wafer_cli-0.2.31.dist-info → wafer_cli-0.2.33.dist-info}/WHEEL +0 -0
- {wafer_cli-0.2.31.dist-info → wafer_cli-0.2.33.dist-info}/entry_points.txt +0 -0
- {wafer_cli-0.2.31.dist-info → wafer_cli-0.2.33.dist-info}/top_level.txt +0 -0
wafer/GUIDE.md
CHANGED
|
@@ -7,7 +7,7 @@ GPU development primitives for LLM agents.
|
|
|
7
7
|
Run code on cloud GPUs instantly with workspaces:
|
|
8
8
|
|
|
9
9
|
```bash
|
|
10
|
-
wafer
|
|
10
|
+
wafer login # One-time auth
|
|
11
11
|
wafer workspaces create dev --gpu B200 # Create workspace (NVIDIA B200)
|
|
12
12
|
wafer workspaces exec dev -- python -c "import torch; print(torch.cuda.get_device_name(0))"
|
|
13
13
|
wafer workspaces sync dev ./my-project # Sync files
|
wafer/agent_defaults.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
"""Shared agent defaults for kernel optimization tasks.
|
|
2
2
|
|
|
3
3
|
Single source of truth for bash allowlists and enabled tools used by both:
|
|
4
|
-
- CLI templates (apps/wafer-cli/wafer/templates
|
|
5
|
-
- Eval configs (research/evals
|
|
4
|
+
- CLI templates (apps/wafer-cli/wafer/templates/*.py)
|
|
5
|
+
- Eval configs (research/evals/*_eval/*.py)
|
|
6
6
|
|
|
7
7
|
Import from here instead of defining your own copy.
|
|
8
8
|
"""
|
|
@@ -12,6 +12,9 @@ from __future__ import annotations
|
|
|
12
12
|
# Tools available to the agent (coding environment tools)
|
|
13
13
|
ENABLED_TOOLS: list[str] = ["read", "write", "edit", "glob", "grep", "bash"]
|
|
14
14
|
|
|
15
|
+
# vLLM-specific tools (same as ENABLED_TOOLS for now)
|
|
16
|
+
VLLM_ENABLED_TOOLS: list[str] = ["read", "write", "edit", "glob", "grep", "bash"]
|
|
17
|
+
|
|
15
18
|
# Bash commands allowed for kernel optimization agents.
|
|
16
19
|
# Uses prefix matching — "wafer evaluate" also allows "wafer evaluate kernelbench".
|
|
17
20
|
KERNELBENCH_BASH_ALLOWLIST: list[str] = [
|
|
@@ -40,3 +43,155 @@ KERNELBENCH_BASH_ALLOWLIST: list[str] = [
|
|
|
40
43
|
"pwd",
|
|
41
44
|
"which",
|
|
42
45
|
]
|
|
46
|
+
|
|
47
|
+
# Tools available to aiter optimization agents (full coding environment)
|
|
48
|
+
AITER_ENABLED_TOOLS: list[str] = ["read", "write", "edit", "glob", "grep", "bash"]
|
|
49
|
+
|
|
50
|
+
# System prompt for aiter optimization (shared between eval and template)
|
|
51
|
+
# Uses {op_name}, {test_file}, {target_flag} placeholders
|
|
52
|
+
AITER_SYSTEM_PROMPT = """\
|
|
53
|
+
You are a GPU kernel optimization expert specializing in AMD MI300X and the aiter library.
|
|
54
|
+
|
|
55
|
+
## Context
|
|
56
|
+
|
|
57
|
+
aiter (ROCm/aiter) is AMD's centralized repository for high-performance AI operators.
|
|
58
|
+
Operators are implemented using Triton kernels, Composable Kernel (CK), or HIP/ROCm.
|
|
59
|
+
|
|
60
|
+
Each operator has a test in `op_tests/test_{{op}}.py` that validates correctness and
|
|
61
|
+
measures performance against a reference implementation.
|
|
62
|
+
|
|
63
|
+
## Your Task
|
|
64
|
+
|
|
65
|
+
1. **Understand the operator**: Read the test file and trace imports to find implementation
|
|
66
|
+
2. **Establish baseline**: Run the evaluation to measure current performance
|
|
67
|
+
```bash
|
|
68
|
+
# Quick check with one shape (fast iteration)
|
|
69
|
+
wafer evaluate aiter --aiter-dir . --cmd "python op_tests/test_{{op}}.py --mnk 128,32,8192" {target_flag}
|
|
70
|
+
|
|
71
|
+
# Full test suite (final validation)
|
|
72
|
+
wafer evaluate aiter --aiter-dir . --cmd "python op_tests/test_{{op}}.py" {target_flag}
|
|
73
|
+
```
|
|
74
|
+
3. **Identify optimizations**: Look for memory access patterns, occupancy, instruction selection
|
|
75
|
+
4. **Implement changes**: Modify the operator to improve performance
|
|
76
|
+
5. **Validate**: Re-run evaluation to verify correctness and measure speedup
|
|
77
|
+
6. **Iterate**: Use quick checks during development, full suite for final validation
|
|
78
|
+
|
|
79
|
+
## Finding Source Files
|
|
80
|
+
|
|
81
|
+
The aiter codebase structure varies by operator. To find implementation files:
|
|
82
|
+
|
|
83
|
+
1. **Start with the test file**: `op_tests/test_{{op}}.py`
|
|
84
|
+
- Read imports to see what modules are used
|
|
85
|
+
- Look for the main function being tested
|
|
86
|
+
|
|
87
|
+
2. **Check common locations** (not all ops have all of these):
|
|
88
|
+
- `aiter/ops/{{op}}.py` — High-level Python API (some ops)
|
|
89
|
+
- `aiter/triton_kernels/` — Triton kernel implementations
|
|
90
|
+
- `csrc/kernels/` — CUDA/HIP kernel implementations
|
|
91
|
+
- `csrc/py_itfs_cu/` — Python interface CUDA files
|
|
92
|
+
- `csrc/cktile_*/` — Composable Kernel tile implementations
|
|
93
|
+
|
|
94
|
+
3. **Search for the op name**:
|
|
95
|
+
```bash
|
|
96
|
+
find . -name "*{{op}}*" -type f | grep -v __pycache__
|
|
97
|
+
grep -r "def {{function_name}}" aiter/ csrc/ --include="*.py" --include="*.cu"
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Key Directories
|
|
101
|
+
|
|
102
|
+
- `aiter/` — Main package with operator implementations
|
|
103
|
+
- `aiter/ops/` — High-level operator APIs (some ops)
|
|
104
|
+
- `aiter/triton_kernels/` — Triton kernel implementations
|
|
105
|
+
- `csrc/` — C++/CUDA/HIP implementations
|
|
106
|
+
- `op_tests/` — Tests for each operator
|
|
107
|
+
- `aiter/configs/` — Tuned configurations (CSV files)
|
|
108
|
+
|
|
109
|
+
## Output
|
|
110
|
+
|
|
111
|
+
Your goal is to produce:
|
|
112
|
+
1. Modified operator code with optimizations
|
|
113
|
+
2. Benchmark results showing correctness and speedup
|
|
114
|
+
3. A summary of what you changed and why
|
|
115
|
+
|
|
116
|
+
The optimization should be correct (pass the op_test) and faster than baseline."""
|
|
117
|
+
|
|
118
|
+
# Bash commands allowed for aiter optimization agents.
|
|
119
|
+
AITER_BASH_ALLOWLIST: list[str] = [
|
|
120
|
+
# Read-only
|
|
121
|
+
"ls",
|
|
122
|
+
"cat",
|
|
123
|
+
"head",
|
|
124
|
+
"tail",
|
|
125
|
+
"wc",
|
|
126
|
+
"find",
|
|
127
|
+
"grep",
|
|
128
|
+
"rg",
|
|
129
|
+
"pwd",
|
|
130
|
+
"tree",
|
|
131
|
+
"which",
|
|
132
|
+
"diff",
|
|
133
|
+
"sort",
|
|
134
|
+
# Filesystem
|
|
135
|
+
"mkdir",
|
|
136
|
+
"cp",
|
|
137
|
+
"mv",
|
|
138
|
+
# Git
|
|
139
|
+
"git diff",
|
|
140
|
+
"git status",
|
|
141
|
+
"git log",
|
|
142
|
+
# Compilation
|
|
143
|
+
"hipcc",
|
|
144
|
+
"g++",
|
|
145
|
+
"gcc",
|
|
146
|
+
"clang",
|
|
147
|
+
"python",
|
|
148
|
+
"python3",
|
|
149
|
+
"pip",
|
|
150
|
+
"pytest",
|
|
151
|
+
# Execution — allows running compiled binaries and python scripts
|
|
152
|
+
"./",
|
|
153
|
+
# Kernel evaluation
|
|
154
|
+
"wafer evaluate aiter",
|
|
155
|
+
# Profiling — AMD
|
|
156
|
+
"wafer amd rocprof-compute",
|
|
157
|
+
"wafer amd rocprof-sdk",
|
|
158
|
+
"wafer amd rocprof-systems",
|
|
159
|
+
"wafer amd isa",
|
|
160
|
+
# Sub-agents
|
|
161
|
+
"wafer agent -t ask-docs",
|
|
162
|
+
# Misc
|
|
163
|
+
"timeout",
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
# Bash commands allowed for vLLM kernel optimization agents.
|
|
167
|
+
VLLM_BASH_ALLOWLIST: list[str] = [
|
|
168
|
+
# vLLM evaluation
|
|
169
|
+
"wafer evaluate vllm",
|
|
170
|
+
# vLLM's own test and benchmark commands (run inside vllm dir)
|
|
171
|
+
"pytest",
|
|
172
|
+
# Profiling — AMD
|
|
173
|
+
"wafer amd rocprof-compute",
|
|
174
|
+
"wafer amd rocprof-sdk",
|
|
175
|
+
"wafer amd rocprof-systems",
|
|
176
|
+
# Profiling — NVIDIA
|
|
177
|
+
"wafer nvidia ncu",
|
|
178
|
+
"wafer nvidia nsys",
|
|
179
|
+
# Analysis
|
|
180
|
+
"wafer compiler-analyze",
|
|
181
|
+
# Sub-agents
|
|
182
|
+
"wafer agent -t ask-docs",
|
|
183
|
+
# General utilities
|
|
184
|
+
"python",
|
|
185
|
+
"python3",
|
|
186
|
+
"pip",
|
|
187
|
+
"timeout",
|
|
188
|
+
"ls",
|
|
189
|
+
"cat",
|
|
190
|
+
"head",
|
|
191
|
+
"tail",
|
|
192
|
+
"wc",
|
|
193
|
+
"pwd",
|
|
194
|
+
"which",
|
|
195
|
+
"cd",
|
|
196
|
+
"git",
|
|
197
|
+
]
|
wafer/billing.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Billing CLI - Manage credits and subscription.
|
|
2
2
|
|
|
3
|
-
This module provides the implementation for the `wafer
|
|
3
|
+
This module provides the implementation for the `wafer billing` subcommand.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import json
|
|
@@ -126,7 +126,7 @@ def format_usage_text(usage: dict) -> str:
|
|
|
126
126
|
lines.extend([
|
|
127
127
|
"",
|
|
128
128
|
"Upgrade to Pro for hardware counters and credit topups:",
|
|
129
|
-
" wafer
|
|
129
|
+
" wafer billing portal",
|
|
130
130
|
])
|
|
131
131
|
|
|
132
132
|
return "\n".join(lines)
|
|
@@ -153,7 +153,7 @@ def get_usage(json_output: bool = False) -> str:
|
|
|
153
153
|
usage = response.json()
|
|
154
154
|
except httpx.HTTPStatusError as e:
|
|
155
155
|
if e.response.status_code == 401:
|
|
156
|
-
raise RuntimeError("Not authenticated. Run: wafer
|
|
156
|
+
raise RuntimeError("Not authenticated. Run: wafer login") from e
|
|
157
157
|
raise RuntimeError(f"API error: {e.response.status_code} - {e.response.text}") from e
|
|
158
158
|
except httpx.RequestError as e:
|
|
159
159
|
raise RuntimeError(f"Could not reach API: {e}") from e
|
|
@@ -188,7 +188,7 @@ def create_topup(amount_cents: int) -> dict:
|
|
|
188
188
|
return response.json()
|
|
189
189
|
except httpx.HTTPStatusError as e:
|
|
190
190
|
if e.response.status_code == 401:
|
|
191
|
-
raise RuntimeError("Not authenticated. Run: wafer
|
|
191
|
+
raise RuntimeError("Not authenticated. Run: wafer login") from e
|
|
192
192
|
if e.response.status_code == 400:
|
|
193
193
|
# Invalid amount
|
|
194
194
|
try:
|
|
@@ -200,7 +200,7 @@ def create_topup(amount_cents: int) -> dict:
|
|
|
200
200
|
# Start tier or other restriction
|
|
201
201
|
raise RuntimeError(
|
|
202
202
|
"Topup not available for your subscription tier.\n"
|
|
203
|
-
"Upgrade your subscription first: wafer
|
|
203
|
+
"Upgrade your subscription first: wafer billing portal"
|
|
204
204
|
) from e
|
|
205
205
|
if e.response.status_code == 503:
|
|
206
206
|
raise RuntimeError("Billing service temporarily unavailable. Please try again later.") from e
|
|
@@ -227,7 +227,7 @@ def get_portal_url() -> dict:
|
|
|
227
227
|
return response.json()
|
|
228
228
|
except httpx.HTTPStatusError as e:
|
|
229
229
|
if e.response.status_code == 401:
|
|
230
|
-
raise RuntimeError("Not authenticated. Run: wafer
|
|
230
|
+
raise RuntimeError("Not authenticated. Run: wafer login") from e
|
|
231
231
|
raise RuntimeError(f"API error: {e.response.status_code} - {e.response.text}") from e
|
|
232
232
|
except httpx.RequestError as e:
|
|
233
233
|
raise RuntimeError(f"Could not reach API: {e}") from e
|