wafer-cli 0.2.32__tar.gz → 0.2.34__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer_cli-0.2.34/PKG-INFO +260 -0
- wafer_cli-0.2.34/README.md +242 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/pyproject.toml +1 -1
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_analytics.py +2 -2
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_billing.py +15 -15
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_cli_coverage.py +47 -1
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_cli_parity_integration.py +47 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/GUIDE.md +1 -1
- wafer_cli-0.2.34/wafer/agent_defaults.py +197 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/billing.py +6 -6
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/cli.py +432 -348
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/corpus.py +6 -72
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/evaluate.py +143 -81
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/global_config.py +0 -13
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/kernel_scope.py +1 -1
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/ncu_analyze.py +1 -1
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/nsys_analyze.py +1 -1
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/skills/wafer-guide/SKILL.md +6 -22
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/ssh_keys.py +6 -6
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/targets_ops.py +2 -29
- wafer_cli-0.2.34/wafer/templates/aiter_optimize.py +59 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/templates/optimize_kernel.py +2 -4
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/templates/optimize_kernelbench.py +62 -17
- wafer_cli-0.2.34/wafer/templates/optimize_vllm.py +156 -0
- wafer_cli-0.2.34/wafer/trace_compare.py +183 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/wevin_cli.py +1 -12
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/workspaces.py +8 -8
- wafer_cli-0.2.34/wafer_cli.egg-info/PKG-INFO +260 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer_cli.egg-info/SOURCES.txt +2 -0
- wafer_cli-0.2.32/PKG-INFO +0 -107
- wafer_cli-0.2.32/README.md +0 -89
- wafer_cli-0.2.32/wafer/agent_defaults.py +0 -42
- wafer_cli-0.2.32/wafer/trace_compare.py +0 -274
- wafer_cli-0.2.32/wafer_cli.egg-info/PKG-INFO +0 -107
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/setup.cfg +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_auth.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_config_integration.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_file_operations_integration.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_kernel_scope_cli.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_nsys_analyze.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_nsys_profile.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_output.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_rocprof_compute_integration.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_skill_commands.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_ssh_integration.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_targets_ops.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_wevin_cli.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/tests/test_workflow_integration.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/__init__.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/analytics.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/api_client.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/auth.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/autotuner.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/baseline.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/cli_instructions.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/config.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/gpu_run.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/inference.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/nsys_profile.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/output.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/problems.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/rocprof_compute.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/rocprof_sdk.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/rocprof_systems.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/specs_cli.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/target_lock.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/targets.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/targets_cli.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/templates/__init__.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/templates/ask_docs.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/templates/trace_analyze.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/tests/test_eval_cli_parity.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer/tracelens.py +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer_cli.egg-info/dependency_links.txt +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer_cli.egg-info/entry_points.txt +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer_cli.egg-info/requires.txt +0 -0
- {wafer_cli-0.2.32 → wafer_cli-0.2.34}/wafer_cli.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wafer-cli
|
|
3
|
+
Version: 0.2.34
|
|
4
|
+
Summary: CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: typer>=0.12.0
|
|
8
|
+
Requires-Dist: trio>=0.24.0
|
|
9
|
+
Requires-Dist: trio-asyncio>=0.15.0
|
|
10
|
+
Requires-Dist: wafer-core>=0.1.0
|
|
11
|
+
Requires-Dist: perfetto>=0.16.0
|
|
12
|
+
Requires-Dist: posthog>=3.0.0
|
|
13
|
+
Provides-Extra: dev
|
|
14
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
15
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
16
|
+
Requires-Dist: diff-cover>=8.0.0; extra == "dev"
|
|
17
|
+
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
18
|
+
|
|
19
|
+
# Wafer CLI
|
|
20
|
+
|
|
21
|
+
Run GPU workloads, optimize kernels, and query GPU documentation.
|
|
22
|
+
|
|
23
|
+
## Getting Started
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
# Install
|
|
27
|
+
cd apps/wafer-cli && uv sync
|
|
28
|
+
|
|
29
|
+
# Use staging (workspaces and other features require staging)
|
|
30
|
+
wafer config set api.environment staging
|
|
31
|
+
|
|
32
|
+
# Login
|
|
33
|
+
wafer login
|
|
34
|
+
|
|
35
|
+
# Run a command on a remote GPU
|
|
36
|
+
wafer remote-run -- nvidia-smi
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Commands
|
|
40
|
+
|
|
41
|
+
### `wafer login` / `wafer logout` / `wafer whoami`
|
|
42
|
+
|
|
43
|
+
Authenticate with GitHub OAuth.
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
wafer login # Opens browser for GitHub OAuth
|
|
47
|
+
wafer whoami # Show current user
|
|
48
|
+
wafer logout # Remove credentials
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### `wafer remote-run`
|
|
52
|
+
|
|
53
|
+
Run any command on a remote GPU.
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
wafer remote-run -- nvidia-smi
|
|
57
|
+
wafer remote-run --upload-dir ./my_code -- python3 train.py
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### `wafer workspaces`
|
|
61
|
+
|
|
62
|
+
Create and manage persistent GPU environments.
|
|
63
|
+
|
|
64
|
+
**Available GPUs:**
|
|
65
|
+
|
|
66
|
+
- `MI300X` - AMD Instinct MI300X (192GB HBM3, ROCm)
|
|
67
|
+
- `B200` - NVIDIA Blackwell B200 (180GB HBM3e, CUDA) - default
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
wafer workspaces list
|
|
71
|
+
wafer workspaces create my-workspace --gpu B200 --wait # NVIDIA B200
|
|
72
|
+
wafer workspaces create amd-dev --gpu MI300X # AMD MI300X
|
|
73
|
+
wafer workspaces ssh <workspace-id>
|
|
74
|
+
wafer workspaces delete <workspace-id>
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### `wafer agent`
|
|
78
|
+
|
|
79
|
+
AI assistant for GPU kernel development. Helps with CUDA/Triton optimization, documentation queries, and performance analysis.
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
wafer agent "What is TMEM in CuTeDSL?"
|
|
83
|
+
wafer agent -s "optimize this kernel" < kernel.py
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### `wafer evaluate`
|
|
87
|
+
|
|
88
|
+
Evaluate kernel correctness and performance against a reference implementation.
|
|
89
|
+
|
|
90
|
+
**Functional format** (default):
|
|
91
|
+
```bash
|
|
92
|
+
# Generate template files
|
|
93
|
+
wafer evaluate make-template ./my-kernel
|
|
94
|
+
|
|
95
|
+
# Run evaluation
|
|
96
|
+
wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json --benchmark
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
The implementation must define `custom_kernel(inputs)`, the reference must define `ref_kernel(inputs)` and `generate_input(**params)`.
|
|
100
|
+
|
|
101
|
+
**KernelBench format** (ModelNew class):
|
|
102
|
+
```bash
|
|
103
|
+
# Extract a KernelBench problem as template
|
|
104
|
+
wafer evaluate kernelbench make-template level1/1
|
|
105
|
+
|
|
106
|
+
# Run evaluation
|
|
107
|
+
wafer evaluate kernelbench --impl my_kernel.py --reference problem.py --benchmark
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
The implementation must define `class ModelNew(nn.Module)`, the reference must define `class Model`, `get_inputs()`, and `get_init_inputs()`.
|
|
111
|
+
|
|
112
|
+
### `wafer wevin -t ask-docs`
|
|
113
|
+
|
|
114
|
+
Query GPU documentation using the docs template.
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
wafer wevin -t ask-docs --json -s "What causes bank conflicts in shared memory?"
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### `wafer corpus`
|
|
121
|
+
|
|
122
|
+
Download documentation to local filesystem for agents to search.
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
wafer corpus list
|
|
126
|
+
wafer corpus download cuda-programming-guide
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Customization
|
|
132
|
+
|
|
133
|
+
### `wafer remote-run` options
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
wafer remote-run --image pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel -- python3 script.py
|
|
137
|
+
wafer remote-run --require-hwc -- ncu --set full python3 bench.py # Hardware counters for NCU
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### `wafer evaluate` options
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
wafer evaluate --impl k.py --reference r.py --test-cases t.json \
|
|
144
|
+
--target vultr-b200 \ # Specific GPU target
|
|
145
|
+
--benchmark \ # Measure performance
|
|
146
|
+
--profile # Enable torch.profiler + NCU
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### `wafer push` for multi-command workflows
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
WORKSPACE=$(wafer push ./project)
|
|
153
|
+
wafer remote-run --workspace-id $WORKSPACE -- python3 test1.py
|
|
154
|
+
wafer remote-run --workspace-id $WORKSPACE -- python3 test2.py
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Profile analysis
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
wafer nvidia ncu analyze profile.ncu-rep
|
|
161
|
+
wafer nvidia nsys analyze profile.nsys-rep
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## Advanced
|
|
167
|
+
|
|
168
|
+
### Local targets
|
|
169
|
+
|
|
170
|
+
Bypass the API and SSH directly to your own GPUs:
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
wafer targets list
|
|
174
|
+
wafer targets add ./my-gpu.toml
|
|
175
|
+
wafer targets default my-gpu
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Defensive evaluation
|
|
179
|
+
|
|
180
|
+
Detect evaluation hacking (stream injection, lazy evaluation, etc.):
|
|
181
|
+
|
|
182
|
+
```bash
|
|
183
|
+
wafer evaluate --impl k.py --reference r.py --test-cases t.json --benchmark --defensive
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Other tools
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
wafer perfetto <trace.json> --query "SELECT * FROM slice" # Perfetto SQL queries
|
|
190
|
+
wafer capture ./script.py # Capture execution snapshot
|
|
191
|
+
wafer compiler-analyze kernel.ptx # Analyze PTX/SASS
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### ROCm profiling (AMD GPUs)
|
|
195
|
+
|
|
196
|
+
```bash
|
|
197
|
+
wafer rocprof-sdk ...
|
|
198
|
+
wafer rocprof-systems ...
|
|
199
|
+
wafer rocprof-compute ...
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
---
|
|
203
|
+
|
|
204
|
+
## Shell Completion
|
|
205
|
+
|
|
206
|
+
Enable tab completion for commands, options, and target names:
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
# Install completion (zsh/bash/fish)
|
|
210
|
+
wafer --install-completion
|
|
211
|
+
|
|
212
|
+
# Then restart your terminal, or source your shell config:
|
|
213
|
+
source ~/.zshrc # or ~/.bashrc
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Now you can tab-complete:
|
|
217
|
+
- Commands: `wafer eva<TAB>` → `wafer evaluate`
|
|
218
|
+
- Options: `wafer evaluate --<TAB>`
|
|
219
|
+
- Target names: `wafer evaluate --target v<TAB>` → `wafer evaluate --target vultr-b200`
|
|
220
|
+
- File paths: `wafer evaluate --impl ./<TAB>`
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## AI Assistant Skills
|
|
225
|
+
|
|
226
|
+
Install the Wafer CLI skill to make wafer commands discoverable by your AI coding assistant:
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
# Install for all supported tools (Claude Code, Codex CLI, Cursor)
|
|
230
|
+
wafer skill install
|
|
231
|
+
|
|
232
|
+
# Install for a specific tool
|
|
233
|
+
wafer skill install -t cursor # Cursor
|
|
234
|
+
wafer skill install -t claude # Claude Code
|
|
235
|
+
wafer skill install -t codex # Codex CLI
|
|
236
|
+
|
|
237
|
+
# Check installation status
|
|
238
|
+
wafer skill status
|
|
239
|
+
|
|
240
|
+
# Uninstall
|
|
241
|
+
wafer skill uninstall
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### Installing from GitHub (Cursor)
|
|
245
|
+
|
|
246
|
+
You can also install the skill directly from GitHub in Cursor:
|
|
247
|
+
|
|
248
|
+
1. Open Cursor Settings (Cmd+Shift+J / Ctrl+Shift+J)
|
|
249
|
+
2. Navigate to **Rules** → **Add Rule** → **Remote Rule (Github)**
|
|
250
|
+
3. Enter: `https://github.com/wafer-ai/skills`
|
|
251
|
+
4. Cursor will automatically discover skills in `.cursor/skills/`
|
|
252
|
+
|
|
253
|
+
The skill provides comprehensive guidance for GPU kernel development, including documentation lookup, trace analysis, kernel evaluation, and optimization workflows.
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## Requirements
|
|
258
|
+
|
|
259
|
+
- Python 3.10+
|
|
260
|
+
- GitHub account (for authentication)
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
# Wafer CLI
|
|
2
|
+
|
|
3
|
+
Run GPU workloads, optimize kernels, and query GPU documentation.
|
|
4
|
+
|
|
5
|
+
## Getting Started
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Install
|
|
9
|
+
cd apps/wafer-cli && uv sync
|
|
10
|
+
|
|
11
|
+
# Use staging (workspaces and other features require staging)
|
|
12
|
+
wafer config set api.environment staging
|
|
13
|
+
|
|
14
|
+
# Login
|
|
15
|
+
wafer login
|
|
16
|
+
|
|
17
|
+
# Run a command on a remote GPU
|
|
18
|
+
wafer remote-run -- nvidia-smi
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Commands
|
|
22
|
+
|
|
23
|
+
### `wafer login` / `wafer logout` / `wafer whoami`
|
|
24
|
+
|
|
25
|
+
Authenticate with GitHub OAuth.
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
wafer login # Opens browser for GitHub OAuth
|
|
29
|
+
wafer whoami # Show current user
|
|
30
|
+
wafer logout # Remove credentials
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### `wafer remote-run`
|
|
34
|
+
|
|
35
|
+
Run any command on a remote GPU.
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
wafer remote-run -- nvidia-smi
|
|
39
|
+
wafer remote-run --upload-dir ./my_code -- python3 train.py
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### `wafer workspaces`
|
|
43
|
+
|
|
44
|
+
Create and manage persistent GPU environments.
|
|
45
|
+
|
|
46
|
+
**Available GPUs:**
|
|
47
|
+
|
|
48
|
+
- `MI300X` - AMD Instinct MI300X (192GB HBM3, ROCm)
|
|
49
|
+
- `B200` - NVIDIA Blackwell B200 (180GB HBM3e, CUDA) - default
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
wafer workspaces list
|
|
53
|
+
wafer workspaces create my-workspace --gpu B200 --wait # NVIDIA B200
|
|
54
|
+
wafer workspaces create amd-dev --gpu MI300X # AMD MI300X
|
|
55
|
+
wafer workspaces ssh <workspace-id>
|
|
56
|
+
wafer workspaces delete <workspace-id>
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### `wafer agent`
|
|
60
|
+
|
|
61
|
+
AI assistant for GPU kernel development. Helps with CUDA/Triton optimization, documentation queries, and performance analysis.
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
wafer agent "What is TMEM in CuTeDSL?"
|
|
65
|
+
wafer agent -s "optimize this kernel" < kernel.py
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### `wafer evaluate`
|
|
69
|
+
|
|
70
|
+
Evaluate kernel correctness and performance against a reference implementation.
|
|
71
|
+
|
|
72
|
+
**Functional format** (default):
|
|
73
|
+
```bash
|
|
74
|
+
# Generate template files
|
|
75
|
+
wafer evaluate make-template ./my-kernel
|
|
76
|
+
|
|
77
|
+
# Run evaluation
|
|
78
|
+
wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json --benchmark
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
The implementation must define `custom_kernel(inputs)`, the reference must define `ref_kernel(inputs)` and `generate_input(**params)`.
|
|
82
|
+
|
|
83
|
+
**KernelBench format** (ModelNew class):
|
|
84
|
+
```bash
|
|
85
|
+
# Extract a KernelBench problem as template
|
|
86
|
+
wafer evaluate kernelbench make-template level1/1
|
|
87
|
+
|
|
88
|
+
# Run evaluation
|
|
89
|
+
wafer evaluate kernelbench --impl my_kernel.py --reference problem.py --benchmark
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
The implementation must define `class ModelNew(nn.Module)`, the reference must define `class Model`, `get_inputs()`, and `get_init_inputs()`.
|
|
93
|
+
|
|
94
|
+
### `wafer wevin -t ask-docs`
|
|
95
|
+
|
|
96
|
+
Query GPU documentation using the docs template.
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
wafer wevin -t ask-docs --json -s "What causes bank conflicts in shared memory?"
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### `wafer corpus`
|
|
103
|
+
|
|
104
|
+
Download documentation to local filesystem for agents to search.
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
wafer corpus list
|
|
108
|
+
wafer corpus download cuda-programming-guide
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Customization
|
|
114
|
+
|
|
115
|
+
### `wafer remote-run` options
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
wafer remote-run --image pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel -- python3 script.py
|
|
119
|
+
wafer remote-run --require-hwc -- ncu --set full python3 bench.py # Hardware counters for NCU
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### `wafer evaluate` options
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
wafer evaluate --impl k.py --reference r.py --test-cases t.json \
|
|
126
|
+
--target vultr-b200 \ # Specific GPU target
|
|
127
|
+
--benchmark \ # Measure performance
|
|
128
|
+
--profile # Enable torch.profiler + NCU
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### `wafer push` for multi-command workflows
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
WORKSPACE=$(wafer push ./project)
|
|
135
|
+
wafer remote-run --workspace-id $WORKSPACE -- python3 test1.py
|
|
136
|
+
wafer remote-run --workspace-id $WORKSPACE -- python3 test2.py
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Profile analysis
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
wafer nvidia ncu analyze profile.ncu-rep
|
|
143
|
+
wafer nvidia nsys analyze profile.nsys-rep
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Advanced
|
|
149
|
+
|
|
150
|
+
### Local targets
|
|
151
|
+
|
|
152
|
+
Bypass the API and SSH directly to your own GPUs:
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
wafer targets list
|
|
156
|
+
wafer targets add ./my-gpu.toml
|
|
157
|
+
wafer targets default my-gpu
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Defensive evaluation
|
|
161
|
+
|
|
162
|
+
Detect evaluation hacking (stream injection, lazy evaluation, etc.):
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
wafer evaluate --impl k.py --reference r.py --test-cases t.json --benchmark --defensive
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Other tools
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
wafer perfetto <trace.json> --query "SELECT * FROM slice" # Perfetto SQL queries
|
|
172
|
+
wafer capture ./script.py # Capture execution snapshot
|
|
173
|
+
wafer compiler-analyze kernel.ptx # Analyze PTX/SASS
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### ROCm profiling (AMD GPUs)
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
wafer rocprof-sdk ...
|
|
180
|
+
wafer rocprof-systems ...
|
|
181
|
+
wafer rocprof-compute ...
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Shell Completion
|
|
187
|
+
|
|
188
|
+
Enable tab completion for commands, options, and target names:
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
# Install completion (zsh/bash/fish)
|
|
192
|
+
wafer --install-completion
|
|
193
|
+
|
|
194
|
+
# Then restart your terminal, or source your shell config:
|
|
195
|
+
source ~/.zshrc # or ~/.bashrc
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
Now you can tab-complete:
|
|
199
|
+
- Commands: `wafer eva<TAB>` → `wafer evaluate`
|
|
200
|
+
- Options: `wafer evaluate --<TAB>`
|
|
201
|
+
- Target names: `wafer evaluate --target v<TAB>` → `wafer evaluate --target vultr-b200`
|
|
202
|
+
- File paths: `wafer evaluate --impl ./<TAB>`
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## AI Assistant Skills
|
|
207
|
+
|
|
208
|
+
Install the Wafer CLI skill to make wafer commands discoverable by your AI coding assistant:
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
# Install for all supported tools (Claude Code, Codex CLI, Cursor)
|
|
212
|
+
wafer skill install
|
|
213
|
+
|
|
214
|
+
# Install for a specific tool
|
|
215
|
+
wafer skill install -t cursor # Cursor
|
|
216
|
+
wafer skill install -t claude # Claude Code
|
|
217
|
+
wafer skill install -t codex # Codex CLI
|
|
218
|
+
|
|
219
|
+
# Check installation status
|
|
220
|
+
wafer skill status
|
|
221
|
+
|
|
222
|
+
# Uninstall
|
|
223
|
+
wafer skill uninstall
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### Installing from GitHub (Cursor)
|
|
227
|
+
|
|
228
|
+
You can also install the skill directly from GitHub in Cursor:
|
|
229
|
+
|
|
230
|
+
1. Open Cursor Settings (Cmd+Shift+J / Ctrl+Shift+J)
|
|
231
|
+
2. Navigate to **Rules** → **Add Rule** → **Remote Rule (Github)**
|
|
232
|
+
3. Enter: `https://github.com/wafer-ai/skills`
|
|
233
|
+
4. Cursor will automatically discover skills in `.cursor/skills/`
|
|
234
|
+
|
|
235
|
+
The skill provides comprehensive guidance for GPU kernel development, including documentation lookup, trace analysis, kernel evaluation, and optimization workflows.
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## Requirements
|
|
240
|
+
|
|
241
|
+
- Python 3.10+
|
|
242
|
+
- GitHub account (for authentication)
|
|
@@ -467,7 +467,7 @@ class TestLoginLogoutAnalytics:
|
|
|
467
467
|
patch("wafer.analytics.track_login") as mock_track_login, \
|
|
468
468
|
patch("wafer.analytics.init_analytics", return_value=True):
|
|
469
469
|
|
|
470
|
-
runner.invoke(app, ["
|
|
470
|
+
runner.invoke(app, ["login", "--token", "test-token"])
|
|
471
471
|
|
|
472
472
|
# track_login should be called
|
|
473
473
|
mock_track_login.assert_called_once_with("test-user-id", "test@example.com")
|
|
@@ -484,7 +484,7 @@ class TestLoginLogoutAnalytics:
|
|
|
484
484
|
patch("wafer.analytics.track_logout") as mock_track_logout, \
|
|
485
485
|
patch("wafer.analytics.init_analytics", return_value=True):
|
|
486
486
|
|
|
487
|
-
result = runner.invoke(app, ["
|
|
487
|
+
result = runner.invoke(app, ["logout"])
|
|
488
488
|
|
|
489
489
|
assert result.exit_code == 0
|
|
490
490
|
mock_track_logout.assert_called_once()
|
|
@@ -210,7 +210,7 @@ class TestBillingUsageCommand:
|
|
|
210
210
|
)
|
|
211
211
|
mock_client.return_value.__enter__.return_value.get.return_value = mock_response
|
|
212
212
|
|
|
213
|
-
result = runner.invoke(app, ["
|
|
213
|
+
result = runner.invoke(app, ["billing"])
|
|
214
214
|
|
|
215
215
|
assert result.exit_code != 0
|
|
216
216
|
assert "login" in result.output.lower()
|
|
@@ -242,7 +242,7 @@ class TestBillingUsageCommand:
|
|
|
242
242
|
mock_response.raise_for_status.return_value = None
|
|
243
243
|
mock_client.return_value.__enter__.return_value.get.return_value = mock_response
|
|
244
244
|
|
|
245
|
-
result = runner.invoke(app, ["
|
|
245
|
+
result = runner.invoke(app, ["billing", "--json"])
|
|
246
246
|
|
|
247
247
|
assert result.exit_code == 0
|
|
248
248
|
data = json.loads(result.stdout)
|
|
@@ -275,7 +275,7 @@ class TestBillingUsageCommand:
|
|
|
275
275
|
mock_response.raise_for_status.return_value = None
|
|
276
276
|
mock_client.return_value.__enter__.return_value.get.return_value = mock_response
|
|
277
277
|
|
|
278
|
-
result = runner.invoke(app, ["
|
|
278
|
+
result = runner.invoke(app, ["billing"])
|
|
279
279
|
|
|
280
280
|
assert result.exit_code == 0
|
|
281
281
|
assert "Pro" in result.output
|
|
@@ -294,7 +294,7 @@ class TestBillingUsageCommand:
|
|
|
294
294
|
httpx.RequestError("Connection failed")
|
|
295
295
|
)
|
|
296
296
|
|
|
297
|
-
result = runner.invoke(app, ["
|
|
297
|
+
result = runner.invoke(app, ["billing"])
|
|
298
298
|
|
|
299
299
|
assert result.exit_code != 0
|
|
300
300
|
assert "error" in result.output.lower() or "reach" in result.output.lower()
|
|
@@ -317,7 +317,7 @@ class TestBillingTopupCommand:
|
|
|
317
317
|
)
|
|
318
318
|
mock_client.return_value.__enter__.return_value.post.return_value = mock_response
|
|
319
319
|
|
|
320
|
-
result = runner.invoke(app, ["
|
|
320
|
+
result = runner.invoke(app, ["billing", "topup"])
|
|
321
321
|
|
|
322
322
|
assert result.exit_code != 0
|
|
323
323
|
assert "login" in result.output.lower()
|
|
@@ -343,7 +343,7 @@ class TestBillingTopupCommand:
|
|
|
343
343
|
mock_client.return_value.__enter__.return_value.post.return_value = mock_response
|
|
344
344
|
|
|
345
345
|
with patch("webbrowser.open") as mock_browser:
|
|
346
|
-
result = runner.invoke(app, ["
|
|
346
|
+
result = runner.invoke(app, ["billing", "topup"])
|
|
347
347
|
|
|
348
348
|
assert result.exit_code == 0
|
|
349
349
|
# Verify $25 = 2500 cents was sent
|
|
@@ -372,7 +372,7 @@ class TestBillingTopupCommand:
|
|
|
372
372
|
mock_client.return_value.__enter__.return_value.post.return_value = mock_response
|
|
373
373
|
|
|
374
374
|
with patch("webbrowser.open") as mock_browser:
|
|
375
|
-
result = runner.invoke(app, ["
|
|
375
|
+
result = runner.invoke(app, ["billing", "topup", "100"])
|
|
376
376
|
|
|
377
377
|
assert result.exit_code == 0
|
|
378
378
|
call_args = mock_client.return_value.__enter__.return_value.post.call_args
|
|
@@ -381,14 +381,14 @@ class TestBillingTopupCommand:
|
|
|
381
381
|
|
|
382
382
|
def test_amount_below_minimum(self) -> None:
|
|
383
383
|
"""Amount below $10 should error."""
|
|
384
|
-
result = runner.invoke(app, ["
|
|
384
|
+
result = runner.invoke(app, ["billing", "topup", "5"])
|
|
385
385
|
|
|
386
386
|
assert result.exit_code != 0
|
|
387
387
|
assert "10" in result.output # Should mention minimum
|
|
388
388
|
|
|
389
389
|
def test_amount_above_maximum(self) -> None:
|
|
390
390
|
"""Amount above $500 should error."""
|
|
391
|
-
result = runner.invoke(app, ["
|
|
391
|
+
result = runner.invoke(app, ["billing", "topup", "600"])
|
|
392
392
|
|
|
393
393
|
assert result.exit_code != 0
|
|
394
394
|
assert "500" in result.output # Should mention maximum
|
|
@@ -410,7 +410,7 @@ class TestBillingTopupCommand:
|
|
|
410
410
|
)
|
|
411
411
|
mock_client.return_value.__enter__.return_value.post.return_value = mock_response
|
|
412
412
|
|
|
413
|
-
result = runner.invoke(app, ["
|
|
413
|
+
result = runner.invoke(app, ["billing", "topup"])
|
|
414
414
|
|
|
415
415
|
assert result.exit_code != 0
|
|
416
416
|
assert "upgrade" in result.output.lower() or "portal" in result.output.lower()
|
|
@@ -436,7 +436,7 @@ class TestBillingTopupCommand:
|
|
|
436
436
|
mock_client.return_value.__enter__.return_value.post.return_value = mock_response
|
|
437
437
|
|
|
438
438
|
with patch("webbrowser.open") as mock_browser:
|
|
439
|
-
result = runner.invoke(app, ["
|
|
439
|
+
result = runner.invoke(app, ["billing", "topup", "--no-browser"])
|
|
440
440
|
|
|
441
441
|
assert result.exit_code == 0
|
|
442
442
|
assert "https://checkout.stripe.com/test" in result.output
|
|
@@ -460,7 +460,7 @@ class TestBillingPortalCommand:
|
|
|
460
460
|
)
|
|
461
461
|
mock_client.return_value.__enter__.return_value.post.return_value = mock_response
|
|
462
462
|
|
|
463
|
-
result = runner.invoke(app, ["
|
|
463
|
+
result = runner.invoke(app, ["billing", "portal"])
|
|
464
464
|
|
|
465
465
|
assert result.exit_code != 0
|
|
466
466
|
assert "login" in result.output.lower()
|
|
@@ -483,7 +483,7 @@ class TestBillingPortalCommand:
|
|
|
483
483
|
mock_client.return_value.__enter__.return_value.post.return_value = mock_response
|
|
484
484
|
|
|
485
485
|
with patch("webbrowser.open") as mock_browser:
|
|
486
|
-
result = runner.invoke(app, ["
|
|
486
|
+
result = runner.invoke(app, ["billing", "portal"])
|
|
487
487
|
|
|
488
488
|
assert result.exit_code == 0
|
|
489
489
|
mock_browser.assert_called_once_with("https://billing.stripe.com/test")
|
|
@@ -506,7 +506,7 @@ class TestBillingPortalCommand:
|
|
|
506
506
|
mock_client.return_value.__enter__.return_value.post.return_value = mock_response
|
|
507
507
|
|
|
508
508
|
with patch("webbrowser.open") as mock_browser:
|
|
509
|
-
result = runner.invoke(app, ["
|
|
509
|
+
result = runner.invoke(app, ["billing", "portal", "--no-browser"])
|
|
510
510
|
|
|
511
511
|
assert result.exit_code == 0
|
|
512
512
|
assert "https://billing.stripe.com/test" in result.output
|
|
@@ -528,4 +528,4 @@ class TestInsufficientCreditsError:
|
|
|
528
528
|
message = _friendly_error(402, '{"detail": "Insufficient credits"}', "test-workspace")
|
|
529
529
|
|
|
530
530
|
assert "credit" in message.lower()
|
|
531
|
-
assert "wafer
|
|
531
|
+
assert "wafer billing" in message.lower()
|