wafer-cli 0.2.53__tar.gz → 0.2.55__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/PKG-INFO +32 -56
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/README.md +31 -55
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/pyproject.toml +2 -2
- wafer_cli-0.2.55/tests/test_agent_template_discovery.py +500 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_analytics.py +43 -4
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_auth.py +83 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_billing.py +31 -23
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_cli_coverage.py +174 -236
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_cli_parity_integration.py +30 -174
- wafer_cli-0.2.55/tests/test_config_show.py +30 -0
- wafer_cli-0.2.55/tests/test_corpus_lockdown.py +161 -0
- wafer_cli-0.2.55/tests/test_deps.py +94 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_distributed_traces_cli.py +39 -37
- wafer_cli-0.2.55/tests/test_docker_progress.py +161 -0
- wafer_cli-0.2.55/tests/test_evaluate_ux.py +214 -0
- wafer_cli-0.2.55/tests/test_first_run.py +174 -0
- wafer_cli-0.2.55/tests/test_inference.py +32 -0
- wafer_cli-0.2.55/tests/test_json_output.py +458 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_kernel_scope_cli.py +36 -36
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_ncu_run_e2e.py +10 -10
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_ncu_run_local_e2e.py +1 -40
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_nsys_profile.py +2 -2
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_output.py +10 -5
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_rocprof_compute_integration.py +12 -2
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_skill_commands.py +12 -41
- wafer_cli-0.2.55/tests/test_status.py +329 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_targets_ops.py +9 -2
- wafer_cli-0.2.55/tests/test_token_waste.py +217 -0
- wafer_cli-0.2.55/tests/test_ux_improvements.py +575 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_wevin_cli.py +50 -102
- wafer_cli-0.2.55/wafer/GUIDE.md +112 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/agent_defaults.py +96 -198
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/analytics.py +67 -70
- wafer_cli-0.2.55/wafer/api_client.py +10 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/auth.py +1 -100
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/autotuner.py +1 -218
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/baseline.py +11 -117
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/billing.py +32 -22
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/cli.py +1658 -3470
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/cli_instructions.py +18 -56
- wafer_cli-0.2.55/wafer/deps.py +246 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/distributed_traces.py +16 -115
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/evaluate.py +101 -1281
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/global_config.py +5 -112
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/gpu_run.py +30 -73
- wafer_cli-0.2.55/wafer/inference.py +69 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/kernel_scope.py +10 -98
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/ncu_analyze.py +58 -152
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/ncu_run.py +41 -70
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/nsys_analyze.py +18 -234
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/nsys_profile.py +2 -100
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/output.py +30 -40
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/problems.py +1 -83
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/rocprof_compute.py +29 -93
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/rocprof_sdk.py +17 -47
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/rocprof_systems.py +38 -77
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/skills/packed-ops-guide/SKILL.md +3 -3
- wafer_cli-0.2.55/wafer/skills/wafer-guide/SKILL.md +205 -0
- wafer_cli-0.2.55/wafer/skills/wafer-guide/commands.md +125 -0
- wafer_cli-0.2.55/wafer/skills/wafer-guide/evaluate.md +85 -0
- wafer_cli-0.2.55/wafer/skills/wafer-guide/pitfalls.md +8 -0
- wafer_cli-0.2.55/wafer/skills/wafer-guide/profiling.md +47 -0
- wafer_cli-0.2.55/wafer/skills/wafer-guide/workspaces.md +22 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/specs_cli.py +3 -3
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/ssh_keys.py +11 -60
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/targets.py +1 -185
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/targets_cli.py +12 -91
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/targets_ops.py +6 -130
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/templates/aiter_optimize.py +5 -10
- wafer_cli-0.2.55/wafer/templates/ask_docs.py +32 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/templates/audit.py +11 -25
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/templates/optimize_kernel.py +14 -14
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/templates/optimize_kernelbench.py +11 -11
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/templates/optimize_vllm.py +2 -2
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/templates/trace_analyze.py +11 -11
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/trace_compare.py +23 -68
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/tracelens.py +8 -5
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/wevin_cli.py +76 -172
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/workspaces.py +48 -274
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer_cli.egg-info/PKG-INFO +32 -56
- wafer_cli-0.2.55/wafer_cli.egg-info/SOURCES.txt +93 -0
- wafer_cli-0.2.53/tests/test_config_integration.py +0 -50
- wafer_cli-0.2.53/tests/test_workflow_integration.py +0 -147
- wafer_cli-0.2.53/wafer/GUIDE.md +0 -118
- wafer_cli-0.2.53/wafer/api_client.py +0 -195
- wafer_cli-0.2.53/wafer/config.py +0 -105
- wafer_cli-0.2.53/wafer/corpora/amd/amd_instinct_gpu_specs.md +0 -252
- wafer_cli-0.2.53/wafer/corpora/amd/cdna2/01-architecture-overview.md +0 -65
- wafer_cli-0.2.53/wafer/corpora/amd/cdna2/02-matrix-instructions.md +0 -85
- wafer_cli-0.2.53/wafer/corpora/amd/cdna2/README.md +0 -21
- wafer_cli-0.2.53/wafer/corpora/amd/cdna3-isa/01-introduction.md +0 -87
- wafer_cli-0.2.53/wafer/corpora/amd/cdna3-isa/02-program-organization.md +0 -149
- wafer_cli-0.2.53/wafer/corpora/amd/cdna3-isa/03-kernel-state.md +0 -326
- wafer_cli-0.2.53/wafer/corpora/amd/cdna3-isa/04-program-flow-control.md +0 -216
- wafer_cli-0.2.53/wafer/corpora/amd/cdna3-isa/05-scalar-alu.md +0 -263
- wafer_cli-0.2.53/wafer/corpora/amd/cdna3-isa/06-vector-alu.md +0 -277
- wafer_cli-0.2.53/wafer/corpora/amd/cdna3-isa/07-matrix-instructions.md +0 -346
- wafer_cli-0.2.53/wafer/corpora/amd/cdna3-isa/08-scalar-memory.md +0 -145
- wafer_cli-0.2.53/wafer/corpora/amd/cdna3-isa/09-vector-memory.md +0 -247
- wafer_cli-0.2.53/wafer/corpora/amd/cdna3-isa/10-flat-memory.md +0 -227
- wafer_cli-0.2.53/wafer/corpora/amd/cdna3-isa/11-data-share.md +0 -237
- wafer_cli-0.2.53/wafer/corpora/amd/cdna3-isa/README.md +0 -49
- wafer_cli-0.2.53/wafer/corpora/amd/composable-kernel/01-ck-overview.md +0 -217
- wafer_cli-0.2.53/wafer/corpora/amd/hip/01-hip-programming-model.md +0 -143
- wafer_cli-0.2.53/wafer/corpora/amd/hip/02-hip-memory-management.md +0 -183
- wafer_cli-0.2.53/wafer/corpora/amd/hip/03-hip-synchronization.md +0 -211
- wafer_cli-0.2.53/wafer/corpora/amd/hip/04-hip-intrinsics.md +0 -254
- wafer_cli-0.2.53/wafer/corpora/amd/rocm-profiling/01-rocprofiler-overview.md +0 -174
- wafer_cli-0.2.53/wafer/corpora/common/flash-attention/01-flash-attention-overview.md +0 -185
- wafer_cli-0.2.53/wafer/corpora/common/vllm/01-vllm-overview.md +0 -208
- wafer_cli-0.2.53/wafer/corpora/nvidia/blackwell/01-architecture-overview.md +0 -133
- wafer_cli-0.2.53/wafer/corpora/nvidia/cuda-guide/01-cuda-programming-model.md +0 -133
- wafer_cli-0.2.53/wafer/corpora/nvidia/cuda-guide/02-cuda-memory-management.md +0 -202
- wafer_cli-0.2.53/wafer/corpora/nvidia/cuda-guide/03-cuda-best-practices.md +0 -201
- wafer_cli-0.2.53/wafer/corpora/nvidia/cuda-guide/04-cuda-streams-events.md +0 -255
- wafer_cli-0.2.53/wafer/corpora/nvidia/cutlass/01-cutlass-overview.md +0 -165
- wafer_cli-0.2.53/wafer/corpora/nvidia/hopper/01-overview.md +0 -113
- wafer_cli-0.2.53/wafer/corpora/nvidia/hopper/02-streaming-multiprocessor.md +0 -143
- wafer_cli-0.2.53/wafer/corpora/nvidia/hopper/03-tensor-cores.md +0 -158
- wafer_cli-0.2.53/wafer/corpora/nvidia/hopper/04-memory-hierarchy.md +0 -219
- wafer_cli-0.2.53/wafer/corpora/nvidia/hopper/05-synchronization.md +0 -242
- wafer_cli-0.2.53/wafer/corpora/nvidia/hopper/README.md +0 -40
- wafer_cli-0.2.53/wafer/corpora/nvidia/nsight/01-nsight-compute-overview.md +0 -167
- wafer_cli-0.2.53/wafer/corpora/nvidia/nsight/02-nsight-systems.md +0 -187
- wafer_cli-0.2.53/wafer/corpora/nvidia/ptx-isa/01-ptx-overview.md +0 -169
- wafer_cli-0.2.53/wafer/corpora/nvidia/ptx-isa/02-ptx-tensor-operations.md +0 -179
- wafer_cli-0.2.53/wafer/corpora/nvidia/triton/01-triton-overview.md +0 -203
- wafer_cli-0.2.53/wafer/corpus.py +0 -693
- wafer_cli-0.2.53/wafer/inference.py +0 -148
- wafer_cli-0.2.53/wafer/skills/wafer-guide/SKILL.md +0 -319
- wafer_cli-0.2.53/wafer/templates/ask_docs.py +0 -61
- wafer_cli-0.2.53/wafer_cli.egg-info/SOURCES.txt +0 -120
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/setup.cfg +0 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_file_operations_integration.py +0 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_ncu_run.py +0 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_nsys_analyze.py +0 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/tests/test_ssh_integration.py +0 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/__init__.py +0 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/target_lock.py +0 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/templates/__init__.py +0 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/templates/optimize_flashinfer.py +0 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer/tests/test_eval_cli_parity.py +0 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer_cli.egg-info/dependency_links.txt +0 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer_cli.egg-info/entry_points.txt +0 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer_cli.egg-info/requires.txt +0 -0
- {wafer_cli-0.2.53 → wafer_cli-0.2.55}/wafer_cli.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wafer-cli
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.55
|
|
4
4
|
Summary: CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels
|
|
5
5
|
Requires-Python: >=3.11
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -67,11 +67,11 @@ Create and manage persistent GPU environments.
|
|
|
67
67
|
- `B200` - NVIDIA Blackwell B200 (180GB HBM3e, CUDA) - default
|
|
68
68
|
|
|
69
69
|
```bash
|
|
70
|
-
wafer
|
|
71
|
-
wafer
|
|
72
|
-
wafer
|
|
73
|
-
wafer
|
|
74
|
-
wafer
|
|
70
|
+
wafer target workspace list
|
|
71
|
+
wafer target workspace create my-workspace --gpu B200 --wait # NVIDIA B200
|
|
72
|
+
wafer target workspace create amd-dev --gpu MI300X # AMD MI300X
|
|
73
|
+
wafer target workspace ssh <workspace-id>
|
|
74
|
+
wafer target workspace delete <workspace-id>
|
|
75
75
|
```
|
|
76
76
|
|
|
77
77
|
### `wafer agent`
|
|
@@ -83,17 +83,17 @@ wafer agent "What is TMEM in CuTeDSL?"
|
|
|
83
83
|
wafer agent -s "optimize this kernel" < kernel.py
|
|
84
84
|
```
|
|
85
85
|
|
|
86
|
-
### `wafer
|
|
86
|
+
### `wafer tool eval`
|
|
87
87
|
|
|
88
88
|
Evaluate kernel correctness and performance against a reference implementation.
|
|
89
89
|
|
|
90
90
|
**Functional format** (default):
|
|
91
91
|
```bash
|
|
92
92
|
# Generate template files
|
|
93
|
-
wafer
|
|
93
|
+
wafer tool eval make-template ./my-kernel
|
|
94
94
|
|
|
95
95
|
# Run evaluation
|
|
96
|
-
wafer
|
|
96
|
+
wafer tool eval gpumode --impl kernel.py --reference ref.py --test-cases tests.json --benchmark
|
|
97
97
|
```
|
|
98
98
|
|
|
99
99
|
The implementation must define `custom_kernel(inputs)`, the reference must define `ref_kernel(inputs)` and `generate_input(**params)`.
|
|
@@ -101,64 +101,40 @@ The implementation must define `custom_kernel(inputs)`, the reference must defin
|
|
|
101
101
|
**KernelBench format** (ModelNew class):
|
|
102
102
|
```bash
|
|
103
103
|
# Extract a KernelBench problem as template
|
|
104
|
-
wafer
|
|
104
|
+
wafer tool eval kernelbench make-template level1/1
|
|
105
105
|
|
|
106
106
|
# Run evaluation
|
|
107
|
-
wafer
|
|
107
|
+
wafer tool eval kernelbench --impl my_kernel.py --reference problem.py --benchmark
|
|
108
108
|
```
|
|
109
109
|
|
|
110
110
|
The implementation must define `class ModelNew(nn.Module)`, the reference must define `class Model`, `get_inputs()`, and `get_init_inputs()`.
|
|
111
111
|
|
|
112
|
-
### `wafer
|
|
112
|
+
### `wafer agent -t ask-docs`
|
|
113
113
|
|
|
114
|
-
Query GPU documentation using the docs template.
|
|
114
|
+
Query GPU documentation using the docs template. Uses the `ask_docs` tool to search wafer's documentation corpus via the API.
|
|
115
115
|
|
|
116
116
|
```bash
|
|
117
|
-
wafer
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
### `wafer corpus`
|
|
121
|
-
|
|
122
|
-
Download documentation to local filesystem for agents to search.
|
|
123
|
-
|
|
124
|
-
```bash
|
|
125
|
-
wafer corpus list
|
|
126
|
-
wafer corpus download cuda-programming-guide
|
|
117
|
+
wafer agent -t ask-docs -s "What causes bank conflicts in shared memory?"
|
|
127
118
|
```
|
|
128
119
|
|
|
129
120
|
---
|
|
130
121
|
|
|
131
122
|
## Customization
|
|
132
123
|
|
|
133
|
-
### `wafer
|
|
124
|
+
### `wafer tool eval` options
|
|
134
125
|
|
|
135
126
|
```bash
|
|
136
|
-
wafer
|
|
137
|
-
wafer remote-run --require-hwc -- ncu --set full python3 bench.py # Hardware counters for NCU
|
|
138
|
-
```
|
|
139
|
-
|
|
140
|
-
### `wafer evaluate` options
|
|
141
|
-
|
|
142
|
-
```bash
|
|
143
|
-
wafer evaluate --impl k.py --reference r.py --test-cases t.json \
|
|
127
|
+
wafer tool eval gpumode --impl k.py --reference r.py --test-cases t.json \
|
|
144
128
|
--target vultr-b200 \ # Specific GPU target
|
|
145
129
|
--benchmark \ # Measure performance
|
|
146
130
|
--profile # Enable torch.profiler + NCU
|
|
147
131
|
```
|
|
148
132
|
|
|
149
|
-
### `wafer push` for multi-command workflows
|
|
150
|
-
|
|
151
|
-
```bash
|
|
152
|
-
WORKSPACE=$(wafer push ./project)
|
|
153
|
-
wafer remote-run --workspace-id $WORKSPACE -- python3 test1.py
|
|
154
|
-
wafer remote-run --workspace-id $WORKSPACE -- python3 test2.py
|
|
155
|
-
```
|
|
156
|
-
|
|
157
133
|
### Profile analysis
|
|
158
134
|
|
|
159
135
|
```bash
|
|
160
|
-
wafer
|
|
161
|
-
wafer
|
|
136
|
+
wafer tool ncu analyze profile.ncu-rep
|
|
137
|
+
wafer tool nsys analyze profile.nsys-rep
|
|
162
138
|
```
|
|
163
139
|
|
|
164
140
|
---
|
|
@@ -170,9 +146,9 @@ wafer nvidia nsys analyze profile.nsys-rep
|
|
|
170
146
|
Bypass the API and SSH directly to your own GPUs:
|
|
171
147
|
|
|
172
148
|
```bash
|
|
173
|
-
wafer
|
|
174
|
-
wafer
|
|
175
|
-
wafer
|
|
149
|
+
wafer target config list
|
|
150
|
+
wafer target config add ./my-gpu.toml
|
|
151
|
+
wafer target config default my-gpu
|
|
176
152
|
```
|
|
177
153
|
|
|
178
154
|
### Defensive evaluation
|
|
@@ -180,23 +156,23 @@ wafer targets default my-gpu
|
|
|
180
156
|
Detect evaluation hacking (stream injection, lazy evaluation, etc.):
|
|
181
157
|
|
|
182
158
|
```bash
|
|
183
|
-
wafer
|
|
159
|
+
wafer tool eval gpumode --impl k.py --reference r.py --test-cases t.json --benchmark --defensive
|
|
184
160
|
```
|
|
185
161
|
|
|
186
162
|
### Other tools
|
|
187
163
|
|
|
188
164
|
```bash
|
|
189
|
-
wafer perfetto <trace.json> --query "SELECT * FROM slice" # Perfetto SQL queries
|
|
190
|
-
wafer capture ./script.py # Capture execution snapshot
|
|
191
|
-
wafer compiler-analyze kernel.ptx
|
|
165
|
+
wafer tool perfetto <trace.json> --query "SELECT * FROM slice" # Perfetto SQL queries
|
|
166
|
+
wafer tool capture ./script.py # Capture execution snapshot
|
|
167
|
+
wafer compiler-analyze kernel.ptx # Analyze PTX/SASS
|
|
192
168
|
```
|
|
193
169
|
|
|
194
170
|
### ROCm profiling (AMD GPUs)
|
|
195
171
|
|
|
196
172
|
```bash
|
|
197
|
-
wafer rocprof-sdk ...
|
|
198
|
-
wafer rocprof-systems ...
|
|
199
|
-
wafer rocprof-compute ...
|
|
173
|
+
wafer tool rocprof-sdk ...
|
|
174
|
+
wafer tool rocprof-systems ...
|
|
175
|
+
wafer tool rocprof-compute ...
|
|
200
176
|
```
|
|
201
177
|
|
|
202
178
|
---
|
|
@@ -214,10 +190,10 @@ source ~/.zshrc # or ~/.bashrc
|
|
|
214
190
|
```
|
|
215
191
|
|
|
216
192
|
Now you can tab-complete:
|
|
217
|
-
- Commands: `wafer
|
|
218
|
-
- Options: `wafer
|
|
219
|
-
- Target names: `wafer
|
|
220
|
-
- File paths: `wafer
|
|
193
|
+
- Commands: `wafer tool ev<TAB>` → `wafer tool eval`
|
|
194
|
+
- Options: `wafer tool eval --<TAB>`
|
|
195
|
+
- Target names: `wafer tool eval --target v<TAB>` → `wafer tool eval --target vultr-b200`
|
|
196
|
+
- File paths: `wafer tool eval gpumode --impl ./<TAB>`
|
|
221
197
|
|
|
222
198
|
---
|
|
223
199
|
|
|
@@ -49,11 +49,11 @@ Create and manage persistent GPU environments.
|
|
|
49
49
|
- `B200` - NVIDIA Blackwell B200 (180GB HBM3e, CUDA) - default
|
|
50
50
|
|
|
51
51
|
```bash
|
|
52
|
-
wafer
|
|
53
|
-
wafer
|
|
54
|
-
wafer
|
|
55
|
-
wafer
|
|
56
|
-
wafer
|
|
52
|
+
wafer target workspace list
|
|
53
|
+
wafer target workspace create my-workspace --gpu B200 --wait # NVIDIA B200
|
|
54
|
+
wafer target workspace create amd-dev --gpu MI300X # AMD MI300X
|
|
55
|
+
wafer target workspace ssh <workspace-id>
|
|
56
|
+
wafer target workspace delete <workspace-id>
|
|
57
57
|
```
|
|
58
58
|
|
|
59
59
|
### `wafer agent`
|
|
@@ -65,17 +65,17 @@ wafer agent "What is TMEM in CuTeDSL?"
|
|
|
65
65
|
wafer agent -s "optimize this kernel" < kernel.py
|
|
66
66
|
```
|
|
67
67
|
|
|
68
|
-
### `wafer
|
|
68
|
+
### `wafer tool eval`
|
|
69
69
|
|
|
70
70
|
Evaluate kernel correctness and performance against a reference implementation.
|
|
71
71
|
|
|
72
72
|
**Functional format** (default):
|
|
73
73
|
```bash
|
|
74
74
|
# Generate template files
|
|
75
|
-
wafer
|
|
75
|
+
wafer tool eval make-template ./my-kernel
|
|
76
76
|
|
|
77
77
|
# Run evaluation
|
|
78
|
-
wafer
|
|
78
|
+
wafer tool eval gpumode --impl kernel.py --reference ref.py --test-cases tests.json --benchmark
|
|
79
79
|
```
|
|
80
80
|
|
|
81
81
|
The implementation must define `custom_kernel(inputs)`, the reference must define `ref_kernel(inputs)` and `generate_input(**params)`.
|
|
@@ -83,64 +83,40 @@ The implementation must define `custom_kernel(inputs)`, the reference must defin
|
|
|
83
83
|
**KernelBench format** (ModelNew class):
|
|
84
84
|
```bash
|
|
85
85
|
# Extract a KernelBench problem as template
|
|
86
|
-
wafer
|
|
86
|
+
wafer tool eval kernelbench make-template level1/1
|
|
87
87
|
|
|
88
88
|
# Run evaluation
|
|
89
|
-
wafer
|
|
89
|
+
wafer tool eval kernelbench --impl my_kernel.py --reference problem.py --benchmark
|
|
90
90
|
```
|
|
91
91
|
|
|
92
92
|
The implementation must define `class ModelNew(nn.Module)`, the reference must define `class Model`, `get_inputs()`, and `get_init_inputs()`.
|
|
93
93
|
|
|
94
|
-
### `wafer
|
|
94
|
+
### `wafer agent -t ask-docs`
|
|
95
95
|
|
|
96
|
-
Query GPU documentation using the docs template.
|
|
96
|
+
Query GPU documentation using the docs template. Uses the `ask_docs` tool to search wafer's documentation corpus via the API.
|
|
97
97
|
|
|
98
98
|
```bash
|
|
99
|
-
wafer
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
### `wafer corpus`
|
|
103
|
-
|
|
104
|
-
Download documentation to local filesystem for agents to search.
|
|
105
|
-
|
|
106
|
-
```bash
|
|
107
|
-
wafer corpus list
|
|
108
|
-
wafer corpus download cuda-programming-guide
|
|
99
|
+
wafer agent -t ask-docs -s "What causes bank conflicts in shared memory?"
|
|
109
100
|
```
|
|
110
101
|
|
|
111
102
|
---
|
|
112
103
|
|
|
113
104
|
## Customization
|
|
114
105
|
|
|
115
|
-
### `wafer
|
|
106
|
+
### `wafer tool eval` options
|
|
116
107
|
|
|
117
108
|
```bash
|
|
118
|
-
wafer
|
|
119
|
-
wafer remote-run --require-hwc -- ncu --set full python3 bench.py # Hardware counters for NCU
|
|
120
|
-
```
|
|
121
|
-
|
|
122
|
-
### `wafer evaluate` options
|
|
123
|
-
|
|
124
|
-
```bash
|
|
125
|
-
wafer evaluate --impl k.py --reference r.py --test-cases t.json \
|
|
109
|
+
wafer tool eval gpumode --impl k.py --reference r.py --test-cases t.json \
|
|
126
110
|
--target vultr-b200 \ # Specific GPU target
|
|
127
111
|
--benchmark \ # Measure performance
|
|
128
112
|
--profile # Enable torch.profiler + NCU
|
|
129
113
|
```
|
|
130
114
|
|
|
131
|
-
### `wafer push` for multi-command workflows
|
|
132
|
-
|
|
133
|
-
```bash
|
|
134
|
-
WORKSPACE=$(wafer push ./project)
|
|
135
|
-
wafer remote-run --workspace-id $WORKSPACE -- python3 test1.py
|
|
136
|
-
wafer remote-run --workspace-id $WORKSPACE -- python3 test2.py
|
|
137
|
-
```
|
|
138
|
-
|
|
139
115
|
### Profile analysis
|
|
140
116
|
|
|
141
117
|
```bash
|
|
142
|
-
wafer
|
|
143
|
-
wafer
|
|
118
|
+
wafer tool ncu analyze profile.ncu-rep
|
|
119
|
+
wafer tool nsys analyze profile.nsys-rep
|
|
144
120
|
```
|
|
145
121
|
|
|
146
122
|
---
|
|
@@ -152,9 +128,9 @@ wafer nvidia nsys analyze profile.nsys-rep
|
|
|
152
128
|
Bypass the API and SSH directly to your own GPUs:
|
|
153
129
|
|
|
154
130
|
```bash
|
|
155
|
-
wafer
|
|
156
|
-
wafer
|
|
157
|
-
wafer
|
|
131
|
+
wafer target config list
|
|
132
|
+
wafer target config add ./my-gpu.toml
|
|
133
|
+
wafer target config default my-gpu
|
|
158
134
|
```
|
|
159
135
|
|
|
160
136
|
### Defensive evaluation
|
|
@@ -162,23 +138,23 @@ wafer targets default my-gpu
|
|
|
162
138
|
Detect evaluation hacking (stream injection, lazy evaluation, etc.):
|
|
163
139
|
|
|
164
140
|
```bash
|
|
165
|
-
wafer
|
|
141
|
+
wafer tool eval gpumode --impl k.py --reference r.py --test-cases t.json --benchmark --defensive
|
|
166
142
|
```
|
|
167
143
|
|
|
168
144
|
### Other tools
|
|
169
145
|
|
|
170
146
|
```bash
|
|
171
|
-
wafer perfetto <trace.json> --query "SELECT * FROM slice" # Perfetto SQL queries
|
|
172
|
-
wafer capture ./script.py # Capture execution snapshot
|
|
173
|
-
wafer compiler-analyze kernel.ptx
|
|
147
|
+
wafer tool perfetto <trace.json> --query "SELECT * FROM slice" # Perfetto SQL queries
|
|
148
|
+
wafer tool capture ./script.py # Capture execution snapshot
|
|
149
|
+
wafer compiler-analyze kernel.ptx # Analyze PTX/SASS
|
|
174
150
|
```
|
|
175
151
|
|
|
176
152
|
### ROCm profiling (AMD GPUs)
|
|
177
153
|
|
|
178
154
|
```bash
|
|
179
|
-
wafer rocprof-sdk ...
|
|
180
|
-
wafer rocprof-systems ...
|
|
181
|
-
wafer rocprof-compute ...
|
|
155
|
+
wafer tool rocprof-sdk ...
|
|
156
|
+
wafer tool rocprof-systems ...
|
|
157
|
+
wafer tool rocprof-compute ...
|
|
182
158
|
```
|
|
183
159
|
|
|
184
160
|
---
|
|
@@ -196,10 +172,10 @@ source ~/.zshrc # or ~/.bashrc
|
|
|
196
172
|
```
|
|
197
173
|
|
|
198
174
|
Now you can tab-complete:
|
|
199
|
-
- Commands: `wafer
|
|
200
|
-
- Options: `wafer
|
|
201
|
-
- Target names: `wafer
|
|
202
|
-
- File paths: `wafer
|
|
175
|
+
- Commands: `wafer tool ev<TAB>` → `wafer tool eval`
|
|
176
|
+
- Options: `wafer tool eval --<TAB>`
|
|
177
|
+
- Target names: `wafer tool eval --target v<TAB>` → `wafer tool eval --target vultr-b200`
|
|
178
|
+
- File paths: `wafer tool eval gpumode --impl ./<TAB>`
|
|
203
179
|
|
|
204
180
|
---
|
|
205
181
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "wafer-cli"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.55"
|
|
4
4
|
description = "CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.11"
|
|
@@ -37,7 +37,7 @@ where = ["."]
|
|
|
37
37
|
include = ["wafer*"]
|
|
38
38
|
|
|
39
39
|
[tool.setuptools.package-data]
|
|
40
|
-
wafer = ["GUIDE.md", "skills
|
|
40
|
+
wafer = ["GUIDE.md", "skills/*/*.md"]
|
|
41
41
|
|
|
42
42
|
[tool.ruff]
|
|
43
43
|
line-length = 100
|