wafer-cli 0.2.52__tar.gz → 0.2.54__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/PKG-INFO +32 -56
  2. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/README.md +31 -55
  3. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/pyproject.toml +3 -2
  4. wafer_cli-0.2.54/tests/test_agent_template_discovery.py +500 -0
  5. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_analytics.py +43 -4
  6. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_auth.py +83 -0
  7. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_billing.py +31 -23
  8. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_cli_coverage.py +174 -236
  9. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_cli_parity_integration.py +30 -174
  10. wafer_cli-0.2.54/tests/test_config_show.py +30 -0
  11. wafer_cli-0.2.54/tests/test_corpus_lockdown.py +161 -0
  12. wafer_cli-0.2.54/tests/test_deps.py +94 -0
  13. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_distributed_traces_cli.py +39 -37
  14. wafer_cli-0.2.54/tests/test_docker_progress.py +161 -0
  15. wafer_cli-0.2.54/tests/test_evaluate_ux.py +214 -0
  16. wafer_cli-0.2.54/tests/test_first_run.py +174 -0
  17. wafer_cli-0.2.54/tests/test_inference.py +32 -0
  18. wafer_cli-0.2.54/tests/test_json_output.py +458 -0
  19. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_kernel_scope_cli.py +36 -36
  20. wafer_cli-0.2.54/tests/test_ncu_run.py +337 -0
  21. wafer_cli-0.2.54/tests/test_ncu_run_e2e.py +225 -0
  22. wafer_cli-0.2.54/tests/test_ncu_run_local_e2e.py +176 -0
  23. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_nsys_profile.py +2 -2
  24. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_output.py +10 -5
  25. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_rocprof_compute_integration.py +12 -2
  26. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_skill_commands.py +12 -41
  27. wafer_cli-0.2.54/tests/test_status.py +329 -0
  28. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_targets_ops.py +9 -2
  29. wafer_cli-0.2.54/tests/test_token_waste.py +217 -0
  30. wafer_cli-0.2.54/tests/test_ux_improvements.py +575 -0
  31. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_wevin_cli.py +50 -102
  32. wafer_cli-0.2.54/wafer/GUIDE.md +112 -0
  33. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/agent_defaults.py +96 -198
  34. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/analytics.py +67 -70
  35. wafer_cli-0.2.54/wafer/api_client.py +10 -0
  36. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/auth.py +1 -100
  37. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/autotuner.py +1 -218
  38. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/baseline.py +11 -117
  39. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/billing.py +32 -22
  40. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/cli.py +1709 -3435
  41. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/cli_instructions.py +18 -56
  42. wafer_cli-0.2.54/wafer/deps.py +246 -0
  43. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/distributed_traces.py +16 -115
  44. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/evaluate.py +101 -1281
  45. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/global_config.py +5 -112
  46. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/gpu_run.py +30 -73
  47. wafer_cli-0.2.54/wafer/inference.py +69 -0
  48. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/kernel_scope.py +41 -96
  49. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/ncu_analyze.py +58 -152
  50. wafer_cli-0.2.54/wafer/ncu_run.py +343 -0
  51. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/nsys_analyze.py +18 -234
  52. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/nsys_profile.py +2 -100
  53. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/output.py +30 -40
  54. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/problems.py +1 -83
  55. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/rocprof_compute.py +29 -93
  56. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/rocprof_sdk.py +17 -47
  57. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/rocprof_systems.py +38 -77
  58. wafer_cli-0.2.54/wafer/skills/packed-ops-guide/SKILL.md +212 -0
  59. wafer_cli-0.2.54/wafer/skills/wafer-guide/SKILL.md +205 -0
  60. wafer_cli-0.2.54/wafer/skills/wafer-guide/commands.md +125 -0
  61. wafer_cli-0.2.54/wafer/skills/wafer-guide/evaluate.md +85 -0
  62. wafer_cli-0.2.54/wafer/skills/wafer-guide/pitfalls.md +8 -0
  63. wafer_cli-0.2.54/wafer/skills/wafer-guide/profiling.md +47 -0
  64. wafer_cli-0.2.54/wafer/skills/wafer-guide/workspaces.md +22 -0
  65. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/specs_cli.py +3 -3
  66. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/ssh_keys.py +11 -60
  67. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/targets.py +1 -185
  68. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/targets_cli.py +12 -91
  69. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/targets_ops.py +6 -130
  70. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/templates/aiter_optimize.py +5 -10
  71. wafer_cli-0.2.54/wafer/templates/ask_docs.py +32 -0
  72. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/templates/audit.py +11 -25
  73. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/templates/optimize_kernel.py +14 -14
  74. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/templates/optimize_kernelbench.py +11 -11
  75. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/templates/optimize_vllm.py +2 -2
  76. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/templates/trace_analyze.py +11 -11
  77. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/trace_compare.py +23 -68
  78. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/tracelens.py +8 -5
  79. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/wevin_cli.py +76 -172
  80. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/workspaces.py +48 -274
  81. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer_cli.egg-info/PKG-INFO +32 -56
  82. wafer_cli-0.2.54/wafer_cli.egg-info/SOURCES.txt +93 -0
  83. wafer_cli-0.2.52/tests/test_config_integration.py +0 -50
  84. wafer_cli-0.2.52/tests/test_workflow_integration.py +0 -147
  85. wafer_cli-0.2.52/wafer/GUIDE.md +0 -118
  86. wafer_cli-0.2.52/wafer/api_client.py +0 -195
  87. wafer_cli-0.2.52/wafer/config.py +0 -105
  88. wafer_cli-0.2.52/wafer/corpora/amd/amd_instinct_gpu_specs.md +0 -252
  89. wafer_cli-0.2.52/wafer/corpora/amd/cdna2/01-architecture-overview.md +0 -65
  90. wafer_cli-0.2.52/wafer/corpora/amd/cdna2/02-matrix-instructions.md +0 -85
  91. wafer_cli-0.2.52/wafer/corpora/amd/cdna2/README.md +0 -21
  92. wafer_cli-0.2.52/wafer/corpora/amd/cdna3-isa/01-introduction.md +0 -87
  93. wafer_cli-0.2.52/wafer/corpora/amd/cdna3-isa/02-program-organization.md +0 -149
  94. wafer_cli-0.2.52/wafer/corpora/amd/cdna3-isa/03-kernel-state.md +0 -326
  95. wafer_cli-0.2.52/wafer/corpora/amd/cdna3-isa/04-program-flow-control.md +0 -216
  96. wafer_cli-0.2.52/wafer/corpora/amd/cdna3-isa/05-scalar-alu.md +0 -263
  97. wafer_cli-0.2.52/wafer/corpora/amd/cdna3-isa/06-vector-alu.md +0 -277
  98. wafer_cli-0.2.52/wafer/corpora/amd/cdna3-isa/07-matrix-instructions.md +0 -346
  99. wafer_cli-0.2.52/wafer/corpora/amd/cdna3-isa/08-scalar-memory.md +0 -145
  100. wafer_cli-0.2.52/wafer/corpora/amd/cdna3-isa/09-vector-memory.md +0 -247
  101. wafer_cli-0.2.52/wafer/corpora/amd/cdna3-isa/10-flat-memory.md +0 -227
  102. wafer_cli-0.2.52/wafer/corpora/amd/cdna3-isa/11-data-share.md +0 -237
  103. wafer_cli-0.2.52/wafer/corpora/amd/cdna3-isa/README.md +0 -49
  104. wafer_cli-0.2.52/wafer/corpora/amd/composable-kernel/01-ck-overview.md +0 -217
  105. wafer_cli-0.2.52/wafer/corpora/amd/hip/01-hip-programming-model.md +0 -143
  106. wafer_cli-0.2.52/wafer/corpora/amd/hip/02-hip-memory-management.md +0 -183
  107. wafer_cli-0.2.52/wafer/corpora/amd/hip/03-hip-synchronization.md +0 -211
  108. wafer_cli-0.2.52/wafer/corpora/amd/hip/04-hip-intrinsics.md +0 -254
  109. wafer_cli-0.2.52/wafer/corpora/amd/rocm-profiling/01-rocprofiler-overview.md +0 -174
  110. wafer_cli-0.2.52/wafer/corpora/common/flash-attention/01-flash-attention-overview.md +0 -185
  111. wafer_cli-0.2.52/wafer/corpora/common/vllm/01-vllm-overview.md +0 -208
  112. wafer_cli-0.2.52/wafer/corpora/nvidia/blackwell/01-architecture-overview.md +0 -133
  113. wafer_cli-0.2.52/wafer/corpora/nvidia/cuda-guide/01-cuda-programming-model.md +0 -133
  114. wafer_cli-0.2.52/wafer/corpora/nvidia/cuda-guide/02-cuda-memory-management.md +0 -202
  115. wafer_cli-0.2.52/wafer/corpora/nvidia/cuda-guide/03-cuda-best-practices.md +0 -201
  116. wafer_cli-0.2.52/wafer/corpora/nvidia/cuda-guide/04-cuda-streams-events.md +0 -255
  117. wafer_cli-0.2.52/wafer/corpora/nvidia/cutlass/01-cutlass-overview.md +0 -165
  118. wafer_cli-0.2.52/wafer/corpora/nvidia/hopper/01-overview.md +0 -113
  119. wafer_cli-0.2.52/wafer/corpora/nvidia/hopper/02-streaming-multiprocessor.md +0 -143
  120. wafer_cli-0.2.52/wafer/corpora/nvidia/hopper/03-tensor-cores.md +0 -158
  121. wafer_cli-0.2.52/wafer/corpora/nvidia/hopper/04-memory-hierarchy.md +0 -219
  122. wafer_cli-0.2.52/wafer/corpora/nvidia/hopper/05-synchronization.md +0 -242
  123. wafer_cli-0.2.52/wafer/corpora/nvidia/hopper/README.md +0 -40
  124. wafer_cli-0.2.52/wafer/corpora/nvidia/nsight/01-nsight-compute-overview.md +0 -167
  125. wafer_cli-0.2.52/wafer/corpora/nvidia/nsight/02-nsight-systems.md +0 -187
  126. wafer_cli-0.2.52/wafer/corpora/nvidia/ptx-isa/01-ptx-overview.md +0 -169
  127. wafer_cli-0.2.52/wafer/corpora/nvidia/ptx-isa/02-ptx-tensor-operations.md +0 -179
  128. wafer_cli-0.2.52/wafer/corpora/nvidia/triton/01-triton-overview.md +0 -203
  129. wafer_cli-0.2.52/wafer/corpus.py +0 -693
  130. wafer_cli-0.2.52/wafer/inference.py +0 -148
  131. wafer_cli-0.2.52/wafer/skills/wafer-guide/SKILL.md +0 -319
  132. wafer_cli-0.2.52/wafer/templates/ask_docs.py +0 -61
  133. wafer_cli-0.2.52/wafer_cli.egg-info/SOURCES.txt +0 -115
  134. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/setup.cfg +0 -0
  135. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_file_operations_integration.py +0 -0
  136. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_nsys_analyze.py +0 -0
  137. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/tests/test_ssh_integration.py +0 -0
  138. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/__init__.py +0 -0
  139. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/target_lock.py +0 -0
  140. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/templates/__init__.py +0 -0
  141. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/templates/optimize_flashinfer.py +0 -0
  142. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer/tests/test_eval_cli_parity.py +0 -0
  143. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer_cli.egg-info/dependency_links.txt +0 -0
  144. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer_cli.egg-info/entry_points.txt +0 -0
  145. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer_cli.egg-info/requires.txt +0 -0
  146. {wafer_cli-0.2.52 → wafer_cli-0.2.54}/wafer_cli.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wafer-cli
3
- Version: 0.2.52
3
+ Version: 0.2.54
4
4
  Summary: CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -67,11 +67,11 @@ Create and manage persistent GPU environments.
67
67
  - `B200` - NVIDIA Blackwell B200 (180GB HBM3e, CUDA) - default
68
68
 
69
69
  ```bash
70
- wafer workspaces list
71
- wafer workspaces create my-workspace --gpu B200 --wait # NVIDIA B200
72
- wafer workspaces create amd-dev --gpu MI300X # AMD MI300X
73
- wafer workspaces ssh <workspace-id>
74
- wafer workspaces delete <workspace-id>
70
+ wafer target workspace list
71
+ wafer target workspace create my-workspace --gpu B200 --wait # NVIDIA B200
72
+ wafer target workspace create amd-dev --gpu MI300X # AMD MI300X
73
+ wafer target workspace ssh <workspace-id>
74
+ wafer target workspace delete <workspace-id>
75
75
  ```
76
76
 
77
77
  ### `wafer agent`
@@ -83,17 +83,17 @@ wafer agent "What is TMEM in CuTeDSL?"
83
83
  wafer agent -s "optimize this kernel" < kernel.py
84
84
  ```
85
85
 
86
- ### `wafer evaluate`
86
+ ### `wafer tool eval`
87
87
 
88
88
  Evaluate kernel correctness and performance against a reference implementation.
89
89
 
90
90
  **Functional format** (default):
91
91
  ```bash
92
92
  # Generate template files
93
- wafer evaluate make-template ./my-kernel
93
+ wafer tool eval make-template ./my-kernel
94
94
 
95
95
  # Run evaluation
96
- wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json --benchmark
96
+ wafer tool eval gpumode --impl kernel.py --reference ref.py --test-cases tests.json --benchmark
97
97
  ```
98
98
 
99
99
  The implementation must define `custom_kernel(inputs)`, the reference must define `ref_kernel(inputs)` and `generate_input(**params)`.
@@ -101,64 +101,40 @@ The implementation must define `custom_kernel(inputs)`, the reference must defin
101
101
  **KernelBench format** (ModelNew class):
102
102
  ```bash
103
103
  # Extract a KernelBench problem as template
104
- wafer evaluate kernelbench make-template level1/1
104
+ wafer tool eval kernelbench make-template level1/1
105
105
 
106
106
  # Run evaluation
107
- wafer evaluate kernelbench --impl my_kernel.py --reference problem.py --benchmark
107
+ wafer tool eval kernelbench --impl my_kernel.py --reference problem.py --benchmark
108
108
  ```
109
109
 
110
110
  The implementation must define `class ModelNew(nn.Module)`, the reference must define `class Model`, `get_inputs()`, and `get_init_inputs()`.
111
111
 
112
- ### `wafer wevin -t ask-docs`
112
+ ### `wafer agent -t ask-docs`
113
113
 
114
- Query GPU documentation using the docs template.
114
+ Query GPU documentation using the docs template. Uses the `ask_docs` tool to search wafer's documentation corpus via the API.
115
115
 
116
116
  ```bash
117
- wafer wevin -t ask-docs --json -s "What causes bank conflicts in shared memory?"
118
- ```
119
-
120
- ### `wafer corpus`
121
-
122
- Download documentation to local filesystem for agents to search.
123
-
124
- ```bash
125
- wafer corpus list
126
- wafer corpus download cuda-programming-guide
117
+ wafer agent -t ask-docs -s "What causes bank conflicts in shared memory?"
127
118
  ```
128
119
 
129
120
  ---
130
121
 
131
122
  ## Customization
132
123
 
133
- ### `wafer remote-run` options
124
+ ### `wafer tool eval` options
134
125
 
135
126
  ```bash
136
- wafer remote-run --image pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel -- python3 script.py
137
- wafer remote-run --require-hwc -- ncu --set full python3 bench.py # Hardware counters for NCU
138
- ```
139
-
140
- ### `wafer evaluate` options
141
-
142
- ```bash
143
- wafer evaluate --impl k.py --reference r.py --test-cases t.json \
127
+ wafer tool eval gpumode --impl k.py --reference r.py --test-cases t.json \
144
128
  --target vultr-b200 \ # Specific GPU target
145
129
  --benchmark \ # Measure performance
146
130
  --profile # Enable torch.profiler + NCU
147
131
  ```
148
132
 
149
- ### `wafer push` for multi-command workflows
150
-
151
- ```bash
152
- WORKSPACE=$(wafer push ./project)
153
- wafer remote-run --workspace-id $WORKSPACE -- python3 test1.py
154
- wafer remote-run --workspace-id $WORKSPACE -- python3 test2.py
155
- ```
156
-
157
133
  ### Profile analysis
158
134
 
159
135
  ```bash
160
- wafer nvidia ncu analyze profile.ncu-rep
161
- wafer nvidia nsys analyze profile.nsys-rep
136
+ wafer tool ncu analyze profile.ncu-rep
137
+ wafer tool nsys analyze profile.nsys-rep
162
138
  ```
163
139
 
164
140
  ---
@@ -170,9 +146,9 @@ wafer nvidia nsys analyze profile.nsys-rep
170
146
  Bypass the API and SSH directly to your own GPUs:
171
147
 
172
148
  ```bash
173
- wafer targets list
174
- wafer targets add ./my-gpu.toml
175
- wafer targets default my-gpu
149
+ wafer target config list
150
+ wafer target config add ./my-gpu.toml
151
+ wafer target config default my-gpu
176
152
  ```
177
153
 
178
154
  ### Defensive evaluation
@@ -180,23 +156,23 @@ wafer targets default my-gpu
180
156
  Detect evaluation hacking (stream injection, lazy evaluation, etc.):
181
157
 
182
158
  ```bash
183
- wafer evaluate --impl k.py --reference r.py --test-cases t.json --benchmark --defensive
159
+ wafer tool eval gpumode --impl k.py --reference r.py --test-cases t.json --benchmark --defensive
184
160
  ```
185
161
 
186
162
  ### Other tools
187
163
 
188
164
  ```bash
189
- wafer perfetto <trace.json> --query "SELECT * FROM slice" # Perfetto SQL queries
190
- wafer capture ./script.py # Capture execution snapshot
191
- wafer compiler-analyze kernel.ptx # Analyze PTX/SASS
165
+ wafer tool perfetto <trace.json> --query "SELECT * FROM slice" # Perfetto SQL queries
166
+ wafer tool capture ./script.py # Capture execution snapshot
167
+ wafer compiler-analyze kernel.ptx # Analyze PTX/SASS
192
168
  ```
193
169
 
194
170
  ### ROCm profiling (AMD GPUs)
195
171
 
196
172
  ```bash
197
- wafer rocprof-sdk ...
198
- wafer rocprof-systems ...
199
- wafer rocprof-compute ...
173
+ wafer tool rocprof-sdk ...
174
+ wafer tool rocprof-systems ...
175
+ wafer tool rocprof-compute ...
200
176
  ```
201
177
 
202
178
  ---
@@ -214,10 +190,10 @@ source ~/.zshrc # or ~/.bashrc
214
190
  ```
215
191
 
216
192
  Now you can tab-complete:
217
- - Commands: `wafer eva<TAB>` → `wafer evaluate`
218
- - Options: `wafer evaluate --<TAB>`
219
- - Target names: `wafer evaluate --target v<TAB>` → `wafer evaluate --target vultr-b200`
220
- - File paths: `wafer evaluate --impl ./<TAB>`
193
+ - Commands: `wafer tool ev<TAB>` → `wafer tool eval`
194
+ - Options: `wafer tool eval --<TAB>`
195
+ - Target names: `wafer tool eval --target v<TAB>` → `wafer tool eval --target vultr-b200`
196
+ - File paths: `wafer tool eval gpumode --impl ./<TAB>`
221
197
 
222
198
  ---
223
199
 
@@ -49,11 +49,11 @@ Create and manage persistent GPU environments.
49
49
  - `B200` - NVIDIA Blackwell B200 (180GB HBM3e, CUDA) - default
50
50
 
51
51
  ```bash
52
- wafer workspaces list
53
- wafer workspaces create my-workspace --gpu B200 --wait # NVIDIA B200
54
- wafer workspaces create amd-dev --gpu MI300X # AMD MI300X
55
- wafer workspaces ssh <workspace-id>
56
- wafer workspaces delete <workspace-id>
52
+ wafer target workspace list
53
+ wafer target workspace create my-workspace --gpu B200 --wait # NVIDIA B200
54
+ wafer target workspace create amd-dev --gpu MI300X # AMD MI300X
55
+ wafer target workspace ssh <workspace-id>
56
+ wafer target workspace delete <workspace-id>
57
57
  ```
58
58
 
59
59
  ### `wafer agent`
@@ -65,17 +65,17 @@ wafer agent "What is TMEM in CuTeDSL?"
65
65
  wafer agent -s "optimize this kernel" < kernel.py
66
66
  ```
67
67
 
68
- ### `wafer evaluate`
68
+ ### `wafer tool eval`
69
69
 
70
70
  Evaluate kernel correctness and performance against a reference implementation.
71
71
 
72
72
  **Functional format** (default):
73
73
  ```bash
74
74
  # Generate template files
75
- wafer evaluate make-template ./my-kernel
75
+ wafer tool eval make-template ./my-kernel
76
76
 
77
77
  # Run evaluation
78
- wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json --benchmark
78
+ wafer tool eval gpumode --impl kernel.py --reference ref.py --test-cases tests.json --benchmark
79
79
  ```
80
80
 
81
81
  The implementation must define `custom_kernel(inputs)`, the reference must define `ref_kernel(inputs)` and `generate_input(**params)`.
@@ -83,64 +83,40 @@ The implementation must define `custom_kernel(inputs)`, the reference must defin
83
83
  **KernelBench format** (ModelNew class):
84
84
  ```bash
85
85
  # Extract a KernelBench problem as template
86
- wafer evaluate kernelbench make-template level1/1
86
+ wafer tool eval kernelbench make-template level1/1
87
87
 
88
88
  # Run evaluation
89
- wafer evaluate kernelbench --impl my_kernel.py --reference problem.py --benchmark
89
+ wafer tool eval kernelbench --impl my_kernel.py --reference problem.py --benchmark
90
90
  ```
91
91
 
92
92
  The implementation must define `class ModelNew(nn.Module)`, the reference must define `class Model`, `get_inputs()`, and `get_init_inputs()`.
93
93
 
94
- ### `wafer wevin -t ask-docs`
94
+ ### `wafer agent -t ask-docs`
95
95
 
96
- Query GPU documentation using the docs template.
96
+ Query GPU documentation using the docs template. Uses the `ask_docs` tool to search wafer's documentation corpus via the API.
97
97
 
98
98
  ```bash
99
- wafer wevin -t ask-docs --json -s "What causes bank conflicts in shared memory?"
100
- ```
101
-
102
- ### `wafer corpus`
103
-
104
- Download documentation to local filesystem for agents to search.
105
-
106
- ```bash
107
- wafer corpus list
108
- wafer corpus download cuda-programming-guide
99
+ wafer agent -t ask-docs -s "What causes bank conflicts in shared memory?"
109
100
  ```
110
101
 
111
102
  ---
112
103
 
113
104
  ## Customization
114
105
 
115
- ### `wafer remote-run` options
106
+ ### `wafer tool eval` options
116
107
 
117
108
  ```bash
118
- wafer remote-run --image pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel -- python3 script.py
119
- wafer remote-run --require-hwc -- ncu --set full python3 bench.py # Hardware counters for NCU
120
- ```
121
-
122
- ### `wafer evaluate` options
123
-
124
- ```bash
125
- wafer evaluate --impl k.py --reference r.py --test-cases t.json \
109
+ wafer tool eval gpumode --impl k.py --reference r.py --test-cases t.json \
126
110
  --target vultr-b200 \ # Specific GPU target
127
111
  --benchmark \ # Measure performance
128
112
  --profile # Enable torch.profiler + NCU
129
113
  ```
130
114
 
131
- ### `wafer push` for multi-command workflows
132
-
133
- ```bash
134
- WORKSPACE=$(wafer push ./project)
135
- wafer remote-run --workspace-id $WORKSPACE -- python3 test1.py
136
- wafer remote-run --workspace-id $WORKSPACE -- python3 test2.py
137
- ```
138
-
139
115
  ### Profile analysis
140
116
 
141
117
  ```bash
142
- wafer nvidia ncu analyze profile.ncu-rep
143
- wafer nvidia nsys analyze profile.nsys-rep
118
+ wafer tool ncu analyze profile.ncu-rep
119
+ wafer tool nsys analyze profile.nsys-rep
144
120
  ```
145
121
 
146
122
  ---
@@ -152,9 +128,9 @@ wafer nvidia nsys analyze profile.nsys-rep
152
128
  Bypass the API and SSH directly to your own GPUs:
153
129
 
154
130
  ```bash
155
- wafer targets list
156
- wafer targets add ./my-gpu.toml
157
- wafer targets default my-gpu
131
+ wafer target config list
132
+ wafer target config add ./my-gpu.toml
133
+ wafer target config default my-gpu
158
134
  ```
159
135
 
160
136
  ### Defensive evaluation
@@ -162,23 +138,23 @@ wafer targets default my-gpu
162
138
  Detect evaluation hacking (stream injection, lazy evaluation, etc.):
163
139
 
164
140
  ```bash
165
- wafer evaluate --impl k.py --reference r.py --test-cases t.json --benchmark --defensive
141
+ wafer tool eval gpumode --impl k.py --reference r.py --test-cases t.json --benchmark --defensive
166
142
  ```
167
143
 
168
144
  ### Other tools
169
145
 
170
146
  ```bash
171
- wafer perfetto <trace.json> --query "SELECT * FROM slice" # Perfetto SQL queries
172
- wafer capture ./script.py # Capture execution snapshot
173
- wafer compiler-analyze kernel.ptx # Analyze PTX/SASS
147
+ wafer tool perfetto <trace.json> --query "SELECT * FROM slice" # Perfetto SQL queries
148
+ wafer tool capture ./script.py # Capture execution snapshot
149
+ wafer compiler-analyze kernel.ptx # Analyze PTX/SASS
174
150
  ```
175
151
 
176
152
  ### ROCm profiling (AMD GPUs)
177
153
 
178
154
  ```bash
179
- wafer rocprof-sdk ...
180
- wafer rocprof-systems ...
181
- wafer rocprof-compute ...
155
+ wafer tool rocprof-sdk ...
156
+ wafer tool rocprof-systems ...
157
+ wafer tool rocprof-compute ...
182
158
  ```
183
159
 
184
160
  ---
@@ -196,10 +172,10 @@ source ~/.zshrc # or ~/.bashrc
196
172
  ```
197
173
 
198
174
  Now you can tab-complete:
199
- - Commands: `wafer eva<TAB>` → `wafer evaluate`
200
- - Options: `wafer evaluate --<TAB>`
201
- - Target names: `wafer evaluate --target v<TAB>` → `wafer evaluate --target vultr-b200`
202
- - File paths: `wafer evaluate --impl ./<TAB>`
175
+ - Commands: `wafer tool ev<TAB>` → `wafer tool eval`
176
+ - Options: `wafer tool eval --<TAB>`
177
+ - Target names: `wafer tool eval --target v<TAB>` → `wafer tool eval --target vultr-b200`
178
+ - File paths: `wafer tool eval gpumode --impl ./<TAB>`
203
179
 
204
180
  ---
205
181
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "wafer-cli"
3
- version = "0.2.52"
3
+ version = "0.2.54"
4
4
  description = "CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -37,7 +37,7 @@ where = ["."]
37
37
  include = ["wafer*"]
38
38
 
39
39
  [tool.setuptools.package-data]
40
- wafer = ["GUIDE.md", "skills/*/SKILL.md", "corpora/**/*.md"]
40
+ wafer = ["GUIDE.md", "skills/*/*.md"]
41
41
 
42
42
  [tool.ruff]
43
43
  line-length = 100
@@ -78,6 +78,7 @@ ignore = [
78
78
 
79
79
  [tool.ruff.lint.per-file-ignores]
80
80
  "tests/**/*.py" = ["ANN001", "ANN201", "ANN202", "ANN204"] # Don't require type annotations in tests
81
+ "tests/test_ncu_run_local_e2e.py" = ["PLR0915"] # E2E test has a long sequential flow by design
81
82
  "wafer/evaluate.py" = ["PLR0915", "PLR1702", "E402", "PLW2901", "ASYNC221"] # complex deployment flows - TODO: refactor
82
83
  "wafer/output.py" = ["ANN401"] # Output collector uses **kwargs for flexible event data
83
84
  "wafer/autotuner.py" = ["PLR0915", "PLR1702", "B007", "B904"] # complex sweep logic - TODO: refactor