wafer-cli 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/GUIDE.md +118 -0
- wafer/__init__.py +3 -0
- wafer/analytics.py +306 -0
- wafer/api_client.py +195 -0
- wafer/auth.py +432 -0
- wafer/autotuner.py +1080 -0
- wafer/billing.py +233 -0
- wafer/cli.py +7289 -0
- wafer/config.py +105 -0
- wafer/corpus.py +366 -0
- wafer/evaluate.py +4593 -0
- wafer/global_config.py +350 -0
- wafer/gpu_run.py +307 -0
- wafer/inference.py +148 -0
- wafer/kernel_scope.py +552 -0
- wafer/ncu_analyze.py +651 -0
- wafer/nsys_analyze.py +1042 -0
- wafer/nsys_profile.py +510 -0
- wafer/output.py +248 -0
- wafer/problems.py +357 -0
- wafer/rocprof_compute.py +490 -0
- wafer/rocprof_sdk.py +274 -0
- wafer/rocprof_systems.py +520 -0
- wafer/skills/wafer-guide/SKILL.md +129 -0
- wafer/ssh_keys.py +261 -0
- wafer/target_lock.py +270 -0
- wafer/targets.py +842 -0
- wafer/targets_ops.py +717 -0
- wafer/templates/__init__.py +0 -0
- wafer/templates/ask_docs.py +61 -0
- wafer/templates/optimize_kernel.py +71 -0
- wafer/templates/optimize_kernelbench.py +137 -0
- wafer/templates/trace_analyze.py +74 -0
- wafer/tracelens.py +218 -0
- wafer/wevin_cli.py +577 -0
- wafer/workspaces.py +852 -0
- wafer_cli-0.2.14.dist-info/METADATA +16 -0
- wafer_cli-0.2.14.dist-info/RECORD +41 -0
- wafer_cli-0.2.14.dist-info/WHEEL +5 -0
- wafer_cli-0.2.14.dist-info/entry_points.txt +2 -0
- wafer_cli-0.2.14.dist-info/top_level.txt +1 -0
wafer/rocprof_systems.py
ADDED
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
"""ROCprof-Systems - CLI wrapper for rocprof-sys tools.
|
|
2
|
+
|
|
3
|
+
This module provides the CLI wrapper for the `wafer rocprof-systems` command.
|
|
4
|
+
It supports multiple subcommands for different rocprof-sys tools:
|
|
5
|
+
- check: Check rocprof-sys installation
|
|
6
|
+
- run: Run system profiling with rocprof-sys-run
|
|
7
|
+
- analyze: Analyze profiling output files
|
|
8
|
+
|
|
9
|
+
This follows the design in Wafer-391: ROCprofiler Tools Architecture.
|
|
10
|
+
Architecture pattern matches rocprof_sdk.py.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import shlex
|
|
15
|
+
import sys
|
|
16
|
+
from dataclasses import asdict
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def print_usage() -> None:
|
|
21
|
+
"""Print CLI usage information."""
|
|
22
|
+
print(
|
|
23
|
+
"Usage: wafer rocprof-systems <subcommand> [options]",
|
|
24
|
+
file=sys.stderr,
|
|
25
|
+
)
|
|
26
|
+
print("", file=sys.stderr)
|
|
27
|
+
print("Subcommands:", file=sys.stderr)
|
|
28
|
+
print(" check Check rocprof-sys installation status", file=sys.stderr)
|
|
29
|
+
print(
|
|
30
|
+
" run COMMAND Profile a command with rocprof-sys-run", file=sys.stderr
|
|
31
|
+
)
|
|
32
|
+
print(
|
|
33
|
+
" analyze FILE Analyze profiling output file (JSON/text)", file=sys.stderr
|
|
34
|
+
)
|
|
35
|
+
print("", file=sys.stderr)
|
|
36
|
+
print("Run Options:", file=sys.stderr)
|
|
37
|
+
print(
|
|
38
|
+
" --output-dir DIR Output directory for results (default: current directory)",
|
|
39
|
+
file=sys.stderr,
|
|
40
|
+
)
|
|
41
|
+
print(
|
|
42
|
+
" --trace Generate detailed trace (Perfetto output)", file=sys.stderr
|
|
43
|
+
)
|
|
44
|
+
print(
|
|
45
|
+
" --profile Generate call-stack-based profile", file=sys.stderr
|
|
46
|
+
)
|
|
47
|
+
print(" --flat-profile Generate flat profile", file=sys.stderr)
|
|
48
|
+
print(" --sample Enable sampling profiling", file=sys.stderr)
|
|
49
|
+
print(
|
|
50
|
+
" --host Enable host metrics (CPU freq, memory)", file=sys.stderr
|
|
51
|
+
)
|
|
52
|
+
print(
|
|
53
|
+
" --device Enable device metrics (GPU temp, memory)", file=sys.stderr
|
|
54
|
+
)
|
|
55
|
+
print(
|
|
56
|
+
" --wait SECONDS Wait time before collecting data", file=sys.stderr
|
|
57
|
+
)
|
|
58
|
+
print(
|
|
59
|
+
" --duration SECONDS Duration of data collection", file=sys.stderr
|
|
60
|
+
)
|
|
61
|
+
print(
|
|
62
|
+
" --cpus CPU_IDS Comma-separated CPU IDs to sample (e.g., 0,1,2)", file=sys.stderr
|
|
63
|
+
)
|
|
64
|
+
print(
|
|
65
|
+
" --gpus GPU_IDS Comma-separated GPU IDs to sample (e.g., 0)", file=sys.stderr
|
|
66
|
+
)
|
|
67
|
+
print(
|
|
68
|
+
" --use-rocm Enable ROCm backend (default: true)", file=sys.stderr
|
|
69
|
+
)
|
|
70
|
+
print(
|
|
71
|
+
" --use-sampling Enable sampling backend", file=sys.stderr
|
|
72
|
+
)
|
|
73
|
+
print(
|
|
74
|
+
" --use-kokkosp Enable Kokkos profiling backend", file=sys.stderr
|
|
75
|
+
)
|
|
76
|
+
print(
|
|
77
|
+
" --use-mpip Enable MPI profiling backend", file=sys.stderr
|
|
78
|
+
)
|
|
79
|
+
print(
|
|
80
|
+
" --use-rocpd Enable rocpd database output", file=sys.stderr
|
|
81
|
+
)
|
|
82
|
+
print(" --json Output result as JSON", file=sys.stderr)
|
|
83
|
+
print("", file=sys.stderr)
|
|
84
|
+
print("Analyze Options:", file=sys.stderr)
|
|
85
|
+
print(" --json Output result as JSON", file=sys.stderr)
|
|
86
|
+
print("", file=sys.stderr)
|
|
87
|
+
print("Examples:", file=sys.stderr)
|
|
88
|
+
print(" wafer rocprof-systems check", file=sys.stderr)
|
|
89
|
+
print(" wafer rocprof-systems run './my_app --arg' --trace", file=sys.stderr)
|
|
90
|
+
print(
|
|
91
|
+
" wafer rocprof-systems run './kernel' --trace --profile --output-dir ./results",
|
|
92
|
+
file=sys.stderr,
|
|
93
|
+
)
|
|
94
|
+
print(
|
|
95
|
+
" wafer rocprof-systems run './app' --host --device --duration 10",
|
|
96
|
+
file=sys.stderr,
|
|
97
|
+
)
|
|
98
|
+
print(
|
|
99
|
+
" wafer rocprof-systems analyze wall_clock-12345.json", file=sys.stderr
|
|
100
|
+
)
|
|
101
|
+
print(" wafer rocprof-systems analyze wall-clock.txt --json", file=sys.stderr)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def check_command(json_output: bool = False) -> str:
|
|
105
|
+
"""CLI wrapper for checking rocprof-sys installation.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
json_output: If True, return JSON; otherwise print human-readable
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Status message or JSON string
|
|
112
|
+
"""
|
|
113
|
+
from wafer_core.lib.rocprofiler.systems import check_installation # pragma: no cover
|
|
114
|
+
|
|
115
|
+
result = check_installation()
|
|
116
|
+
|
|
117
|
+
if json_output:
|
|
118
|
+
return json.dumps(asdict(result), indent=2)
|
|
119
|
+
else:
|
|
120
|
+
if result.installed:
|
|
121
|
+
print("✓ rocprof-sys tools are installed", file=sys.stderr)
|
|
122
|
+
for tool, path in result.paths.items():
|
|
123
|
+
print(f" {tool}: {path}", file=sys.stderr)
|
|
124
|
+
if tool in result.versions:
|
|
125
|
+
print(f" Version: {result.versions[tool]}", file=sys.stderr)
|
|
126
|
+
return "rocprof-sys tools are installed"
|
|
127
|
+
else:
|
|
128
|
+
print("✗ rocprof-sys tools are not installed", file=sys.stderr)
|
|
129
|
+
if result.install_command:
|
|
130
|
+
print(f" {result.install_command}", file=sys.stderr)
|
|
131
|
+
return "rocprof-sys tools are not installed"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def run_command(
|
|
135
|
+
command: str,
|
|
136
|
+
output_dir: str | None = None,
|
|
137
|
+
trace: bool = False,
|
|
138
|
+
profile: bool = False,
|
|
139
|
+
flat_profile: bool = False,
|
|
140
|
+
sample: bool = False,
|
|
141
|
+
host: bool = False,
|
|
142
|
+
device: bool = False,
|
|
143
|
+
wait: float | None = None,
|
|
144
|
+
duration: float | None = None,
|
|
145
|
+
use_rocm: bool = True,
|
|
146
|
+
use_sampling: bool = False,
|
|
147
|
+
use_kokkosp: bool = False,
|
|
148
|
+
use_mpip: bool = False,
|
|
149
|
+
use_rocpd: bool = False,
|
|
150
|
+
json_output: bool = False,
|
|
151
|
+
) -> str:
|
|
152
|
+
"""Run rocprof-sys-run system profiling.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
command: Shell command to profile
|
|
156
|
+
output_dir: Output directory for results
|
|
157
|
+
trace: Generate detailed trace (Perfetto)
|
|
158
|
+
profile: Generate call-stack-based profile
|
|
159
|
+
flat_profile: Generate flat profile
|
|
160
|
+
sample: Enable sampling profiling
|
|
161
|
+
host: Enable host metrics
|
|
162
|
+
device: Enable device metrics
|
|
163
|
+
wait: Wait time before collecting data (seconds)
|
|
164
|
+
duration: Duration of data collection (seconds)
|
|
165
|
+
use_rocm: Enable ROCm backend
|
|
166
|
+
use_sampling: Enable sampling backend
|
|
167
|
+
use_kokkosp: Enable Kokkos profiling backend
|
|
168
|
+
use_mpip: Enable MPI profiling backend
|
|
169
|
+
use_rocpd: Enable rocpd database output
|
|
170
|
+
json_output: If True, return JSON; otherwise print human-readable
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Success message or JSON string
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
RuntimeError: If profiling fails
|
|
177
|
+
"""
|
|
178
|
+
from wafer_core.lib.rocprofiler.systems import run_systems_profile # pragma: no cover
|
|
179
|
+
|
|
180
|
+
# Parse command string into list
|
|
181
|
+
cmd_list = shlex.split(command)
|
|
182
|
+
|
|
183
|
+
result = run_systems_profile(
|
|
184
|
+
command=cmd_list,
|
|
185
|
+
output_dir=Path(output_dir) if output_dir else None,
|
|
186
|
+
trace=trace,
|
|
187
|
+
profile=profile,
|
|
188
|
+
flat_profile=flat_profile,
|
|
189
|
+
sample=sample,
|
|
190
|
+
host=host,
|
|
191
|
+
device=device,
|
|
192
|
+
wait=wait,
|
|
193
|
+
duration=duration,
|
|
194
|
+
use_rocm=use_rocm,
|
|
195
|
+
use_sampling=use_sampling,
|
|
196
|
+
use_kokkosp=use_kokkosp,
|
|
197
|
+
use_mpip=use_mpip,
|
|
198
|
+
use_rocpd=use_rocpd,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if json_output:
|
|
202
|
+
result_dict = asdict(result)
|
|
203
|
+
return json.dumps(result_dict, indent=2)
|
|
204
|
+
else:
|
|
205
|
+
if result.success:
|
|
206
|
+
print("✓ System profiling completed successfully", file=sys.stderr)
|
|
207
|
+
if result.output_files:
|
|
208
|
+
print(" Output files:", file=sys.stderr)
|
|
209
|
+
for f in result.output_files:
|
|
210
|
+
print(f" - {f}", file=sys.stderr)
|
|
211
|
+
|
|
212
|
+
# Provide hints for next steps
|
|
213
|
+
perfetto_files = [f for f in result.output_files if "perfetto" in f]
|
|
214
|
+
if perfetto_files:
|
|
215
|
+
print(
|
|
216
|
+
"",
|
|
217
|
+
file=sys.stderr,
|
|
218
|
+
)
|
|
219
|
+
print(
|
|
220
|
+
" Tip: Open Perfetto traces at https://ui.perfetto.dev",
|
|
221
|
+
file=sys.stderr,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
if result.stdout:
|
|
225
|
+
print("", file=sys.stderr)
|
|
226
|
+
print("Output:", file=sys.stderr)
|
|
227
|
+
print(result.stdout, file=sys.stderr)
|
|
228
|
+
return "System profiling completed"
|
|
229
|
+
else:
|
|
230
|
+
print("✗ System profiling failed", file=sys.stderr)
|
|
231
|
+
if result.error:
|
|
232
|
+
print(f" Error: {result.error}", file=sys.stderr)
|
|
233
|
+
if result.stderr:
|
|
234
|
+
print(" stderr:", file=sys.stderr)
|
|
235
|
+
print(result.stderr, file=sys.stderr)
|
|
236
|
+
raise RuntimeError(result.error or "System profiling failed")
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def analyze_command(
|
|
240
|
+
file_path: str,
|
|
241
|
+
json_output: bool = False,
|
|
242
|
+
) -> str:
|
|
243
|
+
"""Analyze rocprof-sys output file.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
file_path: Path to output file (JSON or text)
|
|
247
|
+
json_output: If True, return JSON; otherwise print human-readable
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
Analysis summary or JSON string
|
|
251
|
+
|
|
252
|
+
Raises:
|
|
253
|
+
RuntimeError: If analysis fails
|
|
254
|
+
"""
|
|
255
|
+
from wafer_core.lib.rocprofiler.systems.run.analyzer import analyze_file # pragma: no cover
|
|
256
|
+
|
|
257
|
+
result = analyze_file(Path(file_path))
|
|
258
|
+
|
|
259
|
+
if json_output:
|
|
260
|
+
result_dict = asdict(result)
|
|
261
|
+
# Convert SystemMetrics objects to dicts
|
|
262
|
+
if result.functions:
|
|
263
|
+
result_dict["functions"] = [asdict(f) for f in result.functions]
|
|
264
|
+
return json.dumps(result_dict, indent=2)
|
|
265
|
+
else:
|
|
266
|
+
if result.success:
|
|
267
|
+
print("✓ Analysis completed", file=sys.stderr)
|
|
268
|
+
print(f" Format: {result.file_format}", file=sys.stderr)
|
|
269
|
+
|
|
270
|
+
if result.summary:
|
|
271
|
+
print(
|
|
272
|
+
f" Functions: {result.summary.get('total_functions', 0)}",
|
|
273
|
+
file=sys.stderr,
|
|
274
|
+
)
|
|
275
|
+
if "total_time_ms" in result.summary:
|
|
276
|
+
total_ms = result.summary.get("total_time_ms", 0)
|
|
277
|
+
print(f" Total Time: {total_ms:.3f} ms", file=sys.stderr)
|
|
278
|
+
if "total_calls" in result.summary:
|
|
279
|
+
calls = result.summary.get("total_calls", 0)
|
|
280
|
+
print(f" Total Calls: {calls}", file=sys.stderr)
|
|
281
|
+
|
|
282
|
+
if result.metadata:
|
|
283
|
+
print("", file=sys.stderr)
|
|
284
|
+
print("Metadata:", file=sys.stderr)
|
|
285
|
+
if result.metadata.get("pid"):
|
|
286
|
+
print(f" PID: {result.metadata['pid']}", file=sys.stderr)
|
|
287
|
+
if result.metadata.get("user"):
|
|
288
|
+
print(f" User: {result.metadata['user']}", file=sys.stderr)
|
|
289
|
+
if result.metadata.get("working_directory"):
|
|
290
|
+
print(f" Working Directory: {result.metadata['working_directory']}", file=sys.stderr)
|
|
291
|
+
if result.metadata.get("cpu_model"):
|
|
292
|
+
print(f" CPU: {result.metadata['cpu_model']}", file=sys.stderr)
|
|
293
|
+
if result.metadata.get("rocm_version"):
|
|
294
|
+
print(f" ROCm Version: {result.metadata['rocm_version']}", file=sys.stderr)
|
|
295
|
+
if result.metadata.get("launch_date") and result.metadata.get("launch_time"):
|
|
296
|
+
print(f" Launch: {result.metadata['launch_date']} {result.metadata['launch_time']}", file=sys.stderr)
|
|
297
|
+
|
|
298
|
+
print("", file=sys.stderr)
|
|
299
|
+
|
|
300
|
+
# Print function table
|
|
301
|
+
if result.functions:
|
|
302
|
+
print("Function Summary:", file=sys.stderr)
|
|
303
|
+
print(
|
|
304
|
+
f"{'Name':<50} {'Calls':>10} {'Total (ms)':>15} {'Mean (ms)':>15}",
|
|
305
|
+
file=sys.stderr,
|
|
306
|
+
)
|
|
307
|
+
print("-" * 92, file=sys.stderr)
|
|
308
|
+
|
|
309
|
+
for f in result.functions[:20]: # Limit to first 20
|
|
310
|
+
calls = f.call_count or 0
|
|
311
|
+
total_ms = (f.total_time_ns or 0) / 1_000_000
|
|
312
|
+
mean_ms = (f.mean_time_ns or 0) / 1_000_000
|
|
313
|
+
# Truncate long function names
|
|
314
|
+
name = f.function_name[:47] + "..." if len(f.function_name) > 50 else f.function_name
|
|
315
|
+
print(
|
|
316
|
+
f"{name:<50} {calls:>10} {total_ms:>15.3f} {mean_ms:>15.3f}",
|
|
317
|
+
file=sys.stderr,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
if len(result.functions) > 20:
|
|
321
|
+
print(
|
|
322
|
+
f"... and {len(result.functions) - 20} more functions",
|
|
323
|
+
file=sys.stderr,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
return "Analysis completed"
|
|
327
|
+
else:
|
|
328
|
+
print("✗ Analysis failed", file=sys.stderr)
|
|
329
|
+
if result.error:
|
|
330
|
+
print(f" Error: {result.error}", file=sys.stderr)
|
|
331
|
+
raise RuntimeError(result.error or "Analysis failed")
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def sample_command(
|
|
335
|
+
command: str,
|
|
336
|
+
output_dir: str | None = None,
|
|
337
|
+
frequency: int | None = None,
|
|
338
|
+
trace: bool = False,
|
|
339
|
+
profile: bool = False,
|
|
340
|
+
flat_profile: bool = False,
|
|
341
|
+
host: bool = False,
|
|
342
|
+
device: bool = False,
|
|
343
|
+
wait: float | None = None,
|
|
344
|
+
duration: float | None = None,
|
|
345
|
+
cpus: str | None = None,
|
|
346
|
+
gpus: str | None = None,
|
|
347
|
+
json_output: bool = False,
|
|
348
|
+
) -> str:
|
|
349
|
+
"""Run sampling profiling with rocprof-sys-sample.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
command: Shell command to profile
|
|
353
|
+
output_dir: Output directory for results
|
|
354
|
+
frequency: Sampling frequency in Hz
|
|
355
|
+
trace: Generate detailed trace
|
|
356
|
+
profile: Generate call-stack-based profile
|
|
357
|
+
flat_profile: Generate flat profile
|
|
358
|
+
host: Enable host metrics
|
|
359
|
+
device: Enable device metrics
|
|
360
|
+
wait: Wait time before collecting data (seconds)
|
|
361
|
+
duration: Duration of data collection (seconds)
|
|
362
|
+
cpus: Comma-separated CPU IDs to sample (e.g., "0,1,2")
|
|
363
|
+
gpus: Comma-separated GPU IDs to sample (e.g., "0")
|
|
364
|
+
json_output: If True, return JSON; otherwise print human-readable
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
Success message or JSON string
|
|
368
|
+
|
|
369
|
+
Raises:
|
|
370
|
+
RuntimeError: If sampling fails
|
|
371
|
+
"""
|
|
372
|
+
from wafer_core.lib.rocprofiler.systems import run_sampling # pragma: no cover
|
|
373
|
+
|
|
374
|
+
# Parse command string into list
|
|
375
|
+
cmd_list = shlex.split(command)
|
|
376
|
+
|
|
377
|
+
# Parse CPU/GPU lists
|
|
378
|
+
cpu_list = [int(x.strip()) for x in cpus.split(",")] if cpus else None
|
|
379
|
+
gpu_list = [int(x.strip()) for x in gpus.split(",")] if gpus else None
|
|
380
|
+
|
|
381
|
+
result = run_sampling(
|
|
382
|
+
command=cmd_list,
|
|
383
|
+
output_dir=Path(output_dir) if output_dir else None,
|
|
384
|
+
freq=frequency,
|
|
385
|
+
trace=trace,
|
|
386
|
+
profile=profile,
|
|
387
|
+
flat_profile=flat_profile,
|
|
388
|
+
host=host,
|
|
389
|
+
device=device,
|
|
390
|
+
wait=wait,
|
|
391
|
+
duration=duration,
|
|
392
|
+
cpus=cpu_list,
|
|
393
|
+
gpus=gpu_list,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
if json_output:
|
|
397
|
+
result_dict = asdict(result)
|
|
398
|
+
return json.dumps(result_dict, indent=2)
|
|
399
|
+
else:
|
|
400
|
+
if result.success:
|
|
401
|
+
print("✓ Sampling completed successfully", file=sys.stderr)
|
|
402
|
+
if result.output_files:
|
|
403
|
+
print(" Output files:", file=sys.stderr)
|
|
404
|
+
for f in result.output_files:
|
|
405
|
+
print(f" - {f}", file=sys.stderr)
|
|
406
|
+
return "Sampling completed"
|
|
407
|
+
else:
|
|
408
|
+
print("✗ Sampling failed", file=sys.stderr)
|
|
409
|
+
if result.error:
|
|
410
|
+
print(f" Error: {result.error}", file=sys.stderr)
|
|
411
|
+
if result.stderr:
|
|
412
|
+
print(" stderr:", file=sys.stderr)
|
|
413
|
+
print(result.stderr, file=sys.stderr)
|
|
414
|
+
raise RuntimeError(result.error or "Sampling failed")
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def instrument_command(
|
|
418
|
+
command: str,
|
|
419
|
+
output_dir: str | None = None,
|
|
420
|
+
simulate: bool = False,
|
|
421
|
+
function_include: list[str] | None = None,
|
|
422
|
+
function_exclude: list[str] | None = None,
|
|
423
|
+
json_output: bool = False,
|
|
424
|
+
) -> str:
|
|
425
|
+
"""Run binary instrumentation with rocprof-sys-instrument.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
command: Shell command to instrument
|
|
429
|
+
output_dir: Output directory for results
|
|
430
|
+
simulate: Simulate instrumentation without creating binary
|
|
431
|
+
function_include: Function patterns to include
|
|
432
|
+
function_exclude: Function patterns to exclude
|
|
433
|
+
json_output: If True, return JSON; otherwise print human-readable
|
|
434
|
+
|
|
435
|
+
Returns:
|
|
436
|
+
Success message or JSON string
|
|
437
|
+
|
|
438
|
+
Raises:
|
|
439
|
+
RuntimeError: If instrumentation fails
|
|
440
|
+
"""
|
|
441
|
+
from wafer_core.lib.rocprofiler.systems import run_instrumentation # pragma: no cover
|
|
442
|
+
|
|
443
|
+
# Parse command string into list
|
|
444
|
+
cmd_list = shlex.split(command)
|
|
445
|
+
|
|
446
|
+
result = run_instrumentation(
|
|
447
|
+
command=cmd_list,
|
|
448
|
+
output=Path(output_dir) if output_dir else None,
|
|
449
|
+
simulate=simulate,
|
|
450
|
+
function_include=function_include,
|
|
451
|
+
function_exclude=function_exclude,
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
if json_output:
|
|
455
|
+
result_dict = asdict(result)
|
|
456
|
+
return json.dumps(result_dict, indent=2)
|
|
457
|
+
else:
|
|
458
|
+
if result.success:
|
|
459
|
+
print("✓ Instrumentation completed successfully", file=sys.stderr)
|
|
460
|
+
if result.output_files:
|
|
461
|
+
print(" Output files:", file=sys.stderr)
|
|
462
|
+
for f in result.output_files:
|
|
463
|
+
print(f" - {f}", file=sys.stderr)
|
|
464
|
+
return "Instrumentation completed"
|
|
465
|
+
else:
|
|
466
|
+
print("✗ Instrumentation failed", file=sys.stderr)
|
|
467
|
+
if result.error:
|
|
468
|
+
print(f" Error: {result.error}", file=sys.stderr)
|
|
469
|
+
if result.stderr:
|
|
470
|
+
print(" stderr:", file=sys.stderr)
|
|
471
|
+
print(result.stderr, file=sys.stderr)
|
|
472
|
+
raise RuntimeError(result.error or "Instrumentation failed")
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def query_command(
|
|
476
|
+
components: bool = False,
|
|
477
|
+
hw_counters: bool = False,
|
|
478
|
+
all_metrics: bool = False,
|
|
479
|
+
filter_pattern: str | None = None,
|
|
480
|
+
json_output: bool = False,
|
|
481
|
+
) -> str:
|
|
482
|
+
"""Query available profiling metrics and components.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
components: Query available components
|
|
486
|
+
hw_counters: Query hardware counters
|
|
487
|
+
all_metrics: Query all available metrics
|
|
488
|
+
filter_pattern: Filter pattern for results
|
|
489
|
+
json_output: If True, return JSON; otherwise print human-readable
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
Query results or JSON string
|
|
493
|
+
|
|
494
|
+
Raises:
|
|
495
|
+
RuntimeError: If query fails
|
|
496
|
+
"""
|
|
497
|
+
from wafer_core.lib.rocprofiler.systems import query_available_metrics # pragma: no cover
|
|
498
|
+
|
|
499
|
+
result = query_available_metrics(
|
|
500
|
+
components=components,
|
|
501
|
+
hw_counters=hw_counters,
|
|
502
|
+
all_metrics=all_metrics,
|
|
503
|
+
filter_pattern=filter_pattern,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
if json_output:
|
|
507
|
+
result_dict = asdict(result)
|
|
508
|
+
return json.dumps(result_dict, indent=2)
|
|
509
|
+
else:
|
|
510
|
+
if result.success:
|
|
511
|
+
print("✓ Query completed", file=sys.stderr)
|
|
512
|
+
if result.output:
|
|
513
|
+
print("", file=sys.stderr)
|
|
514
|
+
print(result.output)
|
|
515
|
+
return "Query completed"
|
|
516
|
+
else:
|
|
517
|
+
print("✗ Query failed", file=sys.stderr)
|
|
518
|
+
if result.error:
|
|
519
|
+
print(f" Error: {result.error}", file=sys.stderr)
|
|
520
|
+
raise RuntimeError(result.error or "Query failed")
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: wafer-guide
|
|
3
|
+
description: GPU kernel development with the Wafer CLI. Use when working on CUDA/HIP kernels, profiling GPU code, or optimizing kernel performance.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Wafer CLI
|
|
7
|
+
|
|
8
|
+
GPU development primitives for optimizing CUDA and HIP kernels.
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
Before using Wafer CLI commands, install the tool:
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
# Install wafer-cli using uv (recommended)
|
|
16
|
+
uv tool install wafer-cli
|
|
17
|
+
|
|
18
|
+
# Authenticate (one-time setup)
|
|
19
|
+
wafer login
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## When to Use This Skill
|
|
24
|
+
|
|
25
|
+
Activate this skill when:
|
|
26
|
+
- Writing or optimizing CUDA/HIP kernels
|
|
27
|
+
- Profiling GPU code with NCU, NSYS, or ROCprof
|
|
28
|
+
- Evaluating kernel correctness and performance
|
|
29
|
+
- Looking up GPU programming documentation (CUDA, CUTLASS, HIP)
|
|
30
|
+
|
|
31
|
+
## Core Workflows
|
|
32
|
+
|
|
33
|
+
### 1. Documentation Lookup
|
|
34
|
+
|
|
35
|
+
Query indexed GPU documentation:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
# Download corpus (one-time)
|
|
39
|
+
wafer corpus download cuda
|
|
40
|
+
wafer corpus download cutlass
|
|
41
|
+
wafer corpus download hip
|
|
42
|
+
|
|
43
|
+
# Query documentation
|
|
44
|
+
wafer wevin -t ask-docs --corpus cuda "What is warp divergence?"
|
|
45
|
+
wafer wevin -t ask-docs --corpus cutlass "What is a TiledMma?"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### 2. Trace Analysis
|
|
49
|
+
|
|
50
|
+
Analyze NCU, NSYS, or PyTorch profiler traces:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# AI-assisted analysis
|
|
54
|
+
wafer wevin -t trace-analyze --args trace=./profile.ncu-rep "Why is this kernel slow?"
|
|
55
|
+
|
|
56
|
+
# Direct trace queries (PyTorch/Perfetto JSON)
|
|
57
|
+
wafer nvidia perfetto query trace.json \
|
|
58
|
+
"SELECT name, dur/1e6 as ms FROM slice WHERE cat='kernel' ORDER BY dur DESC LIMIT 10"
|
|
59
|
+
|
|
60
|
+
# NCU/NSYS analysis
|
|
61
|
+
wafer nvidia ncu analyze profile.ncu-rep
|
|
62
|
+
wafer nvidia nsys analyze profile.nsys-rep
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### 3. Kernel Evaluation
|
|
66
|
+
|
|
67
|
+
Test correctness and measure speedup against a reference:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Generate template files
|
|
71
|
+
wafer evaluate make-template ./my-kernel
|
|
72
|
+
# Creates: kernel.py, reference.py, test_cases.json
|
|
73
|
+
|
|
74
|
+
# Run evaluation on a configured target
|
|
75
|
+
wafer evaluate \
|
|
76
|
+
--impl ./my-kernel/kernel.py \
|
|
77
|
+
--reference ./my-kernel/reference.py \
|
|
78
|
+
--test-cases ./my-kernel/test_cases.json \
|
|
79
|
+
--target <target-name>
|
|
80
|
+
|
|
81
|
+
# With profiling
|
|
82
|
+
wafer evaluate ... --profile
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### 4. AI-Assisted Optimization
|
|
86
|
+
|
|
87
|
+
Iteratively optimize a kernel with evaluation feedback:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
wafer wevin -t optimize-kernel \
|
|
91
|
+
--args kernel=./my_kernel.cu \
|
|
92
|
+
--args target=H100 \
|
|
93
|
+
"Optimize this GEMM for memory bandwidth"
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### 5. Remote Execution
|
|
97
|
+
|
|
98
|
+
Run on cloud GPU workspaces:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
wafer workspaces list
|
|
102
|
+
wafer workspaces create my-workspace --gpu H100
|
|
103
|
+
wafer workspaces exec <id> "python train.py"
|
|
104
|
+
wafer workspaces ssh <id>
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Command Reference
|
|
108
|
+
|
|
109
|
+
| Command | Description |
|
|
110
|
+
|---------|-------------|
|
|
111
|
+
| `wafer corpus list\|download\|path` | Manage documentation corpora |
|
|
112
|
+
| `wafer evaluate` | Test kernel correctness/performance |
|
|
113
|
+
| `wafer nvidia ncu\|nsys\|perfetto` | NVIDIA profiling tools |
|
|
114
|
+
| `wafer amd isa\|rocprof-compute` | AMD profiling tools |
|
|
115
|
+
| `wafer workspaces` | Cloud GPU environments |
|
|
116
|
+
| `wafer wevin -t <template>` | AI-assisted workflows |
|
|
117
|
+
| `wafer config targets` | Configure GPU targets |
|
|
118
|
+
|
|
119
|
+
## Target Configuration
|
|
120
|
+
|
|
121
|
+
Targets define GPU access methods. Initialize with:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
wafer config targets init ssh # Your own GPU via SSH
|
|
125
|
+
wafer config targets init runpod # RunPod cloud GPUs
|
|
126
|
+
wafer config targets init digitalocean # DigitalOcean AMD GPUs
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Then use: `wafer evaluate --target <name> ...`
|