wafer-cli 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/GUIDE.md +18 -7
- wafer/api_client.py +4 -0
- wafer/cli.py +1177 -278
- wafer/corpus.py +158 -32
- wafer/evaluate.py +75 -6
- wafer/kernel_scope.py +132 -31
- wafer/nsys_analyze.py +903 -73
- wafer/nsys_profile.py +511 -0
- wafer/output.py +241 -0
- wafer/skills/wafer-guide/SKILL.md +13 -0
- wafer/ssh_keys.py +261 -0
- wafer/targets_ops.py +718 -0
- wafer/wevin_cli.py +127 -18
- wafer/workspaces.py +232 -184
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/METADATA +1 -1
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/RECORD +19 -15
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/WHEEL +0 -0
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/entry_points.txt +0 -0
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/top_level.txt +0 -0
wafer/cli.py
CHANGED
|
@@ -182,7 +182,12 @@ workspaces_app = typer.Typer(
|
|
|
182
182
|
|
|
183
183
|
Workspaces are on-demand cloud GPU environments. Requires authentication (wafer login).
|
|
184
184
|
|
|
185
|
-
|
|
185
|
+
Available GPUs:
|
|
186
|
+
MI300X AMD Instinct MI300X (192GB HBM3, ROCm)
|
|
187
|
+
B200 NVIDIA Blackwell B200 (180GB HBM3e, CUDA)
|
|
188
|
+
|
|
189
|
+
Commands:
|
|
190
|
+
wafer workspaces create dev --gpu B200 # Create workspace
|
|
186
191
|
wafer workspaces exec dev -- python x.py # Run commands
|
|
187
192
|
wafer workspaces ssh dev # Interactive SSH
|
|
188
193
|
wafer workspaces sync dev ./project # Sync files
|
|
@@ -190,6 +195,36 @@ Workspaces are on-demand cloud GPU environments. Requires authentication (wafer
|
|
|
190
195
|
)
|
|
191
196
|
app.add_typer(workspaces_app, name="workspaces")
|
|
192
197
|
|
|
198
|
+
# SSH Key management (BYOK - Bring Your Own Key)
|
|
199
|
+
ssh_keys_app = typer.Typer(
|
|
200
|
+
help="""Manage SSH public keys for workspace access.
|
|
201
|
+
|
|
202
|
+
Register your SSH public keys here. These keys are installed in all workspaces
|
|
203
|
+
you provision, enabling SSH access from any machine with your private key.
|
|
204
|
+
|
|
205
|
+
wafer ssh-keys list # List registered keys
|
|
206
|
+
wafer ssh-keys add # Add key (auto-detects ~/.ssh/id_ed25519.pub)
|
|
207
|
+
wafer ssh-keys add ~/.ssh/id_rsa.pub --name laptop # Add specific key
|
|
208
|
+
wafer ssh-keys remove <key-id> # Remove a key"""
|
|
209
|
+
)
|
|
210
|
+
app.add_typer(ssh_keys_app, name="ssh-keys")
|
|
211
|
+
|
|
212
|
+
# Target operations (exec/ssh/sync on configured targets)
|
|
213
|
+
targets_ops_app = typer.Typer(
|
|
214
|
+
help="""Execute commands on configured GPU targets.
|
|
215
|
+
|
|
216
|
+
Run commands, SSH, or sync files to targets without going through evaluate.
|
|
217
|
+
Useful for exploratory work, debugging, or custom scripts.
|
|
218
|
+
|
|
219
|
+
wafer targets exec my-target -- python test.py # Run command
|
|
220
|
+
wafer targets ssh my-target # Interactive SSH
|
|
221
|
+
wafer targets sync my-target ./local_dir # Sync files
|
|
222
|
+
|
|
223
|
+
Supports: RunPod, DigitalOcean (auto-provisions), SSH targets (baremetal/vm).
|
|
224
|
+
Configure targets with: wafer config targets init ..."""
|
|
225
|
+
)
|
|
226
|
+
app.add_typer(targets_ops_app, name="targets")
|
|
227
|
+
|
|
193
228
|
# Billing management
|
|
194
229
|
billing_app = typer.Typer(help="Manage billing, credits, and subscription")
|
|
195
230
|
app.add_typer(billing_app, name="billing")
|
|
@@ -257,13 +292,100 @@ nvidia_app.add_typer(tracelens_app, name="tracelens")
|
|
|
257
292
|
amd_app = typer.Typer(help="AMD GPU profiling and analysis tools")
|
|
258
293
|
app.add_typer(amd_app, name="amd")
|
|
259
294
|
|
|
260
|
-
# ISA
|
|
261
|
-
isa_app = typer.Typer(help="ISA analysis for AMD GPU
|
|
295
|
+
# Unified ISA Analyzer - supports both .co files and Triton artifacts
|
|
296
|
+
isa_app = typer.Typer(help="ISA analysis for AMD GPU kernels (.co, .s, .ll, .ttgir files)")
|
|
262
297
|
amd_app.add_typer(isa_app, name="isa")
|
|
263
298
|
|
|
264
|
-
#
|
|
265
|
-
|
|
266
|
-
|
|
299
|
+
# =============================================================================
|
|
300
|
+
# Roofline analysis (wafer roofline)
|
|
301
|
+
# =============================================================================
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@app.command("roofline")
|
|
305
|
+
def roofline_cmd(
|
|
306
|
+
gpu: str | None = typer.Option(
|
|
307
|
+
None, "--gpu", "-g", help="GPU name (e.g., H100, B200, MI300X, A100)"
|
|
308
|
+
),
|
|
309
|
+
bytes_moved: float | None = typer.Option(
|
|
310
|
+
None, "--bytes", "-b", help="Theoretical minimum bytes moved"
|
|
311
|
+
),
|
|
312
|
+
flops: float | None = typer.Option(None, "--flops", "-f", help="Theoretical minimum FLOPs"),
|
|
313
|
+
time_ms: float | None = typer.Option(
|
|
314
|
+
None, "--time-ms", "-t", help="Actual kernel time in milliseconds"
|
|
315
|
+
),
|
|
316
|
+
dtype: str = typer.Option(
|
|
317
|
+
"fp16", "--dtype", "-d", help="Data type for compute ceiling (fp16, fp32, bf16, fp8, int8)"
|
|
318
|
+
),
|
|
319
|
+
list_gpus: bool = typer.Option(False, "--list-gpus", help="List available GPU specs and exit"),
|
|
320
|
+
) -> None:
|
|
321
|
+
"""Analyze kernel performance against roofline model.
|
|
322
|
+
|
|
323
|
+
The roofline model shows the theoretical speed-of-light (SOL) for your kernel
|
|
324
|
+
based on whether it's memory-bound or compute-bound.
|
|
325
|
+
|
|
326
|
+
You need to provide:
|
|
327
|
+
- The GPU you ran on
|
|
328
|
+
- Theoretical minimum bytes moved (not actual - what the algorithm requires)
|
|
329
|
+
- Theoretical minimum FLOPs
|
|
330
|
+
- Actual measured kernel time
|
|
331
|
+
|
|
332
|
+
Example:
|
|
333
|
+
# Analyze a matmul kernel (4096x4096x4096, FP16)
|
|
334
|
+
# Theoretical: 2*M*N*K FLOPs = 137.4 TFLOP
|
|
335
|
+
# Theoretical bytes: (M*K + K*N + M*N) * 2 = 100.7 MB
|
|
336
|
+
wafer roofline --gpu H100 --bytes 100.7e6 --flops 137.4e12 --time-ms 85
|
|
337
|
+
|
|
338
|
+
# Analyze a memory-bound elementwise add (1B elements FP32)
|
|
339
|
+
# Reads 2 tensors, writes 1 = 12 GB total
|
|
340
|
+
# 1B adds = 1 GFLOP
|
|
341
|
+
wafer roofline --gpu H100 --bytes 12e9 --flops 1e9 --time-ms 4 --dtype fp32
|
|
342
|
+
|
|
343
|
+
# List available GPUs
|
|
344
|
+
wafer roofline --list-gpus
|
|
345
|
+
"""
|
|
346
|
+
from wafer_core.roofline import get_gpu_spec, roofline_analysis
|
|
347
|
+
from wafer_core.roofline import list_gpus as get_all_gpus
|
|
348
|
+
|
|
349
|
+
if list_gpus:
|
|
350
|
+
typer.echo("Available GPUs:")
|
|
351
|
+
for name in get_all_gpus():
|
|
352
|
+
spec = get_gpu_spec(name)
|
|
353
|
+
typer.echo(
|
|
354
|
+
f" {name}: {spec.peak_bandwidth_gbps:.0f} GB/s, {spec.peak_tflops_fp16:.0f} TFLOPS FP16"
|
|
355
|
+
)
|
|
356
|
+
return
|
|
357
|
+
|
|
358
|
+
# Validate required args for analysis
|
|
359
|
+
missing = []
|
|
360
|
+
if gpu is None:
|
|
361
|
+
missing.append("--gpu")
|
|
362
|
+
if bytes_moved is None:
|
|
363
|
+
missing.append("--bytes")
|
|
364
|
+
if flops is None:
|
|
365
|
+
missing.append("--flops")
|
|
366
|
+
if time_ms is None:
|
|
367
|
+
missing.append("--time-ms")
|
|
368
|
+
|
|
369
|
+
if missing:
|
|
370
|
+
typer.echo(f"Error: Missing required options: {', '.join(missing)}", err=True)
|
|
371
|
+
typer.echo("", err=True)
|
|
372
|
+
typer.echo("Run 'wafer roofline --help' for usage.", err=True)
|
|
373
|
+
raise typer.Exit(1)
|
|
374
|
+
|
|
375
|
+
try:
|
|
376
|
+
result = roofline_analysis(
|
|
377
|
+
gpu=gpu,
|
|
378
|
+
dtype=dtype,
|
|
379
|
+
bytes_moved=bytes_moved,
|
|
380
|
+
flops=flops,
|
|
381
|
+
time_ms=time_ms,
|
|
382
|
+
)
|
|
383
|
+
except ValueError as e:
|
|
384
|
+
typer.echo(f"Error: {e}", err=True)
|
|
385
|
+
raise typer.Exit(1) from None
|
|
386
|
+
|
|
387
|
+
typer.echo(result.format_report())
|
|
388
|
+
|
|
267
389
|
|
|
268
390
|
# =============================================================================
|
|
269
391
|
# Skill management (wafer skill ...)
|
|
@@ -279,21 +401,22 @@ def skill_install(
|
|
|
279
401
|
"all",
|
|
280
402
|
"--target",
|
|
281
403
|
"-t",
|
|
282
|
-
help="Target tool: claude, codex, or all",
|
|
404
|
+
help="Target tool: claude, codex, cursor, or all",
|
|
283
405
|
),
|
|
284
406
|
force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing skill"),
|
|
285
407
|
) -> None:
|
|
286
408
|
"""Install the wafer-guide skill for AI coding assistants.
|
|
287
409
|
|
|
288
410
|
Installs the bundled skill to make wafer commands discoverable by
|
|
289
|
-
Claude Code
|
|
411
|
+
Claude Code, OpenAI Codex CLI, and/or Cursor.
|
|
290
412
|
|
|
291
413
|
Skills follow the open agent skills specification (agentskills.io).
|
|
292
414
|
|
|
293
415
|
Examples:
|
|
294
|
-
wafer skill install # Install for
|
|
416
|
+
wafer skill install # Install for all tools
|
|
295
417
|
wafer skill install -t claude # Install for Claude Code only
|
|
296
418
|
wafer skill install -t codex # Install for Codex CLI only
|
|
419
|
+
wafer skill install -t cursor # Install for Cursor only
|
|
297
420
|
wafer skill install --force # Overwrite existing installation
|
|
298
421
|
"""
|
|
299
422
|
# Locate bundled skill
|
|
@@ -311,9 +434,13 @@ def skill_install(
|
|
|
311
434
|
))
|
|
312
435
|
if target in ("all", "codex"):
|
|
313
436
|
targets_to_install.append(("Codex CLI", Path.home() / ".codex" / "skills" / "wafer-guide"))
|
|
437
|
+
if target in ("all", "cursor"):
|
|
438
|
+
targets_to_install.append(("Cursor", Path.home() / ".cursor" / "skills" / "wafer-guide"))
|
|
314
439
|
|
|
315
440
|
if not targets_to_install:
|
|
316
|
-
typer.echo(
|
|
441
|
+
typer.echo(
|
|
442
|
+
f"Error: Unknown target '{target}'. Use: claude, codex, cursor, or all", err=True
|
|
443
|
+
)
|
|
317
444
|
raise typer.Exit(1)
|
|
318
445
|
|
|
319
446
|
for tool_name, dest_path in targets_to_install:
|
|
@@ -348,14 +475,15 @@ def skill_uninstall(
|
|
|
348
475
|
"all",
|
|
349
476
|
"--target",
|
|
350
477
|
"-t",
|
|
351
|
-
help="Target tool: claude, codex, or all",
|
|
478
|
+
help="Target tool: claude, codex, cursor, or all",
|
|
352
479
|
),
|
|
353
480
|
) -> None:
|
|
354
481
|
"""Uninstall the wafer-guide skill.
|
|
355
482
|
|
|
356
483
|
Examples:
|
|
357
|
-
wafer skill uninstall # Uninstall from
|
|
484
|
+
wafer skill uninstall # Uninstall from all tools
|
|
358
485
|
wafer skill uninstall -t claude # Uninstall from Claude Code only
|
|
486
|
+
wafer skill uninstall -t cursor # Uninstall from Cursor only
|
|
359
487
|
"""
|
|
360
488
|
targets_to_uninstall: list[tuple[str, Path]] = []
|
|
361
489
|
|
|
@@ -369,9 +497,16 @@ def skill_uninstall(
|
|
|
369
497
|
"Codex CLI",
|
|
370
498
|
Path.home() / ".codex" / "skills" / "wafer-guide",
|
|
371
499
|
))
|
|
500
|
+
if target in ("all", "cursor"):
|
|
501
|
+
targets_to_uninstall.append((
|
|
502
|
+
"Cursor",
|
|
503
|
+
Path.home() / ".cursor" / "skills" / "wafer-guide",
|
|
504
|
+
))
|
|
372
505
|
|
|
373
506
|
if not targets_to_uninstall:
|
|
374
|
-
typer.echo(
|
|
507
|
+
typer.echo(
|
|
508
|
+
f"Error: Unknown target '{target}'. Use: claude, codex, cursor, or all", err=True
|
|
509
|
+
)
|
|
375
510
|
raise typer.Exit(1)
|
|
376
511
|
|
|
377
512
|
for tool_name, dest_path in targets_to_uninstall:
|
|
@@ -406,6 +541,7 @@ def skill_status() -> None:
|
|
|
406
541
|
installations = [
|
|
407
542
|
("Claude Code", Path.home() / ".claude" / "skills" / "wafer-guide"),
|
|
408
543
|
("Codex CLI", Path.home() / ".codex" / "skills" / "wafer-guide"),
|
|
544
|
+
("Cursor", Path.home() / ".cursor" / "skills" / "wafer-guide"),
|
|
409
545
|
]
|
|
410
546
|
|
|
411
547
|
for tool_name, path in installations:
|
|
@@ -1114,6 +1250,11 @@ def agent( # noqa: PLR0913
|
|
|
1114
1250
|
"--list-sessions",
|
|
1115
1251
|
help="List recent sessions and exit",
|
|
1116
1252
|
),
|
|
1253
|
+
get_session: str | None = typer.Option(
|
|
1254
|
+
None,
|
|
1255
|
+
"--get-session",
|
|
1256
|
+
help="Get session by ID and print messages (use with --json)",
|
|
1257
|
+
),
|
|
1117
1258
|
tools: str | None = typer.Option(
|
|
1118
1259
|
None,
|
|
1119
1260
|
"--tools",
|
|
@@ -1160,47 +1301,7 @@ def agent( # noqa: PLR0913
|
|
|
1160
1301
|
None,
|
|
1161
1302
|
"--corpus",
|
|
1162
1303
|
"-c",
|
|
1163
|
-
help="Documentation corpus to use (cuda, cutlass, hip). Must be downloaded first.",
|
|
1164
|
-
),
|
|
1165
|
-
# Legacy kernel optimization options (hidden, for backwards compat)
|
|
1166
|
-
problem: Path | None = typer.Option(
|
|
1167
|
-
None,
|
|
1168
|
-
"--problem",
|
|
1169
|
-
hidden=True,
|
|
1170
|
-
help="[Legacy] Path to problem YAML config file",
|
|
1171
|
-
),
|
|
1172
|
-
reference: Path | None = typer.Option(
|
|
1173
|
-
None,
|
|
1174
|
-
"--reference",
|
|
1175
|
-
"--ref",
|
|
1176
|
-
hidden=True,
|
|
1177
|
-
help="[Legacy] Path to reference kernel file",
|
|
1178
|
-
),
|
|
1179
|
-
description: str | None = typer.Option(
|
|
1180
|
-
None,
|
|
1181
|
-
"--description",
|
|
1182
|
-
"--desc",
|
|
1183
|
-
hidden=True,
|
|
1184
|
-
help="[Legacy] Problem description",
|
|
1185
|
-
),
|
|
1186
|
-
test: list[str] | None = typer.Option(
|
|
1187
|
-
None,
|
|
1188
|
-
"--test",
|
|
1189
|
-
hidden=True,
|
|
1190
|
-
help="[Legacy] Test case",
|
|
1191
|
-
),
|
|
1192
|
-
benchmark: list[str] | None = typer.Option(
|
|
1193
|
-
None,
|
|
1194
|
-
"--benchmark",
|
|
1195
|
-
"-b",
|
|
1196
|
-
hidden=True,
|
|
1197
|
-
help="[Legacy] Benchmark case",
|
|
1198
|
-
),
|
|
1199
|
-
speedup_target: float | None = typer.Option(
|
|
1200
|
-
None,
|
|
1201
|
-
"--speedup",
|
|
1202
|
-
hidden=True,
|
|
1203
|
-
help="[Legacy] Speedup target",
|
|
1304
|
+
help="Documentation corpus to use (cuda, cutlass, hip, amd). Must be downloaded first.",
|
|
1204
1305
|
),
|
|
1205
1306
|
) -> None:
|
|
1206
1307
|
"""AI assistant for GPU kernel development.
|
|
@@ -1287,20 +1388,15 @@ def agent( # noqa: PLR0913
|
|
|
1287
1388
|
prompt=actual_prompt,
|
|
1288
1389
|
interactive=use_tui,
|
|
1289
1390
|
single_turn=single_turn,
|
|
1290
|
-
problem=str(problem) if problem else None,
|
|
1291
|
-
reference=str(reference) if reference else None,
|
|
1292
|
-
description=description,
|
|
1293
|
-
tests=list(test) if test else None,
|
|
1294
|
-
benchmarks=list(benchmark) if benchmark else None,
|
|
1295
1391
|
model=model,
|
|
1296
|
-
max_turns=max_turns,
|
|
1297
|
-
speedup_target=speedup_target,
|
|
1298
1392
|
resume=resume,
|
|
1299
1393
|
from_turn=from_turn,
|
|
1300
1394
|
list_sessions=list_sessions,
|
|
1395
|
+
get_session=get_session,
|
|
1301
1396
|
tools=tools.split(",") if tools else None,
|
|
1302
1397
|
allow_spawn=allow_spawn,
|
|
1303
1398
|
max_tool_fails=max_tool_fails,
|
|
1399
|
+
max_turns=max_turns,
|
|
1304
1400
|
json_output=json_output,
|
|
1305
1401
|
template=template,
|
|
1306
1402
|
template_args=parsed_template_args,
|
|
@@ -1310,7 +1406,7 @@ def agent( # noqa: PLR0913
|
|
|
1310
1406
|
|
|
1311
1407
|
# =============================================================================
|
|
1312
1408
|
# Evaluate command
|
|
1313
|
-
# Hidden aliases for
|
|
1409
|
+
# Hidden aliases for agent command
|
|
1314
1410
|
def _make_agent_alias(name: str, doc: str) -> None:
|
|
1315
1411
|
"""Create a hidden alias that delegates to agent()."""
|
|
1316
1412
|
|
|
@@ -1325,6 +1421,7 @@ def _make_agent_alias(name: str, doc: str) -> None:
|
|
|
1325
1421
|
resume: str | None = typer.Option(None, "--resume", "-r"),
|
|
1326
1422
|
from_turn: int | None = typer.Option(None, "--from-turn"),
|
|
1327
1423
|
list_sessions: bool = typer.Option(False, "--list-sessions"),
|
|
1424
|
+
get_session: str | None = typer.Option(None, "--get-session"),
|
|
1328
1425
|
tools: str | None = typer.Option(None, "--tools"),
|
|
1329
1426
|
allow_spawn: bool = typer.Option(False, "--allow-spawn"),
|
|
1330
1427
|
max_tool_fails: int | None = typer.Option(None, "--max-tool-fails"),
|
|
@@ -1334,12 +1431,6 @@ def _make_agent_alias(name: str, doc: str) -> None:
|
|
|
1334
1431
|
template: str | None = typer.Option(None, "--template", "-t"),
|
|
1335
1432
|
template_args: list[str] | None = typer.Option(None, "--args"),
|
|
1336
1433
|
corpus: str | None = typer.Option(None, "--corpus"),
|
|
1337
|
-
problem: Path | None = typer.Option(None, "--problem", hidden=True),
|
|
1338
|
-
reference: Path | None = typer.Option(None, "--reference", hidden=True),
|
|
1339
|
-
description: str | None = typer.Option(None, "--description", hidden=True),
|
|
1340
|
-
test: list[Path] | None = typer.Option(None, "--test", hidden=True),
|
|
1341
|
-
benchmark: list[Path] | None = typer.Option(None, "--benchmark", hidden=True),
|
|
1342
|
-
speedup_target: float | None = typer.Option(None, "--speedup-target", hidden=True),
|
|
1343
1434
|
) -> None:
|
|
1344
1435
|
agent(
|
|
1345
1436
|
prompt=prompt,
|
|
@@ -1349,6 +1440,7 @@ def _make_agent_alias(name: str, doc: str) -> None:
|
|
|
1349
1440
|
resume=resume,
|
|
1350
1441
|
from_turn=from_turn,
|
|
1351
1442
|
list_sessions=list_sessions,
|
|
1443
|
+
get_session=get_session,
|
|
1352
1444
|
tools=tools,
|
|
1353
1445
|
allow_spawn=allow_spawn,
|
|
1354
1446
|
max_tool_fails=max_tool_fails,
|
|
@@ -1358,12 +1450,6 @@ def _make_agent_alias(name: str, doc: str) -> None:
|
|
|
1358
1450
|
template=template,
|
|
1359
1451
|
template_args=template_args,
|
|
1360
1452
|
corpus=corpus,
|
|
1361
|
-
problem=problem,
|
|
1362
|
-
reference=reference,
|
|
1363
|
-
description=description,
|
|
1364
|
-
test=test,
|
|
1365
|
-
benchmark=benchmark,
|
|
1366
|
-
speedup_target=speedup_target,
|
|
1367
1453
|
)
|
|
1368
1454
|
|
|
1369
1455
|
alias_cmd.__doc__ = doc
|
|
@@ -1649,7 +1735,7 @@ def kernelbench_list_problems() -> None:
|
|
|
1649
1735
|
|
|
1650
1736
|
|
|
1651
1737
|
@kernelbench_app.callback(invoke_without_command=True)
|
|
1652
|
-
def kernelbench_evaluate( # noqa: PLR0913
|
|
1738
|
+
def kernelbench_evaluate( # noqa: PLR0913, PLR0915
|
|
1653
1739
|
ctx: typer.Context,
|
|
1654
1740
|
implementation: Path | None = typer.Option(
|
|
1655
1741
|
None,
|
|
@@ -1685,10 +1771,22 @@ def kernelbench_evaluate( # noqa: PLR0913
|
|
|
1685
1771
|
defensive: bool = typer.Option(
|
|
1686
1772
|
False, "--defensive", help="Enable defensive timing to detect evaluation hacking"
|
|
1687
1773
|
),
|
|
1774
|
+
backend: str | None = typer.Option(
|
|
1775
|
+
None,
|
|
1776
|
+
"--backend",
|
|
1777
|
+
help="Kernel backend for static validation (hip, cuda, triton, cute, tilelang, thunderkittens). "
|
|
1778
|
+
"When specified, validates that the implementation uses the correct backend primitives.",
|
|
1779
|
+
),
|
|
1688
1780
|
sync_artifacts: bool = typer.Option(
|
|
1689
1781
|
True, "--sync-artifacts/--no-sync-artifacts", help="Download artifacts"
|
|
1690
1782
|
),
|
|
1691
1783
|
gpu_id: int | None = typer.Option(None, "--gpu-id", help="Override GPU ID"),
|
|
1784
|
+
json_output: bool = typer.Option(
|
|
1785
|
+
False, "--json", help="Output as single JSON object (machine-readable)"
|
|
1786
|
+
),
|
|
1787
|
+
jsonl_output: bool = typer.Option(
|
|
1788
|
+
False, "--jsonl", help="Output as streaming JSON Lines (one object per event)"
|
|
1789
|
+
),
|
|
1692
1790
|
) -> None:
|
|
1693
1791
|
"""Run kernel evaluation in KernelBench format (ModelNew class).
|
|
1694
1792
|
|
|
@@ -1744,6 +1842,10 @@ def kernelbench_evaluate( # noqa: PLR0913
|
|
|
1744
1842
|
raise typer.Exit(1)
|
|
1745
1843
|
|
|
1746
1844
|
from .evaluate import KernelBenchEvaluateArgs, run_evaluate_kernelbench
|
|
1845
|
+
from .output import OutputCollector, format_evaluate_result, get_output_format
|
|
1846
|
+
|
|
1847
|
+
output_format = get_output_format(json_output, jsonl_output)
|
|
1848
|
+
collector = OutputCollector(format=output_format)
|
|
1747
1849
|
|
|
1748
1850
|
# If pool specified, acquire a target from the pool
|
|
1749
1851
|
resolved_target = target or ""
|
|
@@ -1756,32 +1858,36 @@ def kernelbench_evaluate( # noqa: PLR0913
|
|
|
1756
1858
|
try:
|
|
1757
1859
|
pool_targets = get_pool(pool)
|
|
1758
1860
|
except FileNotFoundError as e:
|
|
1759
|
-
|
|
1861
|
+
collector.set_error("pool", "PoolNotFound", pool=pool, message=str(e))
|
|
1862
|
+
collector.finalize()
|
|
1760
1863
|
raise typer.Exit(1) from None
|
|
1761
1864
|
|
|
1762
1865
|
# Filter to only targets with valid auth
|
|
1763
1866
|
usable_targets, skipped = filter_pool_by_auth(pool_targets)
|
|
1764
1867
|
if skipped:
|
|
1765
|
-
|
|
1868
|
+
collector.emit("pool_auth_skip", targets=skipped)
|
|
1766
1869
|
|
|
1767
1870
|
if not usable_targets:
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
typer.echo(" Run 'wafer auth status' to see which providers need setup.", err=True)
|
|
1871
|
+
collector.set_error("pool", "NoUsableTargets", pool=pool)
|
|
1872
|
+
collector.finalize()
|
|
1771
1873
|
raise typer.Exit(1) from None
|
|
1772
1874
|
|
|
1773
|
-
|
|
1875
|
+
collector.emit("pool_acquire", pool=pool, count=len(usable_targets))
|
|
1774
1876
|
pool_lock_context = acquire_from_pool(usable_targets)
|
|
1775
1877
|
acquired_target = pool_lock_context.__enter__()
|
|
1776
1878
|
|
|
1777
1879
|
if acquired_target is None:
|
|
1778
|
-
|
|
1779
|
-
|
|
1880
|
+
# Exit context manager before raising to avoid resource leak
|
|
1881
|
+
pool_lock_context.__exit__(None, None, None)
|
|
1882
|
+
collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=usable_targets)
|
|
1883
|
+
collector.finalize()
|
|
1780
1884
|
raise typer.Exit(1)
|
|
1781
1885
|
|
|
1782
|
-
|
|
1886
|
+
collector.emit("pool_acquired", target=acquired_target)
|
|
1783
1887
|
resolved_target = acquired_target
|
|
1784
1888
|
|
|
1889
|
+
collector.target = resolved_target
|
|
1890
|
+
|
|
1785
1891
|
args = KernelBenchEvaluateArgs(
|
|
1786
1892
|
implementation=implementation,
|
|
1787
1893
|
reference=reference,
|
|
@@ -1791,41 +1897,45 @@ def kernelbench_evaluate( # noqa: PLR0913
|
|
|
1791
1897
|
inputs=inputs,
|
|
1792
1898
|
seed=seed,
|
|
1793
1899
|
defensive=defensive,
|
|
1900
|
+
backend=backend,
|
|
1794
1901
|
sync_artifacts=sync_artifacts,
|
|
1795
1902
|
gpu_id=gpu_id,
|
|
1796
1903
|
)
|
|
1797
1904
|
|
|
1905
|
+
collector.emit("started", target=resolved_target)
|
|
1906
|
+
|
|
1798
1907
|
try:
|
|
1799
1908
|
import trio_asyncio
|
|
1800
1909
|
|
|
1910
|
+
collector.emit("evaluation", status="running")
|
|
1801
1911
|
result = trio_asyncio.run(run_evaluate_kernelbench, args)
|
|
1802
1912
|
except KeyboardInterrupt:
|
|
1803
|
-
|
|
1913
|
+
collector.set_error("evaluation", "Interrupted", message="Interrupted by user")
|
|
1914
|
+
collector.finalize()
|
|
1804
1915
|
raise typer.Exit(130) from None
|
|
1805
1916
|
except Exception as e:
|
|
1806
|
-
|
|
1917
|
+
collector.set_error("evaluation", "Exception", message=str(e))
|
|
1918
|
+
collector.finalize()
|
|
1807
1919
|
raise typer.Exit(1) from None
|
|
1808
1920
|
finally:
|
|
1809
1921
|
# Release pool lock if we acquired one
|
|
1810
1922
|
if pool_lock_context is not None:
|
|
1811
1923
|
pool_lock_context.__exit__(None, None, None)
|
|
1812
1924
|
|
|
1813
|
-
#
|
|
1925
|
+
# Build structured output
|
|
1926
|
+
eval_output = format_evaluate_result(result, target=resolved_target)
|
|
1927
|
+
collector._result = eval_output
|
|
1928
|
+
|
|
1929
|
+
# Print results based on output format
|
|
1814
1930
|
if result.success:
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
status = "PASS" if result.all_correct else "FAIL"
|
|
1818
|
-
typer.echo(f"Result: {status}")
|
|
1819
|
-
score_pct = f"{result.correctness_score:.1%}"
|
|
1820
|
-
typer.echo(f"Correctness: {result.passed_tests}/{result.total_tests} ({score_pct})")
|
|
1821
|
-
if result.geomean_speedup > 0:
|
|
1822
|
-
typer.echo(f"Speedup: {result.geomean_speedup:.2f}x")
|
|
1823
|
-
typer.echo("=" * 60)
|
|
1931
|
+
collector.output_text_result(result)
|
|
1932
|
+
collector.finalize()
|
|
1824
1933
|
|
|
1825
1934
|
if not result.all_correct:
|
|
1826
1935
|
raise typer.Exit(1)
|
|
1827
1936
|
else:
|
|
1828
|
-
|
|
1937
|
+
collector.output_text_error(result.error_message or "Unknown error")
|
|
1938
|
+
collector.finalize()
|
|
1829
1939
|
raise typer.Exit(1)
|
|
1830
1940
|
|
|
1831
1941
|
|
|
@@ -2182,6 +2292,8 @@ def gpumode_evaluate( # noqa: PLR0913, PLR0915
|
|
|
2182
2292
|
acquired_target = pool_lock_context.__enter__()
|
|
2183
2293
|
|
|
2184
2294
|
if acquired_target is None:
|
|
2295
|
+
# Exit context manager before raising to avoid resource leak
|
|
2296
|
+
pool_lock_context.__exit__(None, None, None)
|
|
2185
2297
|
typer.echo(f"Error: All targets in pool '{pool}' are busy", err=True)
|
|
2186
2298
|
typer.echo(f" Targets: {', '.join(usable_targets)}", err=True)
|
|
2187
2299
|
raise typer.Exit(1)
|
|
@@ -2402,6 +2514,7 @@ def _run_api_mode( # noqa: PLR0913
|
|
|
2402
2514
|
upload_dir: Path | None,
|
|
2403
2515
|
workspace_id: str | None,
|
|
2404
2516
|
gpu_id: int | None,
|
|
2517
|
+
gpu_count: int,
|
|
2405
2518
|
docker_image: str | None,
|
|
2406
2519
|
docker_entrypoint: str | None,
|
|
2407
2520
|
pull_image: bool,
|
|
@@ -2416,6 +2529,8 @@ def _run_api_mode( # noqa: PLR0913
|
|
|
2416
2529
|
typer.echo(f"Workspace: {workspace_id}")
|
|
2417
2530
|
if gpu_id is not None:
|
|
2418
2531
|
typer.echo(f"GPU: {gpu_id}")
|
|
2532
|
+
if gpu_count > 1:
|
|
2533
|
+
typer.echo(f"GPU count: {gpu_count}")
|
|
2419
2534
|
if docker_image:
|
|
2420
2535
|
typer.echo(f"Image: {docker_image}")
|
|
2421
2536
|
if docker_entrypoint:
|
|
@@ -2433,6 +2548,7 @@ def _run_api_mode( # noqa: PLR0913
|
|
|
2433
2548
|
upload_dir=upload_dir,
|
|
2434
2549
|
workspace_id=workspace_id,
|
|
2435
2550
|
gpu_id=gpu_id,
|
|
2551
|
+
gpu_count=gpu_count,
|
|
2436
2552
|
docker_image=docker_image,
|
|
2437
2553
|
docker_entrypoint=docker_entrypoint,
|
|
2438
2554
|
pull_image=pull_image,
|
|
@@ -2456,6 +2572,7 @@ def remote_run( # noqa: PLR0913
|
|
|
2456
2572
|
None, "--workspace-id", "-w", help="Workspace ID (from wafer push)"
|
|
2457
2573
|
),
|
|
2458
2574
|
gpu_id: int | None = typer.Option(None, "--gpu", "-g", help="GPU ID"),
|
|
2575
|
+
gpu_count: int = typer.Option(1, "--gpu-count", "-n", help="Number of GPUs (1-8)"),
|
|
2459
2576
|
docker_image: str | None = typer.Option(None, "--image", "-i", help="Docker image override"),
|
|
2460
2577
|
docker_entrypoint: str | None = typer.Option(
|
|
2461
2578
|
None, "--docker-entrypoint", help="Override Docker entrypoint (e.g., 'bash')"
|
|
@@ -2525,6 +2642,7 @@ def remote_run( # noqa: PLR0913
|
|
|
2525
2642
|
upload_dir,
|
|
2526
2643
|
workspace_id,
|
|
2527
2644
|
gpu_id,
|
|
2645
|
+
gpu_count,
|
|
2528
2646
|
docker_image,
|
|
2529
2647
|
docker_entrypoint,
|
|
2530
2648
|
pull_image,
|
|
@@ -4108,6 +4226,81 @@ def billing_portal(
|
|
|
4108
4226
|
raise typer.Exit(1) from None
|
|
4109
4227
|
|
|
4110
4228
|
|
|
4229
|
+
# =============================================================================
|
|
4230
|
+
# SSH Keys commands (BYOK - Bring Your Own Key)
|
|
4231
|
+
# =============================================================================
|
|
4232
|
+
|
|
4233
|
+
|
|
4234
|
+
@ssh_keys_app.command("list")
|
|
4235
|
+
def ssh_keys_list(
|
|
4236
|
+
json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
|
|
4237
|
+
) -> None:
|
|
4238
|
+
"""List all registered SSH public keys.
|
|
4239
|
+
|
|
4240
|
+
Example:
|
|
4241
|
+
wafer ssh-keys list
|
|
4242
|
+
wafer ssh-keys list --json
|
|
4243
|
+
"""
|
|
4244
|
+
from .ssh_keys import list_ssh_keys
|
|
4245
|
+
|
|
4246
|
+
try:
|
|
4247
|
+
result = list_ssh_keys(json_output=json_output)
|
|
4248
|
+
typer.echo(result)
|
|
4249
|
+
except RuntimeError as e:
|
|
4250
|
+
typer.echo(f"Error: {e}", err=True)
|
|
4251
|
+
raise typer.Exit(1) from e
|
|
4252
|
+
|
|
4253
|
+
|
|
4254
|
+
@ssh_keys_app.command("add")
|
|
4255
|
+
def ssh_keys_add(
|
|
4256
|
+
pubkey_path: Path | None = typer.Argument(
|
|
4257
|
+
None, help="Path to public key file (auto-detects ~/.ssh/id_ed25519.pub if not specified)"
|
|
4258
|
+
),
|
|
4259
|
+
name: str | None = typer.Option(None, "--name", "-n", help="Friendly name for the key"),
|
|
4260
|
+
json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
|
|
4261
|
+
) -> None:
|
|
4262
|
+
"""Add an SSH public key.
|
|
4263
|
+
|
|
4264
|
+
If no path is specified, auto-detects keys from ~/.ssh/ in preference order:
|
|
4265
|
+
id_ed25519.pub, id_rsa.pub, id_ecdsa.pub.
|
|
4266
|
+
|
|
4267
|
+
Example:
|
|
4268
|
+
wafer ssh-keys add # Auto-detect
|
|
4269
|
+
wafer ssh-keys add ~/.ssh/id_rsa.pub # Specific file
|
|
4270
|
+
wafer ssh-keys add ~/.ssh/id_ed25519.pub --name laptop
|
|
4271
|
+
"""
|
|
4272
|
+
from .ssh_keys import add_ssh_key
|
|
4273
|
+
|
|
4274
|
+
try:
|
|
4275
|
+
result = add_ssh_key(pubkey_path=pubkey_path, name=name, json_output=json_output)
|
|
4276
|
+
typer.echo(result)
|
|
4277
|
+
except RuntimeError as e:
|
|
4278
|
+
typer.echo(f"Error: {e}", err=True)
|
|
4279
|
+
raise typer.Exit(1) from e
|
|
4280
|
+
|
|
4281
|
+
|
|
4282
|
+
@ssh_keys_app.command("remove")
|
|
4283
|
+
def ssh_keys_remove(
|
|
4284
|
+
key_id: str = typer.Argument(..., help="UUID of the SSH key to remove"),
|
|
4285
|
+
json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
|
|
4286
|
+
) -> None:
|
|
4287
|
+
"""Remove an SSH public key.
|
|
4288
|
+
|
|
4289
|
+
Get the key ID from 'wafer ssh-keys list'.
|
|
4290
|
+
|
|
4291
|
+
Example:
|
|
4292
|
+
wafer ssh-keys remove abc123-def456-...
|
|
4293
|
+
"""
|
|
4294
|
+
from .ssh_keys import remove_ssh_key
|
|
4295
|
+
|
|
4296
|
+
try:
|
|
4297
|
+
result = remove_ssh_key(key_id=key_id, json_output=json_output)
|
|
4298
|
+
typer.echo(result)
|
|
4299
|
+
except RuntimeError as e:
|
|
4300
|
+
typer.echo(f"Error: {e}", err=True)
|
|
4301
|
+
raise typer.Exit(1) from e
|
|
4302
|
+
|
|
4303
|
+
|
|
4111
4304
|
# =============================================================================
|
|
4112
4305
|
# Workspaces commands
|
|
4113
4306
|
# =============================================================================
|
|
@@ -4136,21 +4329,34 @@ def workspaces_list(
|
|
|
4136
4329
|
@workspaces_app.command("create")
|
|
4137
4330
|
def workspaces_create(
|
|
4138
4331
|
name: str = typer.Argument(..., help="Workspace name"),
|
|
4139
|
-
gpu_type: str = typer.Option("B200", "--gpu", "-g", help="GPU type (
|
|
4332
|
+
gpu_type: str = typer.Option("B200", "--gpu", "-g", help="GPU type: MI300X (AMD) or B200 (NVIDIA, default)"),
|
|
4140
4333
|
image: str | None = typer.Option(None, "--image", "-i", help="Docker image (optional)"),
|
|
4334
|
+
wait: bool = typer.Option(False, "--wait", "-w", help="Wait for provisioning and show SSH credentials"),
|
|
4141
4335
|
json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
|
|
4142
4336
|
) -> None:
|
|
4143
4337
|
"""Create a new workspace.
|
|
4144
4338
|
|
|
4339
|
+
Available GPUs:
|
|
4340
|
+
MI300X AMD Instinct MI300X (192GB HBM3, ROCm)
|
|
4341
|
+
B200 NVIDIA Blackwell B200 (180GB HBM3e, CUDA)
|
|
4342
|
+
|
|
4145
4343
|
Example:
|
|
4146
|
-
wafer workspaces create my-kernel
|
|
4147
|
-
wafer workspaces create my-kernel --gpu
|
|
4344
|
+
wafer workspaces create my-kernel # B200 (default)
|
|
4345
|
+
wafer workspaces create my-kernel --gpu MI300X # AMD MI300X
|
|
4346
|
+
wafer workspaces create my-kernel --gpu B200 # NVIDIA B200
|
|
4148
4347
|
wafer workspaces create my-kernel --image pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
|
|
4348
|
+
wafer workspaces create my-kernel --wait
|
|
4149
4349
|
"""
|
|
4150
4350
|
from .workspaces import create_workspace
|
|
4151
4351
|
|
|
4152
4352
|
try:
|
|
4153
|
-
result = create_workspace(
|
|
4353
|
+
result = create_workspace(
|
|
4354
|
+
name,
|
|
4355
|
+
gpu_type=gpu_type,
|
|
4356
|
+
image=image,
|
|
4357
|
+
wait=wait,
|
|
4358
|
+
json_output=json_output,
|
|
4359
|
+
)
|
|
4154
4360
|
typer.echo(result)
|
|
4155
4361
|
except RuntimeError as e:
|
|
4156
4362
|
typer.echo(f"Error: {e}", err=True)
|
|
@@ -4160,16 +4366,23 @@ def workspaces_create(
|
|
|
4160
4366
|
@workspaces_app.command("delete")
|
|
4161
4367
|
def workspaces_delete(
|
|
4162
4368
|
workspace_id: str = typer.Argument(..., help="Workspace ID to delete"),
|
|
4369
|
+
yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation prompt"),
|
|
4163
4370
|
json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
|
|
4164
4371
|
) -> None:
|
|
4165
4372
|
"""Delete a workspace.
|
|
4166
4373
|
|
|
4167
4374
|
Example:
|
|
4168
4375
|
wafer workspaces delete ws_abc123
|
|
4376
|
+
wafer workspaces delete ws_abc123 -y
|
|
4169
4377
|
"""
|
|
4170
4378
|
from .workspaces import delete_workspace
|
|
4171
4379
|
|
|
4172
4380
|
try:
|
|
4381
|
+
if not yes:
|
|
4382
|
+
confirm = typer.confirm(f"Delete workspace '{workspace_id}'?")
|
|
4383
|
+
if not confirm:
|
|
4384
|
+
typer.echo("Cancelled.")
|
|
4385
|
+
raise typer.Exit(0)
|
|
4173
4386
|
result = delete_workspace(workspace_id, json_output=json_output)
|
|
4174
4387
|
typer.echo(result)
|
|
4175
4388
|
except RuntimeError as e:
|
|
@@ -4177,32 +4390,6 @@ def workspaces_delete(
|
|
|
4177
4390
|
raise typer.Exit(1) from None
|
|
4178
4391
|
|
|
4179
4392
|
|
|
4180
|
-
@workspaces_app.command("attach")
|
|
4181
|
-
def workspaces_attach(
|
|
4182
|
-
workspace_id: str = typer.Argument(..., help="Workspace ID to attach to"),
|
|
4183
|
-
json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
|
|
4184
|
-
) -> None:
|
|
4185
|
-
"""Attach to a workspace (get SSH credentials).
|
|
4186
|
-
|
|
4187
|
-
This will:
|
|
4188
|
-
1. Start the workspace if needed
|
|
4189
|
-
2. Return SSH connection details
|
|
4190
|
-
3. Save the private key to ~/.wafer/keys/
|
|
4191
|
-
|
|
4192
|
-
Example:
|
|
4193
|
-
wafer workspaces attach ws_abc123
|
|
4194
|
-
wafer workspaces attach ws_abc123 --json
|
|
4195
|
-
"""
|
|
4196
|
-
from .workspaces import attach_workspace
|
|
4197
|
-
|
|
4198
|
-
try:
|
|
4199
|
-
result = attach_workspace(workspace_id, json_output=json_output)
|
|
4200
|
-
typer.echo(result)
|
|
4201
|
-
except RuntimeError as e:
|
|
4202
|
-
typer.echo(f"Error: {e}", err=True)
|
|
4203
|
-
raise typer.Exit(1) from None
|
|
4204
|
-
|
|
4205
|
-
|
|
4206
4393
|
@workspaces_app.command("show")
|
|
4207
4394
|
def workspaces_show(
|
|
4208
4395
|
workspace_id: str = typer.Argument(..., help="Workspace ID to show"),
|
|
@@ -4224,12 +4411,19 @@ def workspaces_show(
|
|
|
4224
4411
|
raise typer.Exit(1) from None
|
|
4225
4412
|
|
|
4226
4413
|
|
|
4227
|
-
@workspaces_app.command(
|
|
4414
|
+
@workspaces_app.command(
|
|
4415
|
+
"exec",
|
|
4416
|
+
context_settings={
|
|
4417
|
+
"allow_interspersed_args": False,
|
|
4418
|
+
"ignore_unknown_options": True,
|
|
4419
|
+
"allow_extra_args": True,
|
|
4420
|
+
},
|
|
4421
|
+
)
|
|
4228
4422
|
def workspaces_exec(
|
|
4423
|
+
ctx: typer.Context,
|
|
4229
4424
|
workspace: str | None = typer.Argument(
|
|
4230
4425
|
None, help="Workspace name or ID (optional if default set)"
|
|
4231
4426
|
),
|
|
4232
|
-
command: list[str] = typer.Argument(..., help="Command to execute"),
|
|
4233
4427
|
timeout: int | None = typer.Option(
|
|
4234
4428
|
None,
|
|
4235
4429
|
"--timeout",
|
|
@@ -4247,6 +4441,7 @@ def workspaces_exec(
|
|
|
4247
4441
|
baremetal: bool = typer.Option(
|
|
4248
4442
|
False, "--baremetal", help="Force baremetal target (for hardware counters like ncu/nsys)"
|
|
4249
4443
|
),
|
|
4444
|
+
pull_image: bool = typer.Option(False, "--pull-image", help="Pull image on target if missing"),
|
|
4250
4445
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
|
|
4251
4446
|
quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
|
|
4252
4447
|
) -> None:
|
|
@@ -4263,6 +4458,8 @@ def workspaces_exec(
|
|
|
4263
4458
|
If workspace is not specified, uses the default workspace from config,
|
|
4264
4459
|
or the only workspace if you have exactly one.
|
|
4265
4460
|
|
|
4461
|
+
IMPORTANT: Options must come before the workspace name.
|
|
4462
|
+
|
|
4266
4463
|
Examples:
|
|
4267
4464
|
wafer workspaces exec dev -- python train.py
|
|
4268
4465
|
wafer workspaces exec dev -- python -c "import torch; print(torch.cuda.is_available())"
|
|
@@ -4273,6 +4470,34 @@ def workspaces_exec(
|
|
|
4273
4470
|
from .global_config import get_defaults, get_preferences
|
|
4274
4471
|
from .workspaces import exec_command, resolve_workspace, sync_files
|
|
4275
4472
|
|
|
4473
|
+
# Enforce option ordering to avoid treating CLI flags as remote commands
|
|
4474
|
+
known_options = {
|
|
4475
|
+
"--timeout",
|
|
4476
|
+
"-t",
|
|
4477
|
+
"--sync",
|
|
4478
|
+
"-s",
|
|
4479
|
+
"--gpu",
|
|
4480
|
+
"--cpu",
|
|
4481
|
+
"--baremetal",
|
|
4482
|
+
"--pull-image",
|
|
4483
|
+
"--verbose",
|
|
4484
|
+
"-v",
|
|
4485
|
+
"--quiet",
|
|
4486
|
+
"-q",
|
|
4487
|
+
"--help",
|
|
4488
|
+
"-h",
|
|
4489
|
+
}
|
|
4490
|
+
for arg in ctx.args:
|
|
4491
|
+
if arg == "--":
|
|
4492
|
+
break
|
|
4493
|
+
if arg in known_options:
|
|
4494
|
+
typer.echo(
|
|
4495
|
+
"Error: options must come before the workspace name. "
|
|
4496
|
+
"Example: wafer workspaces exec --pull-image dev -- python -V",
|
|
4497
|
+
err=True,
|
|
4498
|
+
)
|
|
4499
|
+
raise typer.Exit(1)
|
|
4500
|
+
|
|
4276
4501
|
# Validate mutually exclusive routing flags
|
|
4277
4502
|
routing_flags = sum([gpu, cpu, baremetal])
|
|
4278
4503
|
if routing_flags > 1:
|
|
@@ -4339,27 +4564,30 @@ def workspaces_exec(
|
|
|
4339
4564
|
typer.echo(f"Error: {e}", err=True)
|
|
4340
4565
|
raise typer.Exit(1) from None
|
|
4341
4566
|
|
|
4567
|
+
# Get command from context args (passthrough after --)
|
|
4568
|
+
import shlex
|
|
4569
|
+
|
|
4570
|
+
command = list(ctx.args)
|
|
4571
|
+
if command and command[0] == "--":
|
|
4572
|
+
command = command[1:]
|
|
4573
|
+
|
|
4574
|
+
if not command:
|
|
4575
|
+
typer.echo("Error: No command specified", err=True)
|
|
4576
|
+
raise typer.Exit(1)
|
|
4577
|
+
|
|
4342
4578
|
if show_status:
|
|
4343
4579
|
typer.echo(f"[wafer] Executing (timeout: {effective_timeout}s)...", err=True)
|
|
4344
4580
|
|
|
4345
|
-
#
|
|
4346
|
-
|
|
4347
|
-
|
|
4348
|
-
|
|
4349
|
-
|
|
4350
|
-
|
|
4351
|
-
|
|
4352
|
-
|
|
4353
|
-
# 1. Single element: user quoted the whole command (e.g., "echo hello world")
|
|
4354
|
-
# -> use directly, don't re-quote
|
|
4355
|
-
# 2. Multiple elements: user passed separate args (e.g., -- python -c "print(1)")
|
|
4356
|
-
# -> use shlex.join to properly quote args with spaces
|
|
4357
|
-
if len(command) == 1:
|
|
4358
|
-
command_str = command[0]
|
|
4359
|
-
else:
|
|
4360
|
-
command_str = shlex.join(command)
|
|
4581
|
+
# Build command string
|
|
4582
|
+
# Handle two cases:
|
|
4583
|
+
# 1. Single element: user quoted the whole command (e.g., "echo hello world")
|
|
4584
|
+
# -> use directly, don't re-quote
|
|
4585
|
+
# 2. Multiple elements: user passed separate args (e.g., -- python -c "print(1)")
|
|
4586
|
+
# -> use shlex.join to properly quote args with spaces
|
|
4587
|
+
if len(command) == 1:
|
|
4588
|
+
command_str = command[0]
|
|
4361
4589
|
else:
|
|
4362
|
-
command_str = command
|
|
4590
|
+
command_str = shlex.join(command)
|
|
4363
4591
|
|
|
4364
4592
|
try:
|
|
4365
4593
|
exit_code = exec_command(
|
|
@@ -4367,6 +4595,7 @@ def workspaces_exec(
|
|
|
4367
4595
|
command=command_str,
|
|
4368
4596
|
timeout_seconds=effective_timeout,
|
|
4369
4597
|
routing=routing,
|
|
4598
|
+
pull_image=pull_image,
|
|
4370
4599
|
)
|
|
4371
4600
|
except RuntimeError as e:
|
|
4372
4601
|
typer.echo(f"Error: {e}", err=True)
|
|
@@ -4386,7 +4615,7 @@ def workspaces_ssh(
|
|
|
4386
4615
|
) -> None:
|
|
4387
4616
|
"""SSH into a workspace.
|
|
4388
4617
|
|
|
4389
|
-
|
|
4618
|
+
Uses workspace SSH credentials once the workspace is running.
|
|
4390
4619
|
If workspace is not specified, uses the default workspace.
|
|
4391
4620
|
|
|
4392
4621
|
Examples:
|
|
@@ -4395,7 +4624,7 @@ def workspaces_ssh(
|
|
|
4395
4624
|
"""
|
|
4396
4625
|
import os
|
|
4397
4626
|
|
|
4398
|
-
from .workspaces import
|
|
4627
|
+
from .workspaces import get_workspace_raw, resolve_workspace
|
|
4399
4628
|
|
|
4400
4629
|
# Resolve workspace
|
|
4401
4630
|
try:
|
|
@@ -4406,26 +4635,39 @@ def workspaces_ssh(
|
|
|
4406
4635
|
|
|
4407
4636
|
typer.echo(f"Connecting to workspace: {resolved_workspace}...", err=True)
|
|
4408
4637
|
|
|
4409
|
-
# Get SSH credentials
|
|
4638
|
+
# Get SSH credentials from workspace
|
|
4410
4639
|
try:
|
|
4411
|
-
|
|
4640
|
+
ws = get_workspace_raw(resolved_workspace)
|
|
4412
4641
|
except RuntimeError as e:
|
|
4413
4642
|
typer.echo(f"Error: {e}", err=True)
|
|
4414
4643
|
raise typer.Exit(1) from None
|
|
4415
4644
|
|
|
4416
|
-
|
|
4417
|
-
|
|
4418
|
-
|
|
4419
|
-
|
|
4420
|
-
|
|
4645
|
+
from .workspaces import VALID_STATUSES
|
|
4646
|
+
|
|
4647
|
+
workspace_status = ws.get("status")
|
|
4648
|
+
assert workspace_status in VALID_STATUSES, (
|
|
4649
|
+
f"Workspace {resolved_workspace} has invalid status '{workspace_status}'. "
|
|
4650
|
+
f"Valid statuses: {VALID_STATUSES}"
|
|
4651
|
+
)
|
|
4652
|
+
|
|
4653
|
+
if workspace_status != "running":
|
|
4654
|
+
typer.echo(f"Error: Workspace is {workspace_status}. Wait for it to be running.", err=True)
|
|
4655
|
+
raise typer.Exit(1)
|
|
4656
|
+
if not ws.get("ssh_host") or not ws.get("ssh_port") or not ws.get("ssh_user"):
|
|
4657
|
+
typer.echo("Error: SSH credentials not available yet.", err=True)
|
|
4658
|
+
raise typer.Exit(1)
|
|
4659
|
+
|
|
4660
|
+
# Build SSH args - key_path is None for BYOK model (uses default SSH key)
|
|
4661
|
+
ssh_args = ["ssh"]
|
|
4662
|
+
ssh_args.extend([
|
|
4421
4663
|
"-p",
|
|
4422
|
-
str(
|
|
4664
|
+
str(ws.get("ssh_port")),
|
|
4423
4665
|
"-o",
|
|
4424
4666
|
"StrictHostKeyChecking=no",
|
|
4425
4667
|
"-o",
|
|
4426
4668
|
"UserKnownHostsFile=/dev/null",
|
|
4427
|
-
f"{
|
|
4428
|
-
]
|
|
4669
|
+
f"{ws.get('ssh_user')}@{ws.get('ssh_host')}",
|
|
4670
|
+
])
|
|
4429
4671
|
|
|
4430
4672
|
# Replace current process with SSH
|
|
4431
4673
|
os.execvp("ssh", ssh_args)
|
|
@@ -4492,51 +4734,568 @@ def workspaces_sync(
|
|
|
4492
4734
|
|
|
4493
4735
|
|
|
4494
4736
|
# =============================================================================
|
|
4495
|
-
#
|
|
4737
|
+
# Target operations commands (exec/ssh/sync)
|
|
4496
4738
|
# =============================================================================
|
|
4497
4739
|
|
|
4498
4740
|
|
|
4499
|
-
@
|
|
4500
|
-
def
|
|
4501
|
-
|
|
4502
|
-
|
|
4503
|
-
|
|
4741
|
+
@targets_ops_app.command("exec", context_settings={"allow_interspersed_args": False})
|
|
4742
|
+
def targets_exec(
|
|
4743
|
+
target: str = typer.Argument(
|
|
4744
|
+
...,
|
|
4745
|
+
help="Target name",
|
|
4746
|
+
autocompletion=complete_target_name,
|
|
4747
|
+
),
|
|
4748
|
+
command: list[str] = typer.Argument(..., help="Command to execute"),
|
|
4749
|
+
timeout: int | None = typer.Option(
|
|
4750
|
+
None,
|
|
4751
|
+
"--timeout",
|
|
4752
|
+
"-t",
|
|
4753
|
+
help="Execution timeout in seconds (default: 300)",
|
|
4754
|
+
),
|
|
4755
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
|
|
4756
|
+
quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
|
|
4504
4757
|
) -> None:
|
|
4505
|
-
"""Execute
|
|
4758
|
+
"""Execute a command on a configured target.
|
|
4506
4759
|
|
|
4507
|
-
|
|
4760
|
+
Provisions the target if needed (RunPod, DigitalOcean), then runs the command via SSH.
|
|
4761
|
+
For cloud targets, the instance is kept alive after execution - use
|
|
4762
|
+
'wafer config targets cleanup <name>' to terminate.
|
|
4763
|
+
|
|
4764
|
+
Supported targets: RunPod, DigitalOcean, SSH (baremetal/vm).
|
|
4765
|
+
Not supported: Modal (serverless), Local (no SSH), Workspace (use 'wafer workspaces exec').
|
|
4508
4766
|
|
|
4509
4767
|
Examples:
|
|
4510
|
-
wafer
|
|
4511
|
-
wafer
|
|
4768
|
+
wafer targets exec runpod-mi300x -- python -c "import torch; print(torch.cuda.is_available())"
|
|
4769
|
+
wafer targets exec runpod-mi300x -- rocm-smi
|
|
4770
|
+
wafer targets exec my-ssh-server -- nvidia-smi
|
|
4771
|
+
wafer targets exec runpod-mi300x "echo hello && ls -la" --timeout 60
|
|
4512
4772
|
"""
|
|
4513
|
-
from
|
|
4773
|
+
from .global_config import get_preferences
|
|
4774
|
+
from .targets import load_target
|
|
4775
|
+
from .targets_ops import TargetExecError, exec_on_target_sync, get_target_ssh_info
|
|
4514
4776
|
|
|
4515
|
-
|
|
4516
|
-
|
|
4517
|
-
|
|
4518
|
-
|
|
4519
|
-
|
|
4777
|
+
# Determine verbosity
|
|
4778
|
+
prefs = get_preferences()
|
|
4779
|
+
if quiet:
|
|
4780
|
+
show_status = False
|
|
4781
|
+
elif verbose:
|
|
4782
|
+
show_status = True
|
|
4783
|
+
else:
|
|
4784
|
+
show_status = prefs.mode == "explicit"
|
|
4520
4785
|
|
|
4786
|
+
# Load target
|
|
4521
4787
|
try:
|
|
4522
|
-
|
|
4523
|
-
|
|
4524
|
-
|
|
4525
|
-
|
|
4526
|
-
|
|
4527
|
-
|
|
4528
|
-
|
|
4529
|
-
|
|
4530
|
-
|
|
4531
|
-
|
|
4532
|
-
|
|
4533
|
-
|
|
4534
|
-
|
|
4535
|
-
|
|
4536
|
-
|
|
4537
|
-
|
|
4538
|
-
|
|
4539
|
-
|
|
4788
|
+
target_config = load_target(target)
|
|
4789
|
+
except FileNotFoundError as e:
|
|
4790
|
+
typer.echo(f"Error: {e}", err=True)
|
|
4791
|
+
typer.echo("List available targets with: wafer config targets list", err=True)
|
|
4792
|
+
raise typer.Exit(1) from None
|
|
4793
|
+
except ValueError as e:
|
|
4794
|
+
typer.echo(f"Error loading target config: {e}", err=True)
|
|
4795
|
+
raise typer.Exit(1) from None
|
|
4796
|
+
|
|
4797
|
+
if show_status:
|
|
4798
|
+
typer.echo(f"[wafer] Target: {target} ({type(target_config).__name__})", err=True)
|
|
4799
|
+
|
|
4800
|
+
# Get SSH info (may provision)
|
|
4801
|
+
if show_status:
|
|
4802
|
+
typer.echo("[wafer] Connecting to target...", err=True)
|
|
4803
|
+
|
|
4804
|
+
try:
|
|
4805
|
+
ssh_info = trio.run(get_target_ssh_info, target_config)
|
|
4806
|
+
except TargetExecError as e:
|
|
4807
|
+
typer.echo(f"Error: {e}", err=True)
|
|
4808
|
+
raise typer.Exit(1) from None
|
|
4809
|
+
|
|
4810
|
+
if show_status:
|
|
4811
|
+
typer.echo(f"[wafer] Connected: {ssh_info.user}@{ssh_info.host}:{ssh_info.port}", err=True)
|
|
4812
|
+
|
|
4813
|
+
# Build command string
|
|
4814
|
+
if isinstance(command, list):
|
|
4815
|
+
import shlex
|
|
4816
|
+
|
|
4817
|
+
# Remove leading "--" if present
|
|
4818
|
+
if command and command[0] == "--":
|
|
4819
|
+
command = command[1:]
|
|
4820
|
+
|
|
4821
|
+
if not command:
|
|
4822
|
+
typer.echo("Error: No command specified", err=True)
|
|
4823
|
+
raise typer.Exit(1)
|
|
4824
|
+
|
|
4825
|
+
if len(command) == 1:
|
|
4826
|
+
command_str = command[0]
|
|
4827
|
+
else:
|
|
4828
|
+
command_str = shlex.join(command)
|
|
4829
|
+
else:
|
|
4830
|
+
command_str = command
|
|
4831
|
+
|
|
4832
|
+
# Default timeout
|
|
4833
|
+
effective_timeout = timeout if timeout is not None else 300
|
|
4834
|
+
|
|
4835
|
+
if show_status:
|
|
4836
|
+
typer.echo(f"[wafer] Executing (timeout: {effective_timeout}s)...", err=True)
|
|
4837
|
+
|
|
4838
|
+
# Execute
|
|
4839
|
+
try:
|
|
4840
|
+
exit_code = exec_on_target_sync(ssh_info, command_str, effective_timeout)
|
|
4841
|
+
except TargetExecError as e:
|
|
4842
|
+
typer.echo(f"Error: {e}", err=True)
|
|
4843
|
+
raise typer.Exit(1) from None
|
|
4844
|
+
|
|
4845
|
+
if show_status:
|
|
4846
|
+
typer.echo(f"[wafer] Exit code: {exit_code}", err=True)
|
|
4847
|
+
|
|
4848
|
+
raise typer.Exit(exit_code)
|
|
4849
|
+
|
|
4850
|
+
|
|
4851
|
+
@targets_ops_app.command("ssh")
|
|
4852
|
+
def targets_ssh(
|
|
4853
|
+
target: str = typer.Argument(
|
|
4854
|
+
...,
|
|
4855
|
+
help="Target name",
|
|
4856
|
+
autocompletion=complete_target_name,
|
|
4857
|
+
),
|
|
4858
|
+
) -> None:
|
|
4859
|
+
"""SSH into a configured target.
|
|
4860
|
+
|
|
4861
|
+
Provisions the target if needed (RunPod, DigitalOcean), then starts an interactive SSH session.
|
|
4862
|
+
For cloud targets, the instance is kept alive - use 'wafer config targets cleanup <name>' to terminate.
|
|
4863
|
+
|
|
4864
|
+
Examples:
|
|
4865
|
+
wafer targets ssh runpod-mi300x
|
|
4866
|
+
wafer targets ssh my-baremetal-server
|
|
4867
|
+
"""
|
|
4868
|
+
from .targets import load_target
|
|
4869
|
+
from .targets_ops import TargetExecError, get_target_ssh_info
|
|
4870
|
+
|
|
4871
|
+
# Load target
|
|
4872
|
+
try:
|
|
4873
|
+
target_config = load_target(target)
|
|
4874
|
+
except FileNotFoundError as e:
|
|
4875
|
+
typer.echo(f"Error: {e}", err=True)
|
|
4876
|
+
typer.echo("List available targets with: wafer config targets list", err=True)
|
|
4877
|
+
raise typer.Exit(1) from None
|
|
4878
|
+
except ValueError as e:
|
|
4879
|
+
typer.echo(f"Error loading target config: {e}", err=True)
|
|
4880
|
+
raise typer.Exit(1) from None
|
|
4881
|
+
|
|
4882
|
+
typer.echo(f"Connecting to target: {target}...", err=True)
|
|
4883
|
+
|
|
4884
|
+
# Get SSH info (may provision)
|
|
4885
|
+
try:
|
|
4886
|
+
ssh_info = trio.run(get_target_ssh_info, target_config)
|
|
4887
|
+
except TargetExecError as e:
|
|
4888
|
+
typer.echo(f"Error: {e}", err=True)
|
|
4889
|
+
raise typer.Exit(1) from None
|
|
4890
|
+
|
|
4891
|
+
# Build SSH command
|
|
4892
|
+
ssh_args = [
|
|
4893
|
+
"ssh",
|
|
4894
|
+
"-i",
|
|
4895
|
+
str(ssh_info.key_path),
|
|
4896
|
+
"-p",
|
|
4897
|
+
str(ssh_info.port),
|
|
4898
|
+
"-o",
|
|
4899
|
+
"StrictHostKeyChecking=no",
|
|
4900
|
+
"-o",
|
|
4901
|
+
"UserKnownHostsFile=/dev/null",
|
|
4902
|
+
f"{ssh_info.user}@{ssh_info.host}",
|
|
4903
|
+
]
|
|
4904
|
+
|
|
4905
|
+
# Replace current process with SSH
|
|
4906
|
+
os.execvp("ssh", ssh_args)
|
|
4907
|
+
|
|
4908
|
+
|
|
4909
|
+
@targets_ops_app.command("sync")
|
|
4910
|
+
def targets_sync(
|
|
4911
|
+
target: str = typer.Argument(
|
|
4912
|
+
...,
|
|
4913
|
+
help="Target name",
|
|
4914
|
+
autocompletion=complete_target_name,
|
|
4915
|
+
),
|
|
4916
|
+
path: Path = typer.Argument(..., help="Local file or directory to sync"),
|
|
4917
|
+
dest: str | None = typer.Option(
|
|
4918
|
+
None,
|
|
4919
|
+
"--dest",
|
|
4920
|
+
"-d",
|
|
4921
|
+
help="Remote destination path (default: /tmp/<basename>)",
|
|
4922
|
+
),
|
|
4923
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
|
|
4924
|
+
quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
|
|
4925
|
+
) -> None:
|
|
4926
|
+
"""Sync local files to a configured target.
|
|
4927
|
+
|
|
4928
|
+
Uses rsync over SSH to copy files to the target. Provisions the target if needed.
|
|
4929
|
+
|
|
4930
|
+
Examples:
|
|
4931
|
+
wafer targets sync runpod-mi300x ./my-project
|
|
4932
|
+
wafer targets sync runpod-mi300x ./script.py --dest /workspace/script.py
|
|
4933
|
+
wafer targets sync my-server ./kernels --dest /tmp/kernels
|
|
4934
|
+
"""
|
|
4935
|
+
from .global_config import get_preferences
|
|
4936
|
+
from .targets import load_target
|
|
4937
|
+
from .targets_ops import TargetExecError, get_target_ssh_info, sync_to_target
|
|
4938
|
+
|
|
4939
|
+
# Determine verbosity
|
|
4940
|
+
prefs = get_preferences()
|
|
4941
|
+
if quiet:
|
|
4942
|
+
show_status = False
|
|
4943
|
+
elif verbose:
|
|
4944
|
+
show_status = True
|
|
4945
|
+
else:
|
|
4946
|
+
show_status = prefs.mode == "explicit"
|
|
4947
|
+
|
|
4948
|
+
# Validate path
|
|
4949
|
+
if not path.exists():
|
|
4950
|
+
typer.echo(f"Error: Path not found: {path}", err=True)
|
|
4951
|
+
raise typer.Exit(1)
|
|
4952
|
+
|
|
4953
|
+
# Load target
|
|
4954
|
+
try:
|
|
4955
|
+
target_config = load_target(target)
|
|
4956
|
+
except FileNotFoundError as e:
|
|
4957
|
+
typer.echo(f"Error: {e}", err=True)
|
|
4958
|
+
typer.echo("List available targets with: wafer config targets list", err=True)
|
|
4959
|
+
raise typer.Exit(1) from None
|
|
4960
|
+
except ValueError as e:
|
|
4961
|
+
typer.echo(f"Error loading target config: {e}", err=True)
|
|
4962
|
+
raise typer.Exit(1) from None
|
|
4963
|
+
|
|
4964
|
+
if show_status:
|
|
4965
|
+
typer.echo(f"[wafer] Target: {target} ({type(target_config).__name__})", err=True)
|
|
4966
|
+
|
|
4967
|
+
# Get SSH info (may provision)
|
|
4968
|
+
if show_status:
|
|
4969
|
+
typer.echo("[wafer] Connecting to target...", err=True)
|
|
4970
|
+
|
|
4971
|
+
try:
|
|
4972
|
+
ssh_info = trio.run(get_target_ssh_info, target_config)
|
|
4973
|
+
except TargetExecError as e:
|
|
4974
|
+
typer.echo(f"Error: {e}", err=True)
|
|
4975
|
+
raise typer.Exit(1) from None
|
|
4976
|
+
|
|
4977
|
+
if show_status:
|
|
4978
|
+
typer.echo(f"[wafer] Connected: {ssh_info.user}@{ssh_info.host}:{ssh_info.port}", err=True)
|
|
4979
|
+
|
|
4980
|
+
# Sync
|
|
4981
|
+
def on_progress(msg: str) -> None:
|
|
4982
|
+
if show_status:
|
|
4983
|
+
typer.echo(f"[wafer] {msg}", err=True)
|
|
4984
|
+
|
|
4985
|
+
try:
|
|
4986
|
+
file_count = sync_to_target(ssh_info, path.resolve(), dest, on_progress)
|
|
4987
|
+
except TargetExecError as e:
|
|
4988
|
+
typer.echo(f"Error: {e}", err=True)
|
|
4989
|
+
raise typer.Exit(1) from None
|
|
4990
|
+
|
|
4991
|
+
if show_status:
|
|
4992
|
+
typer.echo(f"[wafer] Done. Synced {file_count} files.", err=True)
|
|
4993
|
+
|
|
4994
|
+
|
|
4995
|
+
@targets_ops_app.command("scp")
|
|
4996
|
+
def targets_scp(
|
|
4997
|
+
source: str = typer.Argument(..., help="Source path (prefix with target: for remote)"),
|
|
4998
|
+
dest: str = typer.Argument(..., help="Destination path (prefix with target: for remote)"),
|
|
4999
|
+
recursive: bool = typer.Option(False, "-r", "--recursive", help="Copy directories recursively"),
|
|
5000
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
|
|
5001
|
+
quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
|
|
5002
|
+
) -> None:
|
|
5003
|
+
"""Copy files to/from a target using scp-style syntax.
|
|
5004
|
+
|
|
5005
|
+
Use target: prefix to indicate remote paths. Exactly one of source or dest
|
|
5006
|
+
must be remote.
|
|
5007
|
+
|
|
5008
|
+
Examples:
|
|
5009
|
+
wafer targets scp runpod-mi300x:/tmp/trace.json ./trace.json # download
|
|
5010
|
+
wafer targets scp ./script.py runpod-mi300x:/tmp/script.py # upload
|
|
5011
|
+
wafer targets scp -r ./kernels runpod-mi300x:/tmp/kernels # upload dir
|
|
5012
|
+
wafer targets scp -r runpod-mi300x:/tmp/results ./results # download dir
|
|
5013
|
+
"""
|
|
5014
|
+
from .global_config import get_preferences
|
|
5015
|
+
from .targets import load_target
|
|
5016
|
+
from .targets_ops import TargetExecError, get_target_ssh_info, parse_scp_path, scp_transfer
|
|
5017
|
+
|
|
5018
|
+
# Determine verbosity
|
|
5019
|
+
prefs = get_preferences()
|
|
5020
|
+
if quiet:
|
|
5021
|
+
show_status = False
|
|
5022
|
+
elif verbose:
|
|
5023
|
+
show_status = True
|
|
5024
|
+
else:
|
|
5025
|
+
show_status = prefs.mode == "explicit"
|
|
5026
|
+
|
|
5027
|
+
# Parse source and dest
|
|
5028
|
+
source_target, source_path = parse_scp_path(source)
|
|
5029
|
+
dest_target, dest_path = parse_scp_path(dest)
|
|
5030
|
+
|
|
5031
|
+
# Validate: exactly one must be remote
|
|
5032
|
+
if source_target and dest_target:
|
|
5033
|
+
typer.echo("Error: Both paths are remote. Use ssh to transfer between remotes.", err=True)
|
|
5034
|
+
raise typer.Exit(1)
|
|
5035
|
+
|
|
5036
|
+
if not source_target and not dest_target:
|
|
5037
|
+
typer.echo("Error: Both paths are local. Use regular cp command.", err=True)
|
|
5038
|
+
raise typer.Exit(1)
|
|
5039
|
+
|
|
5040
|
+
# Determine direction and target
|
|
5041
|
+
is_download = source_target is not None
|
|
5042
|
+
target_name = source_target if is_download else dest_target
|
|
5043
|
+
|
|
5044
|
+
# Load target
|
|
5045
|
+
try:
|
|
5046
|
+
target_config = load_target(target_name)
|
|
5047
|
+
except FileNotFoundError:
|
|
5048
|
+
typer.echo(f"Error: Target '{target_name}' not found.", err=True)
|
|
5049
|
+
typer.echo("Run 'wafer config targets list' to see available targets.", err=True)
|
|
5050
|
+
raise typer.Exit(1) from None
|
|
5051
|
+
except ValueError as e:
|
|
5052
|
+
typer.echo(f"Error loading target config: {e}", err=True)
|
|
5053
|
+
raise typer.Exit(1) from None
|
|
5054
|
+
|
|
5055
|
+
# Validate local path exists (for upload)
|
|
5056
|
+
if not is_download:
|
|
5057
|
+
local_path = Path(source_path)
|
|
5058
|
+
if not local_path.exists():
|
|
5059
|
+
typer.echo(f"Error: Local path '{source_path}' does not exist.", err=True)
|
|
5060
|
+
raise typer.Exit(1)
|
|
5061
|
+
if local_path.is_dir() and not recursive:
|
|
5062
|
+
typer.echo(
|
|
5063
|
+
f"Error: '{source_path}' is a directory. Use -r flag for recursive copy.", err=True
|
|
5064
|
+
)
|
|
5065
|
+
raise typer.Exit(1)
|
|
5066
|
+
|
|
5067
|
+
if show_status:
|
|
5068
|
+
typer.echo(f"[wafer] Target: {target_name} ({type(target_config).__name__})", err=True)
|
|
5069
|
+
typer.echo("[wafer] Connecting to target...", err=True)
|
|
5070
|
+
|
|
5071
|
+
# Get SSH info (may provision)
|
|
5072
|
+
try:
|
|
5073
|
+
ssh_info = trio.run(get_target_ssh_info, target_config)
|
|
5074
|
+
except TargetExecError as e:
|
|
5075
|
+
typer.echo(f"Error: {e}", err=True)
|
|
5076
|
+
raise typer.Exit(1) from None
|
|
5077
|
+
|
|
5078
|
+
if show_status:
|
|
5079
|
+
typer.echo(f"[wafer] Connected: {ssh_info.user}@{ssh_info.host}:{ssh_info.port}", err=True)
|
|
5080
|
+
direction = "Downloading" if is_download else "Uploading"
|
|
5081
|
+
typer.echo(f"[wafer] {direction}...", err=True)
|
|
5082
|
+
|
|
5083
|
+
# Transfer
|
|
5084
|
+
try:
|
|
5085
|
+
if is_download:
|
|
5086
|
+
scp_transfer(ssh_info, source_path, dest_path, is_download=True, recursive=recursive)
|
|
5087
|
+
else:
|
|
5088
|
+
scp_transfer(ssh_info, source_path, dest_path, is_download=False, recursive=recursive)
|
|
5089
|
+
except TargetExecError as e:
|
|
5090
|
+
typer.echo(f"Error: {e}", err=True)
|
|
5091
|
+
raise typer.Exit(1) from None
|
|
5092
|
+
|
|
5093
|
+
if show_status:
|
|
5094
|
+
typer.echo("[wafer] Done.", err=True)
|
|
5095
|
+
|
|
5096
|
+
|
|
5097
|
+
@targets_ops_app.command("ensure")
|
|
5098
|
+
def targets_ensure( # noqa: PLR0915
|
|
5099
|
+
target: str = typer.Argument(
|
|
5100
|
+
None,
|
|
5101
|
+
help="Target name",
|
|
5102
|
+
autocompletion=complete_target_name,
|
|
5103
|
+
),
|
|
5104
|
+
tool: str = typer.Argument(None, help="Tool to ensure is installed"),
|
|
5105
|
+
check_only: bool = typer.Option(False, "--check-only", "-c", help="Only check, don't install"),
|
|
5106
|
+
force: bool = typer.Option(False, "--force", "-f", help="Reinstall even if present"),
|
|
5107
|
+
list_tools: bool = typer.Option(False, "--list", "-l", help="List available tools"),
|
|
5108
|
+
timeout: int = typer.Option(300, "--timeout", "-t", help="Installation timeout in seconds"),
|
|
5109
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show [wafer] status messages"),
|
|
5110
|
+
quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress [wafer] status messages"),
|
|
5111
|
+
) -> None:
|
|
5112
|
+
"""Ensure a tool is installed on a target.
|
|
5113
|
+
|
|
5114
|
+
Checks if a tool exists on the target and installs it if missing.
|
|
5115
|
+
Useful for profiling tools like rocprof-compute that aren't pre-installed.
|
|
5116
|
+
|
|
5117
|
+
Examples:
|
|
5118
|
+
wafer targets ensure runpod-mi300x rocprof-compute
|
|
5119
|
+
wafer targets ensure runpod-mi300x rocprof-compute --check-only
|
|
5120
|
+
wafer targets ensure runpod-mi300x rocprof-compute --force
|
|
5121
|
+
wafer targets ensure --list
|
|
5122
|
+
"""
|
|
5123
|
+
from .global_config import get_preferences
|
|
5124
|
+
from .targets import load_target
|
|
5125
|
+
from .targets_ops import (
|
|
5126
|
+
TOOL_REGISTRY,
|
|
5127
|
+
TargetExecError,
|
|
5128
|
+
ensure_tool,
|
|
5129
|
+
get_target_platform,
|
|
5130
|
+
get_target_ssh_info,
|
|
5131
|
+
)
|
|
5132
|
+
|
|
5133
|
+
# Handle --list flag
|
|
5134
|
+
if list_tools:
|
|
5135
|
+
typer.echo("Available tools:\n")
|
|
5136
|
+
typer.echo("AMD tools:")
|
|
5137
|
+
for name, spec in sorted(TOOL_REGISTRY.items()):
|
|
5138
|
+
if spec.platform == "amd":
|
|
5139
|
+
auto = "auto-install" if spec.install_cmd else "manual"
|
|
5140
|
+
typer.echo(f" {name:20} ({auto}) - {spec.description}")
|
|
5141
|
+
|
|
5142
|
+
typer.echo("\nNVIDIA tools:")
|
|
5143
|
+
for name, spec in sorted(TOOL_REGISTRY.items()):
|
|
5144
|
+
if spec.platform == "nvidia":
|
|
5145
|
+
auto = "auto-install" if spec.install_cmd else "manual"
|
|
5146
|
+
typer.echo(f" {name:20} ({auto}) - {spec.description}")
|
|
5147
|
+
|
|
5148
|
+
typer.echo("\nCross-platform:")
|
|
5149
|
+
for name, spec in sorted(TOOL_REGISTRY.items()):
|
|
5150
|
+
if spec.platform == "any":
|
|
5151
|
+
auto = "auto-install" if spec.install_cmd else "manual"
|
|
5152
|
+
typer.echo(f" {name:20} ({auto}) - {spec.description}")
|
|
5153
|
+
return
|
|
5154
|
+
|
|
5155
|
+
# Require target and tool if not listing
|
|
5156
|
+
if not target:
|
|
5157
|
+
typer.echo("Error: Missing argument 'TARGET'", err=True)
|
|
5158
|
+
typer.echo("Usage: wafer targets ensure TARGET TOOL", err=True)
|
|
5159
|
+
typer.echo(" or: wafer targets ensure --list", err=True)
|
|
5160
|
+
raise typer.Exit(1)
|
|
5161
|
+
|
|
5162
|
+
if not tool:
|
|
5163
|
+
typer.echo("Error: Missing argument 'TOOL'", err=True)
|
|
5164
|
+
typer.echo("Usage: wafer targets ensure TARGET TOOL", err=True)
|
|
5165
|
+
typer.echo(" or: wafer targets ensure --list", err=True)
|
|
5166
|
+
raise typer.Exit(1)
|
|
5167
|
+
|
|
5168
|
+
# Check tool exists
|
|
5169
|
+
if tool not in TOOL_REGISTRY:
|
|
5170
|
+
typer.echo(f"Error: Unknown tool '{tool}'", err=True)
|
|
5171
|
+
typer.echo(f"Available tools: {', '.join(sorted(TOOL_REGISTRY.keys()))}", err=True)
|
|
5172
|
+
typer.echo("Run 'wafer targets ensure --list' for details.", err=True)
|
|
5173
|
+
raise typer.Exit(1)
|
|
5174
|
+
|
|
5175
|
+
spec = TOOL_REGISTRY[tool]
|
|
5176
|
+
|
|
5177
|
+
# Determine verbosity
|
|
5178
|
+
prefs = get_preferences()
|
|
5179
|
+
if quiet:
|
|
5180
|
+
show_status = False
|
|
5181
|
+
elif verbose:
|
|
5182
|
+
show_status = True
|
|
5183
|
+
else:
|
|
5184
|
+
show_status = prefs.mode == "explicit"
|
|
5185
|
+
|
|
5186
|
+
# Load target
|
|
5187
|
+
try:
|
|
5188
|
+
target_config = load_target(target)
|
|
5189
|
+
except FileNotFoundError as e:
|
|
5190
|
+
typer.echo(f"Error: {e}", err=True)
|
|
5191
|
+
typer.echo("List available targets with: wafer config targets list", err=True)
|
|
5192
|
+
raise typer.Exit(1) from None
|
|
5193
|
+
except ValueError as e:
|
|
5194
|
+
typer.echo(f"Error loading target config: {e}", err=True)
|
|
5195
|
+
raise typer.Exit(1) from None
|
|
5196
|
+
|
|
5197
|
+
# Platform validation
|
|
5198
|
+
platform = get_target_platform(target_config)
|
|
5199
|
+
if spec.platform != "any" and spec.platform != platform:
|
|
5200
|
+
typer.echo(
|
|
5201
|
+
f"Error: {tool} is an {spec.platform.upper()} tool but target '{target}' "
|
|
5202
|
+
f"is {platform.upper()}",
|
|
5203
|
+
err=True,
|
|
5204
|
+
)
|
|
5205
|
+
raise typer.Exit(1)
|
|
5206
|
+
|
|
5207
|
+
if show_status:
|
|
5208
|
+
typer.echo(f"[wafer] Target: {target} ({platform.upper()})", err=True)
|
|
5209
|
+
typer.echo(f"[wafer] Checking for {tool}...", err=True)
|
|
5210
|
+
|
|
5211
|
+
# Get SSH info (may provision)
|
|
5212
|
+
try:
|
|
5213
|
+
ssh_info = trio.run(get_target_ssh_info, target_config)
|
|
5214
|
+
except TargetExecError as e:
|
|
5215
|
+
typer.echo(f"Error: {e}", err=True)
|
|
5216
|
+
raise typer.Exit(1) from None
|
|
5217
|
+
|
|
5218
|
+
if show_status:
|
|
5219
|
+
typer.echo(f"[wafer] Connected: {ssh_info.user}@{ssh_info.host}:{ssh_info.port}", err=True)
|
|
5220
|
+
|
|
5221
|
+
# Check-only mode
|
|
5222
|
+
if check_only:
|
|
5223
|
+
from .targets_ops import TargetExecError, exec_on_target_sync
|
|
5224
|
+
|
|
5225
|
+
try:
|
|
5226
|
+
exit_code = exec_on_target_sync(ssh_info, spec.check_cmd, timeout_seconds=30)
|
|
5227
|
+
except TargetExecError as e:
|
|
5228
|
+
typer.echo(f"Error: {e}", err=True)
|
|
5229
|
+
raise typer.Exit(1) from None
|
|
5230
|
+
if exit_code == 0:
|
|
5231
|
+
typer.echo(f"{tool} is installed")
|
|
5232
|
+
else:
|
|
5233
|
+
typer.echo(f"{tool} is NOT installed", err=True)
|
|
5234
|
+
raise typer.Exit(1)
|
|
5235
|
+
return
|
|
5236
|
+
|
|
5237
|
+
# Ensure tool is installed
|
|
5238
|
+
result = ensure_tool(ssh_info, tool, force=force, timeout=timeout)
|
|
5239
|
+
|
|
5240
|
+
if result.error:
|
|
5241
|
+
typer.echo(f"Error: {result.error}", err=True)
|
|
5242
|
+
raise typer.Exit(1)
|
|
5243
|
+
|
|
5244
|
+
if result.already_installed:
|
|
5245
|
+
typer.echo(f"{tool} is already installed")
|
|
5246
|
+
elif result.installed:
|
|
5247
|
+
if result.verified:
|
|
5248
|
+
typer.echo(f"{tool} installed successfully")
|
|
5249
|
+
else:
|
|
5250
|
+
typer.echo(f"{tool} installed (verification skipped)")
|
|
5251
|
+
|
|
5252
|
+
|
|
5253
|
+
# =============================================================================
|
|
5254
|
+
# Perfetto trace analysis commands
|
|
5255
|
+
# =============================================================================
|
|
5256
|
+
|
|
5257
|
+
|
|
5258
|
+
@perfetto_app.command("query")
|
|
5259
|
+
def perfetto_query(
|
|
5260
|
+
trace_path: Path = typer.Argument(..., help="Path to Perfetto trace file"),
|
|
5261
|
+
sql: str = typer.Argument(..., help="SQL query to execute"),
|
|
5262
|
+
json_output: bool = typer.Option(True, "--json", "-j", help="Output as JSON"),
|
|
5263
|
+
) -> None:
|
|
5264
|
+
"""Execute SQL query against a Perfetto trace.
|
|
5265
|
+
|
|
5266
|
+
Starts trace_processor, loads the trace, executes the query, and returns results.
|
|
5267
|
+
|
|
5268
|
+
Examples:
|
|
5269
|
+
wafer perfetto query trace.perfetto "SELECT * FROM slice LIMIT 10"
|
|
5270
|
+
wafer perfetto query trace.perfetto "SELECT name, dur FROM slice ORDER BY dur DESC LIMIT 5"
|
|
5271
|
+
"""
|
|
5272
|
+
from wafer_core.lib.perfetto.perfetto_tool import PerfettoConfig, PerfettoTool
|
|
5273
|
+
|
|
5274
|
+
config = PerfettoConfig(
|
|
5275
|
+
workspace_root=".",
|
|
5276
|
+
storage_dir=str(Path.home() / ".wafer" / "perfetto"),
|
|
5277
|
+
)
|
|
5278
|
+
tool = PerfettoTool(config)
|
|
5279
|
+
|
|
5280
|
+
try:
|
|
5281
|
+
results, err = tool.query(sql, str(trace_path))
|
|
5282
|
+
if err:
|
|
5283
|
+
typer.echo(f"Error: {err}", err=True)
|
|
5284
|
+
raise typer.Exit(1)
|
|
5285
|
+
|
|
5286
|
+
if json_output:
|
|
5287
|
+
typer.echo(json.dumps({"results": results, "count": len(results or [])}, indent=2))
|
|
5288
|
+
else:
|
|
5289
|
+
if not results:
|
|
5290
|
+
typer.echo("No results")
|
|
5291
|
+
else:
|
|
5292
|
+
# Simple table output
|
|
5293
|
+
if results:
|
|
5294
|
+
headers = list(results[0].keys())
|
|
5295
|
+
typer.echo("\t".join(headers))
|
|
5296
|
+
for row in results:
|
|
5297
|
+
typer.echo("\t".join(str(row.get(h, "")) for h in headers))
|
|
5298
|
+
except Exception as e:
|
|
4540
5299
|
typer.echo(f"Error: {e}", err=True)
|
|
4541
5300
|
raise typer.Exit(1) from None
|
|
4542
5301
|
|
|
@@ -4774,13 +5533,39 @@ def ncu_analyze(
|
|
|
4774
5533
|
|
|
4775
5534
|
|
|
4776
5535
|
# =============================================================================
|
|
4777
|
-
# NSYS
|
|
5536
|
+
# NSYS commands
|
|
4778
5537
|
# =============================================================================
|
|
4779
5538
|
|
|
4780
5539
|
|
|
5540
|
+
@nsys_app.command("check")
|
|
5541
|
+
def nsys_check() -> None:
|
|
5542
|
+
"""Check if NSYS (Nsight Systems) is installed and show version.
|
|
5543
|
+
|
|
5544
|
+
NSYS is required for local analysis. If not installed, shows install instructions.
|
|
5545
|
+
|
|
5546
|
+
Examples:
|
|
5547
|
+
wafer nvidia nsys check
|
|
5548
|
+
"""
|
|
5549
|
+
from .nsys_analyze import check_nsys_installation
|
|
5550
|
+
|
|
5551
|
+
result = check_nsys_installation()
|
|
5552
|
+
|
|
5553
|
+
if result.installed:
|
|
5554
|
+
typer.echo(f"✓ NSYS installed: {result.path}")
|
|
5555
|
+
if result.version:
|
|
5556
|
+
typer.echo(f" Version: {result.version}")
|
|
5557
|
+
else:
|
|
5558
|
+
typer.echo("✗ NSYS not installed")
|
|
5559
|
+
if result.install_command:
|
|
5560
|
+
typer.echo(f" Install with: {result.install_command}")
|
|
5561
|
+
|
|
5562
|
+
|
|
4781
5563
|
@nsys_app.command("analyze")
|
|
4782
5564
|
def nsys_analyze(
|
|
4783
5565
|
filepath: Path = typer.Argument(..., help="Path to .nsys-rep profile file"),
|
|
5566
|
+
output_dir: Path | None = typer.Option(
|
|
5567
|
+
None, "--output-dir", "-o", help="Output directory for analysis files"
|
|
5568
|
+
),
|
|
4784
5569
|
json_output: bool = typer.Option(
|
|
4785
5570
|
False, "--json", help="Output raw JSON instead of formatted text"
|
|
4786
5571
|
),
|
|
@@ -4789,6 +5574,12 @@ def nsys_analyze(
|
|
|
4789
5574
|
"--remote/--local",
|
|
4790
5575
|
help="Force remote (via API) or local analysis. Default: auto-detect (remote if nsys not installed locally)",
|
|
4791
5576
|
),
|
|
5577
|
+
target: str | None = typer.Option(
|
|
5578
|
+
None,
|
|
5579
|
+
"--target",
|
|
5580
|
+
"-t",
|
|
5581
|
+
help="Remote target: 'workspace:id' for workspace execution, or target name from ~/.wafer/targets/",
|
|
5582
|
+
),
|
|
4792
5583
|
) -> None:
|
|
4793
5584
|
"""Analyze an NVIDIA Nsight Systems profile (.nsys-rep file).
|
|
4794
5585
|
|
|
@@ -4797,10 +5588,20 @@ def nsys_analyze(
|
|
|
4797
5588
|
By default, uses local nsys if available, otherwise runs analysis
|
|
4798
5589
|
remotely via wafer-api (requires authentication: wafer login).
|
|
4799
5590
|
|
|
5591
|
+
Supports multiple execution modes:
|
|
5592
|
+
- Local: Uses local nsys CLI (no GPU required for analysis)
|
|
5593
|
+
- Remote API: Uploads file and runs analysis on Modal
|
|
5594
|
+
- Workspace: Runs analysis on a Wafer workspace via SSH
|
|
5595
|
+
- Target: Runs analysis on a configured target machine via SSH
|
|
5596
|
+
|
|
4800
5597
|
Examples:
|
|
4801
5598
|
wafer nvidia nsys analyze profile.nsys-rep
|
|
4802
5599
|
wafer nvidia nsys analyze profile.nsys-rep --json
|
|
5600
|
+
wafer nvidia nsys analyze profile.nsys-rep --local
|
|
4803
5601
|
wafer nvidia nsys analyze profile.nsys-rep --remote
|
|
5602
|
+
wafer nvidia nsys analyze profile.nsys-rep --target workspace:abc123
|
|
5603
|
+
wafer nvidia nsys analyze profile.nsys-rep --target vultr-b200
|
|
5604
|
+
wafer nvidia nsys analyze profile.nsys-rep -o ./results/
|
|
4804
5605
|
"""
|
|
4805
5606
|
from .nsys_analyze import analyze_nsys_profile
|
|
4806
5607
|
|
|
@@ -4812,11 +5613,20 @@ def nsys_analyze(
|
|
|
4812
5613
|
typer.echo(f"Error: Expected .nsys-rep file, got: {filepath.suffix}", err=True)
|
|
4813
5614
|
raise typer.Exit(1)
|
|
4814
5615
|
|
|
5616
|
+
# Warn if both remote flag and target are specified
|
|
5617
|
+
if target and remote is not None:
|
|
5618
|
+
typer.echo(
|
|
5619
|
+
"Warning: --target overrides --remote/--local flag",
|
|
5620
|
+
err=True,
|
|
5621
|
+
)
|
|
5622
|
+
|
|
4815
5623
|
try:
|
|
4816
5624
|
result = analyze_nsys_profile(
|
|
4817
5625
|
filepath,
|
|
4818
5626
|
json_output=json_output,
|
|
4819
5627
|
remote=remote,
|
|
5628
|
+
target=target,
|
|
5629
|
+
output_dir=output_dir,
|
|
4820
5630
|
)
|
|
4821
5631
|
typer.echo(result)
|
|
4822
5632
|
except FileNotFoundError as e:
|
|
@@ -4827,6 +5637,150 @@ def nsys_analyze(
|
|
|
4827
5637
|
raise typer.Exit(1) from None
|
|
4828
5638
|
|
|
4829
5639
|
|
|
5640
|
+
@nsys_app.command("profile", context_settings={"allow_interspersed_args": False})
|
|
5641
|
+
def nsys_profile(
|
|
5642
|
+
command: list[str] = typer.Argument(..., help="Command to profile"),
|
|
5643
|
+
output: str = typer.Option(
|
|
5644
|
+
"profile",
|
|
5645
|
+
"--output",
|
|
5646
|
+
"-o",
|
|
5647
|
+
help="Output filename (without .nsys-rep extension)",
|
|
5648
|
+
),
|
|
5649
|
+
trace: str | None = typer.Option(
|
|
5650
|
+
None,
|
|
5651
|
+
"--trace",
|
|
5652
|
+
"-t",
|
|
5653
|
+
help="Trace APIs to capture (comma-separated: cuda,nvtx,osrt,cudnn,cublas). Default: cuda",
|
|
5654
|
+
),
|
|
5655
|
+
duration: int | None = typer.Option(
|
|
5656
|
+
None,
|
|
5657
|
+
"--duration",
|
|
5658
|
+
"-d",
|
|
5659
|
+
help="Maximum profiling duration in seconds",
|
|
5660
|
+
),
|
|
5661
|
+
target: str | None = typer.Option(
|
|
5662
|
+
None,
|
|
5663
|
+
"--target",
|
|
5664
|
+
help="Remote target: 'workspace:id' for workspace execution, or target name from ~/.wafer/targets/",
|
|
5665
|
+
),
|
|
5666
|
+
analyze: bool = typer.Option(
|
|
5667
|
+
False,
|
|
5668
|
+
"--analyze",
|
|
5669
|
+
"-a",
|
|
5670
|
+
help="Automatically analyze the profile after completion",
|
|
5671
|
+
),
|
|
5672
|
+
json_output: bool = typer.Option(
|
|
5673
|
+
False,
|
|
5674
|
+
"--json",
|
|
5675
|
+
help="Output analysis as JSON (only with --analyze)",
|
|
5676
|
+
),
|
|
5677
|
+
verbose: bool = typer.Option(
|
|
5678
|
+
False,
|
|
5679
|
+
"--verbose",
|
|
5680
|
+
"-v",
|
|
5681
|
+
help="Show verbose progress messages",
|
|
5682
|
+
),
|
|
5683
|
+
extra_args: str | None = typer.Option(
|
|
5684
|
+
None,
|
|
5685
|
+
"--extra",
|
|
5686
|
+
help="Extra arguments to pass to nsys profile",
|
|
5687
|
+
),
|
|
5688
|
+
) -> None:
|
|
5689
|
+
"""Profile a command with NVIDIA Nsight Systems.
|
|
5690
|
+
|
|
5691
|
+
Runs nsys profile on the specified command and generates a .nsys-rep file.
|
|
5692
|
+
Profiling requires an NVIDIA GPU. Use --target to run on a remote GPU server
|
|
5693
|
+
or workspace.
|
|
5694
|
+
|
|
5695
|
+
Examples:
|
|
5696
|
+
wafer nvidia nsys profile -- python train.py
|
|
5697
|
+
wafer nvidia nsys profile -o gemm_profile -- ./gemm_kernel
|
|
5698
|
+
wafer nvidia nsys profile --trace cuda,nvtx -- python model.py
|
|
5699
|
+
wafer nvidia nsys profile --duration 60 -- ./long_running_app
|
|
5700
|
+
wafer nvidia nsys profile --target workspace:abc123 -- python test.py
|
|
5701
|
+
wafer nvidia nsys profile --target vultr-b200 -- ./benchmark
|
|
5702
|
+
wafer nvidia nsys profile --analyze -- python train.py
|
|
5703
|
+
wafer nvidia nsys profile --analyze --json -- ./kernel > results.json
|
|
5704
|
+
"""
|
|
5705
|
+
# Parse command
|
|
5706
|
+
import shlex
|
|
5707
|
+
|
|
5708
|
+
from .nsys_analyze import _parse_target
|
|
5709
|
+
from .nsys_profile import (
|
|
5710
|
+
NSYSProfileOptions,
|
|
5711
|
+
profile_and_analyze,
|
|
5712
|
+
profile_local,
|
|
5713
|
+
profile_remote_ssh,
|
|
5714
|
+
profile_workspace,
|
|
5715
|
+
)
|
|
5716
|
+
|
|
5717
|
+
if isinstance(command, list):
|
|
5718
|
+
# Remove leading "--" if present
|
|
5719
|
+
if command and command[0] == "--":
|
|
5720
|
+
command = command[1:]
|
|
5721
|
+
if len(command) == 1:
|
|
5722
|
+
command_str = command[0]
|
|
5723
|
+
else:
|
|
5724
|
+
command_str = shlex.join(command)
|
|
5725
|
+
else:
|
|
5726
|
+
command_str = command
|
|
5727
|
+
|
|
5728
|
+
if not command_str:
|
|
5729
|
+
typer.echo("Error: No command specified", err=True)
|
|
5730
|
+
raise typer.Exit(1)
|
|
5731
|
+
|
|
5732
|
+
# Parse trace options
|
|
5733
|
+
trace_list = trace.split(",") if trace else None
|
|
5734
|
+
|
|
5735
|
+
# Build options
|
|
5736
|
+
options = NSYSProfileOptions(
|
|
5737
|
+
command=command_str,
|
|
5738
|
+
output=output,
|
|
5739
|
+
trace=trace_list,
|
|
5740
|
+
duration=duration,
|
|
5741
|
+
extra_args=extra_args,
|
|
5742
|
+
)
|
|
5743
|
+
|
|
5744
|
+
if verbose:
|
|
5745
|
+
typer.echo(f"[nsys] Command: {command_str}", err=True)
|
|
5746
|
+
if target:
|
|
5747
|
+
typer.echo(f"[nsys] Target: {target}", err=True)
|
|
5748
|
+
|
|
5749
|
+
# Execute
|
|
5750
|
+
if analyze:
|
|
5751
|
+
profile_result, analysis_result = profile_and_analyze(
|
|
5752
|
+
options,
|
|
5753
|
+
target=target,
|
|
5754
|
+
json_output=json_output,
|
|
5755
|
+
verbose=verbose,
|
|
5756
|
+
)
|
|
5757
|
+
else:
|
|
5758
|
+
if target:
|
|
5759
|
+
target_type, target_id = _parse_target(target)
|
|
5760
|
+
if target_type == "workspace":
|
|
5761
|
+
profile_result = profile_workspace(target_id, options, verbose=verbose)
|
|
5762
|
+
else:
|
|
5763
|
+
profile_result = profile_remote_ssh(target_id, options, verbose=verbose)
|
|
5764
|
+
else:
|
|
5765
|
+
profile_result = profile_local(options, verbose=verbose)
|
|
5766
|
+
analysis_result = None
|
|
5767
|
+
|
|
5768
|
+
# Report results
|
|
5769
|
+
if not profile_result.success:
|
|
5770
|
+
typer.echo(f"Error: {profile_result.error}", err=True)
|
|
5771
|
+
if profile_result.stderr:
|
|
5772
|
+
typer.echo(f"stderr: {profile_result.stderr}", err=True)
|
|
5773
|
+
raise typer.Exit(1)
|
|
5774
|
+
|
|
5775
|
+
if verbose or not analyze:
|
|
5776
|
+
typer.echo(f"Profile created: {profile_result.output_path}")
|
|
5777
|
+
|
|
5778
|
+
if analysis_result:
|
|
5779
|
+
if not analysis_result.success:
|
|
5780
|
+
typer.echo(f"Analysis error: {analysis_result.error}", err=True)
|
|
5781
|
+
raise typer.Exit(1)
|
|
5782
|
+
|
|
5783
|
+
|
|
4830
5784
|
# =============================================================================
|
|
4831
5785
|
# ROCprof-Compute commands
|
|
4832
5786
|
# =============================================================================
|
|
@@ -5959,13 +6913,14 @@ def capture_list_command(
|
|
|
5959
6913
|
|
|
5960
6914
|
@corpus_app.command("download")
|
|
5961
6915
|
def corpus_download(
|
|
5962
|
-
name: str = typer.Argument(..., help="Corpus name (cuda, cutlass, hip)"),
|
|
6916
|
+
name: str = typer.Argument(..., help="Corpus name (cuda, cutlass, hip, amd)"),
|
|
5963
6917
|
force: bool = typer.Option(False, "--force", "-f", help="Re-download even if exists"),
|
|
5964
6918
|
) -> None:
|
|
5965
6919
|
"""Download a documentation corpus for agent filesystem access.
|
|
5966
6920
|
|
|
5967
6921
|
Examples:
|
|
5968
6922
|
wafer corpus download cuda
|
|
6923
|
+
wafer corpus download amd
|
|
5969
6924
|
wafer corpus download cutlass --force
|
|
5970
6925
|
"""
|
|
5971
6926
|
from .corpus import CORPORA, download_corpus
|
|
@@ -6180,78 +7135,12 @@ def tracelens_collective(
|
|
|
6180
7135
|
|
|
6181
7136
|
|
|
6182
7137
|
# =============================================================================
|
|
6183
|
-
# ISA Analysis Commands
|
|
7138
|
+
# Unified ISA Analysis Commands (wafer amd isa ...)
|
|
6184
7139
|
# =============================================================================
|
|
6185
7140
|
|
|
6186
7141
|
|
|
6187
7142
|
@isa_app.command("analyze")
|
|
6188
7143
|
def isa_analyze(
|
|
6189
|
-
file: Path = typer.Argument(..., help="Path to .co file to analyze"),
|
|
6190
|
-
json_output: bool = typer.Option(False, "--json", help="Output as JSON"),
|
|
6191
|
-
) -> None:
|
|
6192
|
-
"""Analyze AMD GPU code object (.co file).
|
|
6193
|
-
|
|
6194
|
-
Extracts and analyzes ISA, showing register usage, instruction mix,
|
|
6195
|
-
spills, and other performance-relevant metrics.
|
|
6196
|
-
|
|
6197
|
-
The .co file is uploaded to the Wafer API server which has ROCm tools
|
|
6198
|
-
installed for analysis.
|
|
6199
|
-
|
|
6200
|
-
Examples:
|
|
6201
|
-
wafer isa analyze kernel.co
|
|
6202
|
-
wafer isa analyze kernel.co --json
|
|
6203
|
-
"""
|
|
6204
|
-
from dataclasses import asdict
|
|
6205
|
-
|
|
6206
|
-
from wafer_core.tools.isa_analysis_tools import analyze_isa, format_isa_summary
|
|
6207
|
-
|
|
6208
|
-
from .auth import get_auth_headers
|
|
6209
|
-
from .global_config import get_api_url
|
|
6210
|
-
|
|
6211
|
-
# Validate file exists
|
|
6212
|
-
if not file.exists():
|
|
6213
|
-
typer.echo(f"Error: File not found: {file}", err=True)
|
|
6214
|
-
raise typer.Exit(1)
|
|
6215
|
-
|
|
6216
|
-
if not file.suffix == ".co":
|
|
6217
|
-
typer.echo(f"Error: Expected .co file, got: {file.suffix}", err=True)
|
|
6218
|
-
raise typer.Exit(1)
|
|
6219
|
-
|
|
6220
|
-
# Get API URL and auth
|
|
6221
|
-
api_url = get_api_url()
|
|
6222
|
-
auth_headers = get_auth_headers()
|
|
6223
|
-
|
|
6224
|
-
if not auth_headers:
|
|
6225
|
-
typer.echo("Error: Not logged in. Run 'wafer login' first.", err=True)
|
|
6226
|
-
raise typer.Exit(1)
|
|
6227
|
-
|
|
6228
|
-
try:
|
|
6229
|
-
result = analyze_isa(
|
|
6230
|
-
co_file_path=file,
|
|
6231
|
-
api_url=api_url,
|
|
6232
|
-
auth_headers=auth_headers,
|
|
6233
|
-
)
|
|
6234
|
-
|
|
6235
|
-
if json_output:
|
|
6236
|
-
typer.echo(json.dumps(asdict(result)))
|
|
6237
|
-
else:
|
|
6238
|
-
typer.echo(format_isa_summary(result))
|
|
6239
|
-
|
|
6240
|
-
except FileNotFoundError as e:
|
|
6241
|
-
typer.echo(f"Error: {e}", err=True)
|
|
6242
|
-
raise typer.Exit(1) from None
|
|
6243
|
-
except Exception as e:
|
|
6244
|
-
typer.echo(f"Error: {e}", err=True)
|
|
6245
|
-
raise typer.Exit(1) from None
|
|
6246
|
-
|
|
6247
|
-
|
|
6248
|
-
# =============================================================================
|
|
6249
|
-
# Kernel Scope Commands (wafer amd kernel-scope ...)
|
|
6250
|
-
# =============================================================================
|
|
6251
|
-
|
|
6252
|
-
|
|
6253
|
-
@kernel_scope_app.command("analyze")
|
|
6254
|
-
def kernel_scope_analyze(
|
|
6255
7144
|
path: Path = typer.Argument(..., help="Path to file or directory to analyze"),
|
|
6256
7145
|
json_output: bool = typer.Option(False, "--json", "-j", help="Output as JSON"),
|
|
6257
7146
|
csv_output: bool = typer.Option(False, "--csv", help="Output as CSV"),
|
|
@@ -6264,24 +7153,32 @@ def kernel_scope_analyze(
|
|
|
6264
7153
|
output_file: Path | None = typer.Option(None, "--output", "-o", help="Write output to file"),
|
|
6265
7154
|
kernel_index: int = typer.Option(0, "--kernel", "-k", help="Kernel index if multiple in file"),
|
|
6266
7155
|
) -> None:
|
|
6267
|
-
"""Analyze
|
|
7156
|
+
"""Analyze AMD GPU ISA files (.co, .s, .ll, .ttgir).
|
|
6268
7157
|
|
|
6269
7158
|
Performs static analysis to extract performance metrics like register
|
|
6270
7159
|
pressure, spills, MFMA density, and occupancy limits.
|
|
6271
7160
|
|
|
6272
7161
|
Supports:
|
|
6273
|
-
-
|
|
6274
|
-
-
|
|
6275
|
-
-
|
|
7162
|
+
- AMD GPU code objects (.co) - Requires API authentication
|
|
7163
|
+
- AMDGCN ISA assembly (.s, .gcn, .asm) - Local parsing
|
|
7164
|
+
- LLVM-IR files (.ll) - Local parsing
|
|
7165
|
+
- TTGIR files (.ttgir, .ttir, .mlir) - Local parsing
|
|
6276
7166
|
|
|
6277
7167
|
Examples:
|
|
6278
|
-
wafer amd
|
|
6279
|
-
wafer amd
|
|
6280
|
-
wafer amd
|
|
6281
|
-
wafer amd
|
|
7168
|
+
wafer amd isa analyze kernel.co # Code object (needs login)
|
|
7169
|
+
wafer amd isa analyze kernel.s # ISA assembly
|
|
7170
|
+
wafer amd isa analyze kernel.s --json # Output as JSON
|
|
7171
|
+
wafer amd isa analyze ~/.triton/cache/ --filter 'spills > 0'
|
|
7172
|
+
wafer amd isa analyze . -r --csv -o metrics.csv
|
|
6282
7173
|
"""
|
|
7174
|
+
from .auth import get_auth_headers
|
|
7175
|
+
from .global_config import get_api_url
|
|
6283
7176
|
from .kernel_scope import analyze_command
|
|
6284
7177
|
|
|
7178
|
+
# Get API credentials for .co files
|
|
7179
|
+
api_url = get_api_url()
|
|
7180
|
+
auth_headers = get_auth_headers()
|
|
7181
|
+
|
|
6285
7182
|
try:
|
|
6286
7183
|
output = analyze_command(
|
|
6287
7184
|
path=str(path),
|
|
@@ -6291,6 +7188,8 @@ def kernel_scope_analyze(
|
|
|
6291
7188
|
filter_expr=filter_expr,
|
|
6292
7189
|
output_file=str(output_file) if output_file else None,
|
|
6293
7190
|
kernel_index=kernel_index,
|
|
7191
|
+
api_url=api_url,
|
|
7192
|
+
auth_headers=auth_headers,
|
|
6294
7193
|
)
|
|
6295
7194
|
typer.echo(output)
|
|
6296
7195
|
|
|
@@ -6305,15 +7204,15 @@ def kernel_scope_analyze(
|
|
|
6305
7204
|
raise typer.Exit(1) from None
|
|
6306
7205
|
|
|
6307
7206
|
|
|
6308
|
-
@
|
|
6309
|
-
def
|
|
6310
|
-
"""List available metrics for
|
|
7207
|
+
@isa_app.command("metrics")
|
|
7208
|
+
def isa_metrics() -> None:
|
|
7209
|
+
"""List available metrics for ISA analysis.
|
|
6311
7210
|
|
|
6312
|
-
Shows all metrics that can be extracted from
|
|
6313
|
-
|
|
7211
|
+
Shows all metrics that can be extracted from AMD GPU ISA files,
|
|
7212
|
+
along with their derivation.
|
|
6314
7213
|
|
|
6315
7214
|
Examples:
|
|
6316
|
-
wafer amd
|
|
7215
|
+
wafer amd isa metrics
|
|
6317
7216
|
"""
|
|
6318
7217
|
from .kernel_scope import metrics_command
|
|
6319
7218
|
|
|
@@ -6321,15 +7220,15 @@ def kernel_scope_metrics() -> None:
|
|
|
6321
7220
|
typer.echo(output)
|
|
6322
7221
|
|
|
6323
7222
|
|
|
6324
|
-
@
|
|
6325
|
-
def
|
|
7223
|
+
@isa_app.command("targets")
|
|
7224
|
+
def isa_targets() -> None:
|
|
6326
7225
|
"""List supported GPU targets and their specifications.
|
|
6327
7226
|
|
|
6328
7227
|
Shows hardware specs (VGPRs, SGPRs, LDS, etc.) for each supported
|
|
6329
7228
|
AMD GPU architecture.
|
|
6330
7229
|
|
|
6331
7230
|
Examples:
|
|
6332
|
-
wafer amd
|
|
7231
|
+
wafer amd isa targets
|
|
6333
7232
|
"""
|
|
6334
7233
|
from .kernel_scope import targets_command
|
|
6335
7234
|
|