wafer-cli 0.2.29__py3-none-any.whl → 0.2.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/baseline.py +661 -0
- wafer/cli.py +41 -321
- wafer/evaluate.py +81 -143
- wafer/templates/optimize_kernel.py +4 -2
- {wafer_cli-0.2.29.dist-info → wafer_cli-0.2.31.dist-info}/METADATA +1 -1
- {wafer_cli-0.2.29.dist-info → wafer_cli-0.2.31.dist-info}/RECORD +9 -8
- {wafer_cli-0.2.29.dist-info → wafer_cli-0.2.31.dist-info}/WHEEL +0 -0
- {wafer_cli-0.2.29.dist-info → wafer_cli-0.2.31.dist-info}/entry_points.txt +0 -0
- {wafer_cli-0.2.29.dist-info → wafer_cli-0.2.31.dist-info}/top_level.txt +0 -0
wafer/cli.py
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
Core commands:
|
|
9
9
|
agent AI assistant for GPU kernel development
|
|
10
10
|
evaluate Test kernel correctness and performance
|
|
11
|
+
baseline Discover what kernel PyTorch uses for an op
|
|
11
12
|
corpus Download GPU documentation for local access
|
|
12
13
|
workspaces Manage cloud GPU environments
|
|
13
14
|
|
|
@@ -279,19 +280,19 @@ from wafer.targets_cli import (
|
|
|
279
280
|
targets_list as _targets_list_cmd,
|
|
280
281
|
)
|
|
281
282
|
from wafer.targets_cli import (
|
|
282
|
-
|
|
283
|
+
targets_pools as _targets_pools_cmd,
|
|
283
284
|
)
|
|
284
285
|
from wafer.targets_cli import (
|
|
285
|
-
|
|
286
|
+
targets_probe as _targets_probe_cmd,
|
|
286
287
|
)
|
|
287
288
|
from wafer.targets_cli import (
|
|
288
|
-
|
|
289
|
+
targets_provision as _targets_provision_cmd,
|
|
289
290
|
)
|
|
290
291
|
from wafer.targets_cli import (
|
|
291
|
-
|
|
292
|
+
targets_reconcile as _targets_reconcile_cmd,
|
|
292
293
|
)
|
|
293
294
|
from wafer.targets_cli import (
|
|
294
|
-
|
|
295
|
+
targets_terminate as _targets_terminate_cmd,
|
|
295
296
|
)
|
|
296
297
|
|
|
297
298
|
# Billing management - nested under config
|
|
@@ -323,6 +324,11 @@ gpumode_app = typer.Typer(
|
|
|
323
324
|
)
|
|
324
325
|
evaluate_app.add_typer(gpumode_app, name="gpumode")
|
|
325
326
|
|
|
327
|
+
# Baseline discovery (what kernel does PyTorch use?)
|
|
328
|
+
from wafer.baseline import baseline_app
|
|
329
|
+
|
|
330
|
+
app.add_typer(baseline_app, name="baseline", rich_help_panel="Kernel Development")
|
|
331
|
+
|
|
326
332
|
# =============================================================================
|
|
327
333
|
# Dev commands (internal, used by web app proxy)
|
|
328
334
|
# =============================================================================
|
|
@@ -1592,7 +1598,9 @@ def evaluate( # noqa: PLR0913
|
|
|
1592
1598
|
benchmark: bool = typer.Option(False, "--benchmark", help="Run performance benchmarks"),
|
|
1593
1599
|
profile: bool = typer.Option(False, "--profile", help="Enable profiling"),
|
|
1594
1600
|
defensive: bool = typer.Option(
|
|
1595
|
-
|
|
1601
|
+
True,
|
|
1602
|
+
"--defense/--no-defense",
|
|
1603
|
+
help="Run reward hack defense checks after benchmarking. Enabled by default.",
|
|
1596
1604
|
),
|
|
1597
1605
|
sync_artifacts: bool = typer.Option(
|
|
1598
1606
|
True, "--sync-artifacts/--no-sync-artifacts", help="Download artifacts"
|
|
@@ -1606,19 +1614,19 @@ def evaluate( # noqa: PLR0913
|
|
|
1606
1614
|
The evaluation checks:
|
|
1607
1615
|
1. Correctness: Does the kernel produce the same output as the reference?
|
|
1608
1616
|
2. Performance (--benchmark): How fast is it compared to the reference?
|
|
1609
|
-
3. Defense
|
|
1617
|
+
3. Defense: Detects reward hacking (runs automatically with benchmark, disable with --no-defense)
|
|
1610
1618
|
|
|
1611
1619
|
Examples:
|
|
1612
1620
|
# Basic correctness check
|
|
1613
1621
|
wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json
|
|
1614
1622
|
|
|
1615
|
-
# With benchmarking
|
|
1623
|
+
# With benchmarking (defense checks run automatically)
|
|
1616
1624
|
wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json \\
|
|
1617
1625
|
--target vultr-b200 --benchmark
|
|
1618
1626
|
|
|
1619
|
-
#
|
|
1627
|
+
# Benchmarking without defense checks
|
|
1620
1628
|
wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json \\
|
|
1621
|
-
--benchmark --
|
|
1629
|
+
--benchmark --no-defense
|
|
1622
1630
|
|
|
1623
1631
|
Subcommands:
|
|
1624
1632
|
gpumode Use GPUMode format (functional) - RECOMMENDED
|
|
@@ -1863,7 +1871,9 @@ def _resolve_pool_query(pool: str, collector) -> tuple[str, object]:
|
|
|
1863
1871
|
spec_targets = [t for t in matched_targets if t.spec_name]
|
|
1864
1872
|
if not spec_targets:
|
|
1865
1873
|
collector.set_error(
|
|
1866
|
-
"pool",
|
|
1874
|
+
"pool",
|
|
1875
|
+
"NoSpecTargets",
|
|
1876
|
+
pool=pool,
|
|
1867
1877
|
message="Matched targets have no spec binding — evaluator needs spec fields",
|
|
1868
1878
|
)
|
|
1869
1879
|
collector.finalize()
|
|
@@ -1963,7 +1973,9 @@ def kernelbench_evaluate( # noqa: PLR0913, PLR0915
|
|
|
1963
1973
|
),
|
|
1964
1974
|
seed: int = typer.Option(42, "--seed", help="Random seed for weight initialization"),
|
|
1965
1975
|
defensive: bool = typer.Option(
|
|
1966
|
-
|
|
1976
|
+
True,
|
|
1977
|
+
"--defense/--no-defense",
|
|
1978
|
+
help="Run reward hack defense checks after benchmarking. Enabled by default.",
|
|
1967
1979
|
),
|
|
1968
1980
|
backend: str | None = typer.Option(
|
|
1969
1981
|
None,
|
|
@@ -2003,16 +2015,20 @@ def kernelbench_evaluate( # noqa: PLR0913, PLR0915
|
|
|
2003
2015
|
The evaluation checks:
|
|
2004
2016
|
1. Correctness: Does ModelNew.forward() produce same output as Model.forward()?
|
|
2005
2017
|
2. Performance (--benchmark): How fast is it compared to the reference?
|
|
2006
|
-
3. Defense
|
|
2018
|
+
3. Defense: Detects reward hacking (runs automatically with benchmark, disable with --no-defense)
|
|
2007
2019
|
|
|
2008
2020
|
Examples:
|
|
2009
2021
|
# Basic correctness check
|
|
2010
2022
|
wafer evaluate kernelbench --impl my_kernel.py --reference problem.py
|
|
2011
2023
|
|
|
2012
|
-
# With benchmarking
|
|
2024
|
+
# With benchmarking (defense checks run automatically)
|
|
2013
2025
|
wafer evaluate kernelbench --impl my_kernel.py --reference problem.py \\
|
|
2014
2026
|
--target vultr-b200 --benchmark
|
|
2015
2027
|
|
|
2028
|
+
# Benchmarking without defense checks
|
|
2029
|
+
wafer evaluate kernelbench --impl my_kernel.py --reference problem.py \\
|
|
2030
|
+
--target vultr-b200 --benchmark --no-defense
|
|
2031
|
+
|
|
2016
2032
|
Subcommands:
|
|
2017
2033
|
make-template Extract a KernelBench problem as template
|
|
2018
2034
|
"""
|
|
@@ -2072,12 +2088,15 @@ def kernelbench_evaluate( # noqa: PLR0913, PLR0915
|
|
|
2072
2088
|
if stages == "all":
|
|
2073
2089
|
resolved_stages = "compile,correctness,benchmark,defense"
|
|
2074
2090
|
|
|
2075
|
-
# Handle
|
|
2091
|
+
# Handle --benchmark and --defense/--no-defense flags
|
|
2076
2092
|
stage_set = set(resolved_stages.split(","))
|
|
2077
2093
|
if benchmark and "benchmark" not in stage_set:
|
|
2078
2094
|
stage_set.add("benchmark")
|
|
2079
|
-
|
|
2095
|
+
# Defense runs automatically when benchmarking, unless --no-defense
|
|
2096
|
+
if defensive and "benchmark" in stage_set and "defense" not in stage_set:
|
|
2080
2097
|
stage_set.add("defense")
|
|
2098
|
+
if not defensive:
|
|
2099
|
+
stage_set.discard("defense")
|
|
2081
2100
|
resolved_stages = ",".join(
|
|
2082
2101
|
sorted(
|
|
2083
2102
|
stage_set,
|
|
@@ -2411,7 +2430,9 @@ def gpumode_evaluate( # noqa: PLR0913, PLR0915
|
|
|
2411
2430
|
benchmark: bool = typer.Option(False, "--benchmark", help="Run performance benchmarks"),
|
|
2412
2431
|
profile: bool = typer.Option(False, "--profile", help="Enable profiling"),
|
|
2413
2432
|
defensive: bool = typer.Option(
|
|
2414
|
-
|
|
2433
|
+
True,
|
|
2434
|
+
"--defense/--no-defense",
|
|
2435
|
+
help="Run reward hack defense checks after benchmarking. Enabled by default.",
|
|
2415
2436
|
),
|
|
2416
2437
|
sync_artifacts: bool = typer.Option(
|
|
2417
2438
|
True, "--sync-artifacts/--no-sync-artifacts", help="Download artifacts"
|
|
@@ -2567,307 +2588,6 @@ def gpumode_evaluate( # noqa: PLR0913, PLR0915
|
|
|
2567
2588
|
else:
|
|
2568
2589
|
typer.echo(f"Error: {result.error_message}", err=True)
|
|
2569
2590
|
raise typer.Exit(1)
|
|
2570
|
-
|
|
2571
|
-
|
|
2572
|
-
# =============================================================================
|
|
2573
|
-
# Push and Remote-Run commands
|
|
2574
|
-
# =============================================================================
|
|
2575
|
-
|
|
2576
|
-
|
|
2577
|
-
@app.command("push", hidden=True)
|
|
2578
|
-
def push(
|
|
2579
|
-
local_path: Path = typer.Argument(..., help="Local directory to upload"),
|
|
2580
|
-
workspace: str | None = typer.Option(None, "--workspace", "-w", help="Workspace name override"),
|
|
2581
|
-
direct: bool = typer.Option(False, "--direct", "-d", help="Use direct SSH instead of API"),
|
|
2582
|
-
target_name: str | None = typer.Option(
|
|
2583
|
-
None,
|
|
2584
|
-
"--target",
|
|
2585
|
-
"-t",
|
|
2586
|
-
help="Target for --direct mode. See 'wafer config targets list'.",
|
|
2587
|
-
autocompletion=complete_target_name,
|
|
2588
|
-
),
|
|
2589
|
-
) -> None:
|
|
2590
|
-
"""Push directory to remote GPU.
|
|
2591
|
-
|
|
2592
|
-
By default, uses wafer-api. Use --direct for direct SSH mode.
|
|
2593
|
-
|
|
2594
|
-
Examples:
|
|
2595
|
-
wafer push ./my_project
|
|
2596
|
-
wafer push . --workspace my-kernel
|
|
2597
|
-
wafer push ./my_project --direct --target vultr-b200
|
|
2598
|
-
"""
|
|
2599
|
-
# Validate path
|
|
2600
|
-
if not local_path.exists():
|
|
2601
|
-
typer.echo(f"Error: Path not found: {local_path}", err=True)
|
|
2602
|
-
raise typer.Exit(1)
|
|
2603
|
-
|
|
2604
|
-
if not local_path.is_dir():
|
|
2605
|
-
typer.echo(f"Error: Not a directory: {local_path}", err=True)
|
|
2606
|
-
raise typer.Exit(1)
|
|
2607
|
-
|
|
2608
|
-
# Resolve to absolute path
|
|
2609
|
-
local_path = local_path.resolve()
|
|
2610
|
-
|
|
2611
|
-
if direct:
|
|
2612
|
-
# Direct SSH mode (requires target)
|
|
2613
|
-
if not target_name:
|
|
2614
|
-
typer.echo("Error: --target required for --direct mode", err=True)
|
|
2615
|
-
raise typer.Exit(1)
|
|
2616
|
-
|
|
2617
|
-
from wafer_core.utils.kernel_utils.targets.config import ModalTarget
|
|
2618
|
-
|
|
2619
|
-
from .gpu_run import push_directory as push_direct
|
|
2620
|
-
from .targets import load_target
|
|
2621
|
-
|
|
2622
|
-
try:
|
|
2623
|
-
target = load_target(target_name)
|
|
2624
|
-
except FileNotFoundError:
|
|
2625
|
-
typer.echo(f"Error: Target not found: {target_name}", err=True)
|
|
2626
|
-
typer.echo("List targets with: wafer config targets list", err=True)
|
|
2627
|
-
raise typer.Exit(1) from None
|
|
2628
|
-
|
|
2629
|
-
if isinstance(target, ModalTarget):
|
|
2630
|
-
typer.echo(
|
|
2631
|
-
f"Error: Target '{target_name}' is a Modal target. Direct push requires SSH.",
|
|
2632
|
-
err=True,
|
|
2633
|
-
)
|
|
2634
|
-
raise typer.Exit(1) from None
|
|
2635
|
-
|
|
2636
|
-
typer.echo(f"Connecting to {target.ssh_target}...")
|
|
2637
|
-
try:
|
|
2638
|
-
result = push_direct(local_path, target)
|
|
2639
|
-
except Exception as e:
|
|
2640
|
-
typer.echo(f"Error: {e}", err=True)
|
|
2641
|
-
raise typer.Exit(1) from None
|
|
2642
|
-
|
|
2643
|
-
typer.echo(f"Uploading {len(result.files_uploaded)} files to {result.workspace_path}")
|
|
2644
|
-
for f in result.files_uploaded:
|
|
2645
|
-
typer.echo(f" ✓ {f}")
|
|
2646
|
-
typer.echo(f"Pushed to: {result.workspace_path}")
|
|
2647
|
-
else:
|
|
2648
|
-
# API mode (default)
|
|
2649
|
-
from .api_client import push_directory as push_api
|
|
2650
|
-
|
|
2651
|
-
workspace_name = workspace or local_path.name
|
|
2652
|
-
typer.echo(f"Pushing {local_path.name} to wafer-api...")
|
|
2653
|
-
|
|
2654
|
-
try:
|
|
2655
|
-
result = push_api(local_path, workspace_name)
|
|
2656
|
-
except Exception as e:
|
|
2657
|
-
typer.echo(f"Error: {e}", err=True)
|
|
2658
|
-
raise typer.Exit(1) from None
|
|
2659
|
-
|
|
2660
|
-
typer.echo(f"Uploaded {len(result.files_uploaded)} files")
|
|
2661
|
-
for f in result.files_uploaded:
|
|
2662
|
-
typer.echo(f" ✓ {f}")
|
|
2663
|
-
typer.echo(f"Workspace ID: {result.workspace_id}")
|
|
2664
|
-
|
|
2665
|
-
|
|
2666
|
-
def _run_direct_mode(
|
|
2667
|
-
cmd_str: str,
|
|
2668
|
-
target_name: str,
|
|
2669
|
-
upload_dir: Path | None,
|
|
2670
|
-
workspace_id: str | None,
|
|
2671
|
-
gpu_id: int | None,
|
|
2672
|
-
) -> int:
|
|
2673
|
-
"""Run command via direct SSH mode. Returns exit code."""
|
|
2674
|
-
from wafer_core.utils.kernel_utils.targets.config import ModalTarget
|
|
2675
|
-
|
|
2676
|
-
from .gpu_run import push_directory as push_direct
|
|
2677
|
-
from .gpu_run import run_command as run_direct
|
|
2678
|
-
from .targets import load_target
|
|
2679
|
-
|
|
2680
|
-
try:
|
|
2681
|
-
target = load_target(target_name)
|
|
2682
|
-
except FileNotFoundError:
|
|
2683
|
-
typer.echo(f"Error: Target not found: {target_name}", err=True)
|
|
2684
|
-
typer.echo("List targets with: wafer config targets list", err=True)
|
|
2685
|
-
raise typer.Exit(1) from None
|
|
2686
|
-
|
|
2687
|
-
if isinstance(target, ModalTarget):
|
|
2688
|
-
typer.echo(
|
|
2689
|
-
f"Error: Target '{target_name}' is a Modal target. Direct mode requires SSH.", err=True
|
|
2690
|
-
)
|
|
2691
|
-
raise typer.Exit(1) from None
|
|
2692
|
-
|
|
2693
|
-
if not target.docker_image:
|
|
2694
|
-
typer.echo(f"Error: Target '{target_name}' has no docker_image configured", err=True)
|
|
2695
|
-
raise typer.Exit(1)
|
|
2696
|
-
|
|
2697
|
-
# If upload_dir provided, push first
|
|
2698
|
-
workspace_name = workspace_id
|
|
2699
|
-
if upload_dir:
|
|
2700
|
-
typer.echo(f"Uploading {upload_dir.name}...")
|
|
2701
|
-
try:
|
|
2702
|
-
push_result = push_direct(upload_dir, target)
|
|
2703
|
-
workspace_name = push_result.workspace_name
|
|
2704
|
-
typer.echo(f"Uploaded {len(push_result.files_uploaded)} files")
|
|
2705
|
-
except Exception as e:
|
|
2706
|
-
typer.echo(f"Error uploading: {e}", err=True)
|
|
2707
|
-
raise typer.Exit(1) from None
|
|
2708
|
-
elif not workspace_name:
|
|
2709
|
-
workspace_name = "tmp"
|
|
2710
|
-
|
|
2711
|
-
effective_gpu = gpu_id if gpu_id is not None else target.gpu_ids[0]
|
|
2712
|
-
typer.echo(f"Target: {target_name} (docker: {target.docker_image})")
|
|
2713
|
-
typer.echo(f"Workspace: {workspace_name}")
|
|
2714
|
-
typer.echo(f"GPU: {effective_gpu}")
|
|
2715
|
-
typer.echo(f"Command: {cmd_str}")
|
|
2716
|
-
typer.echo("-" * 60)
|
|
2717
|
-
|
|
2718
|
-
try:
|
|
2719
|
-
return run_direct(cmd_str, workspace_name, target, gpu_id)
|
|
2720
|
-
except KeyboardInterrupt:
|
|
2721
|
-
typer.echo("\nInterrupted by user", err=True)
|
|
2722
|
-
raise typer.Exit(130) from None
|
|
2723
|
-
except Exception as e:
|
|
2724
|
-
typer.echo(f"Error: {e}", err=True)
|
|
2725
|
-
raise typer.Exit(1) from None
|
|
2726
|
-
|
|
2727
|
-
|
|
2728
|
-
def _run_api_mode( # noqa: PLR0913
|
|
2729
|
-
cmd_str: str,
|
|
2730
|
-
upload_dir: Path | None,
|
|
2731
|
-
workspace_id: str | None,
|
|
2732
|
-
gpu_id: int | None,
|
|
2733
|
-
gpu_count: int,
|
|
2734
|
-
docker_image: str | None,
|
|
2735
|
-
docker_entrypoint: str | None,
|
|
2736
|
-
pull_image: bool,
|
|
2737
|
-
require_hwc: bool,
|
|
2738
|
-
) -> int:
|
|
2739
|
-
"""Run command via wafer-api. Returns exit code."""
|
|
2740
|
-
from .api_client import run_command_stream
|
|
2741
|
-
|
|
2742
|
-
if upload_dir:
|
|
2743
|
-
typer.echo(f"Uploading: {upload_dir}")
|
|
2744
|
-
elif workspace_id:
|
|
2745
|
-
typer.echo(f"Workspace: {workspace_id}")
|
|
2746
|
-
if gpu_id is not None:
|
|
2747
|
-
typer.echo(f"GPU: {gpu_id}")
|
|
2748
|
-
if gpu_count > 1:
|
|
2749
|
-
typer.echo(f"GPU count: {gpu_count}")
|
|
2750
|
-
if docker_image:
|
|
2751
|
-
typer.echo(f"Image: {docker_image}")
|
|
2752
|
-
if docker_entrypoint:
|
|
2753
|
-
typer.echo(f"Entrypoint: {docker_entrypoint}")
|
|
2754
|
-
if pull_image:
|
|
2755
|
-
typer.echo("Pull image: yes")
|
|
2756
|
-
typer.echo(f"Command: {cmd_str}")
|
|
2757
|
-
if require_hwc:
|
|
2758
|
-
typer.echo("Hardware counters: required (baremetal)")
|
|
2759
|
-
typer.echo("-" * 60)
|
|
2760
|
-
|
|
2761
|
-
try:
|
|
2762
|
-
return run_command_stream(
|
|
2763
|
-
command=cmd_str,
|
|
2764
|
-
upload_dir=upload_dir,
|
|
2765
|
-
workspace_id=workspace_id,
|
|
2766
|
-
gpu_id=gpu_id,
|
|
2767
|
-
gpu_count=gpu_count,
|
|
2768
|
-
docker_image=docker_image,
|
|
2769
|
-
docker_entrypoint=docker_entrypoint,
|
|
2770
|
-
pull_image=pull_image,
|
|
2771
|
-
require_hardware_counters=require_hwc,
|
|
2772
|
-
)
|
|
2773
|
-
except KeyboardInterrupt:
|
|
2774
|
-
typer.echo("\nInterrupted by user", err=True)
|
|
2775
|
-
raise typer.Exit(130) from None
|
|
2776
|
-
except Exception as e:
|
|
2777
|
-
typer.echo(f"Error: {e}", err=True)
|
|
2778
|
-
raise typer.Exit(1) from None
|
|
2779
|
-
|
|
2780
|
-
|
|
2781
|
-
@app.command("remote-run", hidden=True)
|
|
2782
|
-
def remote_run( # noqa: PLR0913
|
|
2783
|
-
command: list[str] = typer.Argument(..., help="Command to run"),
|
|
2784
|
-
upload_dir: Path | None = typer.Option(
|
|
2785
|
-
None, "--upload-dir", "-u", help="Directory to upload (stateless mode)"
|
|
2786
|
-
),
|
|
2787
|
-
workspace_id: str | None = typer.Option(
|
|
2788
|
-
None, "--workspace-id", "-w", help="Workspace ID (from wafer push)"
|
|
2789
|
-
),
|
|
2790
|
-
gpu_id: int | None = typer.Option(None, "--gpu", "-g", help="GPU ID"),
|
|
2791
|
-
gpu_count: int = typer.Option(1, "--gpu-count", "-n", help="Number of GPUs (1-8)"),
|
|
2792
|
-
docker_image: str | None = typer.Option(None, "--image", "-i", help="Docker image override"),
|
|
2793
|
-
docker_entrypoint: str | None = typer.Option(
|
|
2794
|
-
None, "--docker-entrypoint", help="Override Docker entrypoint (e.g., 'bash')"
|
|
2795
|
-
),
|
|
2796
|
-
pull_image: bool = typer.Option(
|
|
2797
|
-
False, "--pull-image", help="Pull image if not available on target"
|
|
2798
|
-
),
|
|
2799
|
-
require_hwc: bool = typer.Option(
|
|
2800
|
-
False, "--require-hwc", help="Require hardware counters (baremetal)"
|
|
2801
|
-
),
|
|
2802
|
-
direct: bool = typer.Option(False, "--direct", "-d", help="Use direct SSH instead of API"),
|
|
2803
|
-
target_name: str | None = typer.Option(
|
|
2804
|
-
None,
|
|
2805
|
-
"--target",
|
|
2806
|
-
"-t",
|
|
2807
|
-
help="Target for --direct mode. See 'wafer config targets list'.",
|
|
2808
|
-
autocompletion=complete_target_name,
|
|
2809
|
-
),
|
|
2810
|
-
) -> None:
|
|
2811
|
-
"""Run command on remote GPU in Docker.
|
|
2812
|
-
|
|
2813
|
-
Two modes:
|
|
2814
|
-
- High-level (stateless): --upload-dir uploads files and runs command
|
|
2815
|
-
- Low-level: --workspace-id uses existing workspace from 'wafer push'
|
|
2816
|
-
|
|
2817
|
-
By default, uses wafer-api. Use --direct for direct SSH mode.
|
|
2818
|
-
|
|
2819
|
-
Examples:
|
|
2820
|
-
# Stateless: upload and run
|
|
2821
|
-
wafer remote-run --upload-dir ./my_project -- python train.py
|
|
2822
|
-
|
|
2823
|
-
# Run without files
|
|
2824
|
-
wafer remote-run -- nvidia-smi
|
|
2825
|
-
|
|
2826
|
-
# Low-level: use existing workspace
|
|
2827
|
-
wafer remote-run --workspace-id ws_abc123 -- python train.py
|
|
2828
|
-
|
|
2829
|
-
# Direct SSH mode
|
|
2830
|
-
wafer remote-run --upload-dir ./my_project --direct --target vultr-b200 -- python train.py
|
|
2831
|
-
"""
|
|
2832
|
-
cmd_str = " ".join(command)
|
|
2833
|
-
if not cmd_str.strip():
|
|
2834
|
-
typer.echo("Error: Empty command", err=True)
|
|
2835
|
-
raise typer.Exit(1)
|
|
2836
|
-
|
|
2837
|
-
if upload_dir and workspace_id:
|
|
2838
|
-
typer.echo("Error: --upload-dir and --workspace-id are mutually exclusive", err=True)
|
|
2839
|
-
raise typer.Exit(1)
|
|
2840
|
-
|
|
2841
|
-
if upload_dir:
|
|
2842
|
-
if not upload_dir.exists():
|
|
2843
|
-
typer.echo(f"Error: Directory not found: {upload_dir}", err=True)
|
|
2844
|
-
raise typer.Exit(1)
|
|
2845
|
-
if not upload_dir.is_dir():
|
|
2846
|
-
typer.echo(f"Error: Not a directory: {upload_dir}", err=True)
|
|
2847
|
-
raise typer.Exit(1)
|
|
2848
|
-
upload_dir = upload_dir.resolve()
|
|
2849
|
-
|
|
2850
|
-
if direct:
|
|
2851
|
-
if not target_name:
|
|
2852
|
-
typer.echo("Error: --target required for --direct mode", err=True)
|
|
2853
|
-
raise typer.Exit(1)
|
|
2854
|
-
exit_code = _run_direct_mode(cmd_str, target_name, upload_dir, workspace_id, gpu_id)
|
|
2855
|
-
else:
|
|
2856
|
-
exit_code = _run_api_mode(
|
|
2857
|
-
cmd_str,
|
|
2858
|
-
upload_dir,
|
|
2859
|
-
workspace_id,
|
|
2860
|
-
gpu_id,
|
|
2861
|
-
gpu_count,
|
|
2862
|
-
docker_image,
|
|
2863
|
-
docker_entrypoint,
|
|
2864
|
-
pull_image,
|
|
2865
|
-
require_hwc,
|
|
2866
|
-
)
|
|
2867
|
-
|
|
2868
|
-
raise typer.Exit(exit_code)
|
|
2869
|
-
|
|
2870
|
-
|
|
2871
2591
|
# =============================================================================
|
|
2872
2592
|
# Authentication commands
|
|
2873
2593
|
# =============================================================================
|
|
@@ -6114,7 +5834,7 @@ def ncu_analyze(
|
|
|
6114
5834
|
By default, uses local NCU if available, otherwise runs analysis
|
|
6115
5835
|
remotely via wafer-api (requires authentication: wafer auth login).
|
|
6116
5836
|
|
|
6117
|
-
Use --target for direct SSH mode
|
|
5837
|
+
Use --target for direct SSH mode.
|
|
6118
5838
|
Use --include-source to fetch SASS assembly with register/instruction data.
|
|
6119
5839
|
|
|
6120
5840
|
Examples:
|
|
@@ -7988,7 +7708,7 @@ def compare_fusion_cmd(
|
|
|
7988
7708
|
wafer compare fusion amd_trace.json nvidia_trace.json --format csv -o fusion.csv
|
|
7989
7709
|
"""
|
|
7990
7710
|
from .trace_compare import compare_align
|
|
7991
|
-
|
|
7711
|
+
|
|
7992
7712
|
compare_align(
|
|
7993
7713
|
trace1=trace1,
|
|
7994
7714
|
trace2=trace2,
|
|
@@ -8042,7 +7762,7 @@ def compare_align_cmd(
|
|
|
8042
7762
|
wafer compare align amd_trace.json nvidia_trace.json --layer 5
|
|
8043
7763
|
"""
|
|
8044
7764
|
from .trace_compare import compare_align
|
|
8045
|
-
|
|
7765
|
+
|
|
8046
7766
|
compare_align(
|
|
8047
7767
|
trace1=trace1,
|
|
8048
7768
|
trace2=trace2,
|