wafer-cli 0.2.26__tar.gz → 0.2.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/PKG-INFO +1 -1
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/pyproject.toml +1 -1
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/cli.py +133 -33
- wafer_cli-0.2.27/wafer/specs_cli.py +157 -0
- wafer_cli-0.2.27/wafer/targets_cli.py +472 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/targets_ops.py +29 -2
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer_cli.egg-info/PKG-INFO +1 -1
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer_cli.egg-info/SOURCES.txt +2 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/README.md +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/setup.cfg +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_analytics.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_auth.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_billing.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_cli_coverage.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_cli_parity_integration.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_config_integration.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_file_operations_integration.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_kernel_scope_cli.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_nsys_analyze.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_nsys_profile.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_output.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_rocprof_compute_integration.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_skill_commands.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_ssh_integration.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_targets_ops.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_wevin_cli.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/tests/test_workflow_integration.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/GUIDE.md +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/__init__.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/agent_defaults.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/analytics.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/api_client.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/auth.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/autotuner.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/billing.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/cli_instructions.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/config.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/corpus.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/evaluate.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/global_config.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/gpu_run.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/inference.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/kernel_scope.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/ncu_analyze.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/nsys_analyze.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/nsys_profile.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/output.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/problems.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/rocprof_compute.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/rocprof_sdk.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/rocprof_systems.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/skills/wafer-guide/SKILL.md +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/ssh_keys.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/target_lock.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/targets.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/templates/__init__.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/templates/ask_docs.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/templates/optimize_kernel.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/templates/optimize_kernelbench.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/templates/trace_analyze.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/tests/test_eval_cli_parity.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/trace_compare.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/tracelens.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/wevin_cli.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer/workspaces.py +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer_cli.egg-info/dependency_links.txt +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer_cli.egg-info/entry_points.txt +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer_cli.egg-info/requires.txt +0 -0
- {wafer_cli-0.2.26 → wafer_cli-0.2.27}/wafer_cli.egg-info/top_level.txt +0 -0
|
@@ -268,6 +268,32 @@ Configure targets with: wafer config targets init ..."""
|
|
|
268
268
|
)
|
|
269
269
|
app.add_typer(targets_ops_app, name="targets", rich_help_panel="Infrastructure")
|
|
270
270
|
|
|
271
|
+
# Specs management (new: local TOML configs)
|
|
272
|
+
from wafer.specs_cli import specs_app
|
|
273
|
+
|
|
274
|
+
app.add_typer(specs_app, name="specs", rich_help_panel="Configuration")
|
|
275
|
+
|
|
276
|
+
# Live resource management (new: API-backed commands on `wafer targets`)
|
|
277
|
+
# These become: wafer targets list, wafer targets terminate, etc.
|
|
278
|
+
from wafer.targets_cli import (
|
|
279
|
+
targets_list as _targets_list_cmd,
|
|
280
|
+
)
|
|
281
|
+
from wafer.targets_cli import (
|
|
282
|
+
targets_provision as _targets_provision_cmd,
|
|
283
|
+
)
|
|
284
|
+
from wafer.targets_cli import (
|
|
285
|
+
targets_reconcile as _targets_reconcile_cmd,
|
|
286
|
+
)
|
|
287
|
+
from wafer.targets_cli import (
|
|
288
|
+
targets_terminate as _targets_terminate_cmd,
|
|
289
|
+
)
|
|
290
|
+
from wafer.targets_cli import (
|
|
291
|
+
targets_pools as _targets_pools_cmd,
|
|
292
|
+
)
|
|
293
|
+
from wafer.targets_cli import (
|
|
294
|
+
targets_probe as _targets_probe_cmd,
|
|
295
|
+
)
|
|
296
|
+
|
|
271
297
|
# Billing management - nested under config
|
|
272
298
|
billing_app = typer.Typer(help="Manage billing, credits, and subscription")
|
|
273
299
|
config_app.add_typer(billing_app, name="billing")
|
|
@@ -612,7 +638,9 @@ def skill_status() -> None:
|
|
|
612
638
|
auth_app = typer.Typer(help="Authenticate with Wafer and cloud GPU providers")
|
|
613
639
|
app.add_typer(auth_app, name="auth", rich_help_panel="Configuration")
|
|
614
640
|
|
|
615
|
-
providers_app = typer.Typer(
|
|
641
|
+
providers_app = typer.Typer(
|
|
642
|
+
help="Manage API keys for cloud GPU providers (RunPod, DigitalOcean, etc.)"
|
|
643
|
+
)
|
|
616
644
|
auth_app.add_typer(providers_app, name="providers")
|
|
617
645
|
|
|
618
646
|
|
|
@@ -1813,6 +1841,93 @@ def kernelbench_list_problems() -> None:
|
|
|
1813
1841
|
raise typer.Exit(1) from None
|
|
1814
1842
|
|
|
1815
1843
|
|
|
1844
|
+
def _resolve_pool_query(pool: str, collector) -> tuple[str, object]:
|
|
1845
|
+
"""Resolve a PoolQuery pool to a target spec name + lock context.
|
|
1846
|
+
|
|
1847
|
+
Queries live providers, matches by pool query, locks one target,
|
|
1848
|
+
returns (spec_name, lock_context) for the evaluator.
|
|
1849
|
+
"""
|
|
1850
|
+
import trio
|
|
1851
|
+
from wafer_core.targets.pool import resolve_pool
|
|
1852
|
+
|
|
1853
|
+
from .target_lock import acquire_from_pool
|
|
1854
|
+
|
|
1855
|
+
matched_targets = trio.run(resolve_pool, pool)
|
|
1856
|
+
|
|
1857
|
+
if not matched_targets:
|
|
1858
|
+
collector.set_error("pool", "NoMatchingTargets", pool=pool)
|
|
1859
|
+
collector.finalize()
|
|
1860
|
+
raise typer.Exit(1)
|
|
1861
|
+
|
|
1862
|
+
# Filter to targets with a spec (evaluator needs spec fields)
|
|
1863
|
+
spec_targets = [t for t in matched_targets if t.spec_name]
|
|
1864
|
+
if not spec_targets:
|
|
1865
|
+
collector.set_error(
|
|
1866
|
+
"pool", "NoSpecTargets", pool=pool,
|
|
1867
|
+
message="Matched targets have no spec binding — evaluator needs spec fields",
|
|
1868
|
+
)
|
|
1869
|
+
collector.finalize()
|
|
1870
|
+
raise typer.Exit(1)
|
|
1871
|
+
|
|
1872
|
+
# Lock one by resource_id
|
|
1873
|
+
resource_ids = [t.resource_id for t in spec_targets]
|
|
1874
|
+
collector.emit("pool_acquire", pool=pool, count=len(resource_ids))
|
|
1875
|
+
|
|
1876
|
+
lock_ctx = acquire_from_pool(resource_ids)
|
|
1877
|
+
acquired_id = lock_ctx.__enter__()
|
|
1878
|
+
|
|
1879
|
+
if acquired_id is None:
|
|
1880
|
+
lock_ctx.__exit__(None, None, None)
|
|
1881
|
+
collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=resource_ids)
|
|
1882
|
+
collector.finalize()
|
|
1883
|
+
raise typer.Exit(1)
|
|
1884
|
+
|
|
1885
|
+
# Map resource_id back to spec_name
|
|
1886
|
+
acquired_target = next(t for t in spec_targets if t.resource_id == acquired_id)
|
|
1887
|
+
spec_name = acquired_target.spec_name
|
|
1888
|
+
|
|
1889
|
+
collector.emit("pool_acquired", target=spec_name, resource_id=acquired_id)
|
|
1890
|
+
return spec_name, lock_ctx
|
|
1891
|
+
|
|
1892
|
+
|
|
1893
|
+
def _resolve_pool_legacy(pool: str, collector) -> tuple[str, object]:
|
|
1894
|
+
"""Resolve an old-style pool (static target name list) to a target name + lock context.
|
|
1895
|
+
|
|
1896
|
+
Old format: [pools.name] targets = ["t1", "t2"]
|
|
1897
|
+
"""
|
|
1898
|
+
from .target_lock import acquire_from_pool
|
|
1899
|
+
from .targets import filter_pool_by_auth, get_pool
|
|
1900
|
+
|
|
1901
|
+
try:
|
|
1902
|
+
pool_targets = get_pool(pool)
|
|
1903
|
+
except FileNotFoundError as e:
|
|
1904
|
+
collector.set_error("pool", "PoolNotFound", pool=pool, message=str(e))
|
|
1905
|
+
collector.finalize()
|
|
1906
|
+
raise typer.Exit(1) from None
|
|
1907
|
+
|
|
1908
|
+
usable_targets, skipped = filter_pool_by_auth(pool_targets)
|
|
1909
|
+
if skipped:
|
|
1910
|
+
collector.emit("pool_auth_skip", targets=skipped)
|
|
1911
|
+
|
|
1912
|
+
if not usable_targets:
|
|
1913
|
+
collector.set_error("pool", "NoUsableTargets", pool=pool)
|
|
1914
|
+
collector.finalize()
|
|
1915
|
+
raise typer.Exit(1) from None
|
|
1916
|
+
|
|
1917
|
+
collector.emit("pool_acquire", pool=pool, count=len(usable_targets))
|
|
1918
|
+
lock_ctx = acquire_from_pool(usable_targets)
|
|
1919
|
+
acquired_target = lock_ctx.__enter__()
|
|
1920
|
+
|
|
1921
|
+
if acquired_target is None:
|
|
1922
|
+
lock_ctx.__exit__(None, None, None)
|
|
1923
|
+
collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=usable_targets)
|
|
1924
|
+
collector.finalize()
|
|
1925
|
+
raise typer.Exit(1)
|
|
1926
|
+
|
|
1927
|
+
collector.emit("pool_acquired", target=acquired_target)
|
|
1928
|
+
return acquired_target, lock_ctx
|
|
1929
|
+
|
|
1930
|
+
|
|
1816
1931
|
@kernelbench_app.callback(invoke_without_command=True)
|
|
1817
1932
|
def kernelbench_evaluate( # noqa: PLR0913, PLR0915
|
|
1818
1933
|
ctx: typer.Context,
|
|
@@ -1943,39 +2058,12 @@ def kernelbench_evaluate( # noqa: PLR0913, PLR0915
|
|
|
1943
2058
|
pool_lock_context = None
|
|
1944
2059
|
|
|
1945
2060
|
if pool:
|
|
1946
|
-
from .
|
|
1947
|
-
from .targets import filter_pool_by_auth, get_pool
|
|
1948
|
-
|
|
1949
|
-
try:
|
|
1950
|
-
pool_targets = get_pool(pool)
|
|
1951
|
-
except FileNotFoundError as e:
|
|
1952
|
-
collector.set_error("pool", "PoolNotFound", pool=pool, message=str(e))
|
|
1953
|
-
collector.finalize()
|
|
1954
|
-
raise typer.Exit(1) from None
|
|
2061
|
+
from wafer_core.targets.pool import is_query_pool
|
|
1955
2062
|
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
if not usable_targets:
|
|
1962
|
-
collector.set_error("pool", "NoUsableTargets", pool=pool)
|
|
1963
|
-
collector.finalize()
|
|
1964
|
-
raise typer.Exit(1) from None
|
|
1965
|
-
|
|
1966
|
-
collector.emit("pool_acquire", pool=pool, count=len(usable_targets))
|
|
1967
|
-
pool_lock_context = acquire_from_pool(usable_targets)
|
|
1968
|
-
acquired_target = pool_lock_context.__enter__()
|
|
1969
|
-
|
|
1970
|
-
if acquired_target is None:
|
|
1971
|
-
# Exit context manager before raising to avoid resource leak
|
|
1972
|
-
pool_lock_context.__exit__(None, None, None)
|
|
1973
|
-
collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=usable_targets)
|
|
1974
|
-
collector.finalize()
|
|
1975
|
-
raise typer.Exit(1)
|
|
1976
|
-
|
|
1977
|
-
collector.emit("pool_acquired", target=acquired_target)
|
|
1978
|
-
resolved_target = acquired_target
|
|
2063
|
+
if is_query_pool(pool):
|
|
2064
|
+
resolved_target, pool_lock_context = _resolve_pool_query(pool, collector)
|
|
2065
|
+
else:
|
|
2066
|
+
resolved_target, pool_lock_context = _resolve_pool_legacy(pool, collector)
|
|
1979
2067
|
|
|
1980
2068
|
collector.target = resolved_target
|
|
1981
2069
|
|
|
@@ -5254,6 +5342,18 @@ def workspaces_pull(
|
|
|
5254
5342
|
raise typer.Exit(1) from None
|
|
5255
5343
|
|
|
5256
5344
|
|
|
5345
|
+
# =============================================================================
|
|
5346
|
+
# Live resource commands (list/terminate/reconcile/provision)
|
|
5347
|
+
# =============================================================================
|
|
5348
|
+
|
|
5349
|
+
targets_ops_app.command("list")(_targets_list_cmd)
|
|
5350
|
+
targets_ops_app.command("terminate")(_targets_terminate_cmd)
|
|
5351
|
+
targets_ops_app.command("reconcile")(_targets_reconcile_cmd)
|
|
5352
|
+
targets_ops_app.command("provision")(_targets_provision_cmd)
|
|
5353
|
+
targets_ops_app.command("pools")(_targets_pools_cmd)
|
|
5354
|
+
targets_ops_app.command("probe")(_targets_probe_cmd)
|
|
5355
|
+
|
|
5356
|
+
|
|
5257
5357
|
# =============================================================================
|
|
5258
5358
|
# Target operations commands (exec/ssh/sync)
|
|
5259
5359
|
# =============================================================================
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""CLI commands for wafer specs — TargetSpec TOML management.
|
|
2
|
+
|
|
3
|
+
These are the local config commands (no API calls).
|
|
4
|
+
Registered as: wafer specs list|show|add|remove|default|init
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import typer
|
|
12
|
+
|
|
13
|
+
specs_app = typer.Typer(
|
|
14
|
+
help="""Manage GPU target specs (provisioning blueprints).
|
|
15
|
+
|
|
16
|
+
Specs define how to access or provision GPUs. They are TOML files in ~/.wafer/specs/.
|
|
17
|
+
|
|
18
|
+
wafer specs list # List all specs
|
|
19
|
+
wafer specs show runpod-mi300x # Show one spec
|
|
20
|
+
wafer specs add /path/to/spec.toml # Add from file
|
|
21
|
+
wafer specs remove old-target # Remove a spec
|
|
22
|
+
wafer specs default runpod-mi300x # Set default
|
|
23
|
+
|
|
24
|
+
To create a new spec interactively:
|
|
25
|
+
wafer config targets init ssh # (legacy, still works)
|
|
26
|
+
wafer config targets init runpod
|
|
27
|
+
"""
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@specs_app.command("list")
|
|
32
|
+
def specs_list() -> None:
|
|
33
|
+
"""List all configured specs.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
wafer specs list
|
|
37
|
+
"""
|
|
38
|
+
from wafer_core.targets.spec_store import list_spec_names, load_spec
|
|
39
|
+
|
|
40
|
+
from .targets import get_default_target
|
|
41
|
+
|
|
42
|
+
names = list_spec_names()
|
|
43
|
+
default = get_default_target()
|
|
44
|
+
|
|
45
|
+
if not names:
|
|
46
|
+
typer.echo("No specs configured.")
|
|
47
|
+
typer.echo("Add one with: wafer specs add <path/to/spec.toml>")
|
|
48
|
+
typer.echo("Or interactively: wafer config targets init ssh")
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
typer.echo("Configured specs:")
|
|
52
|
+
for name in names:
|
|
53
|
+
marker = " (default)" if name == default else ""
|
|
54
|
+
try:
|
|
55
|
+
spec = load_spec(name)
|
|
56
|
+
type_name = type(spec).__name__.replace("Target", "")
|
|
57
|
+
typer.echo(f" {name}{marker} [{type_name}] gpu={spec.gpu_type}")
|
|
58
|
+
except Exception as e:
|
|
59
|
+
typer.echo(f" {name}{marker} [error: {e}]")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@specs_app.command("show")
|
|
63
|
+
def specs_show(
|
|
64
|
+
name: str = typer.Argument(..., help="Spec name"),
|
|
65
|
+
) -> None:
|
|
66
|
+
"""Show details for a spec.
|
|
67
|
+
|
|
68
|
+
Example:
|
|
69
|
+
wafer specs show runpod-mi300x
|
|
70
|
+
"""
|
|
71
|
+
from wafer_core.targets.spec_store import load_spec
|
|
72
|
+
|
|
73
|
+
from .targets import get_target_info
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
spec = load_spec(name)
|
|
77
|
+
except FileNotFoundError:
|
|
78
|
+
typer.echo(f"Spec not found: {name}", err=True)
|
|
79
|
+
raise typer.Exit(1) from None
|
|
80
|
+
|
|
81
|
+
typer.echo(f"Spec: {name}")
|
|
82
|
+
for key, value in get_target_info(spec).items():
|
|
83
|
+
typer.echo(f" {key}: {value}")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@specs_app.command("add")
|
|
87
|
+
def specs_add(
|
|
88
|
+
file_path: Path = typer.Argument(..., help="Path to TOML spec file"),
|
|
89
|
+
) -> None:
|
|
90
|
+
"""Add a spec from a TOML file.
|
|
91
|
+
|
|
92
|
+
Example:
|
|
93
|
+
wafer specs add ./my-target.toml
|
|
94
|
+
"""
|
|
95
|
+
import tomllib
|
|
96
|
+
|
|
97
|
+
from wafer_core.targets.spec_store import parse_spec, save_spec
|
|
98
|
+
|
|
99
|
+
if not file_path.exists():
|
|
100
|
+
typer.echo(f"File not found: {file_path}", err=True)
|
|
101
|
+
raise typer.Exit(1) from None
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
with open(file_path, "rb") as f:
|
|
105
|
+
data = tomllib.load(f)
|
|
106
|
+
spec = parse_spec(data)
|
|
107
|
+
save_spec(spec)
|
|
108
|
+
typer.echo(f"Added spec: {spec.name}")
|
|
109
|
+
except Exception as e:
|
|
110
|
+
typer.echo(f"Error: {e}", err=True)
|
|
111
|
+
raise typer.Exit(1) from None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@specs_app.command("remove")
|
|
115
|
+
def specs_remove(
|
|
116
|
+
name: str = typer.Argument(..., help="Spec name to remove"),
|
|
117
|
+
force: bool = typer.Option(False, "--force", "-f", help="Skip confirmation"),
|
|
118
|
+
) -> None:
|
|
119
|
+
"""Remove a spec.
|
|
120
|
+
|
|
121
|
+
Example:
|
|
122
|
+
wafer specs remove old-target
|
|
123
|
+
"""
|
|
124
|
+
from wafer_core.targets.spec_store import remove_spec
|
|
125
|
+
|
|
126
|
+
if not force:
|
|
127
|
+
confirm = typer.confirm(f"Remove spec '{name}'?")
|
|
128
|
+
if not confirm:
|
|
129
|
+
return
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
remove_spec(name)
|
|
133
|
+
typer.echo(f"Removed spec: {name}")
|
|
134
|
+
except FileNotFoundError:
|
|
135
|
+
typer.echo(f"Spec not found: {name}", err=True)
|
|
136
|
+
raise typer.Exit(1) from None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@specs_app.command("default")
|
|
140
|
+
def specs_default(
|
|
141
|
+
name: str = typer.Argument(..., help="Spec name to set as default"),
|
|
142
|
+
) -> None:
|
|
143
|
+
"""Set the default spec.
|
|
144
|
+
|
|
145
|
+
Example:
|
|
146
|
+
wafer specs default runpod-mi300x
|
|
147
|
+
"""
|
|
148
|
+
from wafer_core.targets.spec_store import list_spec_names
|
|
149
|
+
|
|
150
|
+
from .targets import set_default_target
|
|
151
|
+
|
|
152
|
+
if name not in list_spec_names():
|
|
153
|
+
typer.echo(f"Spec not found: {name}", err=True)
|
|
154
|
+
raise typer.Exit(1) from None
|
|
155
|
+
|
|
156
|
+
set_default_target(name)
|
|
157
|
+
typer.echo(f"Default spec set to: {name}")
|
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
"""CLI commands for wafer targets — live resource management.
|
|
2
|
+
|
|
3
|
+
These commands always hit provider APIs to show real state.
|
|
4
|
+
Registered as: wafer targets list|show|terminate|sync|provision
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import UTC, datetime
|
|
10
|
+
|
|
11
|
+
import typer
|
|
12
|
+
|
|
13
|
+
targets_live_app = typer.Typer(
|
|
14
|
+
name="targets",
|
|
15
|
+
help="""Manage live GPU resources across cloud providers.
|
|
16
|
+
|
|
17
|
+
Unlike 'wafer specs' (local config files), these commands query provider APIs
|
|
18
|
+
to show what's actually running.
|
|
19
|
+
|
|
20
|
+
wafer targets list # All running resources
|
|
21
|
+
wafer targets list --unbound # Orphans (no matching spec)
|
|
22
|
+
wafer targets list --provider runpod # Filter by provider
|
|
23
|
+
wafer targets terminate <resource-id> # Kill a resource
|
|
24
|
+
wafer targets terminate --unbound # Kill all orphans
|
|
25
|
+
wafer targets sync # Refresh bindings
|
|
26
|
+
wafer targets provision <spec-name> # Provision from a spec
|
|
27
|
+
""",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@targets_live_app.command("list")
|
|
32
|
+
def targets_list(
|
|
33
|
+
provider: str | None = typer.Option(None, "--provider", "-p", help="Filter by provider"),
|
|
34
|
+
pool: str | None = typer.Option(None, "--pool", help="Filter by pool query from config.toml"),
|
|
35
|
+
) -> None:
|
|
36
|
+
"""List all running GPU resources across providers.
|
|
37
|
+
|
|
38
|
+
Queries RunPod and DigitalOcean APIs to show live state.
|
|
39
|
+
|
|
40
|
+
Examples:
|
|
41
|
+
wafer targets list
|
|
42
|
+
wafer targets list --provider runpod
|
|
43
|
+
wafer targets list --pool mi300x-rocm7
|
|
44
|
+
"""
|
|
45
|
+
import trio
|
|
46
|
+
from wafer_core.targets.providers import get_all_cloud_providers, get_provider
|
|
47
|
+
from wafer_core.targets.types import Target, TargetProvider
|
|
48
|
+
|
|
49
|
+
async def _list() -> list[Target]:
|
|
50
|
+
all_targets: list[Target] = []
|
|
51
|
+
|
|
52
|
+
if provider:
|
|
53
|
+
prov = get_provider(provider)
|
|
54
|
+
all_targets = await prov.list_targets()
|
|
55
|
+
else:
|
|
56
|
+
providers = get_all_cloud_providers()
|
|
57
|
+
|
|
58
|
+
async def _fetch(prov_impl: TargetProvider, results: list[Target]) -> None:
|
|
59
|
+
try:
|
|
60
|
+
targets = await prov_impl.list_targets()
|
|
61
|
+
results.extend(targets)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
typer.echo(
|
|
64
|
+
f" Warning: failed to query {type(prov_impl).__name__}: {e}", err=True
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
async with trio.open_nursery() as nursery:
|
|
68
|
+
for _, prov_impl in providers:
|
|
69
|
+
nursery.start_soon(_fetch, prov_impl, all_targets)
|
|
70
|
+
|
|
71
|
+
return all_targets
|
|
72
|
+
|
|
73
|
+
all_targets = trio.run(_list)
|
|
74
|
+
|
|
75
|
+
# Hydrate targets with cached labels
|
|
76
|
+
from dataclasses import replace
|
|
77
|
+
from wafer_core.targets.state_cache import load_all_labels
|
|
78
|
+
|
|
79
|
+
cached_labels = load_all_labels()
|
|
80
|
+
all_targets = [
|
|
81
|
+
replace(t, labels=cached_labels[t.resource_id])
|
|
82
|
+
if t.resource_id in cached_labels
|
|
83
|
+
else t
|
|
84
|
+
for t in all_targets
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
# Apply pool filter if specified
|
|
88
|
+
if pool:
|
|
89
|
+
from wafer_core.targets.pool import load_pool_query, match_targets
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
query = load_pool_query(pool)
|
|
93
|
+
except KeyError as e:
|
|
94
|
+
typer.echo(str(e), err=True)
|
|
95
|
+
raise typer.Exit(1) from None
|
|
96
|
+
|
|
97
|
+
all_targets = match_targets(query, all_targets)
|
|
98
|
+
typer.echo(f"Pool {pool!r}: {len(all_targets)} matching target(s)\n")
|
|
99
|
+
|
|
100
|
+
if not all_targets:
|
|
101
|
+
typer.echo("No running resources found.")
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
typer.echo(f"{len(all_targets)} resource(s):\n")
|
|
105
|
+
for target in all_targets:
|
|
106
|
+
_print_target(target)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _print_target(target: Target) -> None:
|
|
110
|
+
"""Print a single target's info."""
|
|
111
|
+
ssh_info = ""
|
|
112
|
+
if target.public_ip and target.ssh_port:
|
|
113
|
+
ssh_info = f" ssh={target.ssh_username}@{target.public_ip}:{target.ssh_port}"
|
|
114
|
+
|
|
115
|
+
name_part = f" name={target.name}" if target.name else ""
|
|
116
|
+
spec_part = f" spec={target.spec_name}" if target.spec_name else ""
|
|
117
|
+
price_part = f" ${target.price_per_hour:.2f}/hr" if target.price_per_hour else ""
|
|
118
|
+
|
|
119
|
+
# Show interesting labels (skip 'image' — too long)
|
|
120
|
+
label_keys = sorted(k for k in target.labels if k != "image")
|
|
121
|
+
labels_part = ""
|
|
122
|
+
if label_keys:
|
|
123
|
+
labels_part = " " + " ".join(f"{k}={target.labels[k]}" for k in label_keys)
|
|
124
|
+
|
|
125
|
+
typer.echo(
|
|
126
|
+
f" {target.resource_id} [{target.provider}] "
|
|
127
|
+
f"status={target.status} gpu={target.gpu_type}"
|
|
128
|
+
f"{spec_part}{name_part}{ssh_info}{price_part}{labels_part}"
|
|
129
|
+
)
|
|
130
|
+
typer.echo()
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@targets_live_app.command("terminate")
|
|
134
|
+
def targets_terminate(
|
|
135
|
+
resource_id: str | None = typer.Argument(None, help="Resource ID to terminate"),
|
|
136
|
+
pool_name: str | None = typer.Option(
|
|
137
|
+
None, "--pool", help="Terminate all targets matching a pool query"
|
|
138
|
+
),
|
|
139
|
+
provider_name: str | None = typer.Option(
|
|
140
|
+
None, "--provider", "-p", help="Provider hint (avoids querying all providers)"
|
|
141
|
+
),
|
|
142
|
+
yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation"),
|
|
143
|
+
) -> None:
|
|
144
|
+
"""Terminate a running resource by ID, or all targets matching a pool query.
|
|
145
|
+
|
|
146
|
+
Examples:
|
|
147
|
+
wafer targets terminate tkru24z7npcgth
|
|
148
|
+
wafer targets terminate --pool mi300x --yes
|
|
149
|
+
wafer targets terminate --pool runpod-only --provider runpod
|
|
150
|
+
"""
|
|
151
|
+
import trio
|
|
152
|
+
from wafer_core.targets.providers import get_all_cloud_providers, get_provider
|
|
153
|
+
from wafer_core.targets.state_cache import remove_binding
|
|
154
|
+
|
|
155
|
+
if pool_name:
|
|
156
|
+
_terminate_pool(pool_name, provider_name, yes)
|
|
157
|
+
return
|
|
158
|
+
|
|
159
|
+
if not resource_id:
|
|
160
|
+
typer.echo("Provide a resource ID or use --pool <name>.", err=True)
|
|
161
|
+
raise typer.Exit(1)
|
|
162
|
+
|
|
163
|
+
async def _terminate() -> bool:
|
|
164
|
+
if provider_name:
|
|
165
|
+
prov = get_provider(provider_name)
|
|
166
|
+
return await prov.terminate(resource_id)
|
|
167
|
+
|
|
168
|
+
for name, prov in get_all_cloud_providers():
|
|
169
|
+
target = await prov.get_target(resource_id)
|
|
170
|
+
if target is not None:
|
|
171
|
+
success = await prov.terminate(resource_id)
|
|
172
|
+
if success:
|
|
173
|
+
remove_binding(resource_id)
|
|
174
|
+
typer.echo(f"Terminated {resource_id} ({name})")
|
|
175
|
+
return success
|
|
176
|
+
|
|
177
|
+
typer.echo(f"Resource {resource_id} not found on any provider.", err=True)
|
|
178
|
+
return False
|
|
179
|
+
|
|
180
|
+
success = trio.run(_terminate)
|
|
181
|
+
if not success:
|
|
182
|
+
raise typer.Exit(1)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _terminate_pool(pool_name: str, provider_name: str | None, yes: bool) -> None:
|
|
186
|
+
"""Terminate all targets matching a pool query."""
|
|
187
|
+
import trio
|
|
188
|
+
from wafer_core.targets.pool import load_pool_query, match_targets
|
|
189
|
+
from wafer_core.targets.providers import get_all_cloud_providers, get_provider
|
|
190
|
+
from wafer_core.targets.state_cache import remove_binding
|
|
191
|
+
from wafer_core.targets.types import Target
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
query = load_pool_query(pool_name)
|
|
195
|
+
except KeyError as e:
|
|
196
|
+
typer.echo(str(e), err=True)
|
|
197
|
+
raise typer.Exit(1) from None
|
|
198
|
+
|
|
199
|
+
async def _do_terminate() -> int:
|
|
200
|
+
all_targets: list[Target] = []
|
|
201
|
+
if provider_name:
|
|
202
|
+
prov = get_provider(provider_name)
|
|
203
|
+
all_targets = await prov.list_targets()
|
|
204
|
+
else:
|
|
205
|
+
for _, prov in get_all_cloud_providers():
|
|
206
|
+
try:
|
|
207
|
+
all_targets.extend(await prov.list_targets())
|
|
208
|
+
except Exception:
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
matched = match_targets(query, all_targets)
|
|
212
|
+
|
|
213
|
+
if not matched:
|
|
214
|
+
typer.echo(f"No targets match pool {pool_name!r}.")
|
|
215
|
+
return 0
|
|
216
|
+
|
|
217
|
+
typer.echo(f"Found {len(matched)} target(s) matching pool {pool_name!r}:")
|
|
218
|
+
for t in matched:
|
|
219
|
+
name_part = f" name={t.name}" if t.name else ""
|
|
220
|
+
typer.echo(f" {t.resource_id} [{t.provider}] gpu={t.gpu_type}{name_part}")
|
|
221
|
+
|
|
222
|
+
if not yes:
|
|
223
|
+
confirm = typer.confirm("Terminate all?")
|
|
224
|
+
if not confirm:
|
|
225
|
+
return 0
|
|
226
|
+
|
|
227
|
+
count = 0
|
|
228
|
+
for t in matched:
|
|
229
|
+
prov = get_provider(t.provider)
|
|
230
|
+
if await prov.terminate(t.resource_id):
|
|
231
|
+
remove_binding(t.resource_id)
|
|
232
|
+
typer.echo(f" Terminated {t.resource_id}")
|
|
233
|
+
count += 1
|
|
234
|
+
else:
|
|
235
|
+
typer.echo(f" Failed to terminate {t.resource_id}", err=True)
|
|
236
|
+
|
|
237
|
+
return count
|
|
238
|
+
|
|
239
|
+
count = trio.run(_do_terminate)
|
|
240
|
+
typer.echo(f"\nTerminated {count} resource(s).")
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@targets_live_app.command("reconcile")
|
|
244
|
+
def targets_reconcile() -> None:
|
|
245
|
+
"""Refresh local binding cache from provider APIs.
|
|
246
|
+
|
|
247
|
+
Queries all cloud providers, matches resources to specs, and updates
|
|
248
|
+
the local state cache. Reports any drift.
|
|
249
|
+
|
|
250
|
+
Example:
|
|
251
|
+
wafer targets reconcile
|
|
252
|
+
"""
|
|
253
|
+
import trio
|
|
254
|
+
from wafer_core.targets.providers import get_all_cloud_providers
|
|
255
|
+
from wafer_core.targets.reconcile import reconcile
|
|
256
|
+
from wafer_core.targets.spec_store import load_all_specs
|
|
257
|
+
from wafer_core.targets.state_cache import (
|
|
258
|
+
BindingEntry,
|
|
259
|
+
get_binding_hints,
|
|
260
|
+
save_bindings,
|
|
261
|
+
)
|
|
262
|
+
from wafer_core.targets.types import Target
|
|
263
|
+
|
|
264
|
+
async def _sync() -> None:
|
|
265
|
+
specs = load_all_specs()
|
|
266
|
+
|
|
267
|
+
all_targets: list[Target] = []
|
|
268
|
+
for name, prov in get_all_cloud_providers():
|
|
269
|
+
typer.echo(f"Querying {name}...")
|
|
270
|
+
try:
|
|
271
|
+
targets = await prov.list_targets()
|
|
272
|
+
typer.echo(f" Found {len(targets)} resource(s)")
|
|
273
|
+
all_targets.extend(targets)
|
|
274
|
+
except Exception as e:
|
|
275
|
+
typer.echo(f" Failed: {e}", err=True)
|
|
276
|
+
|
|
277
|
+
hints = get_binding_hints()
|
|
278
|
+
result = reconcile(specs, all_targets, binding_hints=hints)
|
|
279
|
+
|
|
280
|
+
# Update binding cache with bound results
|
|
281
|
+
new_bindings = {}
|
|
282
|
+
now = datetime.now(UTC).isoformat()
|
|
283
|
+
for spec, target in result.bound:
|
|
284
|
+
new_bindings[target.resource_id] = BindingEntry(
|
|
285
|
+
spec_name=spec.name,
|
|
286
|
+
provider=target.provider,
|
|
287
|
+
bound_at=now,
|
|
288
|
+
)
|
|
289
|
+
save_bindings(new_bindings)
|
|
290
|
+
|
|
291
|
+
typer.echo("\nSync complete:")
|
|
292
|
+
typer.echo(f" Total resources: {len(all_targets)}")
|
|
293
|
+
typer.echo(f" Matched to specs: {len(result.bound)}")
|
|
294
|
+
typer.echo(f" No matching spec: {len(result.unbound)}")
|
|
295
|
+
|
|
296
|
+
trio.run(_sync)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
@targets_live_app.command("provision")
|
|
300
|
+
def targets_provision(
|
|
301
|
+
spec_name: str = typer.Argument(..., help="Spec name to provision from"),
|
|
302
|
+
) -> None:
|
|
303
|
+
"""Explicitly provision a resource from a spec.
|
|
304
|
+
|
|
305
|
+
Creates a new cloud resource and binds it to the spec.
|
|
306
|
+
|
|
307
|
+
Example:
|
|
308
|
+
wafer targets provision runpod-mi300x
|
|
309
|
+
"""
|
|
310
|
+
import trio
|
|
311
|
+
from wafer_core.targets.providers import get_provider
|
|
312
|
+
from wafer_core.targets.spec_store import load_spec
|
|
313
|
+
from wafer_core.targets.state_cache import BindingEntry, add_binding
|
|
314
|
+
from wafer_core.utils.kernel_utils.targets.config import (
|
|
315
|
+
DigitalOceanTarget,
|
|
316
|
+
RunPodTarget,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
try:
|
|
320
|
+
spec = load_spec(spec_name)
|
|
321
|
+
except FileNotFoundError:
|
|
322
|
+
typer.echo(f"Spec not found: {spec_name}", err=True)
|
|
323
|
+
raise typer.Exit(1) from None
|
|
324
|
+
|
|
325
|
+
if isinstance(spec, RunPodTarget):
|
|
326
|
+
provider_name = "runpod"
|
|
327
|
+
elif isinstance(spec, DigitalOceanTarget):
|
|
328
|
+
provider_name = "digitalocean"
|
|
329
|
+
else:
|
|
330
|
+
typer.echo(f"Spec type {type(spec).__name__} cannot be provisioned.", err=True)
|
|
331
|
+
raise typer.Exit(1) from None
|
|
332
|
+
|
|
333
|
+
async def _provision() -> None:
|
|
334
|
+
from wafer_core.targets.probe import probe_target_labels
|
|
335
|
+
from wafer_core.targets.state_cache import save_labels
|
|
336
|
+
|
|
337
|
+
prov = get_provider(provider_name)
|
|
338
|
+
typer.echo(f"Provisioning {spec_name} via {provider_name}...")
|
|
339
|
+
target = await prov.provision(spec)
|
|
340
|
+
|
|
341
|
+
# Cache the binding
|
|
342
|
+
add_binding(
|
|
343
|
+
target.resource_id,
|
|
344
|
+
BindingEntry(
|
|
345
|
+
spec_name=spec_name,
|
|
346
|
+
provider=provider_name,
|
|
347
|
+
bound_at=datetime.now(UTC).isoformat(),
|
|
348
|
+
),
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
typer.echo(f"\nProvisioned: {target.resource_id}")
|
|
352
|
+
if target.public_ip:
|
|
353
|
+
typer.echo(f" SSH: {target.ssh_username}@{target.public_ip}:{target.ssh_port}")
|
|
354
|
+
|
|
355
|
+
# Probe software labels (sync — runs subprocess ssh)
|
|
356
|
+
if target.public_ip and target.ssh_port:
|
|
357
|
+
typer.echo(" Probing software versions...")
|
|
358
|
+
try:
|
|
359
|
+
ssh_key = spec.ssh_key if hasattr(spec, "ssh_key") else None
|
|
360
|
+
labels = probe_target_labels(
|
|
361
|
+
host=target.public_ip,
|
|
362
|
+
port=target.ssh_port,
|
|
363
|
+
username=target.ssh_username,
|
|
364
|
+
ssh_key_path=ssh_key,
|
|
365
|
+
)
|
|
366
|
+
save_labels(target.resource_id, labels)
|
|
367
|
+
if labels:
|
|
368
|
+
typer.echo(f" Labels: {' '.join(f'{k}={v}' for k, v in sorted(labels.items()))}")
|
|
369
|
+
except Exception as e:
|
|
370
|
+
typer.echo(f" Warning: probe failed: {e}", err=True)
|
|
371
|
+
|
|
372
|
+
trio.run(_provision)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
@targets_live_app.command("pools")
|
|
376
|
+
def targets_pools() -> None:
|
|
377
|
+
"""List configured pool queries from config.toml.
|
|
378
|
+
|
|
379
|
+
Example:
|
|
380
|
+
wafer targets pools
|
|
381
|
+
"""
|
|
382
|
+
from wafer_core.targets.pool import list_pool_names, load_pool_query
|
|
383
|
+
|
|
384
|
+
names = list_pool_names()
|
|
385
|
+
if not names:
|
|
386
|
+
typer.echo("No pools configured in ~/.wafer/config.toml.")
|
|
387
|
+
typer.echo("\nAdd a pool:\n")
|
|
388
|
+
typer.echo(" [pools.mi300x]")
|
|
389
|
+
typer.echo(' gpu_type = "MI300X"')
|
|
390
|
+
typer.echo("")
|
|
391
|
+
typer.echo(" [pools.mi300x-rocm7]")
|
|
392
|
+
typer.echo(' gpu_type = "MI300X"')
|
|
393
|
+
typer.echo(" [pools.mi300x-rocm7.labels]")
|
|
394
|
+
typer.echo(' rocm_version = "7.0.2"')
|
|
395
|
+
return
|
|
396
|
+
|
|
397
|
+
typer.echo(f"{len(names)} pool(s):\n")
|
|
398
|
+
for name in names:
|
|
399
|
+
query = load_pool_query(name)
|
|
400
|
+
parts = []
|
|
401
|
+
if query.gpu_type:
|
|
402
|
+
parts.append(f"gpu_type={query.gpu_type}")
|
|
403
|
+
if query.provider:
|
|
404
|
+
parts.append(f"provider={query.provider}")
|
|
405
|
+
if query.status and query.status != "running":
|
|
406
|
+
parts.append(f"status={query.status}")
|
|
407
|
+
for k, v in sorted(query.labels.items()):
|
|
408
|
+
parts.append(f"{k}={v}")
|
|
409
|
+
criteria = " ".join(parts) if parts else "(match all)"
|
|
410
|
+
typer.echo(f" {name}: {criteria}")
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
@targets_live_app.command("probe")
|
|
414
|
+
def targets_probe(
|
|
415
|
+
resource_id: str = typer.Argument(..., help="Resource ID to probe"),
|
|
416
|
+
provider_name: str | None = typer.Option(
|
|
417
|
+
None, "--provider", "-p", help="Provider hint (avoids querying all providers)"
|
|
418
|
+
),
|
|
419
|
+
) -> None:
|
|
420
|
+
"""Probe a running target's software versions via SSH.
|
|
421
|
+
|
|
422
|
+
Results are cached in ~/.wafer/target_state.json and shown
|
|
423
|
+
by wafer targets list. Used for targets not provisioned by wafer
|
|
424
|
+
(e.g. dashboard-created pods).
|
|
425
|
+
|
|
426
|
+
Examples:
|
|
427
|
+
wafer targets probe ewfo5ckpxlg7y2
|
|
428
|
+
wafer targets probe 543538453 --provider digitalocean
|
|
429
|
+
"""
|
|
430
|
+
import trio
|
|
431
|
+
from wafer_core.targets.probe import probe_target_labels
|
|
432
|
+
from wafer_core.targets.providers import get_all_cloud_providers, get_provider
|
|
433
|
+
from wafer_core.targets.state_cache import save_labels
|
|
434
|
+
|
|
435
|
+
# Find the target (async — needs provider API)
|
|
436
|
+
async def _find_target():
|
|
437
|
+
if provider_name:
|
|
438
|
+
prov = get_provider(provider_name)
|
|
439
|
+
return await prov.get_target(resource_id)
|
|
440
|
+
|
|
441
|
+
for _, prov in get_all_cloud_providers():
|
|
442
|
+
target = await prov.get_target(resource_id)
|
|
443
|
+
if target is not None:
|
|
444
|
+
return target
|
|
445
|
+
return None
|
|
446
|
+
|
|
447
|
+
target = trio.run(_find_target)
|
|
448
|
+
|
|
449
|
+
if target is None:
|
|
450
|
+
typer.echo(f"Resource {resource_id} not found.", err=True)
|
|
451
|
+
raise typer.Exit(1)
|
|
452
|
+
|
|
453
|
+
if not target.public_ip or not target.ssh_port:
|
|
454
|
+
typer.echo(f"Resource {resource_id} has no SSH info (status={target.status}).", err=True)
|
|
455
|
+
raise typer.Exit(1)
|
|
456
|
+
|
|
457
|
+
typer.echo(f"Probing {resource_id} ({target.ssh_username}@{target.public_ip}:{target.ssh_port})...")
|
|
458
|
+
|
|
459
|
+
labels = probe_target_labels(
|
|
460
|
+
host=target.public_ip,
|
|
461
|
+
port=target.ssh_port,
|
|
462
|
+
username=target.ssh_username,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
save_labels(resource_id, labels)
|
|
466
|
+
|
|
467
|
+
if labels:
|
|
468
|
+
typer.echo(f"Labels cached for {resource_id}:")
|
|
469
|
+
for k, v in sorted(labels.items()):
|
|
470
|
+
typer.echo(f" {k}={v}")
|
|
471
|
+
else:
|
|
472
|
+
typer.echo("Probe returned no labels.")
|
|
@@ -15,6 +15,7 @@ import logging
|
|
|
15
15
|
import subprocess
|
|
16
16
|
from collections.abc import Callable
|
|
17
17
|
from dataclasses import dataclass, replace
|
|
18
|
+
from datetime import UTC
|
|
18
19
|
from pathlib import Path
|
|
19
20
|
from typing import TYPE_CHECKING
|
|
20
21
|
|
|
@@ -30,6 +31,26 @@ if TYPE_CHECKING:
|
|
|
30
31
|
logger = logging.getLogger(__name__)
|
|
31
32
|
|
|
32
33
|
|
|
34
|
+
def _update_binding_cache(resource_id: str, spec_name: str, provider: str) -> None:
|
|
35
|
+
"""Update the new target state cache when provisioning through the legacy path.
|
|
36
|
+
|
|
37
|
+
This bridges the old per-provider state files with the new unified cache
|
|
38
|
+
so that `wafer targets list` can see resources provisioned via the old flow.
|
|
39
|
+
"""
|
|
40
|
+
from datetime import datetime
|
|
41
|
+
|
|
42
|
+
from wafer_core.targets.state_cache import BindingEntry, add_binding
|
|
43
|
+
|
|
44
|
+
add_binding(
|
|
45
|
+
resource_id,
|
|
46
|
+
BindingEntry(
|
|
47
|
+
spec_name=spec_name,
|
|
48
|
+
provider=provider,
|
|
49
|
+
bound_at=datetime.now(UTC).isoformat(),
|
|
50
|
+
),
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
33
54
|
@dataclass(frozen=True)
|
|
34
55
|
class TargetSSHInfo:
|
|
35
56
|
"""SSH connection info for a target."""
|
|
@@ -135,7 +156,8 @@ async def _get_runpod_ssh_info(target: RunPodTarget) -> TargetSSHInfo:
|
|
|
135
156
|
# Check if pod already exists and is running
|
|
136
157
|
existing = get_pod_state(target.name)
|
|
137
158
|
if existing and await check_pod_running(existing.pod_id):
|
|
138
|
-
# Reuse existing pod
|
|
159
|
+
# Reuse existing pod — also update the new state cache
|
|
160
|
+
_update_binding_cache(existing.pod_id, target.name, "runpod")
|
|
139
161
|
return TargetSSHInfo(
|
|
140
162
|
host=existing.public_ip,
|
|
141
163
|
port=existing.ssh_port,
|
|
@@ -151,6 +173,8 @@ async def _get_runpod_ssh_info(target: RunPodTarget) -> TargetSSHInfo:
|
|
|
151
173
|
target_keep_alive = replace(target, keep_alive=True)
|
|
152
174
|
|
|
153
175
|
async with runpod_ssh_context(target_keep_alive) as ssh_info:
|
|
176
|
+
# Update new state cache with provisioned pod
|
|
177
|
+
_update_binding_cache(ssh_info.pod_id, target.name, "runpod")
|
|
154
178
|
return TargetSSHInfo(
|
|
155
179
|
host=ssh_info.host,
|
|
156
180
|
port=ssh_info.port,
|
|
@@ -172,7 +196,8 @@ async def _get_digitalocean_ssh_info(target: DigitalOceanTarget) -> TargetSSHInf
|
|
|
172
196
|
# Check if droplet already exists and is running
|
|
173
197
|
existing = get_droplet_state(target.name)
|
|
174
198
|
if existing and await check_droplet_running(existing.droplet_id):
|
|
175
|
-
# Reuse existing droplet
|
|
199
|
+
# Reuse existing droplet — also update the new state cache
|
|
200
|
+
_update_binding_cache(existing.droplet_id, target.name, "digitalocean")
|
|
176
201
|
return TargetSSHInfo(
|
|
177
202
|
host=existing.public_ip,
|
|
178
203
|
port=22, # DigitalOcean uses standard SSH port
|
|
@@ -184,6 +209,8 @@ async def _get_digitalocean_ssh_info(target: DigitalOceanTarget) -> TargetSSHInf
|
|
|
184
209
|
target_keep_alive = replace(target, keep_alive=True)
|
|
185
210
|
|
|
186
211
|
async with digitalocean_ssh_context(target_keep_alive) as ssh_info:
|
|
212
|
+
# Update new state cache with provisioned droplet
|
|
213
|
+
_update_binding_cache(ssh_info.droplet_id, target.name, "digitalocean")
|
|
187
214
|
return TargetSSHInfo(
|
|
188
215
|
host=ssh_info.host,
|
|
189
216
|
port=ssh_info.port,
|
|
@@ -42,9 +42,11 @@ wafer/problems.py
|
|
|
42
42
|
wafer/rocprof_compute.py
|
|
43
43
|
wafer/rocprof_sdk.py
|
|
44
44
|
wafer/rocprof_systems.py
|
|
45
|
+
wafer/specs_cli.py
|
|
45
46
|
wafer/ssh_keys.py
|
|
46
47
|
wafer/target_lock.py
|
|
47
48
|
wafer/targets.py
|
|
49
|
+
wafer/targets_cli.py
|
|
48
50
|
wafer/targets_ops.py
|
|
49
51
|
wafer/trace_compare.py
|
|
50
52
|
wafer/tracelens.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|