wafer-cli 0.2.26__py3-none-any.whl → 0.2.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wafer/cli.py CHANGED
@@ -268,6 +268,32 @@ Configure targets with: wafer config targets init ..."""
268
268
  )
269
269
  app.add_typer(targets_ops_app, name="targets", rich_help_panel="Infrastructure")
270
270
 
271
+ # Specs management (new: local TOML configs)
272
+ from wafer.specs_cli import specs_app
273
+
274
+ app.add_typer(specs_app, name="specs", rich_help_panel="Configuration")
275
+
276
+ # Live resource management (new: API-backed commands on `wafer targets`)
277
+ # These become: wafer targets list, wafer targets terminate, etc.
278
+ from wafer.targets_cli import (
279
+ targets_list as _targets_list_cmd,
280
+ )
281
+ from wafer.targets_cli import (
282
+ targets_provision as _targets_provision_cmd,
283
+ )
284
+ from wafer.targets_cli import (
285
+ targets_reconcile as _targets_reconcile_cmd,
286
+ )
287
+ from wafer.targets_cli import (
288
+ targets_terminate as _targets_terminate_cmd,
289
+ )
290
+ from wafer.targets_cli import (
291
+ targets_pools as _targets_pools_cmd,
292
+ )
293
+ from wafer.targets_cli import (
294
+ targets_probe as _targets_probe_cmd,
295
+ )
296
+
271
297
  # Billing management - nested under config
272
298
  billing_app = typer.Typer(help="Manage billing, credits, and subscription")
273
299
  config_app.add_typer(billing_app, name="billing")
@@ -612,7 +638,9 @@ def skill_status() -> None:
612
638
  auth_app = typer.Typer(help="Authenticate with Wafer and cloud GPU providers")
613
639
  app.add_typer(auth_app, name="auth", rich_help_panel="Configuration")
614
640
 
615
- providers_app = typer.Typer(help="Manage API keys for cloud GPU providers (RunPod, DigitalOcean, etc.)")
641
+ providers_app = typer.Typer(
642
+ help="Manage API keys for cloud GPU providers (RunPod, DigitalOcean, etc.)"
643
+ )
616
644
  auth_app.add_typer(providers_app, name="providers")
617
645
 
618
646
 
@@ -1813,6 +1841,93 @@ def kernelbench_list_problems() -> None:
1813
1841
  raise typer.Exit(1) from None
1814
1842
 
1815
1843
 
1844
+ def _resolve_pool_query(pool: str, collector) -> tuple[str, object]:
1845
+ """Resolve a PoolQuery pool to a target spec name + lock context.
1846
+
1847
+ Queries live providers, matches by pool query, locks one target,
1848
+ returns (spec_name, lock_context) for the evaluator.
1849
+ """
1850
+ import trio
1851
+ from wafer_core.targets.pool import resolve_pool
1852
+
1853
+ from .target_lock import acquire_from_pool
1854
+
1855
+ matched_targets = trio.run(resolve_pool, pool)
1856
+
1857
+ if not matched_targets:
1858
+ collector.set_error("pool", "NoMatchingTargets", pool=pool)
1859
+ collector.finalize()
1860
+ raise typer.Exit(1)
1861
+
1862
+ # Filter to targets with a spec (evaluator needs spec fields)
1863
+ spec_targets = [t for t in matched_targets if t.spec_name]
1864
+ if not spec_targets:
1865
+ collector.set_error(
1866
+ "pool", "NoSpecTargets", pool=pool,
1867
+ message="Matched targets have no spec binding — evaluator needs spec fields",
1868
+ )
1869
+ collector.finalize()
1870
+ raise typer.Exit(1)
1871
+
1872
+ # Lock one by resource_id
1873
+ resource_ids = [t.resource_id for t in spec_targets]
1874
+ collector.emit("pool_acquire", pool=pool, count=len(resource_ids))
1875
+
1876
+ lock_ctx = acquire_from_pool(resource_ids)
1877
+ acquired_id = lock_ctx.__enter__()
1878
+
1879
+ if acquired_id is None:
1880
+ lock_ctx.__exit__(None, None, None)
1881
+ collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=resource_ids)
1882
+ collector.finalize()
1883
+ raise typer.Exit(1)
1884
+
1885
+ # Map resource_id back to spec_name
1886
+ acquired_target = next(t for t in spec_targets if t.resource_id == acquired_id)
1887
+ spec_name = acquired_target.spec_name
1888
+
1889
+ collector.emit("pool_acquired", target=spec_name, resource_id=acquired_id)
1890
+ return spec_name, lock_ctx
1891
+
1892
+
1893
+ def _resolve_pool_legacy(pool: str, collector) -> tuple[str, object]:
1894
+ """Resolve an old-style pool (static target name list) to a target name + lock context.
1895
+
1896
+ Old format: [pools.name] targets = ["t1", "t2"]
1897
+ """
1898
+ from .target_lock import acquire_from_pool
1899
+ from .targets import filter_pool_by_auth, get_pool
1900
+
1901
+ try:
1902
+ pool_targets = get_pool(pool)
1903
+ except FileNotFoundError as e:
1904
+ collector.set_error("pool", "PoolNotFound", pool=pool, message=str(e))
1905
+ collector.finalize()
1906
+ raise typer.Exit(1) from None
1907
+
1908
+ usable_targets, skipped = filter_pool_by_auth(pool_targets)
1909
+ if skipped:
1910
+ collector.emit("pool_auth_skip", targets=skipped)
1911
+
1912
+ if not usable_targets:
1913
+ collector.set_error("pool", "NoUsableTargets", pool=pool)
1914
+ collector.finalize()
1915
+ raise typer.Exit(1) from None
1916
+
1917
+ collector.emit("pool_acquire", pool=pool, count=len(usable_targets))
1918
+ lock_ctx = acquire_from_pool(usable_targets)
1919
+ acquired_target = lock_ctx.__enter__()
1920
+
1921
+ if acquired_target is None:
1922
+ lock_ctx.__exit__(None, None, None)
1923
+ collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=usable_targets)
1924
+ collector.finalize()
1925
+ raise typer.Exit(1)
1926
+
1927
+ collector.emit("pool_acquired", target=acquired_target)
1928
+ return acquired_target, lock_ctx
1929
+
1930
+
1816
1931
  @kernelbench_app.callback(invoke_without_command=True)
1817
1932
  def kernelbench_evaluate( # noqa: PLR0913, PLR0915
1818
1933
  ctx: typer.Context,
@@ -1943,39 +2058,12 @@ def kernelbench_evaluate( # noqa: PLR0913, PLR0915
1943
2058
  pool_lock_context = None
1944
2059
 
1945
2060
  if pool:
1946
- from .target_lock import acquire_from_pool
1947
- from .targets import filter_pool_by_auth, get_pool
1948
-
1949
- try:
1950
- pool_targets = get_pool(pool)
1951
- except FileNotFoundError as e:
1952
- collector.set_error("pool", "PoolNotFound", pool=pool, message=str(e))
1953
- collector.finalize()
1954
- raise typer.Exit(1) from None
2061
+ from wafer_core.targets.pool import is_query_pool
1955
2062
 
1956
- # Filter to only targets with valid auth
1957
- usable_targets, skipped = filter_pool_by_auth(pool_targets)
1958
- if skipped:
1959
- collector.emit("pool_auth_skip", targets=skipped)
1960
-
1961
- if not usable_targets:
1962
- collector.set_error("pool", "NoUsableTargets", pool=pool)
1963
- collector.finalize()
1964
- raise typer.Exit(1) from None
1965
-
1966
- collector.emit("pool_acquire", pool=pool, count=len(usable_targets))
1967
- pool_lock_context = acquire_from_pool(usable_targets)
1968
- acquired_target = pool_lock_context.__enter__()
1969
-
1970
- if acquired_target is None:
1971
- # Exit context manager before raising to avoid resource leak
1972
- pool_lock_context.__exit__(None, None, None)
1973
- collector.set_error("pool", "AllTargetsBusy", pool=pool, targets=usable_targets)
1974
- collector.finalize()
1975
- raise typer.Exit(1)
1976
-
1977
- collector.emit("pool_acquired", target=acquired_target)
1978
- resolved_target = acquired_target
2063
+ if is_query_pool(pool):
2064
+ resolved_target, pool_lock_context = _resolve_pool_query(pool, collector)
2065
+ else:
2066
+ resolved_target, pool_lock_context = _resolve_pool_legacy(pool, collector)
1979
2067
 
1980
2068
  collector.target = resolved_target
1981
2069
 
@@ -5254,6 +5342,18 @@ def workspaces_pull(
5254
5342
  raise typer.Exit(1) from None
5255
5343
 
5256
5344
 
5345
+ # =============================================================================
5346
+ # Live resource commands (list/terminate/reconcile/provision)
5347
+ # =============================================================================
5348
+
5349
+ targets_ops_app.command("list")(_targets_list_cmd)
5350
+ targets_ops_app.command("terminate")(_targets_terminate_cmd)
5351
+ targets_ops_app.command("reconcile")(_targets_reconcile_cmd)
5352
+ targets_ops_app.command("provision")(_targets_provision_cmd)
5353
+ targets_ops_app.command("pools")(_targets_pools_cmd)
5354
+ targets_ops_app.command("probe")(_targets_probe_cmd)
5355
+
5356
+
5257
5357
  # =============================================================================
5258
5358
  # Target operations commands (exec/ssh/sync)
5259
5359
  # =============================================================================
wafer/specs_cli.py ADDED
@@ -0,0 +1,157 @@
1
+ """CLI commands for wafer specs — TargetSpec TOML management.
2
+
3
+ These are the local config commands (no API calls).
4
+ Registered as: wafer specs list|show|add|remove|default|init
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+
11
+ import typer
12
+
13
+ specs_app = typer.Typer(
14
+ help="""Manage GPU target specs (provisioning blueprints).
15
+
16
+ Specs define how to access or provision GPUs. They are TOML files in ~/.wafer/specs/.
17
+
18
+ wafer specs list # List all specs
19
+ wafer specs show runpod-mi300x # Show one spec
20
+ wafer specs add /path/to/spec.toml # Add from file
21
+ wafer specs remove old-target # Remove a spec
22
+ wafer specs default runpod-mi300x # Set default
23
+
24
+ To create a new spec interactively:
25
+ wafer config targets init ssh # (legacy, still works)
26
+ wafer config targets init runpod
27
+ """
28
+ )
29
+
30
+
31
+ @specs_app.command("list")
32
+ def specs_list() -> None:
33
+ """List all configured specs.
34
+
35
+ Example:
36
+ wafer specs list
37
+ """
38
+ from wafer_core.targets.spec_store import list_spec_names, load_spec
39
+
40
+ from .targets import get_default_target
41
+
42
+ names = list_spec_names()
43
+ default = get_default_target()
44
+
45
+ if not names:
46
+ typer.echo("No specs configured.")
47
+ typer.echo("Add one with: wafer specs add <path/to/spec.toml>")
48
+ typer.echo("Or interactively: wafer config targets init ssh")
49
+ return
50
+
51
+ typer.echo("Configured specs:")
52
+ for name in names:
53
+ marker = " (default)" if name == default else ""
54
+ try:
55
+ spec = load_spec(name)
56
+ type_name = type(spec).__name__.replace("Target", "")
57
+ typer.echo(f" {name}{marker} [{type_name}] gpu={spec.gpu_type}")
58
+ except Exception as e:
59
+ typer.echo(f" {name}{marker} [error: {e}]")
60
+
61
+
62
+ @specs_app.command("show")
63
+ def specs_show(
64
+ name: str = typer.Argument(..., help="Spec name"),
65
+ ) -> None:
66
+ """Show details for a spec.
67
+
68
+ Example:
69
+ wafer specs show runpod-mi300x
70
+ """
71
+ from wafer_core.targets.spec_store import load_spec
72
+
73
+ from .targets import get_target_info
74
+
75
+ try:
76
+ spec = load_spec(name)
77
+ except FileNotFoundError:
78
+ typer.echo(f"Spec not found: {name}", err=True)
79
+ raise typer.Exit(1) from None
80
+
81
+ typer.echo(f"Spec: {name}")
82
+ for key, value in get_target_info(spec).items():
83
+ typer.echo(f" {key}: {value}")
84
+
85
+
86
+ @specs_app.command("add")
87
+ def specs_add(
88
+ file_path: Path = typer.Argument(..., help="Path to TOML spec file"),
89
+ ) -> None:
90
+ """Add a spec from a TOML file.
91
+
92
+ Example:
93
+ wafer specs add ./my-target.toml
94
+ """
95
+ import tomllib
96
+
97
+ from wafer_core.targets.spec_store import parse_spec, save_spec
98
+
99
+ if not file_path.exists():
100
+ typer.echo(f"File not found: {file_path}", err=True)
101
+ raise typer.Exit(1) from None
102
+
103
+ try:
104
+ with open(file_path, "rb") as f:
105
+ data = tomllib.load(f)
106
+ spec = parse_spec(data)
107
+ save_spec(spec)
108
+ typer.echo(f"Added spec: {spec.name}")
109
+ except Exception as e:
110
+ typer.echo(f"Error: {e}", err=True)
111
+ raise typer.Exit(1) from None
112
+
113
+
114
+ @specs_app.command("remove")
115
+ def specs_remove(
116
+ name: str = typer.Argument(..., help="Spec name to remove"),
117
+ force: bool = typer.Option(False, "--force", "-f", help="Skip confirmation"),
118
+ ) -> None:
119
+ """Remove a spec.
120
+
121
+ Example:
122
+ wafer specs remove old-target
123
+ """
124
+ from wafer_core.targets.spec_store import remove_spec
125
+
126
+ if not force:
127
+ confirm = typer.confirm(f"Remove spec '{name}'?")
128
+ if not confirm:
129
+ return
130
+
131
+ try:
132
+ remove_spec(name)
133
+ typer.echo(f"Removed spec: {name}")
134
+ except FileNotFoundError:
135
+ typer.echo(f"Spec not found: {name}", err=True)
136
+ raise typer.Exit(1) from None
137
+
138
+
139
+ @specs_app.command("default")
140
+ def specs_default(
141
+ name: str = typer.Argument(..., help="Spec name to set as default"),
142
+ ) -> None:
143
+ """Set the default spec.
144
+
145
+ Example:
146
+ wafer specs default runpod-mi300x
147
+ """
148
+ from wafer_core.targets.spec_store import list_spec_names
149
+
150
+ from .targets import set_default_target
151
+
152
+ if name not in list_spec_names():
153
+ typer.echo(f"Spec not found: {name}", err=True)
154
+ raise typer.Exit(1) from None
155
+
156
+ set_default_target(name)
157
+ typer.echo(f"Default spec set to: {name}")
wafer/targets_cli.py ADDED
@@ -0,0 +1,472 @@
1
+ """CLI commands for wafer targets — live resource management.
2
+
3
+ These commands always hit provider APIs to show real state.
4
+ Registered as: wafer targets list|show|terminate|sync|provision
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from datetime import UTC, datetime
10
+
11
+ import typer
12
+
13
+ targets_live_app = typer.Typer(
14
+ name="targets",
15
+ help="""Manage live GPU resources across cloud providers.
16
+
17
+ Unlike 'wafer specs' (local config files), these commands query provider APIs
18
+ to show what's actually running.
19
+
20
+ wafer targets list # All running resources
21
+ wafer targets list --unbound # Orphans (no matching spec)
22
+ wafer targets list --provider runpod # Filter by provider
23
+ wafer targets terminate <resource-id> # Kill a resource
24
+ wafer targets terminate --unbound # Kill all orphans
25
+ wafer targets sync # Refresh bindings
26
+ wafer targets provision <spec-name> # Provision from a spec
27
+ """,
28
+ )
29
+
30
+
31
+ @targets_live_app.command("list")
32
+ def targets_list(
33
+ provider: str | None = typer.Option(None, "--provider", "-p", help="Filter by provider"),
34
+ pool: str | None = typer.Option(None, "--pool", help="Filter by pool query from config.toml"),
35
+ ) -> None:
36
+ """List all running GPU resources across providers.
37
+
38
+ Queries RunPod and DigitalOcean APIs to show live state.
39
+
40
+ Examples:
41
+ wafer targets list
42
+ wafer targets list --provider runpod
43
+ wafer targets list --pool mi300x-rocm7
44
+ """
45
+ import trio
46
+ from wafer_core.targets.providers import get_all_cloud_providers, get_provider
47
+ from wafer_core.targets.types import Target, TargetProvider
48
+
49
+ async def _list() -> list[Target]:
50
+ all_targets: list[Target] = []
51
+
52
+ if provider:
53
+ prov = get_provider(provider)
54
+ all_targets = await prov.list_targets()
55
+ else:
56
+ providers = get_all_cloud_providers()
57
+
58
+ async def _fetch(prov_impl: TargetProvider, results: list[Target]) -> None:
59
+ try:
60
+ targets = await prov_impl.list_targets()
61
+ results.extend(targets)
62
+ except Exception as e:
63
+ typer.echo(
64
+ f" Warning: failed to query {type(prov_impl).__name__}: {e}", err=True
65
+ )
66
+
67
+ async with trio.open_nursery() as nursery:
68
+ for _, prov_impl in providers:
69
+ nursery.start_soon(_fetch, prov_impl, all_targets)
70
+
71
+ return all_targets
72
+
73
+ all_targets = trio.run(_list)
74
+
75
+ # Hydrate targets with cached labels
76
+ from dataclasses import replace
77
+ from wafer_core.targets.state_cache import load_all_labels
78
+
79
+ cached_labels = load_all_labels()
80
+ all_targets = [
81
+ replace(t, labels=cached_labels[t.resource_id])
82
+ if t.resource_id in cached_labels
83
+ else t
84
+ for t in all_targets
85
+ ]
86
+
87
+ # Apply pool filter if specified
88
+ if pool:
89
+ from wafer_core.targets.pool import load_pool_query, match_targets
90
+
91
+ try:
92
+ query = load_pool_query(pool)
93
+ except KeyError as e:
94
+ typer.echo(str(e), err=True)
95
+ raise typer.Exit(1) from None
96
+
97
+ all_targets = match_targets(query, all_targets)
98
+ typer.echo(f"Pool {pool!r}: {len(all_targets)} matching target(s)\n")
99
+
100
+ if not all_targets:
101
+ typer.echo("No running resources found.")
102
+ return
103
+
104
+ typer.echo(f"{len(all_targets)} resource(s):\n")
105
+ for target in all_targets:
106
+ _print_target(target)
107
+
108
+
109
+ def _print_target(target: Target) -> None:
110
+ """Print a single target's info."""
111
+ ssh_info = ""
112
+ if target.public_ip and target.ssh_port:
113
+ ssh_info = f" ssh={target.ssh_username}@{target.public_ip}:{target.ssh_port}"
114
+
115
+ name_part = f" name={target.name}" if target.name else ""
116
+ spec_part = f" spec={target.spec_name}" if target.spec_name else ""
117
+ price_part = f" ${target.price_per_hour:.2f}/hr" if target.price_per_hour else ""
118
+
119
+ # Show interesting labels (skip 'image' — too long)
120
+ label_keys = sorted(k for k in target.labels if k != "image")
121
+ labels_part = ""
122
+ if label_keys:
123
+ labels_part = " " + " ".join(f"{k}={target.labels[k]}" for k in label_keys)
124
+
125
+ typer.echo(
126
+ f" {target.resource_id} [{target.provider}] "
127
+ f"status={target.status} gpu={target.gpu_type}"
128
+ f"{spec_part}{name_part}{ssh_info}{price_part}{labels_part}"
129
+ )
130
+ typer.echo()
131
+
132
+
133
+ @targets_live_app.command("terminate")
134
+ def targets_terminate(
135
+ resource_id: str | None = typer.Argument(None, help="Resource ID to terminate"),
136
+ pool_name: str | None = typer.Option(
137
+ None, "--pool", help="Terminate all targets matching a pool query"
138
+ ),
139
+ provider_name: str | None = typer.Option(
140
+ None, "--provider", "-p", help="Provider hint (avoids querying all providers)"
141
+ ),
142
+ yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation"),
143
+ ) -> None:
144
+ """Terminate a running resource by ID, or all targets matching a pool query.
145
+
146
+ Examples:
147
+ wafer targets terminate tkru24z7npcgth
148
+ wafer targets terminate --pool mi300x --yes
149
+ wafer targets terminate --pool runpod-only --provider runpod
150
+ """
151
+ import trio
152
+ from wafer_core.targets.providers import get_all_cloud_providers, get_provider
153
+ from wafer_core.targets.state_cache import remove_binding
154
+
155
+ if pool_name:
156
+ _terminate_pool(pool_name, provider_name, yes)
157
+ return
158
+
159
+ if not resource_id:
160
+ typer.echo("Provide a resource ID or use --pool <name>.", err=True)
161
+ raise typer.Exit(1)
162
+
163
+ async def _terminate() -> bool:
164
+ if provider_name:
165
+ prov = get_provider(provider_name)
166
+ return await prov.terminate(resource_id)
167
+
168
+ for name, prov in get_all_cloud_providers():
169
+ target = await prov.get_target(resource_id)
170
+ if target is not None:
171
+ success = await prov.terminate(resource_id)
172
+ if success:
173
+ remove_binding(resource_id)
174
+ typer.echo(f"Terminated {resource_id} ({name})")
175
+ return success
176
+
177
+ typer.echo(f"Resource {resource_id} not found on any provider.", err=True)
178
+ return False
179
+
180
+ success = trio.run(_terminate)
181
+ if not success:
182
+ raise typer.Exit(1)
183
+
184
+
185
+ def _terminate_pool(pool_name: str, provider_name: str | None, yes: bool) -> None:
186
+ """Terminate all targets matching a pool query."""
187
+ import trio
188
+ from wafer_core.targets.pool import load_pool_query, match_targets
189
+ from wafer_core.targets.providers import get_all_cloud_providers, get_provider
190
+ from wafer_core.targets.state_cache import remove_binding
191
+ from wafer_core.targets.types import Target
192
+
193
+ try:
194
+ query = load_pool_query(pool_name)
195
+ except KeyError as e:
196
+ typer.echo(str(e), err=True)
197
+ raise typer.Exit(1) from None
198
+
199
+ async def _do_terminate() -> int:
200
+ all_targets: list[Target] = []
201
+ if provider_name:
202
+ prov = get_provider(provider_name)
203
+ all_targets = await prov.list_targets()
204
+ else:
205
+ for _, prov in get_all_cloud_providers():
206
+ try:
207
+ all_targets.extend(await prov.list_targets())
208
+ except Exception:
209
+ pass
210
+
211
+ matched = match_targets(query, all_targets)
212
+
213
+ if not matched:
214
+ typer.echo(f"No targets match pool {pool_name!r}.")
215
+ return 0
216
+
217
+ typer.echo(f"Found {len(matched)} target(s) matching pool {pool_name!r}:")
218
+ for t in matched:
219
+ name_part = f" name={t.name}" if t.name else ""
220
+ typer.echo(f" {t.resource_id} [{t.provider}] gpu={t.gpu_type}{name_part}")
221
+
222
+ if not yes:
223
+ confirm = typer.confirm("Terminate all?")
224
+ if not confirm:
225
+ return 0
226
+
227
+ count = 0
228
+ for t in matched:
229
+ prov = get_provider(t.provider)
230
+ if await prov.terminate(t.resource_id):
231
+ remove_binding(t.resource_id)
232
+ typer.echo(f" Terminated {t.resource_id}")
233
+ count += 1
234
+ else:
235
+ typer.echo(f" Failed to terminate {t.resource_id}", err=True)
236
+
237
+ return count
238
+
239
+ count = trio.run(_do_terminate)
240
+ typer.echo(f"\nTerminated {count} resource(s).")
241
+
242
+
243
+ @targets_live_app.command("reconcile")
244
+ def targets_reconcile() -> None:
245
+ """Refresh local binding cache from provider APIs.
246
+
247
+ Queries all cloud providers, matches resources to specs, and updates
248
+ the local state cache. Reports any drift.
249
+
250
+ Example:
251
+ wafer targets reconcile
252
+ """
253
+ import trio
254
+ from wafer_core.targets.providers import get_all_cloud_providers
255
+ from wafer_core.targets.reconcile import reconcile
256
+ from wafer_core.targets.spec_store import load_all_specs
257
+ from wafer_core.targets.state_cache import (
258
+ BindingEntry,
259
+ get_binding_hints,
260
+ save_bindings,
261
+ )
262
+ from wafer_core.targets.types import Target
263
+
264
+ async def _sync() -> None:
265
+ specs = load_all_specs()
266
+
267
+ all_targets: list[Target] = []
268
+ for name, prov in get_all_cloud_providers():
269
+ typer.echo(f"Querying {name}...")
270
+ try:
271
+ targets = await prov.list_targets()
272
+ typer.echo(f" Found {len(targets)} resource(s)")
273
+ all_targets.extend(targets)
274
+ except Exception as e:
275
+ typer.echo(f" Failed: {e}", err=True)
276
+
277
+ hints = get_binding_hints()
278
+ result = reconcile(specs, all_targets, binding_hints=hints)
279
+
280
+ # Update binding cache with bound results
281
+ new_bindings = {}
282
+ now = datetime.now(UTC).isoformat()
283
+ for spec, target in result.bound:
284
+ new_bindings[target.resource_id] = BindingEntry(
285
+ spec_name=spec.name,
286
+ provider=target.provider,
287
+ bound_at=now,
288
+ )
289
+ save_bindings(new_bindings)
290
+
291
+ typer.echo("\nSync complete:")
292
+ typer.echo(f" Total resources: {len(all_targets)}")
293
+ typer.echo(f" Matched to specs: {len(result.bound)}")
294
+ typer.echo(f" No matching spec: {len(result.unbound)}")
295
+
296
+ trio.run(_sync)
297
+
298
+
299
+ @targets_live_app.command("provision")
300
+ def targets_provision(
301
+ spec_name: str = typer.Argument(..., help="Spec name to provision from"),
302
+ ) -> None:
303
+ """Explicitly provision a resource from a spec.
304
+
305
+ Creates a new cloud resource and binds it to the spec.
306
+
307
+ Example:
308
+ wafer targets provision runpod-mi300x
309
+ """
310
+ import trio
311
+ from wafer_core.targets.providers import get_provider
312
+ from wafer_core.targets.spec_store import load_spec
313
+ from wafer_core.targets.state_cache import BindingEntry, add_binding
314
+ from wafer_core.utils.kernel_utils.targets.config import (
315
+ DigitalOceanTarget,
316
+ RunPodTarget,
317
+ )
318
+
319
+ try:
320
+ spec = load_spec(spec_name)
321
+ except FileNotFoundError:
322
+ typer.echo(f"Spec not found: {spec_name}", err=True)
323
+ raise typer.Exit(1) from None
324
+
325
+ if isinstance(spec, RunPodTarget):
326
+ provider_name = "runpod"
327
+ elif isinstance(spec, DigitalOceanTarget):
328
+ provider_name = "digitalocean"
329
+ else:
330
+ typer.echo(f"Spec type {type(spec).__name__} cannot be provisioned.", err=True)
331
+ raise typer.Exit(1) from None
332
+
333
+ async def _provision() -> None:
334
+ from wafer_core.targets.probe import probe_target_labels
335
+ from wafer_core.targets.state_cache import save_labels
336
+
337
+ prov = get_provider(provider_name)
338
+ typer.echo(f"Provisioning {spec_name} via {provider_name}...")
339
+ target = await prov.provision(spec)
340
+
341
+ # Cache the binding
342
+ add_binding(
343
+ target.resource_id,
344
+ BindingEntry(
345
+ spec_name=spec_name,
346
+ provider=provider_name,
347
+ bound_at=datetime.now(UTC).isoformat(),
348
+ ),
349
+ )
350
+
351
+ typer.echo(f"\nProvisioned: {target.resource_id}")
352
+ if target.public_ip:
353
+ typer.echo(f" SSH: {target.ssh_username}@{target.public_ip}:{target.ssh_port}")
354
+
355
+ # Probe software labels (sync — runs subprocess ssh)
356
+ if target.public_ip and target.ssh_port:
357
+ typer.echo(" Probing software versions...")
358
+ try:
359
+ ssh_key = spec.ssh_key if hasattr(spec, "ssh_key") else None
360
+ labels = probe_target_labels(
361
+ host=target.public_ip,
362
+ port=target.ssh_port,
363
+ username=target.ssh_username,
364
+ ssh_key_path=ssh_key,
365
+ )
366
+ save_labels(target.resource_id, labels)
367
+ if labels:
368
+ typer.echo(f" Labels: {' '.join(f'{k}={v}' for k, v in sorted(labels.items()))}")
369
+ except Exception as e:
370
+ typer.echo(f" Warning: probe failed: {e}", err=True)
371
+
372
+ trio.run(_provision)
373
+
374
+
375
+ @targets_live_app.command("pools")
376
+ def targets_pools() -> None:
377
+ """List configured pool queries from config.toml.
378
+
379
+ Example:
380
+ wafer targets pools
381
+ """
382
+ from wafer_core.targets.pool import list_pool_names, load_pool_query
383
+
384
+ names = list_pool_names()
385
+ if not names:
386
+ typer.echo("No pools configured in ~/.wafer/config.toml.")
387
+ typer.echo("\nAdd a pool:\n")
388
+ typer.echo(" [pools.mi300x]")
389
+ typer.echo(' gpu_type = "MI300X"')
390
+ typer.echo("")
391
+ typer.echo(" [pools.mi300x-rocm7]")
392
+ typer.echo(' gpu_type = "MI300X"')
393
+ typer.echo(" [pools.mi300x-rocm7.labels]")
394
+ typer.echo(' rocm_version = "7.0.2"')
395
+ return
396
+
397
+ typer.echo(f"{len(names)} pool(s):\n")
398
+ for name in names:
399
+ query = load_pool_query(name)
400
+ parts = []
401
+ if query.gpu_type:
402
+ parts.append(f"gpu_type={query.gpu_type}")
403
+ if query.provider:
404
+ parts.append(f"provider={query.provider}")
405
+ if query.status and query.status != "running":
406
+ parts.append(f"status={query.status}")
407
+ for k, v in sorted(query.labels.items()):
408
+ parts.append(f"{k}={v}")
409
+ criteria = " ".join(parts) if parts else "(match all)"
410
+ typer.echo(f" {name}: {criteria}")
411
+
412
+
413
+ @targets_live_app.command("probe")
414
+ def targets_probe(
415
+ resource_id: str = typer.Argument(..., help="Resource ID to probe"),
416
+ provider_name: str | None = typer.Option(
417
+ None, "--provider", "-p", help="Provider hint (avoids querying all providers)"
418
+ ),
419
+ ) -> None:
420
+ """Probe a running target's software versions via SSH.
421
+
422
+ Results are cached in ~/.wafer/target_state.json and shown
423
+ by wafer targets list. Used for targets not provisioned by wafer
424
+ (e.g. dashboard-created pods).
425
+
426
+ Examples:
427
+ wafer targets probe ewfo5ckpxlg7y2
428
+ wafer targets probe 543538453 --provider digitalocean
429
+ """
430
+ import trio
431
+ from wafer_core.targets.probe import probe_target_labels
432
+ from wafer_core.targets.providers import get_all_cloud_providers, get_provider
433
+ from wafer_core.targets.state_cache import save_labels
434
+
435
+ # Find the target (async — needs provider API)
436
+ async def _find_target():
437
+ if provider_name:
438
+ prov = get_provider(provider_name)
439
+ return await prov.get_target(resource_id)
440
+
441
+ for _, prov in get_all_cloud_providers():
442
+ target = await prov.get_target(resource_id)
443
+ if target is not None:
444
+ return target
445
+ return None
446
+
447
+ target = trio.run(_find_target)
448
+
449
+ if target is None:
450
+ typer.echo(f"Resource {resource_id} not found.", err=True)
451
+ raise typer.Exit(1)
452
+
453
+ if not target.public_ip or not target.ssh_port:
454
+ typer.echo(f"Resource {resource_id} has no SSH info (status={target.status}).", err=True)
455
+ raise typer.Exit(1)
456
+
457
+ typer.echo(f"Probing {resource_id} ({target.ssh_username}@{target.public_ip}:{target.ssh_port})...")
458
+
459
+ labels = probe_target_labels(
460
+ host=target.public_ip,
461
+ port=target.ssh_port,
462
+ username=target.ssh_username,
463
+ )
464
+
465
+ save_labels(resource_id, labels)
466
+
467
+ if labels:
468
+ typer.echo(f"Labels cached for {resource_id}:")
469
+ for k, v in sorted(labels.items()):
470
+ typer.echo(f" {k}={v}")
471
+ else:
472
+ typer.echo("Probe returned no labels.")
wafer/targets_ops.py CHANGED
@@ -15,6 +15,7 @@ import logging
15
15
  import subprocess
16
16
  from collections.abc import Callable
17
17
  from dataclasses import dataclass, replace
18
+ from datetime import UTC
18
19
  from pathlib import Path
19
20
  from typing import TYPE_CHECKING
20
21
 
@@ -30,6 +31,26 @@ if TYPE_CHECKING:
30
31
  logger = logging.getLogger(__name__)
31
32
 
32
33
 
34
+ def _update_binding_cache(resource_id: str, spec_name: str, provider: str) -> None:
35
+ """Update the new target state cache when provisioning through the legacy path.
36
+
37
+ This bridges the old per-provider state files with the new unified cache
38
+ so that `wafer targets list` can see resources provisioned via the old flow.
39
+ """
40
+ from datetime import datetime
41
+
42
+ from wafer_core.targets.state_cache import BindingEntry, add_binding
43
+
44
+ add_binding(
45
+ resource_id,
46
+ BindingEntry(
47
+ spec_name=spec_name,
48
+ provider=provider,
49
+ bound_at=datetime.now(UTC).isoformat(),
50
+ ),
51
+ )
52
+
53
+
33
54
  @dataclass(frozen=True)
34
55
  class TargetSSHInfo:
35
56
  """SSH connection info for a target."""
@@ -135,7 +156,8 @@ async def _get_runpod_ssh_info(target: RunPodTarget) -> TargetSSHInfo:
135
156
  # Check if pod already exists and is running
136
157
  existing = get_pod_state(target.name)
137
158
  if existing and await check_pod_running(existing.pod_id):
138
- # Reuse existing pod
159
+ # Reuse existing pod — also update the new state cache
160
+ _update_binding_cache(existing.pod_id, target.name, "runpod")
139
161
  return TargetSSHInfo(
140
162
  host=existing.public_ip,
141
163
  port=existing.ssh_port,
@@ -151,6 +173,8 @@ async def _get_runpod_ssh_info(target: RunPodTarget) -> TargetSSHInfo:
151
173
  target_keep_alive = replace(target, keep_alive=True)
152
174
 
153
175
  async with runpod_ssh_context(target_keep_alive) as ssh_info:
176
+ # Update new state cache with provisioned pod
177
+ _update_binding_cache(ssh_info.pod_id, target.name, "runpod")
154
178
  return TargetSSHInfo(
155
179
  host=ssh_info.host,
156
180
  port=ssh_info.port,
@@ -172,7 +196,8 @@ async def _get_digitalocean_ssh_info(target: DigitalOceanTarget) -> TargetSSHInf
172
196
  # Check if droplet already exists and is running
173
197
  existing = get_droplet_state(target.name)
174
198
  if existing and await check_droplet_running(existing.droplet_id):
175
- # Reuse existing droplet
199
+ # Reuse existing droplet — also update the new state cache
200
+ _update_binding_cache(existing.droplet_id, target.name, "digitalocean")
176
201
  return TargetSSHInfo(
177
202
  host=existing.public_ip,
178
203
  port=22, # DigitalOcean uses standard SSH port
@@ -184,6 +209,8 @@ async def _get_digitalocean_ssh_info(target: DigitalOceanTarget) -> TargetSSHInf
184
209
  target_keep_alive = replace(target, keep_alive=True)
185
210
 
186
211
  async with digitalocean_ssh_context(target_keep_alive) as ssh_info:
212
+ # Update new state cache with provisioned droplet
213
+ _update_binding_cache(ssh_info.droplet_id, target.name, "digitalocean")
187
214
  return TargetSSHInfo(
188
215
  host=ssh_info.host,
189
216
  port=ssh_info.port,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wafer-cli
3
- Version: 0.2.26
3
+ Version: 0.2.28
4
4
  Summary: CLI for running GPU workloads, managing remote workspaces, and evaluating/optimizing kernels
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -6,7 +6,7 @@ wafer/api_client.py,sha256=i_Az2b2llC3DSW8yOL-BKqa7LSKuxOr8hSN40s-oQXY,6313
6
6
  wafer/auth.py,sha256=dwss_se5P-FFc9IN38q4kh_dBrA6k-CguDBkivgcdj0,14003
7
7
  wafer/autotuner.py,sha256=41WYP41pTDvMijv2h42vm89bcHtDMJXObDlWmn6xpFU,44416
8
8
  wafer/billing.py,sha256=hEEwtrtIsbPQ3lLJNcyTLMsapUbcuvcVW_e9_0SxzVo,7199
9
- wafer/cli.py,sha256=s3m6SJzK1vRJxaQCrd_I4rcxrt3skty0GBdFHzIBc6U,279424
9
+ wafer/cli.py,sha256=zuVZhPdML5AOBtLUqLwAwjl8XMNe9EwQkffZxtBGLx4,282748
10
10
  wafer/cli_instructions.py,sha256=bziUKDNDAXABVMvKPLEMXm-hFSD2TcFSh-FKRYa949k,4693
11
11
  wafer/config.py,sha256=h5Eo9_yfWqWGoPNdVQikI9GoZVUeysunSYiixf1mKcw,3411
12
12
  wafer/corpus.py,sha256=CY9T7wXENNDJxnrtI-XsQmXeptrFfKG4x-lngrc9_3s,24748
@@ -23,10 +23,12 @@ wafer/problems.py,sha256=ce2sy10A1nnNUG3VGsseTS8jL7LZsku4dE8zVf9JHQ4,11296
23
23
  wafer/rocprof_compute.py,sha256=n_yOGZaFbOXna_ghhmYWXeyUoSabgH4KkjlYq38DlHo,19888
24
24
  wafer/rocprof_sdk.py,sha256=0Q7Ye6dUfa1anFZbqKc21rItgqva8V8VIZoSB7wqbmA,10085
25
25
  wafer/rocprof_systems.py,sha256=4IWbMcbYk1x_8iS7P3FC_u5sgH6EXADCtR2lV9id80M,18629
26
+ wafer/specs_cli.py,sha256=frMEKwMflxVNpFlAuxprmr33ZZ1Oeh2lB0KWZ4oZWzw,4360
26
27
  wafer/ssh_keys.py,sha256=MxiHlSm6wuDUFzkOQtx5K7OIbx_a6bXxE-m8OpwLx98,8130
27
28
  wafer/target_lock.py,sha256=SDKhNzv2N7gsphGflcNni9FE5YYuAMuEthngAJEo4Gs,7809
28
29
  wafer/targets.py,sha256=9r-iRWoKSH5cQl1LcamaX-T7cNVOg99ngIm_hlRk-qU,26922
29
- wafer/targets_ops.py,sha256=jN1oIBx0mutxRNE9xpIc7SaBxPkVmOyus2eqn0kEKNI,21475
30
+ wafer/targets_cli.py,sha256=Oe3e02rSXeNrMbe_Qv9DNfQ8dEOKodtU7BbQQWxlNwA,16348
31
+ wafer/targets_ops.py,sha256=wLPyq55H_wz0wEAEg8KFLYs9LIIyiVIphcsXD2NLa-E,22623
30
32
  wafer/trace_compare.py,sha256=COuxxKY874DteOSLUvJuJFREPMBSybq9dtANi3ATsg4,10803
31
33
  wafer/tracelens.py,sha256=g9ZIeFyNojZn4uTd3skPqIrRiL7aMJOz_-GOd3aiyy4,7998
32
34
  wafer/wevin_cli.py,sha256=eo1ETsXIsCftXSG5AxEYYZipNGcXayKyIevs5F6MjXg,26140
@@ -38,8 +40,8 @@ wafer/templates/optimize_kernel.py,sha256=4-MaKm_C9BQHQEllrNLLYkcdhJpcj6D-8zbJ4F
38
40
  wafer/templates/optimize_kernelbench.py,sha256=T3co9Y9eSLWDrZG66gwQVFMdnGVoyUQos-TxnMMBLL8,3747
39
41
  wafer/templates/trace_analyze.py,sha256=B7CiRlsokERzBjLL-k49kGjpU2zlJZqzTE05xbRS1WI,2878
40
42
  wafer/tests/test_eval_cli_parity.py,sha256=SGmaj2NGBZ7GdDF53bXsECvQbV21iHZw8YeL_MJOLk0,7206
41
- wafer_cli-0.2.26.dist-info/METADATA,sha256=IM8Eatar1KYIBo1hHEBjvpX6J272f0PWfV4mwhV1jIY,2799
42
- wafer_cli-0.2.26.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
43
- wafer_cli-0.2.26.dist-info/entry_points.txt,sha256=WqB7hB__WhtPY8y1cO2sZiUz7fCq6Ik-usAigpeFvWE,41
44
- wafer_cli-0.2.26.dist-info/top_level.txt,sha256=2MK1IVMWfpLL8BZCQ3E9aG6L6L666gSA_teYlwan4fs,6
45
- wafer_cli-0.2.26.dist-info/RECORD,,
43
+ wafer_cli-0.2.28.dist-info/METADATA,sha256=2AsfmsBcnVfG8UTxileEvOjpEYOO1OF5YcpXpm9Mf2w,2799
44
+ wafer_cli-0.2.28.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
45
+ wafer_cli-0.2.28.dist-info/entry_points.txt,sha256=WqB7hB__WhtPY8y1cO2sZiUz7fCq6Ik-usAigpeFvWE,41
46
+ wafer_cli-0.2.28.dist-info/top_level.txt,sha256=2MK1IVMWfpLL8BZCQ3E9aG6L6L666gSA_teYlwan4fs,6
47
+ wafer_cli-0.2.28.dist-info/RECORD,,