wafer-core 0.1.27__py3-none-any.whl → 0.1.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer_core/lib/trace_compare/aligner.py +13 -6
- wafer_core/lib/trace_compare/analyzer.py +12 -3
- wafer_core/lib/trace_compare/fusion_analyzer.py +392 -284
- wafer_core/targets/__init__.py +47 -21
- wafer_core/targets/pool.py +181 -0
- wafer_core/targets/probe.py +113 -0
- wafer_core/targets/providers/__init__.py +46 -0
- wafer_core/targets/providers/baremetal.py +72 -0
- wafer_core/targets/providers/digitalocean.py +164 -0
- wafer_core/targets/providers/runpod.py +250 -0
- wafer_core/targets/reconcile.py +90 -0
- wafer_core/targets/spec_store.py +200 -0
- wafer_core/targets/state_cache.py +150 -0
- wafer_core/targets/types.py +141 -0
- wafer_core/utils/kernel_utils/targets/config.py +8 -24
- {wafer_core-0.1.27.dist-info → wafer_core-0.1.28.dist-info}/METADATA +1 -1
- {wafer_core-0.1.27.dist-info → wafer_core-0.1.28.dist-info}/RECORD +18 -8
- {wafer_core-0.1.27.dist-info → wafer_core-0.1.28.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Target and TargetSpec: the two core concepts for GPU resource management.
|
|
2
|
+
|
|
3
|
+
TargetSpec = provisioning blueprint (TOML config, "how to get a GPU")
|
|
4
|
+
Target = live running resource (from provider API, "what's actually running")
|
|
5
|
+
|
|
6
|
+
TargetSpec is the existing union of provider-specific frozen dataclasses
|
|
7
|
+
(RunPodTarget, DigitalOceanTarget, BaremetalTarget, etc.), re-exported here
|
|
8
|
+
under the name TargetSpec for clarity.
|
|
9
|
+
|
|
10
|
+
Target is always fetched from provider APIs. The spec_name field links a
|
|
11
|
+
live resource back to the spec that created it (None = orphan/unbound).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import TYPE_CHECKING, Protocol, runtime_checkable
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
# TargetSpec is the existing union type, re-exported under a clearer name.
|
|
23
|
+
# Each variant is a frozen dataclass with provider-specific provisioning params.
|
|
24
|
+
from wafer_core.utils.kernel_utils.targets.config import ( # noqa: E402
|
|
25
|
+
TargetConfig,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# TargetSpec = TargetConfig (same union, better name for the new API)
|
|
29
|
+
TargetSpec = TargetConfig
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class Target:
|
|
34
|
+
"""A live running GPU resource, fetched from a provider API.
|
|
35
|
+
|
|
36
|
+
This is the runtime counterpart to TargetSpec. A TargetSpec describes
|
|
37
|
+
*how* to provision a GPU; a Target describes *what's actually running*.
|
|
38
|
+
|
|
39
|
+
The provider API is the source of truth for Target state. Local caches
|
|
40
|
+
(target_state.json) are performance hints only.
|
|
41
|
+
|
|
42
|
+
Fields:
|
|
43
|
+
resource_id: Provider's unique ID (pod_id, droplet_id, or
|
|
44
|
+
"baremetal:{host}:{port}" for SSH targets with no cloud lifecycle).
|
|
45
|
+
provider: Which cloud provider owns this resource.
|
|
46
|
+
status: Current state from provider API.
|
|
47
|
+
public_ip: SSH-reachable IP address (None if not yet assigned).
|
|
48
|
+
ssh_port: SSH port (None if not yet assigned).
|
|
49
|
+
ssh_username: SSH user (typically "root" for cloud providers).
|
|
50
|
+
gpu_type: GPU model name (e.g., "MI300X", "B200").
|
|
51
|
+
name: Provider-side resource name (e.g., "wafer-runpod-mi300x-1706000000",
|
|
52
|
+
"kernelbench-pool-0"). Used for spec_name inference.
|
|
53
|
+
created_at: ISO timestamp of resource creation (None if unknown).
|
|
54
|
+
spec_name: Name of the TargetSpec that owns this resource.
|
|
55
|
+
None means unbound (orphan) — running but no spec claims it.
|
|
56
|
+
price_per_hour: Cost in $/hr (None if unknown or baremetal).
|
|
57
|
+
labels: Software metadata not available from the provider API's
|
|
58
|
+
structured fields. Examples: {"rocm_version": "7.0.2",
|
|
59
|
+
"cuda_version": "12.4", "image": "rocm/pytorch:rocm7.0.2_..."}.
|
|
60
|
+
Populated from the container image string at provision time,
|
|
61
|
+
or from SSH probe on demand. Pool queries filter on these.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
resource_id: str
|
|
65
|
+
provider: str
|
|
66
|
+
status: str
|
|
67
|
+
public_ip: str | None
|
|
68
|
+
ssh_port: int | None
|
|
69
|
+
ssh_username: str
|
|
70
|
+
gpu_type: str
|
|
71
|
+
name: str | None = None
|
|
72
|
+
created_at: str | None = None
|
|
73
|
+
spec_name: str | None = None
|
|
74
|
+
price_per_hour: float | None = None
|
|
75
|
+
labels: dict[str, str] = field(default_factory=dict)
|
|
76
|
+
|
|
77
|
+
def __post_init__(self) -> None:
|
|
78
|
+
assert self.resource_id, "resource_id cannot be empty"
|
|
79
|
+
assert self.provider, "provider cannot be empty"
|
|
80
|
+
assert self.status, "status cannot be empty"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass(frozen=True)
|
|
84
|
+
class ReconcileResult:
|
|
85
|
+
"""Result of comparing TargetSpecs to live Targets.
|
|
86
|
+
|
|
87
|
+
Pure data — no side effects. The caller decides what to do:
|
|
88
|
+
- Display bound/unbound/unprovisioned in CLI
|
|
89
|
+
- Terminate unbound targets
|
|
90
|
+
- Provision from unprovisioned specs
|
|
91
|
+
|
|
92
|
+
Fields:
|
|
93
|
+
bound: Specs matched to live targets (spec, target) pairs.
|
|
94
|
+
unbound: Live targets with no matching spec (orphans).
|
|
95
|
+
unprovisioned: Specs with no live target running.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
bound: list[tuple[TargetSpec, Target]]
|
|
99
|
+
unbound: list[Target]
|
|
100
|
+
unprovisioned: list[TargetSpec]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@runtime_checkable
|
|
104
|
+
class TargetProvider(Protocol):
|
|
105
|
+
"""Interface for querying and managing live GPU resources from a cloud provider.
|
|
106
|
+
|
|
107
|
+
Each cloud provider (RunPod, DigitalOcean, etc.) implements this protocol.
|
|
108
|
+
Methods are async because they hit external APIs.
|
|
109
|
+
|
|
110
|
+
Baremetal is a degenerate case: list_targets returns a Target built from
|
|
111
|
+
the spec's ssh_target, provision/terminate are no-ops.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
async def list_targets(self) -> list[Target]:
|
|
115
|
+
"""List all running resources on the provider account.
|
|
116
|
+
|
|
117
|
+
Always hits the provider API — never reads from local cache.
|
|
118
|
+
"""
|
|
119
|
+
...
|
|
120
|
+
|
|
121
|
+
async def get_target(self, resource_id: str) -> Target | None:
|
|
122
|
+
"""Get a specific resource by provider ID.
|
|
123
|
+
|
|
124
|
+
Returns None if the resource doesn't exist or is terminated.
|
|
125
|
+
"""
|
|
126
|
+
...
|
|
127
|
+
|
|
128
|
+
async def provision(self, spec: TargetSpec) -> Target:
|
|
129
|
+
"""Provision a new resource from a spec.
|
|
130
|
+
|
|
131
|
+
Blocks until the resource is SSH-ready.
|
|
132
|
+
Raises on failure (no silent None returns).
|
|
133
|
+
"""
|
|
134
|
+
...
|
|
135
|
+
|
|
136
|
+
async def terminate(self, resource_id: str) -> bool:
|
|
137
|
+
"""Terminate a resource by provider ID.
|
|
138
|
+
|
|
139
|
+
Returns True if terminated, False if resource not found.
|
|
140
|
+
"""
|
|
141
|
+
...
|
|
@@ -346,25 +346,17 @@ class RunPodTarget:
|
|
|
346
346
|
ncu_available: bool = False
|
|
347
347
|
|
|
348
348
|
def __post_init__(self) -> None:
|
|
349
|
-
"""Validate configuration.
|
|
350
|
-
from wafer_core.auth import get_api_key
|
|
349
|
+
"""Validate configuration fields.
|
|
351
350
|
|
|
351
|
+
API key availability is checked at provision/query time, not here —
|
|
352
|
+
loading a spec from TOML should not require credentials.
|
|
353
|
+
"""
|
|
352
354
|
assert self.name, "name cannot be empty"
|
|
353
355
|
assert self.ssh_key, "ssh_key cannot be empty"
|
|
354
356
|
assert self.gpu_count > 0, "gpu_count must be positive"
|
|
355
357
|
assert self.provision_timeout > 0, "provision_timeout must be positive"
|
|
356
358
|
assert self.eval_timeout > 0, "eval_timeout must be positive"
|
|
357
359
|
|
|
358
|
-
# Check for API key (env var or ~/.wafer/auth.json)
|
|
359
|
-
api_key = get_api_key("runpod")
|
|
360
|
-
if not api_key:
|
|
361
|
-
raise ValueError(
|
|
362
|
-
"RunPod API key not found.\n"
|
|
363
|
-
"Set WAFER_RUNPOD_API_KEY environment variable, or run:\n"
|
|
364
|
-
" wafer auth login runpod\n"
|
|
365
|
-
"Get your API key from: https://runpod.io/console/user/settings"
|
|
366
|
-
)
|
|
367
|
-
|
|
368
360
|
|
|
369
361
|
@dataclass(frozen=True)
|
|
370
362
|
class LocalTarget:
|
|
@@ -468,24 +460,16 @@ class DigitalOceanTarget:
|
|
|
468
460
|
ncu_available: bool = False
|
|
469
461
|
|
|
470
462
|
def __post_init__(self) -> None:
|
|
471
|
-
"""Validate configuration.
|
|
472
|
-
from wafer_core.auth import get_api_key
|
|
463
|
+
"""Validate configuration fields.
|
|
473
464
|
|
|
465
|
+
API key availability is checked at provision/query time, not here —
|
|
466
|
+
loading a spec from TOML should not require credentials.
|
|
467
|
+
"""
|
|
474
468
|
assert self.name, "name cannot be empty"
|
|
475
469
|
assert self.ssh_key, "ssh_key cannot be empty"
|
|
476
470
|
assert self.provision_timeout > 0, "provision_timeout must be positive"
|
|
477
471
|
assert self.eval_timeout > 0, "eval_timeout must be positive"
|
|
478
472
|
|
|
479
|
-
# Check for API key (env var or ~/.wafer/auth.json)
|
|
480
|
-
api_key = get_api_key("digitalocean")
|
|
481
|
-
if not api_key:
|
|
482
|
-
raise ValueError(
|
|
483
|
-
"DigitalOcean API key not found.\n"
|
|
484
|
-
"Set WAFER_AMD_DIGITALOCEAN_API_KEY environment variable, or run:\n"
|
|
485
|
-
" wafer auth login digitalocean\n"
|
|
486
|
-
"Get your API key from: https://cloud.digitalocean.com/account/api/tokens"
|
|
487
|
-
)
|
|
488
|
-
|
|
489
473
|
|
|
490
474
|
# Union type for target configs
|
|
491
475
|
TargetConfig = (
|
|
@@ -320,13 +320,13 @@ wafer_core/lib/rocprofiler/systems/sample/__init__.py,sha256=31rNmLPQ7OVhvlOEEOw
|
|
|
320
320
|
wafer_core/lib/rocprofiler/systems/sample/profiler.py,sha256=CYZPTzNXd48LoCfmY6h_5RSYEdWYccuv3-t4YncHJLE,7384
|
|
321
321
|
wafer_core/lib/trace_compare/PERFORMANCE.md,sha256=jkJh7ApZi8H7NKTcz8v0LNtwSFtIUqY88e3QbL749ww,3823
|
|
322
322
|
wafer_core/lib/trace_compare/__init__.py,sha256=CyUPbPQDYhVLCFFA7S_jNSilG3OgqYjmHSKfR5X11go,1377
|
|
323
|
-
wafer_core/lib/trace_compare/aligner.py,sha256=
|
|
324
|
-
wafer_core/lib/trace_compare/analyzer.py,sha256=
|
|
323
|
+
wafer_core/lib/trace_compare/aligner.py,sha256=1S8Ob3RaEsIjN0HdqEx0yGsW5uf_lMrJVSH_MnZhKok,13788
|
|
324
|
+
wafer_core/lib/trace_compare/analyzer.py,sha256=YkuOPA3HFX_7mNUEhE9CMOtEMGLQd12lvUkvqqeQF14,29698
|
|
325
325
|
wafer_core/lib/trace_compare/api.py,sha256=JSRTcd7eZK1Z8l18TFEiA5A8ENJS1TMz7oIiw1KBbAs,8796
|
|
326
326
|
wafer_core/lib/trace_compare/architecture.py,sha256=8bqlAJQeJLBHblyXvFV-w55PIKiVQDPjDQZ8Jx4tuGg,2110
|
|
327
327
|
wafer_core/lib/trace_compare/classifier.py,sha256=CDGzY9TY-I5wRuEGsu4mTCdljqVTOnLWyFLyNgmkGXI,16864
|
|
328
328
|
wafer_core/lib/trace_compare/formatter.py,sha256=GNrCZ45ueBN05CEXjOtTuKvTI8z-g-ZZFil-ni3sWVY,37962
|
|
329
|
-
wafer_core/lib/trace_compare/fusion_analyzer.py,sha256=
|
|
329
|
+
wafer_core/lib/trace_compare/fusion_analyzer.py,sha256=ZbFXUuPOt8ezT08WfjlDx7XaUNoUgg9hlFTJb68-eo0,17433
|
|
330
330
|
wafer_core/lib/trace_compare/kernel_registry.yaml,sha256=0-knXwsF3pR1x1JdIz-aWaH-5xDgTylh53E47Kf6nHo,9808
|
|
331
331
|
wafer_core/lib/trace_compare/layer_segmentation.py,sha256=kI_Y1e9nrKZfdwfcrGo4h7gpMxqXI_xkgXk46zuFen4,4642
|
|
332
332
|
wafer_core/lib/trace_compare/loader.py,sha256=zBHI0r7CX_wJ2mz0_-s0lm9KGSdaVaq7OKyxUL6KIlw,23997
|
|
@@ -597,9 +597,19 @@ wafer_core/sessions/__init__.py,sha256=Ybps5QclZShAELoW9bva4w6OCNrcBf8vd9nGDjYfQ
|
|
|
597
597
|
wafer_core/sessions/agent.py,sha256=4-Q-NG_xm07FFq7hB8mjxW38nt2_S0QpwCYkPOoGRxA,5946
|
|
598
598
|
wafer_core/sessions/dtypes.py,sha256=K6nOjvL6sjCGY7GTtdEygf1IZY_18R9YkHGqFyMd8wY,589
|
|
599
599
|
wafer_core/sessions/hooks.py,sha256=A-txm6ufnRGQCdtP3vwh7oEOdlLN9Tv0XsjORMihuAI,4295
|
|
600
|
-
wafer_core/targets/__init__.py,sha256=
|
|
600
|
+
wafer_core/targets/__init__.py,sha256=N4lTf9MjZ5dzAShObweZzyBfPMSzwjD5qBFWnM5lczM,2800
|
|
601
601
|
wafer_core/targets/digitalocean.py,sha256=cvoYpYjtSyy5t2lQAPi7ERruuuibronah_ivOiduAHQ,16550
|
|
602
|
+
wafer_core/targets/pool.py,sha256=TeNE9rpr67OsGtbxniYpr9Cb3wosnf_e3kTLBbwtDok,5434
|
|
603
|
+
wafer_core/targets/probe.py,sha256=rzF8tiq5GxkMR3jhryTOW0GMcoHtrN67wmHlGJuBTv8,3038
|
|
604
|
+
wafer_core/targets/reconcile.py,sha256=Hftd7LyqkcTOP0Qpa_cdYpxGW2I3bkSlkQrnYjU5lns,3091
|
|
602
605
|
wafer_core/targets/runpod.py,sha256=LrVmNvA6qjzL5nbGSWvtw7CHrK6bDu7_o3vKIek00Tc,20286
|
|
606
|
+
wafer_core/targets/spec_store.py,sha256=uNpMdo7ASeq7_RhgAqj8CFIK39rGEbaYtYtqt--FXO0,6455
|
|
607
|
+
wafer_core/targets/state_cache.py,sha256=oji4APL_tjOty_u0CJzHaP59jJAIJWQTjYvD4pCdQ3g,4479
|
|
608
|
+
wafer_core/targets/types.py,sha256=MQ7ECcBAwSoWsJfGxycJoLBeoTXSYtGeXEg5ZNxfs4c,5217
|
|
609
|
+
wafer_core/targets/providers/__init__.py,sha256=u6OCCgyPRymrnZmIYPLF0hdkr6aTCF301K9gSgcFWvc,1355
|
|
610
|
+
wafer_core/targets/providers/baremetal.py,sha256=L0KAiTkRH_fQvCbtaEa5wlJBqsvNaY56Zq6ovBhk2YY,2452
|
|
611
|
+
wafer_core/targets/providers/digitalocean.py,sha256=_TnGi9Otzsn2T_vSv40T_3HFLT559WS_ljGsrWr7j0s,5281
|
|
612
|
+
wafer_core/targets/providers/runpod.py,sha256=jCA7ENFRwbTKyToGa7fw2VS3coY61ggK1m0F17-rvng,7388
|
|
603
613
|
wafer_core/tools/__init__.py,sha256=wBQD45GdSfkxcT6NHzIv0IMeXCc0enwwkpm3T_9j1X8,3341
|
|
604
614
|
wafer_core/tools/bash_tool.py,sha256=daoKOVGSgL0x9X_3l8Apd6-wFH4VMXMGJwVemw2FIfc,16828
|
|
605
615
|
wafer_core/tools/glob_tool.py,sha256=9X5PdOjQJj7kiVNqqCZC0-1LmnE6wHx3Zc9zfMjtXdc,3533
|
|
@@ -679,7 +689,7 @@ wafer_core/utils/kernel_utils/static_checker.py,sha256=XIQkzAOkGH5xtrOuZM4tNUqVJ
|
|
|
679
689
|
wafer_core/utils/kernel_utils/task.py,sha256=XcmKxKUWh5It6nX3zGqj77tWgA32uPfQMqNOqyD5T48,2682
|
|
680
690
|
wafer_core/utils/kernel_utils/utils.py,sha256=uDZoJDxh07hJeLNlPdKN2vgB15pqIr1LbXf0YIBHU4E,43056
|
|
681
691
|
wafer_core/utils/kernel_utils/targets/__init__.py,sha256=4NwRLsuJ__S4xKAfda4Ag82C5MQ3Qio-4xA5S-mQGlU,2067
|
|
682
|
-
wafer_core/utils/kernel_utils/targets/config.py,sha256=
|
|
692
|
+
wafer_core/utils/kernel_utils/targets/config.py,sha256=DJPPyV7yGmyvS7cavdDENC5PQsia1dQeQYlWCTE7iUo,19975
|
|
683
693
|
wafer_core/utils/kernel_utils/targets/execution.py,sha256=bZuNXCo0sIdD6hFhetLPrtDC-zMSiIsAx_aml49VVL0,15033
|
|
684
694
|
wafer_core/utils/kernel_utils/targets/selection.py,sha256=5I_RG_7cfhq7uaeR28meC2EeNNKssFsK-Tc3QFG6Ze0,3590
|
|
685
695
|
wafer_core/utils/modal_execution/__init__.py,sha256=jkVqYOLzCT5K73N9Od0UIUsx-99A0m6bpDrxfyXxQZ8,945
|
|
@@ -687,6 +697,6 @@ wafer_core/utils/modal_execution/modal_app.py,sha256=VfS2cX8gHtnlPXemmMcEwDPeQdh
|
|
|
687
697
|
wafer_core/utils/modal_execution/modal_config.py,sha256=7cGX9TGqilQ3qxI3OFGXV5orjtyRU-PEDOJ4vP2oxno,4421
|
|
688
698
|
wafer_core/utils/modal_execution/modal_execution.py,sha256=gChjnV6jqA3A7IRP3DfvV5cSfm_MN0X4f7JZufXgdZE,24594
|
|
689
699
|
wafer_core/utils/modal_execution/test_modal.py,sha256=_jqou_hrLs1Daf1590Pnb0a_lXMMa2rczAPpW9HpoNQ,8153
|
|
690
|
-
wafer_core-0.1.
|
|
691
|
-
wafer_core-0.1.
|
|
692
|
-
wafer_core-0.1.
|
|
700
|
+
wafer_core-0.1.28.dist-info/METADATA,sha256=0x6opc3zOlxGhlZNJDVDY2LPnBZHYP5K4U0I6ZDl0Os,1477
|
|
701
|
+
wafer_core-0.1.28.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
702
|
+
wafer_core-0.1.28.dist-info/RECORD,,
|
|
File without changes
|