wafer-core 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer_core/lib/trace_compare/PERFORMANCE.md +148 -0
- wafer_core/lib/trace_compare/__init__.py +22 -9
- wafer_core/lib/trace_compare/aligner.py +376 -0
- wafer_core/lib/trace_compare/analyzer.py +558 -159
- wafer_core/lib/trace_compare/api.py +225 -0
- wafer_core/lib/trace_compare/architecture.py +77 -0
- wafer_core/lib/trace_compare/classifier.py +307 -13
- wafer_core/lib/trace_compare/fusion_analyzer.py +280 -706
- wafer_core/lib/trace_compare/kernel_registry.yaml +349 -0
- wafer_core/lib/trace_compare/layer_segmentation.py +114 -0
- wafer_core/lib/trace_compare/loader.py +526 -227
- wafer_core/lib/trace_compare/same_kernel_analyzer.py +119 -0
- wafer_core/lib/trace_compare/warnings.py +99 -0
- wafer_core/targets/__init__.py +47 -21
- wafer_core/targets/pool.py +181 -0
- wafer_core/targets/probe.py +113 -0
- wafer_core/targets/providers/__init__.py +46 -0
- wafer_core/targets/providers/baremetal.py +72 -0
- wafer_core/targets/providers/digitalocean.py +164 -0
- wafer_core/targets/providers/runpod.py +250 -0
- wafer_core/targets/reconcile.py +90 -0
- wafer_core/targets/spec_store.py +200 -0
- wafer_core/targets/state_cache.py +150 -0
- wafer_core/targets/types.py +141 -0
- wafer_core/utils/kernel_utils/targets/config.py +8 -24
- {wafer_core-0.1.26.dist-info → wafer_core-0.1.28.dist-info}/METADATA +3 -1
- {wafer_core-0.1.26.dist-info → wafer_core-0.1.28.dist-info}/RECORD +28 -10
- {wafer_core-0.1.26.dist-info → wafer_core-0.1.28.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""Spec store: CRUD for TargetSpec TOML files.
|
|
2
|
+
|
|
3
|
+
Specs live in ~/.wafer/specs/{name}.toml. On first access, auto-migrates
|
|
4
|
+
from the old ~/.wafer/targets/ directory if specs/ doesn't exist yet.
|
|
5
|
+
|
|
6
|
+
This module provides the same operations as the old targets.py but under
|
|
7
|
+
the "spec" vocabulary. The CLI-layer targets.py still works and delegates
|
|
8
|
+
here where needed.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import shutil
|
|
15
|
+
from dataclasses import asdict
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import tomllib
|
|
20
|
+
|
|
21
|
+
from wafer_core.utils.kernel_utils.targets.config import (
|
|
22
|
+
BaremetalTarget,
|
|
23
|
+
DigitalOceanTarget,
|
|
24
|
+
LocalTarget,
|
|
25
|
+
ModalTarget,
|
|
26
|
+
RunPodTarget,
|
|
27
|
+
TargetConfig,
|
|
28
|
+
VMTarget,
|
|
29
|
+
WorkspaceTarget,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
WAFER_DIR = Path.home() / ".wafer"
|
|
35
|
+
SPECS_DIR = WAFER_DIR / "specs"
|
|
36
|
+
OLD_TARGETS_DIR = WAFER_DIR / "targets"
|
|
37
|
+
CONFIG_FILE = WAFER_DIR / "config.toml"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _ensure_specs_dir() -> None:
|
|
41
|
+
"""Ensure ~/.wafer/specs/ exists, migrating from targets/ if needed."""
|
|
42
|
+
if SPECS_DIR.exists():
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
if OLD_TARGETS_DIR.exists() and any(OLD_TARGETS_DIR.glob("*.toml")):
|
|
46
|
+
logger.info(
|
|
47
|
+
f"Migrating {OLD_TARGETS_DIR} -> {SPECS_DIR} (target configs are now called 'specs')"
|
|
48
|
+
)
|
|
49
|
+
shutil.copytree(OLD_TARGETS_DIR, SPECS_DIR)
|
|
50
|
+
# Don't delete old dir yet — other code may still read from it.
|
|
51
|
+
# It becomes a dead symlink target once all callers migrate.
|
|
52
|
+
logger.info(
|
|
53
|
+
f"Migration complete. Old directory preserved at {OLD_TARGETS_DIR}. "
|
|
54
|
+
"You can safely delete it once 'wafer specs list' works."
|
|
55
|
+
)
|
|
56
|
+
else:
|
|
57
|
+
SPECS_DIR.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _spec_path(name: str) -> Path:
|
|
61
|
+
return SPECS_DIR / f"{name}.toml"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ── Parsing ──────────────────────────────────────────────────────────────────
|
|
65
|
+
|
|
66
|
+
_TYPE_MAP: dict[str, type] = {
|
|
67
|
+
"baremetal": BaremetalTarget,
|
|
68
|
+
"vm": VMTarget,
|
|
69
|
+
"modal": ModalTarget,
|
|
70
|
+
"workspace": WorkspaceTarget,
|
|
71
|
+
"runpod": RunPodTarget,
|
|
72
|
+
"digitalocean": DigitalOceanTarget,
|
|
73
|
+
"local": LocalTarget,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
_TYPE_REVERSE: dict[type, str] = {v: k for k, v in _TYPE_MAP.items()}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def parse_spec(data: dict[str, Any]) -> TargetConfig:
|
|
80
|
+
"""Parse TOML dict into TargetSpec (TargetConfig union)."""
|
|
81
|
+
target_type = data.get("type")
|
|
82
|
+
if not target_type:
|
|
83
|
+
raise ValueError("Spec must have 'type' field")
|
|
84
|
+
|
|
85
|
+
cls = _TYPE_MAP.get(target_type)
|
|
86
|
+
if cls is None:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
f"Unknown spec type: {target_type}. Must be one of: {', '.join(sorted(_TYPE_MAP))}"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
fields = {k: v for k, v in data.items() if k != "type"}
|
|
92
|
+
|
|
93
|
+
# TOML parses lists; dataclasses may want tuples
|
|
94
|
+
if "pip_packages" in fields and isinstance(fields["pip_packages"], list):
|
|
95
|
+
fields["pip_packages"] = tuple(fields["pip_packages"])
|
|
96
|
+
if "gpu_ids" in fields and isinstance(fields["gpu_ids"], list):
|
|
97
|
+
fields["gpu_ids"] = tuple(fields["gpu_ids"])
|
|
98
|
+
|
|
99
|
+
return cls(**fields)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def serialize_spec(spec: TargetConfig) -> dict[str, Any]:
|
|
103
|
+
"""Serialize TargetSpec to TOML-compatible dict."""
|
|
104
|
+
data = asdict(spec)
|
|
105
|
+
data["type"] = _TYPE_REVERSE.get(type(spec), "unknown")
|
|
106
|
+
|
|
107
|
+
# Tuples -> lists for TOML
|
|
108
|
+
for key in ("pip_packages", "gpu_ids"):
|
|
109
|
+
if key in data and isinstance(data[key], tuple):
|
|
110
|
+
data[key] = list(data[key])
|
|
111
|
+
|
|
112
|
+
# Drop empty pip_packages
|
|
113
|
+
if "pip_packages" in data and not data["pip_packages"]:
|
|
114
|
+
del data["pip_packages"]
|
|
115
|
+
|
|
116
|
+
return data
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ── CRUD ─────────────────────────────────────────────────────────────────────
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def load_spec(name: str) -> TargetConfig:
|
|
123
|
+
"""Load spec by name from ~/.wafer/specs/{name}.toml.
|
|
124
|
+
|
|
125
|
+
Falls back to ~/.wafer/targets/{name}.toml for backwards compatibility.
|
|
126
|
+
"""
|
|
127
|
+
_ensure_specs_dir()
|
|
128
|
+
|
|
129
|
+
path = _spec_path(name)
|
|
130
|
+
if not path.exists():
|
|
131
|
+
# Fallback to old location
|
|
132
|
+
old_path = OLD_TARGETS_DIR / f"{name}.toml"
|
|
133
|
+
if old_path.exists():
|
|
134
|
+
path = old_path
|
|
135
|
+
else:
|
|
136
|
+
raise FileNotFoundError(f"Spec not found: {name} (looked in {SPECS_DIR})")
|
|
137
|
+
|
|
138
|
+
with open(path, "rb") as f:
|
|
139
|
+
data = tomllib.load(f)
|
|
140
|
+
|
|
141
|
+
return parse_spec(data)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def save_spec(spec: TargetConfig) -> None:
|
|
145
|
+
"""Save spec to ~/.wafer/specs/{name}.toml."""
|
|
146
|
+
_ensure_specs_dir()
|
|
147
|
+
|
|
148
|
+
data = serialize_spec(spec)
|
|
149
|
+
path = _spec_path(spec.name)
|
|
150
|
+
_write_toml(path, data)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def list_spec_names() -> list[str]:
|
|
154
|
+
"""List all spec names from ~/.wafer/specs/."""
|
|
155
|
+
_ensure_specs_dir()
|
|
156
|
+
return sorted(p.stem for p in SPECS_DIR.glob("*.toml"))
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def remove_spec(name: str) -> None:
|
|
160
|
+
"""Remove a spec by name."""
|
|
161
|
+
path = _spec_path(name)
|
|
162
|
+
if not path.exists():
|
|
163
|
+
raise FileNotFoundError(f"Spec not found: {name}")
|
|
164
|
+
path.unlink()
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def load_all_specs() -> list[TargetConfig]:
|
|
168
|
+
"""Load all specs. Skips specs that fail to parse (logs warning)."""
|
|
169
|
+
specs = []
|
|
170
|
+
for name in list_spec_names():
|
|
171
|
+
try:
|
|
172
|
+
specs.append(load_spec(name))
|
|
173
|
+
except Exception as e:
|
|
174
|
+
logger.warning(f"Failed to load spec {name}: {e}")
|
|
175
|
+
return specs
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# ── TOML writer ──────────────────────────────────────────────────────────────
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _write_toml(path: Path, data: dict[str, Any]) -> None:
|
|
182
|
+
"""Write dict as flat TOML file."""
|
|
183
|
+
lines = []
|
|
184
|
+
for key, value in data.items():
|
|
185
|
+
if value is None:
|
|
186
|
+
continue
|
|
187
|
+
if isinstance(value, bool):
|
|
188
|
+
lines.append(f"{key} = {str(value).lower()}")
|
|
189
|
+
elif isinstance(value, int | float):
|
|
190
|
+
lines.append(f"{key} = {value}")
|
|
191
|
+
elif isinstance(value, str):
|
|
192
|
+
lines.append(f'{key} = "{value}"')
|
|
193
|
+
elif isinstance(value, list):
|
|
194
|
+
if all(isinstance(v, int) for v in value):
|
|
195
|
+
lines.append(f"{key} = {value}")
|
|
196
|
+
else:
|
|
197
|
+
formatted = ", ".join(f'"{v}"' if isinstance(v, str) else str(v) for v in value)
|
|
198
|
+
lines.append(f"{key} = [{formatted}]")
|
|
199
|
+
|
|
200
|
+
path.write_text("\n".join(lines) + "\n")
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Target state cache: bindings and labels for live resources.
|
|
2
|
+
|
|
3
|
+
Cache file: ~/.wafer/target_state.json
|
|
4
|
+
|
|
5
|
+
Bindings map resource_id -> spec_name (performance hint for reconciliation).
|
|
6
|
+
Labels map resource_id -> {key: value} (probed software versions).
|
|
7
|
+
|
|
8
|
+
The provider API is always the source of truth for whether a resource exists.
|
|
9
|
+
This cache stores metadata that's expensive to recompute (SSH probes, name inference).
|
|
10
|
+
|
|
11
|
+
Format:
|
|
12
|
+
{
|
|
13
|
+
"bindings": {
|
|
14
|
+
"<resource_id>": {
|
|
15
|
+
"spec_name": "<spec_name>",
|
|
16
|
+
"provider": "<provider>",
|
|
17
|
+
"bound_at": "<ISO timestamp>"
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"labels": {
|
|
21
|
+
"<resource_id>": {
|
|
22
|
+
"rocm_version": "7.0.2",
|
|
23
|
+
"python_version": "3.12",
|
|
24
|
+
...
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import json
|
|
33
|
+
import logging
|
|
34
|
+
from dataclasses import asdict, dataclass
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
WAFER_DIR = Path.home() / ".wafer"
|
|
40
|
+
STATE_FILE = WAFER_DIR / "target_state.json"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass(frozen=True)
|
|
44
|
+
class BindingEntry:
|
|
45
|
+
"""A cached binding from resource_id to spec_name."""
|
|
46
|
+
|
|
47
|
+
spec_name: str
|
|
48
|
+
provider: str
|
|
49
|
+
bound_at: str # ISO timestamp
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
# Raw file I/O
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
def _load_state() -> dict:
|
|
57
|
+
"""Load the full state file. Returns empty dict if missing/corrupted."""
|
|
58
|
+
if not STATE_FILE.exists():
|
|
59
|
+
return {}
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
return json.loads(STATE_FILE.read_text())
|
|
63
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
64
|
+
logger.warning(f"Corrupted state cache, ignoring: {e}")
|
|
65
|
+
return {}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _save_state(data: dict) -> None:
|
|
69
|
+
"""Write the full state file."""
|
|
70
|
+
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
STATE_FILE.write_text(json.dumps(data, indent=2) + "\n")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
# Bindings
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
|
|
78
|
+
def load_bindings() -> dict[str, BindingEntry]:
|
|
79
|
+
"""Load binding cache from disk."""
|
|
80
|
+
data = _load_state()
|
|
81
|
+
bindings_raw = data.get("bindings", {})
|
|
82
|
+
result = {}
|
|
83
|
+
for rid, entry in bindings_raw.items():
|
|
84
|
+
try:
|
|
85
|
+
result[rid] = BindingEntry(**entry)
|
|
86
|
+
except TypeError:
|
|
87
|
+
logger.warning(f"Skipping malformed binding for {rid}")
|
|
88
|
+
return result
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def save_bindings(bindings: dict[str, BindingEntry]) -> None:
|
|
92
|
+
"""Write bindings to disk (preserves labels)."""
|
|
93
|
+
data = _load_state()
|
|
94
|
+
data["bindings"] = {rid: asdict(entry) for rid, entry in bindings.items()}
|
|
95
|
+
_save_state(data)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def add_binding(resource_id: str, entry: BindingEntry) -> None:
|
|
99
|
+
"""Add a single binding to the cache."""
|
|
100
|
+
bindings = load_bindings()
|
|
101
|
+
bindings[resource_id] = entry
|
|
102
|
+
save_bindings(bindings)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def remove_binding(resource_id: str) -> None:
|
|
106
|
+
"""Remove a binding from the cache. No-op if not found."""
|
|
107
|
+
bindings = load_bindings()
|
|
108
|
+
if resource_id in bindings:
|
|
109
|
+
del bindings[resource_id]
|
|
110
|
+
save_bindings(bindings)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def get_binding_hints() -> dict[str, str]:
|
|
114
|
+
"""Get resource_id -> spec_name map for reconciliation."""
|
|
115
|
+
bindings = load_bindings()
|
|
116
|
+
return {rid: entry.spec_name for rid, entry in bindings.items()}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
# Labels
|
|
121
|
+
# ---------------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
def load_all_labels() -> dict[str, dict[str, str]]:
|
|
124
|
+
"""Load all cached labels. Returns resource_id -> labels dict."""
|
|
125
|
+
data = _load_state()
|
|
126
|
+
return data.get("labels", {})
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def load_labels(resource_id: str) -> dict[str, str]:
|
|
130
|
+
"""Load cached labels for a single resource. Returns empty dict if none."""
|
|
131
|
+
return load_all_labels().get(resource_id, {})
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def save_labels(resource_id: str, labels: dict[str, str]) -> None:
|
|
135
|
+
"""Save labels for a resource (preserves bindings and other labels)."""
|
|
136
|
+
data = _load_state()
|
|
137
|
+
if "labels" not in data:
|
|
138
|
+
data["labels"] = {}
|
|
139
|
+
data["labels"][resource_id] = labels
|
|
140
|
+
_save_state(data)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def remove_labels(resource_id: str) -> None:
|
|
144
|
+
"""Remove cached labels for a resource. No-op if not found."""
|
|
145
|
+
data = _load_state()
|
|
146
|
+
labels = data.get("labels", {})
|
|
147
|
+
if resource_id in labels:
|
|
148
|
+
del labels[resource_id]
|
|
149
|
+
data["labels"] = labels
|
|
150
|
+
_save_state(data)
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Target and TargetSpec: the two core concepts for GPU resource management.
|
|
2
|
+
|
|
3
|
+
TargetSpec = provisioning blueprint (TOML config, "how to get a GPU")
|
|
4
|
+
Target = live running resource (from provider API, "what's actually running")
|
|
5
|
+
|
|
6
|
+
TargetSpec is the existing union of provider-specific frozen dataclasses
|
|
7
|
+
(RunPodTarget, DigitalOceanTarget, BaremetalTarget, etc.), re-exported here
|
|
8
|
+
under the name TargetSpec for clarity.
|
|
9
|
+
|
|
10
|
+
Target is always fetched from provider APIs. The spec_name field links a
|
|
11
|
+
live resource back to the spec that created it (None = orphan/unbound).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import TYPE_CHECKING, Protocol, runtime_checkable
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
# TargetSpec is the existing union type, re-exported under a clearer name.
|
|
23
|
+
# Each variant is a frozen dataclass with provider-specific provisioning params.
|
|
24
|
+
from wafer_core.utils.kernel_utils.targets.config import ( # noqa: E402
|
|
25
|
+
TargetConfig,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# TargetSpec = TargetConfig (same union, better name for the new API)
|
|
29
|
+
TargetSpec = TargetConfig
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class Target:
|
|
34
|
+
"""A live running GPU resource, fetched from a provider API.
|
|
35
|
+
|
|
36
|
+
This is the runtime counterpart to TargetSpec. A TargetSpec describes
|
|
37
|
+
*how* to provision a GPU; a Target describes *what's actually running*.
|
|
38
|
+
|
|
39
|
+
The provider API is the source of truth for Target state. Local caches
|
|
40
|
+
(target_state.json) are performance hints only.
|
|
41
|
+
|
|
42
|
+
Fields:
|
|
43
|
+
resource_id: Provider's unique ID (pod_id, droplet_id, or
|
|
44
|
+
"baremetal:{host}:{port}" for SSH targets with no cloud lifecycle).
|
|
45
|
+
provider: Which cloud provider owns this resource.
|
|
46
|
+
status: Current state from provider API.
|
|
47
|
+
public_ip: SSH-reachable IP address (None if not yet assigned).
|
|
48
|
+
ssh_port: SSH port (None if not yet assigned).
|
|
49
|
+
ssh_username: SSH user (typically "root" for cloud providers).
|
|
50
|
+
gpu_type: GPU model name (e.g., "MI300X", "B200").
|
|
51
|
+
name: Provider-side resource name (e.g., "wafer-runpod-mi300x-1706000000",
|
|
52
|
+
"kernelbench-pool-0"). Used for spec_name inference.
|
|
53
|
+
created_at: ISO timestamp of resource creation (None if unknown).
|
|
54
|
+
spec_name: Name of the TargetSpec that owns this resource.
|
|
55
|
+
None means unbound (orphan) — running but no spec claims it.
|
|
56
|
+
price_per_hour: Cost in $/hr (None if unknown or baremetal).
|
|
57
|
+
labels: Software metadata not available from the provider API's
|
|
58
|
+
structured fields. Examples: {"rocm_version": "7.0.2",
|
|
59
|
+
"cuda_version": "12.4", "image": "rocm/pytorch:rocm7.0.2_..."}.
|
|
60
|
+
Populated from the container image string at provision time,
|
|
61
|
+
or from SSH probe on demand. Pool queries filter on these.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
resource_id: str
|
|
65
|
+
provider: str
|
|
66
|
+
status: str
|
|
67
|
+
public_ip: str | None
|
|
68
|
+
ssh_port: int | None
|
|
69
|
+
ssh_username: str
|
|
70
|
+
gpu_type: str
|
|
71
|
+
name: str | None = None
|
|
72
|
+
created_at: str | None = None
|
|
73
|
+
spec_name: str | None = None
|
|
74
|
+
price_per_hour: float | None = None
|
|
75
|
+
labels: dict[str, str] = field(default_factory=dict)
|
|
76
|
+
|
|
77
|
+
def __post_init__(self) -> None:
|
|
78
|
+
assert self.resource_id, "resource_id cannot be empty"
|
|
79
|
+
assert self.provider, "provider cannot be empty"
|
|
80
|
+
assert self.status, "status cannot be empty"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass(frozen=True)
|
|
84
|
+
class ReconcileResult:
|
|
85
|
+
"""Result of comparing TargetSpecs to live Targets.
|
|
86
|
+
|
|
87
|
+
Pure data — no side effects. The caller decides what to do:
|
|
88
|
+
- Display bound/unbound/unprovisioned in CLI
|
|
89
|
+
- Terminate unbound targets
|
|
90
|
+
- Provision from unprovisioned specs
|
|
91
|
+
|
|
92
|
+
Fields:
|
|
93
|
+
bound: Specs matched to live targets (spec, target) pairs.
|
|
94
|
+
unbound: Live targets with no matching spec (orphans).
|
|
95
|
+
unprovisioned: Specs with no live target running.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
bound: list[tuple[TargetSpec, Target]]
|
|
99
|
+
unbound: list[Target]
|
|
100
|
+
unprovisioned: list[TargetSpec]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@runtime_checkable
|
|
104
|
+
class TargetProvider(Protocol):
|
|
105
|
+
"""Interface for querying and managing live GPU resources from a cloud provider.
|
|
106
|
+
|
|
107
|
+
Each cloud provider (RunPod, DigitalOcean, etc.) implements this protocol.
|
|
108
|
+
Methods are async because they hit external APIs.
|
|
109
|
+
|
|
110
|
+
Baremetal is a degenerate case: list_targets returns a Target built from
|
|
111
|
+
the spec's ssh_target, provision/terminate are no-ops.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
async def list_targets(self) -> list[Target]:
|
|
115
|
+
"""List all running resources on the provider account.
|
|
116
|
+
|
|
117
|
+
Always hits the provider API — never reads from local cache.
|
|
118
|
+
"""
|
|
119
|
+
...
|
|
120
|
+
|
|
121
|
+
async def get_target(self, resource_id: str) -> Target | None:
|
|
122
|
+
"""Get a specific resource by provider ID.
|
|
123
|
+
|
|
124
|
+
Returns None if the resource doesn't exist or is terminated.
|
|
125
|
+
"""
|
|
126
|
+
...
|
|
127
|
+
|
|
128
|
+
async def provision(self, spec: TargetSpec) -> Target:
|
|
129
|
+
"""Provision a new resource from a spec.
|
|
130
|
+
|
|
131
|
+
Blocks until the resource is SSH-ready.
|
|
132
|
+
Raises on failure (no silent None returns).
|
|
133
|
+
"""
|
|
134
|
+
...
|
|
135
|
+
|
|
136
|
+
async def terminate(self, resource_id: str) -> bool:
|
|
137
|
+
"""Terminate a resource by provider ID.
|
|
138
|
+
|
|
139
|
+
Returns True if terminated, False if resource not found.
|
|
140
|
+
"""
|
|
141
|
+
...
|
|
@@ -346,25 +346,17 @@ class RunPodTarget:
|
|
|
346
346
|
ncu_available: bool = False
|
|
347
347
|
|
|
348
348
|
def __post_init__(self) -> None:
|
|
349
|
-
"""Validate configuration.
|
|
350
|
-
from wafer_core.auth import get_api_key
|
|
349
|
+
"""Validate configuration fields.
|
|
351
350
|
|
|
351
|
+
API key availability is checked at provision/query time, not here —
|
|
352
|
+
loading a spec from TOML should not require credentials.
|
|
353
|
+
"""
|
|
352
354
|
assert self.name, "name cannot be empty"
|
|
353
355
|
assert self.ssh_key, "ssh_key cannot be empty"
|
|
354
356
|
assert self.gpu_count > 0, "gpu_count must be positive"
|
|
355
357
|
assert self.provision_timeout > 0, "provision_timeout must be positive"
|
|
356
358
|
assert self.eval_timeout > 0, "eval_timeout must be positive"
|
|
357
359
|
|
|
358
|
-
# Check for API key (env var or ~/.wafer/auth.json)
|
|
359
|
-
api_key = get_api_key("runpod")
|
|
360
|
-
if not api_key:
|
|
361
|
-
raise ValueError(
|
|
362
|
-
"RunPod API key not found.\n"
|
|
363
|
-
"Set WAFER_RUNPOD_API_KEY environment variable, or run:\n"
|
|
364
|
-
" wafer auth login runpod\n"
|
|
365
|
-
"Get your API key from: https://runpod.io/console/user/settings"
|
|
366
|
-
)
|
|
367
|
-
|
|
368
360
|
|
|
369
361
|
@dataclass(frozen=True)
|
|
370
362
|
class LocalTarget:
|
|
@@ -468,24 +460,16 @@ class DigitalOceanTarget:
|
|
|
468
460
|
ncu_available: bool = False
|
|
469
461
|
|
|
470
462
|
def __post_init__(self) -> None:
|
|
471
|
-
"""Validate configuration.
|
|
472
|
-
from wafer_core.auth import get_api_key
|
|
463
|
+
"""Validate configuration fields.
|
|
473
464
|
|
|
465
|
+
API key availability is checked at provision/query time, not here —
|
|
466
|
+
loading a spec from TOML should not require credentials.
|
|
467
|
+
"""
|
|
474
468
|
assert self.name, "name cannot be empty"
|
|
475
469
|
assert self.ssh_key, "ssh_key cannot be empty"
|
|
476
470
|
assert self.provision_timeout > 0, "provision_timeout must be positive"
|
|
477
471
|
assert self.eval_timeout > 0, "eval_timeout must be positive"
|
|
478
472
|
|
|
479
|
-
# Check for API key (env var or ~/.wafer/auth.json)
|
|
480
|
-
api_key = get_api_key("digitalocean")
|
|
481
|
-
if not api_key:
|
|
482
|
-
raise ValueError(
|
|
483
|
-
"DigitalOcean API key not found.\n"
|
|
484
|
-
"Set WAFER_AMD_DIGITALOCEAN_API_KEY environment variable, or run:\n"
|
|
485
|
-
" wafer auth login digitalocean\n"
|
|
486
|
-
"Get your API key from: https://cloud.digitalocean.com/account/api/tokens"
|
|
487
|
-
)
|
|
488
|
-
|
|
489
473
|
|
|
490
474
|
# Union type for target configs
|
|
491
475
|
TargetConfig = (
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wafer-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.28
|
|
4
4
|
Summary: Core utilities and environments for Wafer GPU kernel optimization
|
|
5
5
|
Requires-Python: >=3.10
|
|
6
6
|
Requires-Dist: aiohttp>=3.9.0
|
|
@@ -15,12 +15,14 @@ Requires-Dist: dash-svg>=0.0.11
|
|
|
15
15
|
Requires-Dist: dash>=3.0.0
|
|
16
16
|
Requires-Dist: dnspython>=2.8.0
|
|
17
17
|
Requires-Dist: httpx>=0.25.0
|
|
18
|
+
Requires-Dist: ijson>=3.2.0
|
|
18
19
|
Requires-Dist: kaleido==0.2.1
|
|
19
20
|
Requires-Dist: markdownify>=0.11.0
|
|
20
21
|
Requires-Dist: matplotlib>=3.0.0
|
|
21
22
|
Requires-Dist: modal>=0.64.0
|
|
22
23
|
Requires-Dist: numpy>=1.17.5
|
|
23
24
|
Requires-Dist: openai>=1.0.0
|
|
25
|
+
Requires-Dist: orjson>=3.9.0
|
|
24
26
|
Requires-Dist: pandas~=3.0.0
|
|
25
27
|
Requires-Dist: paramiko>=3.0.0
|
|
26
28
|
Requires-Dist: paramiko>=3.4.0
|
|
@@ -318,12 +318,20 @@ wafer_core/lib/rocprofiler/systems/run/analyzer.py,sha256=Qg3M8-kCKdV82ehn6Ta20N
|
|
|
318
318
|
wafer_core/lib/rocprofiler/systems/run/profiler.py,sha256=aiQLsDnfQHSeCM5zLnO4VlbTmREYnAtiuT50Eq6uWfg,8387
|
|
319
319
|
wafer_core/lib/rocprofiler/systems/sample/__init__.py,sha256=31rNmLPQ7OVhvlOEEOwPKgk8_qrCidj6AmzDXexQJ_o,288
|
|
320
320
|
wafer_core/lib/rocprofiler/systems/sample/profiler.py,sha256=CYZPTzNXd48LoCfmY6h_5RSYEdWYccuv3-t4YncHJLE,7384
|
|
321
|
-
wafer_core/lib/trace_compare/
|
|
322
|
-
wafer_core/lib/trace_compare/
|
|
323
|
-
wafer_core/lib/trace_compare/
|
|
321
|
+
wafer_core/lib/trace_compare/PERFORMANCE.md,sha256=jkJh7ApZi8H7NKTcz8v0LNtwSFtIUqY88e3QbL749ww,3823
|
|
322
|
+
wafer_core/lib/trace_compare/__init__.py,sha256=CyUPbPQDYhVLCFFA7S_jNSilG3OgqYjmHSKfR5X11go,1377
|
|
323
|
+
wafer_core/lib/trace_compare/aligner.py,sha256=1S8Ob3RaEsIjN0HdqEx0yGsW5uf_lMrJVSH_MnZhKok,13788
|
|
324
|
+
wafer_core/lib/trace_compare/analyzer.py,sha256=YkuOPA3HFX_7mNUEhE9CMOtEMGLQd12lvUkvqqeQF14,29698
|
|
325
|
+
wafer_core/lib/trace_compare/api.py,sha256=JSRTcd7eZK1Z8l18TFEiA5A8ENJS1TMz7oIiw1KBbAs,8796
|
|
326
|
+
wafer_core/lib/trace_compare/architecture.py,sha256=8bqlAJQeJLBHblyXvFV-w55PIKiVQDPjDQZ8Jx4tuGg,2110
|
|
327
|
+
wafer_core/lib/trace_compare/classifier.py,sha256=CDGzY9TY-I5wRuEGsu4mTCdljqVTOnLWyFLyNgmkGXI,16864
|
|
324
328
|
wafer_core/lib/trace_compare/formatter.py,sha256=GNrCZ45ueBN05CEXjOtTuKvTI8z-g-ZZFil-ni3sWVY,37962
|
|
325
|
-
wafer_core/lib/trace_compare/fusion_analyzer.py,sha256=
|
|
326
|
-
wafer_core/lib/trace_compare/
|
|
329
|
+
wafer_core/lib/trace_compare/fusion_analyzer.py,sha256=ZbFXUuPOt8ezT08WfjlDx7XaUNoUgg9hlFTJb68-eo0,17433
|
|
330
|
+
wafer_core/lib/trace_compare/kernel_registry.yaml,sha256=0-knXwsF3pR1x1JdIz-aWaH-5xDgTylh53E47Kf6nHo,9808
|
|
331
|
+
wafer_core/lib/trace_compare/layer_segmentation.py,sha256=kI_Y1e9nrKZfdwfcrGo4h7gpMxqXI_xkgXk46zuFen4,4642
|
|
332
|
+
wafer_core/lib/trace_compare/loader.py,sha256=zBHI0r7CX_wJ2mz0_-s0lm9KGSdaVaq7OKyxUL6KIlw,23997
|
|
333
|
+
wafer_core/lib/trace_compare/same_kernel_analyzer.py,sha256=sp81NJGVJeYdAfRQRgMbB5HcGTOneF1Rau3rbLPfpv4,3489
|
|
334
|
+
wafer_core/lib/trace_compare/warnings.py,sha256=B1HxFt-v1mDqLT2aD5bSm1Yn88bfPYnM-wui0WBF3xM,3548
|
|
327
335
|
wafer_core/lib/tracelens/__init__.py,sha256=AkHdmOnKlBO4RpsAqVVGe7MOfv6E6uhEaC_iKrYeMPI,2002
|
|
328
336
|
wafer_core/lib/tracelens/comparator.py,sha256=71YEPfjBi7_24u1oQuPerNtFsN0sDQ5CT_uBi0XLllw,3460
|
|
329
337
|
wafer_core/lib/tracelens/finder.py,sha256=HpbN8TuRNbbBytPYOmkBkfsFVBReQqVgsvFX-mBrln4,2459
|
|
@@ -589,9 +597,19 @@ wafer_core/sessions/__init__.py,sha256=Ybps5QclZShAELoW9bva4w6OCNrcBf8vd9nGDjYfQ
|
|
|
589
597
|
wafer_core/sessions/agent.py,sha256=4-Q-NG_xm07FFq7hB8mjxW38nt2_S0QpwCYkPOoGRxA,5946
|
|
590
598
|
wafer_core/sessions/dtypes.py,sha256=K6nOjvL6sjCGY7GTtdEygf1IZY_18R9YkHGqFyMd8wY,589
|
|
591
599
|
wafer_core/sessions/hooks.py,sha256=A-txm6ufnRGQCdtP3vwh7oEOdlLN9Tv0XsjORMihuAI,4295
|
|
592
|
-
wafer_core/targets/__init__.py,sha256=
|
|
600
|
+
wafer_core/targets/__init__.py,sha256=N4lTf9MjZ5dzAShObweZzyBfPMSzwjD5qBFWnM5lczM,2800
|
|
593
601
|
wafer_core/targets/digitalocean.py,sha256=cvoYpYjtSyy5t2lQAPi7ERruuuibronah_ivOiduAHQ,16550
|
|
602
|
+
wafer_core/targets/pool.py,sha256=TeNE9rpr67OsGtbxniYpr9Cb3wosnf_e3kTLBbwtDok,5434
|
|
603
|
+
wafer_core/targets/probe.py,sha256=rzF8tiq5GxkMR3jhryTOW0GMcoHtrN67wmHlGJuBTv8,3038
|
|
604
|
+
wafer_core/targets/reconcile.py,sha256=Hftd7LyqkcTOP0Qpa_cdYpxGW2I3bkSlkQrnYjU5lns,3091
|
|
594
605
|
wafer_core/targets/runpod.py,sha256=LrVmNvA6qjzL5nbGSWvtw7CHrK6bDu7_o3vKIek00Tc,20286
|
|
606
|
+
wafer_core/targets/spec_store.py,sha256=uNpMdo7ASeq7_RhgAqj8CFIK39rGEbaYtYtqt--FXO0,6455
|
|
607
|
+
wafer_core/targets/state_cache.py,sha256=oji4APL_tjOty_u0CJzHaP59jJAIJWQTjYvD4pCdQ3g,4479
|
|
608
|
+
wafer_core/targets/types.py,sha256=MQ7ECcBAwSoWsJfGxycJoLBeoTXSYtGeXEg5ZNxfs4c,5217
|
|
609
|
+
wafer_core/targets/providers/__init__.py,sha256=u6OCCgyPRymrnZmIYPLF0hdkr6aTCF301K9gSgcFWvc,1355
|
|
610
|
+
wafer_core/targets/providers/baremetal.py,sha256=L0KAiTkRH_fQvCbtaEa5wlJBqsvNaY56Zq6ovBhk2YY,2452
|
|
611
|
+
wafer_core/targets/providers/digitalocean.py,sha256=_TnGi9Otzsn2T_vSv40T_3HFLT559WS_ljGsrWr7j0s,5281
|
|
612
|
+
wafer_core/targets/providers/runpod.py,sha256=jCA7ENFRwbTKyToGa7fw2VS3coY61ggK1m0F17-rvng,7388
|
|
595
613
|
wafer_core/tools/__init__.py,sha256=wBQD45GdSfkxcT6NHzIv0IMeXCc0enwwkpm3T_9j1X8,3341
|
|
596
614
|
wafer_core/tools/bash_tool.py,sha256=daoKOVGSgL0x9X_3l8Apd6-wFH4VMXMGJwVemw2FIfc,16828
|
|
597
615
|
wafer_core/tools/glob_tool.py,sha256=9X5PdOjQJj7kiVNqqCZC0-1LmnE6wHx3Zc9zfMjtXdc,3533
|
|
@@ -671,7 +689,7 @@ wafer_core/utils/kernel_utils/static_checker.py,sha256=XIQkzAOkGH5xtrOuZM4tNUqVJ
|
|
|
671
689
|
wafer_core/utils/kernel_utils/task.py,sha256=XcmKxKUWh5It6nX3zGqj77tWgA32uPfQMqNOqyD5T48,2682
|
|
672
690
|
wafer_core/utils/kernel_utils/utils.py,sha256=uDZoJDxh07hJeLNlPdKN2vgB15pqIr1LbXf0YIBHU4E,43056
|
|
673
691
|
wafer_core/utils/kernel_utils/targets/__init__.py,sha256=4NwRLsuJ__S4xKAfda4Ag82C5MQ3Qio-4xA5S-mQGlU,2067
|
|
674
|
-
wafer_core/utils/kernel_utils/targets/config.py,sha256=
|
|
692
|
+
wafer_core/utils/kernel_utils/targets/config.py,sha256=DJPPyV7yGmyvS7cavdDENC5PQsia1dQeQYlWCTE7iUo,19975
|
|
675
693
|
wafer_core/utils/kernel_utils/targets/execution.py,sha256=bZuNXCo0sIdD6hFhetLPrtDC-zMSiIsAx_aml49VVL0,15033
|
|
676
694
|
wafer_core/utils/kernel_utils/targets/selection.py,sha256=5I_RG_7cfhq7uaeR28meC2EeNNKssFsK-Tc3QFG6Ze0,3590
|
|
677
695
|
wafer_core/utils/modal_execution/__init__.py,sha256=jkVqYOLzCT5K73N9Od0UIUsx-99A0m6bpDrxfyXxQZ8,945
|
|
@@ -679,6 +697,6 @@ wafer_core/utils/modal_execution/modal_app.py,sha256=VfS2cX8gHtnlPXemmMcEwDPeQdh
|
|
|
679
697
|
wafer_core/utils/modal_execution/modal_config.py,sha256=7cGX9TGqilQ3qxI3OFGXV5orjtyRU-PEDOJ4vP2oxno,4421
|
|
680
698
|
wafer_core/utils/modal_execution/modal_execution.py,sha256=gChjnV6jqA3A7IRP3DfvV5cSfm_MN0X4f7JZufXgdZE,24594
|
|
681
699
|
wafer_core/utils/modal_execution/test_modal.py,sha256=_jqou_hrLs1Daf1590Pnb0a_lXMMa2rczAPpW9HpoNQ,8153
|
|
682
|
-
wafer_core-0.1.
|
|
683
|
-
wafer_core-0.1.
|
|
684
|
-
wafer_core-0.1.
|
|
700
|
+
wafer_core-0.1.28.dist-info/METADATA,sha256=0x6opc3zOlxGhlZNJDVDY2LPnBZHYP5K4U0I6ZDl0Os,1477
|
|
701
|
+
wafer_core-0.1.28.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
702
|
+
wafer_core-0.1.28.dist-info/RECORD,,
|
|
File without changes
|