PyPI - wafer-core - Versions diffs - 0.1.27__py3-none-any.whl → 0.1.28__py3-none-any.whl - Mend

wafer-core 0.1.27py3-none-any.whl → 0.1.28py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

wafer_core/lib/trace_compare/aligner.py +13 -6
wafer_core/lib/trace_compare/analyzer.py +12 -3
wafer_core/lib/trace_compare/fusion_analyzer.py +392 -284
wafer_core/targets/__init__.py +47 -21
wafer_core/targets/pool.py +181 -0
wafer_core/targets/probe.py +113 -0
wafer_core/targets/providers/__init__.py +46 -0
wafer_core/targets/providers/baremetal.py +72 -0
wafer_core/targets/providers/digitalocean.py +164 -0
wafer_core/targets/providers/runpod.py +250 -0
wafer_core/targets/reconcile.py +90 -0
wafer_core/targets/spec_store.py +200 -0
wafer_core/targets/state_cache.py +150 -0
wafer_core/targets/types.py +141 -0
wafer_core/utils/kernel_utils/targets/config.py +8 -24
{wafer_core-0.1.27.dist-info → wafer_core-0.1.28.dist-info}/METADATA +1 -1
{wafer_core-0.1.27.dist-info → wafer_core-0.1.28.dist-info}/RECORD +18 -8
{wafer_core-0.1.27.dist-info → wafer_core-0.1.28.dist-info}/WHEEL +0 -0

wafer_core/targets/__init__.py CHANGED Viewed

@@ -1,5 +1,43 @@
-"""Re-export targets from utils for convenience."""
+"""Target system: specs (config) + targets (live resources) + reconciliation.
+New API (preferred):
+    from wafer_core.targets import Target, TargetSpec, ReconcileResult
+    from wafer_core.targets.providers import get_provider
+    from wafer_core.targets.reconcile import reconcile
+    from wafer_core.targets.spec_store import load_spec, list_spec_names
+    from wafer_core.targets.state_cache import get_binding_hints
+Legacy API (still works, will be deprecated):
+    from wafer_core.targets import RunPodTarget, runpod_ssh_context, ...
+"""
+# ── New types ────────────────────────────────────────────────────────────────
+from wafer_core.targets.digitalocean import (
+    DigitalOceanError,
+    DigitalOceanSSHInfo,
+    cleanup_all_droplets,
+    cleanup_digitalocean_target,
+    digitalocean_ssh_context,
+    get_droplet_state,
+    list_running_droplets,
+)
+from wafer_core.targets.runpod import (
+    RunPodError,
+    RunPodSSHInfo,
+    cleanup_all_pods,
+    cleanup_target,
+    get_pod_state,
+    list_running_pods,
+    runpod_ssh_context,
+)
+from wafer_core.targets.types import (
+    ReconcileResult,
+    Target,
+    TargetProvider,
+    TargetSpec,
+)
+# ── Legacy re-exports (unchanged, for backwards compatibility) ───────────────
 from wafer_core.utils.kernel_utils.targets import (
     BaremetalTarget,
     DigitalOceanTarget,
@@ -18,26 +56,14 @@ from wafer_core.utils.kernel_utils.targets import (
     select_target_for_operation,
     target_to_deployment_config,
 )
-from wafer_core.targets.runpod import (
-    RunPodError,
-    RunPodSSHInfo,
-    cleanup_all_pods,
-    cleanup_target,
-    get_pod_state,
-    list_running_pods,
-    runpod_ssh_context,
-)
-from wafer_core.targets.digitalocean import (
-    DigitalOceanError,
-    DigitalOceanSSHInfo,
-    cleanup_all_droplets,
-    cleanup_digitalocean_target,
-    digitalocean_ssh_context,
-    get_droplet_state,
-    list_running_droplets,
-)
 __all__ = [
+    # New API
+    "Target",
+    "TargetSpec",
+    "TargetProvider",
+    "ReconcileResult",
+    # Legacy: target config types
     "BaremetalTarget",
     "VMTarget",
     "ModalTarget",
@@ -54,7 +80,7 @@ __all__ = [
     "check_target_available",
     "find_free_gpu",
     "run_operation_on_target",
-    # RunPod provisioning
+    # Legacy: RunPod provisioning
     "RunPodError",
     "RunPodSSHInfo",
     "runpod_ssh_context",
@@ -62,7 +88,7 @@ __all__ = [
     "cleanup_all_pods",
     "list_running_pods",
     "get_pod_state",
-    # DigitalOcean provisioning
+    # Legacy: DigitalOcean provisioning
     "DigitalOceanError",
     "DigitalOceanSSHInfo",
     "digitalocean_ssh_context",

wafer_core/targets/pool.py ADDED Viewed

@@ -0,0 +1,181 @@
+"""Pool queries: filter live targets by GPU type, provider, and labels.
+A pool is a predicate over live targets, not a hardcoded list.
+Pool queries are defined in ~/.wafer/config.toml:
+    [pools.mi300x]
+    gpu_type = "MI300X"
+    [pools.mi300x-rocm7]
+    gpu_type = "MI300X"
+    labels.rocm_version = "7.0.2"
+    [pools.runpod-only]
+    provider = "runpod"
+Matching: a target matches a pool query if all specified fields match.
+Fields not specified in the query are ignored (match anything).
+Label matching is AND — all required labels must be present and equal.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from wafer_core.targets.types import Target
+WAFER_DIR = Path.home() / ".wafer"
+CONFIG_FILE = WAFER_DIR / "config.toml"
+# Fields on PoolQuery that map directly to Target fields
+_TARGET_FIELDS = ("gpu_type", "provider", "status")
+@dataclass(frozen=True)
+class PoolQuery:
+    """Predicate for filtering live targets.
+    All specified fields must match (AND semantics).
+    None means "don't care" for that field.
+    """
+    gpu_type: str | None = None
+    provider: str | None = None
+    status: str | None = "running"
+    labels: dict[str, str] = field(default_factory=dict)
+def match_targets(query: PoolQuery, targets: list[Target]) -> list[Target]:
+    """Filter targets that satisfy the pool query. Pure function."""
+    matched = []
+    for target in targets:
+        if not _matches(query, target):
+            continue
+        matched.append(target)
+    return matched
+def _matches(query: PoolQuery, target: Target) -> bool:
+    """Check if a single target satisfies the query."""
+    if query.gpu_type is not None and target.gpu_type != query.gpu_type:
+        return False
+    if query.provider is not None and target.provider != query.provider:
+        return False
+    if query.status is not None and target.status != query.status:
+        return False
+    # All required labels must be present and equal
+    for key, value in query.labels.items():
+        if target.labels.get(key) != value:
+            return False
+    return True
+def load_pool_query(name: str) -> PoolQuery:
+    """Load a pool query from ~/.wafer/config.toml.
+    Raises KeyError if the pool is not defined.
+    """
+    pools = _load_pools_section()
+    if name not in pools:
+        available = ", ".join(sorted(pools)) if pools else "(none)"
+        raise KeyError(f"Pool {name!r} not found. Available: {available}")
+    raw = pools[name]
+    assert isinstance(raw, dict), f"Pool {name!r} must be a table, got {type(raw).__name__}"
+    labels_raw = raw.get("labels", {})
+    assert isinstance(labels_raw, dict), (
+        f"Pool {name!r} labels must be a table, got {type(labels_raw).__name__}"
+    )
+    return PoolQuery(
+        gpu_type=raw.get("gpu_type"),
+        provider=raw.get("provider"),
+        status=raw.get("status", "running"),
+        labels={str(k): str(v) for k, v in labels_raw.items()},
+    )
+def list_pool_names() -> list[str]:
+    """List all pool names from config.toml."""
+    pools = _load_pools_section()
+    return sorted(pools.keys())
+def is_query_pool(name: str) -> bool:
+    """Check if a pool is defined as a PoolQuery (new format) vs target list (old format).
+    Old format: [pools.name] targets = ["t1", "t2"]
+    New format: [pools.name] gpu_type = "MI300X"
+    Returns False if pool doesn't exist or is old format.
+    """
+    pools = _load_pools_section()
+    if name not in pools:
+        return False
+    raw = pools[name]
+    if not isinstance(raw, dict):
+        return False
+    # Old format has a "targets" key with a list of names
+    return "targets" not in raw
+async def resolve_pool(name: str) -> list[Target]:
+    """Resolve a pool query to live targets.
+    Queries all cloud providers, hydrates cached labels, filters by pool query.
+    Returns matching Target objects sorted by resource_id for determinism.
+    Raises KeyError if pool not found.
+    """
+    from dataclasses import replace
+    from wafer_core.targets.providers import get_all_cloud_providers
+    from wafer_core.targets.state_cache import load_all_labels
+    from wafer_core.targets.types import TargetProvider
+    import trio
+    query = load_pool_query(name)
+    # Fetch all live targets
+    all_targets: list[Target] = []
+    async def _fetch(prov_impl: TargetProvider, results: list[Target]) -> None:
+        try:
+            targets = await prov_impl.list_targets()
+            results.extend(targets)
+        except Exception:
+            pass  # Skip providers that fail (missing API key, etc.)
+    async with trio.open_nursery() as nursery:
+        for _, prov_impl in get_all_cloud_providers():
+            nursery.start_soon(_fetch, prov_impl, all_targets)
+    # Hydrate labels from cache
+    cached_labels = load_all_labels()
+    all_targets = [
+        replace(t, labels=cached_labels[t.resource_id])
+        if t.resource_id in cached_labels
+        else t
+        for t in all_targets
+    ]
+    # Filter and sort
+    matched = match_targets(query, all_targets)
+    matched.sort(key=lambda t: t.resource_id)
+    return matched
+def _load_pools_section() -> dict:
+    """Read the [pools] section from config.toml. Returns empty dict if missing."""
+    if not CONFIG_FILE.exists():
+        return {}
+    import tomllib
+    data = tomllib.loads(CONFIG_FILE.read_text())
+    return data.get("pools", {})

wafer_core/targets/probe.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""SSH probe: detect software labels on a live target.
+Runs a Python script on the target via SSH that reports installed
+software versions. Returns a flat dict[str, str] of labels.
+Only called at provision time or manually via `wafer targets probe`.
+Results are cached in target_state.json — probe is never implicit.
+Uses subprocess ssh (not asyncssh) to match existing codebase patterns.
+"""
+from __future__ import annotations
+import json
+import logging
+import subprocess
+logger = logging.getLogger(__name__)
+# Probe script runs on the target machine via SSH.
+# Prints a JSON dict to stdout. Must work with stock Python 3.10+.
+_PROBE_SCRIPT = r"""
+import json, shutil, subprocess, sys
+def probe():
+    result = {}
+    # Python version
+    result["python_version"] = ".".join(map(str, sys.version_info[:2]))
+    # ROCm version from filesystem
+    try:
+        with open("/opt/rocm/.info/version") as f:
+            result["rocm_version"] = f.read().strip().split("-")[0]
+    except Exception:
+        pass
+    # CUDA version from nvcc
+    nvcc = shutil.which("nvcc")
+    if nvcc:
+        try:
+            out = subprocess.check_output([nvcc, "--version"], text=True)
+            for line in out.split("\n"):
+                if "release" in line.lower():
+                    parts = line.split("release")
+                    if len(parts) > 1:
+                        result["cuda_version"] = parts[1].split(",")[0].strip()
+                    break
+        except Exception:
+            pass
+    # PyTorch version
+    try:
+        import torch
+        result["pytorch_version"] = torch.__version__.split("+")[0]
+    except ImportError:
+        pass
+    # Triton version
+    try:
+        import triton
+        result["triton_version"] = triton.__version__
+    except ImportError:
+        pass
+    print(json.dumps(result))
+probe()
+"""
+def probe_target_labels(
+    host: str,
+    port: int,
+    username: str,
+    ssh_key_path: str | None = None,
+    timeout: int = 60,
+) -> dict[str, str]:
+    """SSH into a target and probe installed software. Returns labels dict.
+    Raises on SSH failure — caller decides how to handle.
+    """
+    ssh_args = [
+        "ssh",
+        "-p", str(port),
+        "-o", "StrictHostKeyChecking=no",
+        "-o", "UserKnownHostsFile=/dev/null",
+        "-o", "LogLevel=ERROR",
+        "-o", "ConnectTimeout=10",
+    ]
+    if ssh_key_path:
+        ssh_args.extend(["-i", ssh_key_path])
+    ssh_args.append(f"{username}@{host}")
+    ssh_args.append("python3")
+    result = subprocess.run(
+        ssh_args,
+        input=_PROBE_SCRIPT,
+        capture_output=True,
+        text=True,
+        timeout=timeout,
+    )
+    if result.returncode != 0:
+        stderr = result.stderr.strip()
+        raise RuntimeError(f"Probe failed (exit {result.returncode}): {stderr}")
+    stdout = result.stdout.strip()
+    labels = json.loads(stdout)
+    assert isinstance(labels, dict), f"Probe returned {type(labels).__name__}, expected dict"
+    return {str(k): str(v) for k, v in labels.items()}

wafer_core/targets/providers/__init__.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""Provider registry for GPU resource management.
+Each provider implements TargetProvider protocol: list, get, provision, terminate.
+"""
+from __future__ import annotations
+from wafer_core.targets.providers.baremetal import BaremetalProvider
+from wafer_core.targets.providers.digitalocean import DigitalOceanProvider
+from wafer_core.targets.providers.runpod import RunPodProvider
+from wafer_core.targets.types import TargetProvider
+_PROVIDERS: dict[str, type] = {
+    "runpod": RunPodProvider,
+    "digitalocean": DigitalOceanProvider,
+    "baremetal": BaremetalProvider,
+}
+def get_provider(name: str) -> TargetProvider:
+    """Get a provider instance by name.
+    Raises KeyError if provider is not registered.
+    """
+    cls = _PROVIDERS.get(name)
+    if cls is None:
+        raise KeyError(f"Unknown provider: {name!r}. Available: {', '.join(sorted(_PROVIDERS))}")
+    return cls()
+def get_all_cloud_providers() -> list[tuple[str, TargetProvider]]:
+    """Get all cloud providers that can list remote resources.
+    Excludes baremetal (no remote API to query).
+    Returns list of (name, provider) tuples.
+    """
+    return [(name, cls()) for name, cls in _PROVIDERS.items() if name != "baremetal"]
+__all__ = [
+    "BaremetalProvider",
+    "DigitalOceanProvider",
+    "RunPodProvider",
+    "get_all_cloud_providers",
+    "get_provider",
+]

wafer_core/targets/providers/baremetal.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Baremetal provider — degenerate case with no cloud API.
+Baremetal targets have no provisioning lifecycle. The "resource" is just the
+SSH endpoint from the spec. list_targets returns nothing (no API to query),
+and provision/terminate are errors.
+"""
+from __future__ import annotations
+from wafer_core.targets.types import Target, TargetSpec
+from wafer_core.utils.kernel_utils.targets.config import BaremetalTarget, VMTarget
+def target_from_ssh_spec(spec: BaremetalTarget | VMTarget) -> Target:
+    """Build a Target from a baremetal/VM spec's SSH info.
+    Since there's no cloud API, the resource_id is synthetic:
+    "baremetal:{host}:{port}" to make it unique and stable.
+    """
+    # Parse user@host:port
+    ssh_target = spec.ssh_target
+    assert ":" in ssh_target, f"ssh_target must include port, got: {ssh_target}"
+    user_host, port_str = ssh_target.rsplit(":", 1)
+    if "@" in user_host:
+        user, host = user_host.split("@", 1)
+    else:
+        user = "root"
+        host = user_host
+    port = int(port_str)
+    return Target(
+        resource_id=f"baremetal:{host}:{port}",
+        provider="baremetal",
+        status="running",  # Assumed running; TCP check happens elsewhere
+        public_ip=host,
+        ssh_port=port,
+        ssh_username=user,
+        gpu_type=spec.gpu_type,
+        name=spec.name,
+        spec_name=spec.name,
+    )
+class BaremetalProvider:
+    """Baremetal implementation of TargetProvider.
+    Baremetal has no cloud API. list_targets returns empty (no remote state
+    to query). Use target_from_ssh_spec() to build a Target from a spec
+    when you already know which spec you want.
+    """
+    async def list_targets(self) -> list[Target]:
+        """Baremetal has no API to list. Returns empty."""
+        return []
+    async def get_target(self, resource_id: str) -> Target | None:
+        """Baremetal has no API to query. Returns None."""
+        return None
+    async def provision(self, spec: TargetSpec) -> Target:
+        """Baremetal targets cannot be provisioned — they already exist."""
+        assert isinstance(spec, (BaremetalTarget, VMTarget)), (
+            f"BaremetalProvider.provision requires BaremetalTarget or VMTarget, "
+            f"got {type(spec).__name__}"
+        )
+        return target_from_ssh_spec(spec)
+    async def terminate(self, resource_id: str) -> bool:
+        """Baremetal targets cannot be terminated via API."""
+        return False

wafer_core/targets/providers/digitalocean.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""DigitalOcean provider — adapts existing DO REST API to TargetProvider protocol."""
+from __future__ import annotations
+import logging
+import time
+from datetime import datetime, timezone
+from wafer_core.targets.digitalocean import (
+    DigitalOceanError,
+    _api_request_async,
+    _get_ssh_key_ids,
+    _wait_for_ssh,
+)
+from wafer_core.targets.digitalocean import (
+    terminate_droplet as _terminate_droplet,
+)
+from wafer_core.targets.types import Target, TargetSpec
+from wafer_core.utils.kernel_utils.targets.config import DigitalOceanTarget
+logger = logging.getLogger(__name__)
+def _parse_droplet_to_target(droplet: dict) -> Target:
+    """Parse a DigitalOcean API droplet response into a Target."""
+    droplet_id = str(droplet.get("id", ""))
+    droplet_name = droplet.get("name", "")
+    status_raw = droplet.get("status", "").lower()
+    # Map DO statuses to our values
+    # DO: new, active, off, archive
+    status_map = {
+        "new": "pending",
+        "active": "running",
+        "off": "stopped",
+        "archive": "terminated",
+    }
+    status = status_map.get(status_raw, status_raw)
+    # Extract public IP
+    public_ip = None
+    networks = droplet.get("networks", {})
+    for net in networks.get("v4", []):
+        if net.get("type") == "public":
+            public_ip = net.get("ip_address")
+            break
+    # Infer spec_name from naming convention: wafer-{spec_name}-{timestamp}
+    spec_name = None
+    if droplet_name.startswith("wafer-"):
+        parts = droplet_name.split("-")
+        if len(parts) >= 3:
+            spec_name = "-".join(parts[1:-1])
+    created_at = droplet.get("created_at")
+    # Extract GPU type from size slug
+    size = droplet.get("size", {})
+    size_slug = (
+        size.get("slug", "") if isinstance(size, dict) else str(droplet.get("size_slug", ""))
+    )
+    gpu_type = "MI300X" if "mi300x" in size_slug.lower() else "unknown"
+    return Target(
+        resource_id=droplet_id,
+        provider="digitalocean",
+        status=status,
+        public_ip=public_ip,
+        ssh_port=22,
+        ssh_username="root",
+        gpu_type=gpu_type,
+        name=droplet_name or None,
+        created_at=created_at,
+        spec_name=spec_name,
+    )
+class DigitalOceanProvider:
+    """DigitalOcean implementation of TargetProvider.
+    Wraps existing REST API calls for droplet management.
+    """
+    async def list_targets(self) -> list[Target]:
+        """List all droplets on the DigitalOcean account."""
+        try:
+            response = await _api_request_async("GET", "/droplets", params={"per_page": "200"})
+        except DigitalOceanError:
+            raise
+        except Exception as e:
+            logger.warning(f"Failed to list DigitalOcean droplets: {e}")
+            return []
+        droplets = response.get("droplets", [])
+        return [_parse_droplet_to_target(d) for d in droplets]
+    async def get_target(self, resource_id: str) -> Target | None:
+        """Get a specific droplet by ID."""
+        try:
+            response = await _api_request_async("GET", f"/droplets/{resource_id}")
+        except Exception as e:
+            logger.warning(f"Failed to get DigitalOcean droplet {resource_id}: {e}")
+            return None
+        droplet = response.get("droplet")
+        if not droplet:
+            return None
+        return _parse_droplet_to_target(droplet)
+    async def provision(self, spec: TargetSpec) -> Target:
+        """Provision a new DigitalOcean droplet from a spec.
+        Blocks until SSH is ready.
+        """
+        assert isinstance(spec, DigitalOceanTarget), (
+            f"DigitalOceanProvider.provision requires DigitalOceanTarget, got {type(spec).__name__}"
+        )
+        droplet_name = f"wafer-{spec.name}-{int(time.time())}"
+        ssh_key_ids = await _get_ssh_key_ids()
+        if not ssh_key_ids:
+            logger.warning("No SSH keys found - droplet may not be accessible")
+        create_data = {
+            "name": droplet_name,
+            "region": spec.region,
+            "size": spec.size_slug,
+            "image": spec.image,
+            "ssh_keys": ssh_key_ids,
+            "backups": False,
+            "ipv6": True,
+            "monitoring": True,
+        }
+        logger.info(f"Provisioning DigitalOcean droplet: {droplet_name}")
+        response = await _api_request_async("POST", "/droplets", data=create_data)
+        if not response or "droplet" not in response:
+            raise DigitalOceanError(f"Failed to create droplet: {response}")
+        droplet = response["droplet"]
+        droplet_id = str(droplet["id"])
+        logger.info(f"Droplet created: {droplet_id}")
+        public_ip = await _wait_for_ssh(droplet_id, spec.provision_timeout)
+        return Target(
+            resource_id=droplet_id,
+            provider="digitalocean",
+            status="running",
+            public_ip=public_ip,
+            ssh_port=22,
+            ssh_username="root",
+            gpu_type=spec.gpu_type,
+            name=droplet_name,
+            created_at=datetime.now(timezone.utc).isoformat(),
+            spec_name=spec.name,
+        )
+    async def terminate(self, resource_id: str) -> bool:
+        """Terminate a DigitalOcean droplet."""
+        return await _terminate_droplet(resource_id)

wafer-core 0.1.27__py3-none-any.whl → 0.1.28__py3-none-any.whl

wafer-core 0.1.27py3-none-any.whl → 0.1.28py3-none-any.whl