PyPI - wafer-core - Versions diffs - 0.1.27__py3-none-any.whl → 0.1.29__py3-none-any.whl - Mend

wafer-core 0.1.27py3-none-any.whl → 0.1.29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

wafer_core/lib/trace_compare/aligner.py +13 -6
wafer_core/lib/trace_compare/analyzer.py +12 -3
wafer_core/lib/trace_compare/classifier.py +18 -9
wafer_core/lib/trace_compare/fusion_analyzer.py +424 -275
wafer_core/targets/__init__.py +47 -21
wafer_core/targets/pool.py +181 -0
wafer_core/targets/probe.py +113 -0
wafer_core/targets/providers/__init__.py +46 -0
wafer_core/targets/providers/baremetal.py +72 -0
wafer_core/targets/providers/digitalocean.py +164 -0
wafer_core/targets/providers/runpod.py +250 -0
wafer_core/targets/reconcile.py +90 -0
wafer_core/targets/spec_store.py +200 -0
wafer_core/targets/state_cache.py +150 -0
wafer_core/targets/types.py +141 -0
wafer_core/utils/kernel_utils/targets/config.py +8 -24
{wafer_core-0.1.27.dist-info → wafer_core-0.1.29.dist-info}/METADATA +1 -1
{wafer_core-0.1.27.dist-info → wafer_core-0.1.29.dist-info}/RECORD +19 -9
{wafer_core-0.1.27.dist-info → wafer_core-0.1.29.dist-info}/WHEEL +0 -0

wafer_core/targets/providers/runpod.py ADDED Viewed

@@ -0,0 +1,250 @@
+"""RunPod provider — adapts existing RunPod GraphQL API to TargetProvider protocol."""
+from __future__ import annotations
+import logging
+import time
+from datetime import datetime, timezone
+from wafer_core.targets.runpod import (
+    RunPodError,
+    _graphql_request_async,
+    _wait_for_ssh,
+)
+from wafer_core.targets.runpod import (
+    terminate_pod as _terminate_pod,
+)
+from wafer_core.targets.types import Target, TargetSpec
+from wafer_core.utils.kernel_utils.targets.config import RunPodTarget
+logger = logging.getLogger(__name__)
+def _parse_pod_to_target(pod: dict) -> Target | None:
+    """Parse a RunPod API pod response into a Target.
+    Returns None if the pod has no usable SSH info.
+    """
+    pod_id = pod.get("id", "")
+    pod_name = pod.get("name", "")
+    status_raw = pod.get("desiredStatus", "").lower()
+    # Map RunPod statuses to our status values
+    status = status_raw if status_raw else "unknown"
+    # Extract SSH info from runtime ports
+    public_ip = None
+    ssh_port = None
+    runtime = pod.get("runtime")
+    if runtime:
+        for port in runtime.get("ports") or []:
+            if port.get("privatePort") == 22 and port.get("isIpPublic"):
+                ip = port.get("ip")
+                # Skip proxy SSH (ssh.runpod.io), want direct IP
+                if ip and ip != "ssh.runpod.io":
+                    public_ip = ip
+                    ssh_port = port.get("publicPort")
+                    break
+    # Infer spec_name from pod naming convention: wafer-{spec_name}-{timestamp}
+    spec_name = None
+    if pod_name.startswith("wafer-"):
+        parts = pod_name.split("-")
+        if len(parts) >= 3:
+            spec_name = "-".join(parts[1:-1])
+    # Extract GPU type
+    gpu_type = ""
+    machine = pod.get("machine")
+    if machine:
+        gpu_type_info = machine.get("gpuType")
+        if gpu_type_info:
+            gpu_type = gpu_type_info.get("displayName", "")
+    cost = pod.get("costPerHr")
+    return Target(
+        resource_id=pod_id,
+        provider="runpod",
+        status=status,
+        public_ip=public_ip,
+        ssh_port=ssh_port,
+        ssh_username="root",
+        gpu_type=gpu_type,
+        name=pod_name or None,
+        created_at=None,  # RunPod API doesn't expose creation time in list query
+        spec_name=spec_name,
+        price_per_hour=float(cost) if cost else None,
+    )
+class RunPodProvider:
+    """RunPod implementation of TargetProvider.
+    Wraps existing GraphQL API calls:
+    - list_targets: myself { pods { ... } }
+    - get_target: pod(input: { podId }) { ... }
+    - provision: podFindAndDeployOnDemand
+    - terminate: podTerminate
+    """
+    async def list_targets(self) -> list[Target]:
+        """List all running pods on the RunPod account."""
+        query = """
+        query {
+            myself {
+                pods {
+                    id
+                    name
+                    desiredStatus
+                    costPerHr
+                    machine {
+                        podHostId
+                        gpuType {
+                            displayName
+                        }
+                    }
+                    runtime {
+                        ports {
+                            ip
+                            isIpPublic
+                            privatePort
+                            publicPort
+                            type
+                        }
+                    }
+                }
+            }
+        }
+        """
+        try:
+            data = await _graphql_request_async(query)
+        except RunPodError:
+            raise
+        except Exception as e:
+            logger.warning(f"Failed to list RunPod pods: {e}")
+            return []
+        pods = data.get("myself", {}).get("pods", [])
+        targets = []
+        for pod in pods:
+            target = _parse_pod_to_target(pod)
+            if target is not None:
+                targets.append(target)
+        return targets
+    async def get_target(self, resource_id: str) -> Target | None:
+        """Get a specific pod by ID."""
+        query = """
+        query pod($input: PodFilter!) {
+            pod(input: $input) {
+                id
+                name
+                desiredStatus
+                costPerHr
+                machine {
+                    podHostId
+                    gpuType {
+                        displayName
+                    }
+                }
+                runtime {
+                    ports {
+                        ip
+                        isIpPublic
+                        privatePort
+                        publicPort
+                        type
+                    }
+                }
+            }
+        }
+        """
+        variables = {"input": {"podId": resource_id}}
+        try:
+            data = await _graphql_request_async(query, variables)
+        except Exception as e:
+            logger.warning(f"Failed to get RunPod pod {resource_id}: {e}")
+            return None
+        pod = data.get("pod")
+        if not pod:
+            return None
+        return _parse_pod_to_target(pod)
+    async def provision(self, spec: TargetSpec) -> Target:
+        """Provision a new RunPod pod from a spec.
+        Blocks until SSH is ready.
+        """
+        assert isinstance(spec, RunPodTarget), (
+            f"RunPodProvider.provision requires RunPodTarget, got {type(spec).__name__}"
+        )
+        pod_name = f"wafer-{spec.name}-{int(time.time())}"
+        mutation = """
+        mutation podFindAndDeployOnDemand($input: PodFindAndDeployOnDemandInput!) {
+            podFindAndDeployOnDemand(input: $input) {
+                id
+                machineId
+                machine {
+                    podHostId
+                }
+            }
+        }
+        """
+        pod_input: dict = {
+            "gpuTypeId": spec.gpu_type_id,
+            "gpuCount": spec.gpu_count,
+            "cloudType": "SECURE",
+            "name": pod_name,
+            "supportPublicIp": True,
+            "containerDiskInGb": spec.container_disk_gb,
+            "minVcpuCount": 1,
+            "minMemoryInGb": 4,
+            "ports": "22/tcp",
+            "startSsh": True,
+            "startJupyter": False,
+            "env": [],
+        }
+        if spec.template_id:
+            pod_input["templateId"] = spec.template_id
+        else:
+            pod_input["imageName"] = spec.image
+        logger.info(f"Provisioning RunPod pod: {pod_name}")
+        data = await _graphql_request_async(mutation, {"input": pod_input})
+        pod_data = data.get("podFindAndDeployOnDemand")
+        if not pod_data:
+            raise RunPodError("No pod returned from deployment")
+        pod_id = pod_data["id"]
+        logger.info(f"Pod created: {pod_id}")
+        public_ip, ssh_port, ssh_username = await _wait_for_ssh(pod_id, spec.provision_timeout)
+        return Target(
+            resource_id=pod_id,
+            provider="runpod",
+            status="running",
+            public_ip=public_ip,
+            ssh_port=ssh_port,
+            ssh_username=ssh_username,
+            gpu_type=spec.gpu_type,
+            name=pod_name,
+            created_at=datetime.now(timezone.utc).isoformat(),
+            spec_name=spec.name,
+        )
+    async def terminate(self, resource_id: str) -> bool:
+        """Terminate a RunPod pod."""
+        return await _terminate_pod(resource_id)

wafer_core/targets/reconcile.py ADDED Viewed

@@ -0,0 +1,90 @@
+"""Reconciliation: compare TargetSpecs to live Targets.
+Pure function — no API calls, no side effects. Takes specs and targets as
+inputs, returns a ReconcileResult describing what's bound, what's orphaned,
+and what's unprovisioned.
+"""
+from __future__ import annotations
+from wafer_core.targets.types import ReconcileResult, Target, TargetSpec
+from wafer_core.utils.kernel_utils.targets.config import (
+    BaremetalTarget,
+    DigitalOceanTarget,
+    RunPodTarget,
+    VMTarget,
+)
+def _is_cloud_spec(spec: TargetSpec) -> bool:
+    """Check if a spec represents a cloud-provisioned resource.
+    Baremetal and VM specs don't have cloud-managed lifecycles,
+    so they're excluded from "unprovisioned" checks.
+    """
+    return isinstance(spec, (RunPodTarget, DigitalOceanTarget))
+def _spec_provider(spec: TargetSpec) -> str | None:
+    """Get the provider name for a spec, or None if not cloud-managed."""
+    if isinstance(spec, RunPodTarget):
+        return "runpod"
+    if isinstance(spec, DigitalOceanTarget):
+        return "digitalocean"
+    if isinstance(spec, (BaremetalTarget, VMTarget)):
+        return "baremetal"
+    return None
+def reconcile(
+    specs: list[TargetSpec],
+    targets: list[Target],
+    binding_hints: dict[str, str] | None = None,
+) -> ReconcileResult:
+    """Compare specs to live targets and classify each.
+    Matching rules (in priority order):
+    1. Target.spec_name matches Spec.name exactly (set by naming convention
+       or explicit binding).
+    2. binding_hints maps resource_id → spec_name (from local cache).
+    3. No match → target is unbound (orphan).
+    A cloud spec with no matching target is "unprovisioned".
+    Baremetal/VM specs are never "unprovisioned" (they don't have a cloud
+    lifecycle — the machine is always there or it isn't).
+    Args:
+        specs: All known TargetSpecs (loaded from TOML files).
+        targets: All live Targets (fetched from provider APIs).
+        binding_hints: Optional resource_id → spec_name cache for targets
+            whose spec_name can't be inferred from naming convention.
+    Returns:
+        ReconcileResult with bound, unbound, and unprovisioned lists.
+    """
+    hints = binding_hints or {}
+    spec_by_name = {s.name: s for s in specs}
+    claimed_spec_names: set[str] = set()
+    bound: list[tuple[TargetSpec, Target]] = []
+    unbound: list[Target] = []
+    for target in targets:
+        # Try to find the spec this target belongs to
+        resolved_spec_name = target.spec_name or hints.get(target.resource_id)
+        if resolved_spec_name and resolved_spec_name in spec_by_name:
+            spec = spec_by_name[resolved_spec_name]
+            bound.append((spec, target))
+            claimed_spec_names.add(resolved_spec_name)
+        else:
+            unbound.append(target)
+    # Cloud specs with no bound target are unprovisioned
+    unprovisioned = [s for s in specs if s.name not in claimed_spec_names and _is_cloud_spec(s)]
+    return ReconcileResult(
+        bound=bound,
+        unbound=unbound,
+        unprovisioned=unprovisioned,
+    )

wafer_core/targets/spec_store.py ADDED Viewed

@@ -0,0 +1,200 @@
+"""Spec store: CRUD for TargetSpec TOML files.
+Specs live in ~/.wafer/specs/{name}.toml. On first access, auto-migrates
+from the old ~/.wafer/targets/ directory if specs/ doesn't exist yet.
+This module provides the same operations as the old targets.py but under
+the "spec" vocabulary. The CLI-layer targets.py still works and delegates
+here where needed.
+"""
+from __future__ import annotations
+import logging
+import shutil
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any
+import tomllib
+from wafer_core.utils.kernel_utils.targets.config import (
+    BaremetalTarget,
+    DigitalOceanTarget,
+    LocalTarget,
+    ModalTarget,
+    RunPodTarget,
+    TargetConfig,
+    VMTarget,
+    WorkspaceTarget,
+)
+logger = logging.getLogger(__name__)
+WAFER_DIR = Path.home() / ".wafer"
+SPECS_DIR = WAFER_DIR / "specs"
+OLD_TARGETS_DIR = WAFER_DIR / "targets"
+CONFIG_FILE = WAFER_DIR / "config.toml"
+def _ensure_specs_dir() -> None:
+    """Ensure ~/.wafer/specs/ exists, migrating from targets/ if needed."""
+    if SPECS_DIR.exists():
+        return
+    if OLD_TARGETS_DIR.exists() and any(OLD_TARGETS_DIR.glob("*.toml")):
+        logger.info(
+            f"Migrating {OLD_TARGETS_DIR} -> {SPECS_DIR} (target configs are now called 'specs')"
+        )
+        shutil.copytree(OLD_TARGETS_DIR, SPECS_DIR)
+        # Don't delete old dir yet — other code may still read from it.
+        # It becomes a dead symlink target once all callers migrate.
+        logger.info(
+            f"Migration complete. Old directory preserved at {OLD_TARGETS_DIR}. "
+            "You can safely delete it once 'wafer specs list' works."
+        )
+    else:
+        SPECS_DIR.mkdir(parents=True, exist_ok=True)
+def _spec_path(name: str) -> Path:
+    return SPECS_DIR / f"{name}.toml"
+# ── Parsing ──────────────────────────────────────────────────────────────────
+_TYPE_MAP: dict[str, type] = {
+    "baremetal": BaremetalTarget,
+    "vm": VMTarget,
+    "modal": ModalTarget,
+    "workspace": WorkspaceTarget,
+    "runpod": RunPodTarget,
+    "digitalocean": DigitalOceanTarget,
+    "local": LocalTarget,
+}
+_TYPE_REVERSE: dict[type, str] = {v: k for k, v in _TYPE_MAP.items()}
+def parse_spec(data: dict[str, Any]) -> TargetConfig:
+    """Parse TOML dict into TargetSpec (TargetConfig union)."""
+    target_type = data.get("type")
+    if not target_type:
+        raise ValueError("Spec must have 'type' field")
+    cls = _TYPE_MAP.get(target_type)
+    if cls is None:
+        raise ValueError(
+            f"Unknown spec type: {target_type}. Must be one of: {', '.join(sorted(_TYPE_MAP))}"
+        )
+    fields = {k: v for k, v in data.items() if k != "type"}
+    # TOML parses lists; dataclasses may want tuples
+    if "pip_packages" in fields and isinstance(fields["pip_packages"], list):
+        fields["pip_packages"] = tuple(fields["pip_packages"])
+    if "gpu_ids" in fields and isinstance(fields["gpu_ids"], list):
+        fields["gpu_ids"] = tuple(fields["gpu_ids"])
+    return cls(**fields)
+def serialize_spec(spec: TargetConfig) -> dict[str, Any]:
+    """Serialize TargetSpec to TOML-compatible dict."""
+    data = asdict(spec)
+    data["type"] = _TYPE_REVERSE.get(type(spec), "unknown")
+    # Tuples -> lists for TOML
+    for key in ("pip_packages", "gpu_ids"):
+        if key in data and isinstance(data[key], tuple):
+            data[key] = list(data[key])
+    # Drop empty pip_packages
+    if "pip_packages" in data and not data["pip_packages"]:
+        del data["pip_packages"]
+    return data
+# ── CRUD ─────────────────────────────────────────────────────────────────────
+def load_spec(name: str) -> TargetConfig:
+    """Load spec by name from ~/.wafer/specs/{name}.toml.
+    Falls back to ~/.wafer/targets/{name}.toml for backwards compatibility.
+    """
+    _ensure_specs_dir()
+    path = _spec_path(name)
+    if not path.exists():
+        # Fallback to old location
+        old_path = OLD_TARGETS_DIR / f"{name}.toml"
+        if old_path.exists():
+            path = old_path
+        else:
+            raise FileNotFoundError(f"Spec not found: {name} (looked in {SPECS_DIR})")
+    with open(path, "rb") as f:
+        data = tomllib.load(f)
+    return parse_spec(data)
+def save_spec(spec: TargetConfig) -> None:
+    """Save spec to ~/.wafer/specs/{name}.toml."""
+    _ensure_specs_dir()
+    data = serialize_spec(spec)
+    path = _spec_path(spec.name)
+    _write_toml(path, data)
+def list_spec_names() -> list[str]:
+    """List all spec names from ~/.wafer/specs/."""
+    _ensure_specs_dir()
+    return sorted(p.stem for p in SPECS_DIR.glob("*.toml"))
+def remove_spec(name: str) -> None:
+    """Remove a spec by name."""
+    path = _spec_path(name)
+    if not path.exists():
+        raise FileNotFoundError(f"Spec not found: {name}")
+    path.unlink()
+def load_all_specs() -> list[TargetConfig]:
+    """Load all specs. Skips specs that fail to parse (logs warning)."""
+    specs = []
+    for name in list_spec_names():
+        try:
+            specs.append(load_spec(name))
+        except Exception as e:
+            logger.warning(f"Failed to load spec {name}: {e}")
+    return specs
+# ── TOML writer ──────────────────────────────────────────────────────────────
+def _write_toml(path: Path, data: dict[str, Any]) -> None:
+    """Write dict as flat TOML file."""
+    lines = []
+    for key, value in data.items():
+        if value is None:
+            continue
+        if isinstance(value, bool):
+            lines.append(f"{key} = {str(value).lower()}")
+        elif isinstance(value, int | float):
+            lines.append(f"{key} = {value}")
+        elif isinstance(value, str):
+            lines.append(f'{key} = "{value}"')
+        elif isinstance(value, list):
+            if all(isinstance(v, int) for v in value):
+                lines.append(f"{key} = {value}")
+            else:
+                formatted = ", ".join(f'"{v}"' if isinstance(v, str) else str(v) for v in value)
+                lines.append(f"{key} = [{formatted}]")
+    path.write_text("\n".join(lines) + "\n")

wafer_core/targets/state_cache.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""Target state cache: bindings and labels for live resources.
+Cache file: ~/.wafer/target_state.json
+Bindings map resource_id -> spec_name (performance hint for reconciliation).
+Labels map resource_id -> {key: value} (probed software versions).
+The provider API is always the source of truth for whether a resource exists.
+This cache stores metadata that's expensive to recompute (SSH probes, name inference).
+Format:
+{
+    "bindings": {
+        "<resource_id>": {
+            "spec_name": "<spec_name>",
+            "provider": "<provider>",
+            "bound_at": "<ISO timestamp>"
+        }
+    },
+    "labels": {
+        "<resource_id>": {
+            "rocm_version": "7.0.2",
+            "python_version": "3.12",
+            ...
+        }
+    }
+}
+"""
+from __future__ import annotations
+import json
+import logging
+from dataclasses import asdict, dataclass
+from pathlib import Path
+logger = logging.getLogger(__name__)
+WAFER_DIR = Path.home() / ".wafer"
+STATE_FILE = WAFER_DIR / "target_state.json"
+@dataclass(frozen=True)
+class BindingEntry:
+    """A cached binding from resource_id to spec_name."""
+    spec_name: str
+    provider: str
+    bound_at: str  # ISO timestamp
+# ---------------------------------------------------------------------------
+# Raw file I/O
+# ---------------------------------------------------------------------------
+def _load_state() -> dict:
+    """Load the full state file. Returns empty dict if missing/corrupted."""
+    if not STATE_FILE.exists():
+        return {}
+    try:
+        return json.loads(STATE_FILE.read_text())
+    except (json.JSONDecodeError, TypeError) as e:
+        logger.warning(f"Corrupted state cache, ignoring: {e}")
+        return {}
+def _save_state(data: dict) -> None:
+    """Write the full state file."""
+    STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
+    STATE_FILE.write_text(json.dumps(data, indent=2) + "\n")
+# ---------------------------------------------------------------------------
+# Bindings
+# ---------------------------------------------------------------------------
+def load_bindings() -> dict[str, BindingEntry]:
+    """Load binding cache from disk."""
+    data = _load_state()
+    bindings_raw = data.get("bindings", {})
+    result = {}
+    for rid, entry in bindings_raw.items():
+        try:
+            result[rid] = BindingEntry(**entry)
+        except TypeError:
+            logger.warning(f"Skipping malformed binding for {rid}")
+    return result
+def save_bindings(bindings: dict[str, BindingEntry]) -> None:
+    """Write bindings to disk (preserves labels)."""
+    data = _load_state()
+    data["bindings"] = {rid: asdict(entry) for rid, entry in bindings.items()}
+    _save_state(data)
+def add_binding(resource_id: str, entry: BindingEntry) -> None:
+    """Add a single binding to the cache."""
+    bindings = load_bindings()
+    bindings[resource_id] = entry
+    save_bindings(bindings)
+def remove_binding(resource_id: str) -> None:
+    """Remove a binding from the cache. No-op if not found."""
+    bindings = load_bindings()
+    if resource_id in bindings:
+        del bindings[resource_id]
+        save_bindings(bindings)
+def get_binding_hints() -> dict[str, str]:
+    """Get resource_id -> spec_name map for reconciliation."""
+    bindings = load_bindings()
+    return {rid: entry.spec_name for rid, entry in bindings.items()}
+# ---------------------------------------------------------------------------
+# Labels
+# ---------------------------------------------------------------------------
+def load_all_labels() -> dict[str, dict[str, str]]:
+    """Load all cached labels. Returns resource_id -> labels dict."""
+    data = _load_state()
+    return data.get("labels", {})
+def load_labels(resource_id: str) -> dict[str, str]:
+    """Load cached labels for a single resource. Returns empty dict if none."""
+    return load_all_labels().get(resource_id, {})
+def save_labels(resource_id: str, labels: dict[str, str]) -> None:
+    """Save labels for a resource (preserves bindings and other labels)."""
+    data = _load_state()
+    if "labels" not in data:
+        data["labels"] = {}
+    data["labels"][resource_id] = labels
+    _save_state(data)
+def remove_labels(resource_id: str) -> None:
+    """Remove cached labels for a resource. No-op if not found."""
+    data = _load_state()
+    labels = data.get("labels", {})
+    if resource_id in labels:
+        del labels[resource_id]
+        data["labels"] = labels
+        _save_state(data)

wafer-core 0.1.27__py3-none-any.whl → 0.1.29__py3-none-any.whl

wafer-core 0.1.27py3-none-any.whl → 0.1.29py3-none-any.whl