PyPI - wafer-core - Versions diffs - 0.1.24__py3-none-any.whl → 0.1.26__py3-none-any.whl - Mend

wafer-core 0.1.24py3-none-any.whl → 0.1.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

wafer_core/environments/coding.py +0 -4
wafer_core/lib/trace_compare/__init__.py +32 -0
wafer_core/lib/trace_compare/analyzer.py +339 -0
wafer_core/lib/trace_compare/classifier.py +192 -0
wafer_core/lib/trace_compare/formatter.py +951 -0
wafer_core/lib/trace_compare/fusion_analyzer.py +890 -0
wafer_core/lib/trace_compare/loader.py +336 -0
wafer_core/problem_config.py +3 -3
wafer_core/rollouts/agent_presets/rlm_01_01.py +2 -2
wafer_core/rollouts/dtypes.py +18 -3
wafer_core/rollouts/providers/anthropic.py +35 -3
wafer_core/rollouts/upload.py +45 -0
wafer_core/tools/__init__.py +0 -7
wafer_core/utils/kernel_utils/defense.py +10 -0
wafer_core/utils/kernel_utils/targets/config.py +10 -0
{wafer_core-0.1.24.dist-info → wafer_core-0.1.26.dist-info}/METADATA +1 -1
{wafer_core-0.1.24.dist-info → wafer_core-0.1.26.dist-info}/RECORD +18 -13
wafer_core/tools/search_docs_tool.py +0 -196
{wafer_core-0.1.24.dist-info → wafer_core-0.1.26.dist-info}/WHEEL +0 -0

wafer_core/lib/trace_compare/loader.py ADDED Viewed

@@ -0,0 +1,336 @@
+"""Trace loading and parsing logic.
+Loads JSON trace files from AMD/NVIDIA profilers and extracts kernel execution data,
+Python call stacks, CPU operator mappings, and layer correlations.
+"""
+import bisect
+import json
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+import pandas as pd
+from .classifier import classify
+def extract_layer_mapping(events: list[dict[str, Any]], platform: str) -> dict[int, int]:
+    """Extract correlation ID to layer number mapping.
+    vLLM's execution graph creates large correlation groups for full transformer layers.
+    Each layer's forward pass (norm + attention + FFN) gets grouped under one correlation ID,
+    containing 200-400 kernels depending on batch size and sequence length.
+    We identify layers as correlation groups with many kernels (70+), which filters out
+    individual operations like sampling, logit processing, etc.
+    Args:
+        events: List of trace events
+        platform: 'AMD' or 'NVIDIA'
+    Returns:
+        Dict mapping correlation ID to layer number
+    """
+    # Group kernels by correlation ID
+    correlation_groups = defaultdict(
+        lambda: {"count": 0, "has_attention": False, "has_ffn": False}
+    )
+    for ev in events:
+        if ev.get("cat") != "kernel":
+            continue
+        corr_id = ev.get("args", {}).get("correlation")
+        if corr_id is None:
+            continue
+        kernel_name = ev.get("name", "").lower()
+        # Track what operations this correlation contains
+        correlation_groups[corr_id]["count"] += 1
+        if "attention" in kernel_name or "fmha" in kernel_name:
+            correlation_groups[corr_id]["has_attention"] = True
+        if any(x in kernel_name for x in ["cijk_", "nvjet", "wvsplitk", "gemm"]):
+            correlation_groups[corr_id]["has_ffn"] = True
+    # Map correlation IDs to layer numbers
+    # Transformer layers have many kernels AND contain both attention and FFN ops
+    correlation_to_layer = {}
+    layer_num = 0
+    for corr_id in sorted(correlation_groups.keys()):
+        group = correlation_groups[corr_id]
+        # Identify complete transformer layers by their characteristics:
+        # - Has attention operations (self-attention or cross-attention)
+        # - Has FFN operations (feed-forward network)
+        # - Has sufficient kernel count (70+): typical transformer block has ~80-100 kernels
+        #   including attention QKV projections, softmax, output projection, FFN layers,
+        #   normalization, and elementwise ops. This threshold filters out:
+        #   - Individual operations (1-10 kernels)
+        #   - Sampling/generation steps (20-40 kernels)
+        #   - Partial layer executions
+        is_layer = (
+            group["count"] >= 70 and group["has_attention"] and group["has_ffn"]
+        )
+        if is_layer:
+            correlation_to_layer[corr_id] = layer_num
+            layer_num += 1
+    return correlation_to_layer
+def _build_python_stack_index(
+    events: list[dict[str, Any]],
+) -> tuple[list[tuple[int, int, int, int | None, str]], dict[int, dict[str, Any]]]:
+    """Build Python call stack index for kernels.
+    Args:
+        events: List of trace events
+    Returns:
+        Tuple of (python_intervals, python_by_id)
+    """
+    python_by_id: dict[int, dict[str, Any]] = {}
+    python_intervals: list[tuple[int, int, int, int | None, str]] = []
+    for ev in events:
+        if ev.get("cat") == "python_function":
+            py_id = ev.get("args", {}).get("Python id")
+            name = ev["name"]
+            ts_start = ev["ts"]
+            ts_end = ts_start + ev.get("dur", 0)
+            duration = ev.get("dur", 0)
+            parent_id = ev.get("args", {}).get("Python parent id")
+            python_intervals.append((ts_start, ts_end, duration, py_id, name))
+            if py_id is not None:
+                python_by_id[py_id] = {
+                    "name": name,
+                    "parent_id": parent_id,
+                    "ts_start": ts_start,
+                    "ts_end": ts_end,
+                    "duration": duration,
+                }
+    # Sort by start time for efficient binary search
+    python_intervals.sort()
+    return python_intervals, python_by_id
+def _get_python_stack_full(
+    timestamp: int,
+    python_intervals: list[tuple[int, int, int, int | None, str]],
+    python_by_id: dict[int, dict[str, Any]],
+) -> tuple[str | None, list[str]]:
+    """Get full Python call stack for a kernel launch.
+    Args:
+        timestamp: Kernel launch timestamp
+        python_intervals: Sorted list of Python function intervals
+        python_by_id: Mapping of Python ID to function info
+    Returns:
+        Tuple of (summary_string, full_stack_list)
+    """
+    # Binary search for Python functions active at this timestamp
+    idx = bisect.bisect_right(
+        python_intervals, (timestamp, float("inf"), float("inf"), None, "")
+    )
+    # Find active functions
+    active_funcs = []
+    for i in range(idx - 1, max(0, idx - 1000), -1):
+        ts_start, ts_end, duration, py_id, name = python_intervals[i]
+        if ts_start <= timestamp <= ts_end:
+            active_funcs.append((duration, py_id, name))
+        if ts_end < timestamp - 1000000:  # 1 second before
+            break
+    if not active_funcs:
+        return None, []
+    # Get the innermost (most specific) function
+    active_funcs.sort()
+    leaf_duration, leaf_id, leaf_name = active_funcs[0]
+    # Walk up parent chain to get FULL stack
+    full_stack = []
+    current_id = leaf_id
+    visited = set()
+    while (
+        current_id is not None
+        and current_id not in visited
+        and current_id in python_by_id
+    ):
+        func = python_by_id[current_id]
+        name = func["name"]
+        full_stack.append(name)
+        visited.add(current_id)
+        current_id = func["parent_id"]
+        # Safety limit: prevent infinite loops from circular parent references
+        # and bound memory usage. 50 frames is deeper than typical Python stacks.
+        if len(full_stack) >= 50:
+            break
+    # Reverse so it's outermost -> innermost
+    full_stack.reverse()
+    # Create summary for text output: show the most informative vLLM/model function
+    summary = None
+    vllm_funcs = [
+        f
+        for f in full_stack
+        if any(x in f.lower() for x in ["vllm/", "model", "<eval_with_key>"])
+    ]
+    if vllm_funcs:
+        # Get innermost vLLM function (most specific)
+        summary = vllm_funcs[-1]
+        # Check if it's a CUDA graph - add annotation
+        if any("torch/cuda/graphs" in f for f in full_stack):
+            # Shorten if too long
+            if len(summary) > 45:
+                parts = summary.split("/")[-1]
+                summary = "vllm/..." + parts
+            summary = f"{summary} [CUDA graph]"
+        elif len(summary) > 53:
+            parts = summary.split("/")[-1]
+            summary = "vllm/..." + parts
+    else:
+        # Fallback to innermost function
+        summary = leaf_name
+    return summary, full_stack
+def load_trace(
+    file_path: str | Path,
+) -> tuple[str, str, dict[str, Any], pd.DataFrame, dict[tuple[str, str], set[str]], dict[int, int]]:
+    """Load trace and return platform info, device properties, kernels, patterns, and layer mapping.
+    Args:
+        file_path: Path to JSON trace file
+    Returns:
+        Tuple of (platform, gpu_name, device_props, kernel_df, kernel_patterns, layer_mapping)
+    """
+    with open(file_path, "rb") as f:
+        trace = json.load(f)
+    props = trace.get("deviceProperties", [{}])[0]
+    is_amd = trace.get("roctracer_version") or props.get("warpSize") == 64
+    platform = "AMD" if is_amd else "NVIDIA"
+    gpu_name = props.get("name", "MI300X" if is_amd else "Unknown GPU")
+    # Extract relevant device properties
+    device_props = {
+        "name": gpu_name,
+        "compute_capability": f"{props.get('computeMajor', 0)}.{props.get('computeMinor', 0)}",
+        "total_memory_gb": props.get("totalGlobalMem", 0) / (1024**3),
+        "sm_count": props.get("numSms", 0),
+        "warp_size": props.get("warpSize", 32),
+        "max_threads_per_block": props.get("maxThreadsPerBlock", 0),
+        "shared_mem_per_block_kb": props.get("sharedMemPerBlock", 0) / 1024,
+    }
+    events = trace.get("traceEvents", [])
+    # Build mapping: external_id -> CPU operator name
+    external_to_cpu = {}
+    for ev in events:
+        if ev.get("cat") == "cpu_op":
+            ext_id = ev.get("args", {}).get("External id")
+            cpu_op_name = ev.get("name", "")
+            if ext_id is not None:
+                external_to_cpu[ext_id] = cpu_op_name
+    # Build Python call stack index for kernels without External IDs
+    python_intervals, python_by_id = _build_python_stack_index(events)
+    # Extract phases
+    phases = []
+    for ev in events:
+        if ev.get("cat") == "user_annotation" and ev.get("name", "").startswith(
+            "execute_context"
+        ):
+            name = ev["name"]
+            # Parse execute_context_X(TOKENS)_generation_Y(Y)
+            # We want the TOKENS from execute_context, not the generation number
+            tokens = 0
+            parts = name.split("_")
+            for i, p in enumerate(parts):
+                # Look for execute_context_X(TOKENS) specifically
+                if i > 0 and parts[i-1] == "context" and "(" in p and ")" in p:
+                    try:
+                        tokens = int(p.split("(")[1].split(")")[0])
+                        break  # Stop after finding context tokens
+                    except Exception:
+                        pass
+            is_prefill = tokens >= 1024 and "generation_0" in name
+            phases.append(
+                {
+                    "type": "prefill" if is_prefill else "decode",
+                    "ts_start": ev["ts"],
+                    "ts_end": ev["ts"] + ev["dur"],
+                }
+            )
+    # Extract layer mapping from correlation IDs
+    layer_mapping = extract_layer_mapping(events, platform)
+    kernel_data = []
+    kernel_patterns: dict[tuple[str, str], set[str]] = defaultdict(set)
+    for ev in events:
+        if ev.get("cat") != "kernel":
+            continue
+        name, dur, ts = ev["name"], ev.get("dur", 0), ev["ts"]
+        corr_id = ev.get("args", {}).get("correlation")
+        ext_id = ev.get("args", {}).get("External id")
+        phase = "decode"
+        for p in phases:
+            if p["ts_start"] <= ts <= p["ts_end"]:
+                phase = p["type"]
+                break
+        op, pattern = classify(name, platform)
+        kernel_patterns[(op.value, phase)].add(pattern)
+        # Assign layer number from correlation ID
+        layer = layer_mapping.get(corr_id) if corr_id is not None else None
+        # Get CPU operator name from external ID, or fallback to Python stack
+        cpu_op = external_to_cpu.get(ext_id) if ext_id is not None else None
+        python_stack: list[str] = []
+        # If no CPU op via External ID, try Python stack trace
+        if cpu_op is None:
+            cpu_op, python_stack = _get_python_stack_full(
+                ts, python_intervals, python_by_id
+            )
+        kernel_data.append(
+            {
+                "name": name,
+                "dur_us": dur,
+                "phase": phase,
+                "op": op.value,
+                "pattern": pattern,
+                "layer": layer,
+                "correlation": corr_id,
+                "cpu_op": cpu_op,
+                "python_stack": python_stack,  # Full stack for JSON output
+            }
+        )
+    return platform, gpu_name, device_props, pd.DataFrame(kernel_data), dict(kernel_patterns), layer_mapping

wafer_core/problem_config.py CHANGED Viewed

@@ -84,7 +84,7 @@ class ProblemConfig:
     benchmarks: list[dict[str, Any]]
     # Optional with defaults
-    model: str = "claude-sonnet-4-5-20250929"
+    model: str = "claude-opus-4-5-20251101"
     temperature: float = 0.2
     max_tokens: int = 8192
     max_turns: int = 10
@@ -219,7 +219,7 @@ def _parse_config(data: dict[str, Any], base_dir: Path) -> tuple[ProblemConfig |
         reference_code=reference_code,
         tests=tests,
         benchmarks=benchmarks,
-        model=data.get("model", "claude-sonnet-4-5-20250929"),
+        model=data.get("model", "claude-opus-4-5-20251101"),
         temperature=data.get("temperature", 0.2),
         max_tokens=data.get("max_tokens", 8192),
         max_turns=data.get("max_turns", 10),
@@ -269,7 +269,7 @@ def create_problem_config_from_cli(
         reference_code=reference_code,
         tests=tests,
         benchmarks=benchmarks or tests,  # Use tests as benchmarks if not specified
-        model=kwargs.get("model", "claude-sonnet-4-5-20250929"),
+        model=kwargs.get("model", "claude-opus-4-5-20251101"),
         temperature=kwargs.get("temperature", 0.2),
         max_tokens=kwargs.get("max_tokens", 8192),
         max_turns=kwargs.get("max_turns", 10),

wafer_core/rollouts/agent_presets/rlm_01_01.py CHANGED Viewed

@@ -119,7 +119,7 @@ FINAL(42)
 config = AgentPresetConfig(
     name="rlm",
-    model="anthropic/claude-sonnet-4-5-20250929",
+    model="anthropic/claude-opus-4-5-20251101",
     env="repl",  # Uses REPLEnvironment
     thinking=True,
     system_prompt=RLM_TOOL_SYSTEM_PROMPT,
@@ -128,7 +128,7 @@ config = AgentPresetConfig(
 # Variant for message-parsing mode
 config_block_mode = AgentPresetConfig(
     name="rlm_blocks",
-    model="anthropic/claude-sonnet-4-5-20250929",
+    model="anthropic/claude-opus-4-5-20251101",
     env="repl_blocks",  # Uses MessageParsingREPLEnvironment
     thinking=True,
     system_prompt=RLM_BLOCK_SYSTEM_PROMPT,

wafer_core/rollouts/dtypes.py CHANGED Viewed

@@ -1238,6 +1238,12 @@ class Endpoint(JsonSerializable):
     api_base: str = ""
     api_key: str = ""
     oauth_token: str = ""  # OAuth bearer token (takes precedence over api_key for Anthropic)
+    # TODO: Callbacks on a frozen dataclass are a code smell. This exists because wafer-core
+    # can't depend on wafer-cli (where the Supabase refresh logic lives). A cleaner approach
+    # would be a TokenProvider protocol that Endpoint delegates to, keeping the dataclass pure.
+    api_key_refresh: Callable[[], Awaitable[str | None]] | None = field(
+        default=None, repr=False, compare=False
+    )
     is_claude_code_api_key: bool = (
         False  # API key created via Claude Code OAuth (requires special headers)
     )
@@ -1300,6 +1306,7 @@ class Endpoint(JsonSerializable):
             exclude_secrets: If True (default), omits api_key and oauth_token.
         """
         d = asdict(self)
+        d.pop("api_key_refresh", None)  # Callable, not serializable
         if exclude_secrets:
             d.pop("api_key", None)
             d.pop("oauth_token", None)
@@ -1307,7 +1314,11 @@ class Endpoint(JsonSerializable):
     @classmethod
     def from_dict(
-        cls, data: dict[str, Any], api_key: str = "", oauth_token: str = ""
+        cls,
+        data: dict[str, Any],
+        api_key: str = "",
+        oauth_token: str = "",
+        api_key_refresh: "Callable[[], Awaitable[str | None]] | None" = None,
     ) -> "Endpoint":
         """Deserialize from dict, injecting secrets at runtime.
@@ -1315,12 +1326,16 @@ class Endpoint(JsonSerializable):
             data: Dict from to_dict()
             api_key: API key to inject (not stored in session)
             oauth_token: OAuth token to inject (not stored in session)
+            api_key_refresh: Callback to refresh api_key mid-session (not stored)
         """
-        # Remove secrets if present (they shouldn't be, but be safe)
+        # Remove secrets/callables if present (they shouldn't be, but be safe)
         data = data.copy()
         data.pop("api_key", None)
         data.pop("oauth_token", None)
-        return cls(**data, api_key=api_key, oauth_token=oauth_token)
+        data.pop("api_key_refresh", None)
+        return cls(
+            **data, api_key=api_key, oauth_token=oauth_token, api_key_refresh=api_key_refresh
+        )
 @dataclass(frozen=True)

wafer_core/rollouts/providers/anthropic.py CHANGED Viewed

@@ -725,9 +725,16 @@ async def rollout_anthropic(
             oauth_token = fresh_token
         # If refresh failed, continue with existing token - it might still work
+    # Get fresh wafer proxy token if refresh callback is available
+    api_key = actor.endpoint.api_key
+    if actor.endpoint.api_key_refresh:
+        fresh_key = await actor.endpoint.api_key_refresh()
+        if fresh_key:
+            api_key = fresh_key
     client = _create_anthropic_client(
         oauth_token=oauth_token,
-        api_key=actor.endpoint.api_key,
+        api_key=api_key,
         api_base=actor.endpoint.api_base,
         max_retries=actor.endpoint.max_retries,
         timeout=actor.endpoint.timeout,
@@ -973,7 +980,7 @@ async def rollout_anthropic(
                         f"Model not found: {e}\nCheck your model ID is correct."
                     ) from e
-                # For OAuth: try to refresh token and retry once on auth errors
+                # Try to refresh token and retry once on auth errors
                 if isinstance(e, anthropic.AuthenticationError):
                     if oauth_token and attempt == 0:
                         # Emit retry event for OAuth refresh
@@ -993,12 +1000,37 @@ async def rollout_anthropic(
                             await client.close()
                             client = _create_anthropic_client(
                                 oauth_token=oauth_token,
-                                api_key=actor.endpoint.api_key,
+                                api_key=api_key,
                                 api_base=actor.endpoint.api_base,
                                 max_retries=actor.endpoint.max_retries,
                                 timeout=actor.endpoint.timeout,
                             )
                             continue
+                    # Wafer proxy token refresh (Supabase JWTs expire after ~1hr)
+                    if actor.endpoint.api_key_refresh and attempt == 0:
+                        await on_chunk(
+                            RetryStart(
+                                attempt=1,
+                                max_attempts=2,
+                                delay_seconds=0,
+                                error_message="Wafer proxy token expired, refreshing",
+                                provider="anthropic",
+                            )
+                        )
+                        fresh_key = await actor.endpoint.api_key_refresh()
+                        if fresh_key and fresh_key != api_key:
+                            api_key = fresh_key
+                            await client.close()
+                            client = _create_anthropic_client(
+                                oauth_token=oauth_token,
+                                api_key=api_key,
+                                api_base=actor.endpoint.api_base,
+                                max_retries=actor.endpoint.max_retries,
+                                timeout=actor.endpoint.timeout,
+                            )
+                            continue
                     raise FatalEvalError(
                         f"Authentication failed: {e}\nCheck your API key or OAuth token."
                     ) from e

wafer_core/rollouts/upload.py CHANGED Viewed

@@ -7,10 +7,13 @@ import logging
 import os
 from pathlib import Path
+import httpx
 logger = logging.getLogger(__name__)
 SUPABASE_URL = "https://hvlpthcnxlywlquiciqe.supabase.co"
 BUCKET_NAME = "traces"
+API_BASE = os.environ.get("WAFER_API_URL", "https://api.wafer.ai")
 def upload_results_to_supabase(output_dir: Path, log: logging.Logger | None = None) -> bool:
@@ -95,6 +98,12 @@ def upload_results_to_supabase(output_dir: Path, log: logging.Logger | None = No
             )
         log.info(f"Uploaded {len(uploaded)} files to Supabase: {run_name}")
+        # Auto-index in database for trace viewer
+        # Fail if indexing fails - user can re-run (everything is idempotent)
+        if not _index_run_in_database(run_name, report_path, log):
+            return False
         return True
     except ImportError:
@@ -103,3 +112,39 @@ def upload_results_to_supabase(output_dir: Path, log: logging.Logger | None = No
     except Exception as e:
         log.error(f"Failed to upload to Supabase: {e}")
         return False
+def _index_run_in_database(run_name: str, report_path: Path, log: logging.Logger) -> bool:
+    """Index a run in the trace_runs database table for fast querying.
+    Calls POST /v1/eval-traces/runs to upsert the run metadata.
+    This enables the trace viewer to show the run immediately without manual sync.
+    Args:
+        run_name: Name of the run (folder name)
+        report_path: Path to the report.json file
+        log: Logger instance
+    Returns:
+        True if indexing succeeded, False otherwise
+    """
+    try:
+        with open(report_path) as f:
+            report = json.load(f)
+        response = httpx.post(
+            f"{API_BASE}/v1/eval-traces/runs",
+            json={"name": run_name, "report": report},
+            timeout=30.0,
+        )
+        if response.status_code == 200:
+            log.info(f"Indexed run in database: {run_name}")
+            return True
+        else:
+            log.error(f"Failed to index run {run_name}: {response.status_code} {response.text}")
+            return False
+    except Exception as e:
+        log.error(f"Failed to index run {run_name} in database: {e}")
+        return False

wafer_core/tools/__init__.py CHANGED Viewed

@@ -72,10 +72,6 @@ from wafer_core.tools.write_kernel_tool import (
     KernelSubmission,
     exec_write_kernel,
 )
-from wafer_core.tools.search_docs_tool import (
-    SEARCH_DOCS_TOOL,
-    exec_search_docs,
-)
 __all__ = [
     # File tools
@@ -137,7 +133,4 @@ __all__ = [
     "exec_tracelens_report",
     "exec_tracelens_compare",
     "exec_tracelens_collective",
-    # Search docs tool
-    "SEARCH_DOCS_TOOL",
-    "exec_search_docs",
 ]

wafer_core/utils/kernel_utils/defense.py CHANGED Viewed

@@ -12,6 +12,16 @@ Attack types defended against:
 5. Monkey-patching - Replacing CUDA timing functions with fake implementations
 Reference: "Hacks and Defenses in Automatic GPU Kernel Generation" by Jiwei Li (Dec 2025)
+TODO: Memory guard buffers (from CUDA-L2's zero_one_correctness_check.py) — wrap
+  input/output tensors with guard regions and check for out-of-bounds writes after
+  kernel execution. Catches shared memory overflow and buffer overrun at the memory
+  boundary, rather than inferring from output non-determinism.
+TODO: Exact correctness for GEMM kernels (from CUDA-L2) — use {0,1} input matrices
+  where FP16 results ≤2048 are exactly representable, enabling zero-tolerance
+  validation (torch.equal instead of torch.allclose). Eliminates the "bounded garbage
+  passes tolerance check" failure mode for matmul kernels entirely.
 """
 import random

wafer_core/utils/kernel_utils/targets/config.py CHANGED Viewed

@@ -21,6 +21,12 @@ if TYPE_CHECKING:
     from wafer_core.utils.kernel_utils.deployment import DeploymentConfig
+# TODO: Split BaremetalTarget into BaremetalTarget (persistent servers like Vultr,
+# never auto-removed) and SSHTarget (ephemeral SSH endpoints from providers like
+# RunPod/DO, safe to auto-clean when unreachable). Currently the pool bridge creates
+# ephemeral pod endpoints as type="baremetal", losing provenance. SSHTarget should
+# subclass BaremetalTarget so existing isinstance() checks still work. The `provider`
+# field is a stopgap until this split happens.
 @dataclass(frozen=True)
 class BaremetalTarget:
     """Configuration for baremetal GPU server.
@@ -59,6 +65,9 @@ class BaremetalTarget:
     gpu_type: str = "B200"
     compute_capability: str = "10.0"
     ncu_available: bool = True  # Baremetal typically has NCU
+    provider: str | None = (
+        None  # Source provider ("runpod", "digitalocean") — enables auto-cleanup when instance is gone
+    )
     # Docker execution config (Modal-like). If docker_image is set, run in container.
     docker_image: str | None = (
@@ -314,6 +323,7 @@ class RunPodTarget:
     #   apt-get install --reinstall -y rocthrust
     # See docker/rocm7-runpod/README.md for details.
     image: str = "rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.7.1"
+    template_id: str | None = None  # RunPod template ID for custom pod configuration
     # RunPod template ID — required for non-RunPod images that need custom
     # dockerArgs (e.g. to install and start sshd). When set, takes priority

{wafer_core-0.1.24.dist-info → wafer_core-0.1.26.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: wafer-core
-Version: 0.1.24
+Version: 0.1.26
 Summary: Core utilities and environments for Wafer GPU kernel optimization
 Requires-Python: >=3.10
 Requires-Dist: aiohttp>=3.9.0

wafer-core 0.1.24__py3-none-any.whl → 0.1.26__py3-none-any.whl

wafer-core 0.1.24py3-none-any.whl → 0.1.26py3-none-any.whl