wafer-core 0.1.27__py3-none-any.whl → 0.1.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,141 @@
1
+ """Target and TargetSpec: the two core concepts for GPU resource management.
2
+
3
+ TargetSpec = provisioning blueprint (TOML config, "how to get a GPU")
4
+ Target = live running resource (from provider API, "what's actually running")
5
+
6
+ TargetSpec is the existing union of provider-specific frozen dataclasses
7
+ (RunPodTarget, DigitalOceanTarget, BaremetalTarget, etc.), re-exported here
8
+ under the name TargetSpec for clarity.
9
+
10
+ Target is always fetched from provider APIs. The spec_name field links a
11
+ live resource back to the spec that created it (None = orphan/unbound).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from dataclasses import dataclass, field
17
+ from typing import TYPE_CHECKING, Protocol, runtime_checkable
18
+
19
+ if TYPE_CHECKING:
20
+ pass
21
+
22
+ # TargetSpec is the existing union type, re-exported under a clearer name.
23
+ # Each variant is a frozen dataclass with provider-specific provisioning params.
24
+ from wafer_core.utils.kernel_utils.targets.config import ( # noqa: E402
25
+ TargetConfig,
26
+ )
27
+
28
+ # TargetSpec = TargetConfig (same union, better name for the new API)
29
+ TargetSpec = TargetConfig
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class Target:
34
+ """A live running GPU resource, fetched from a provider API.
35
+
36
+ This is the runtime counterpart to TargetSpec. A TargetSpec describes
37
+ *how* to provision a GPU; a Target describes *what's actually running*.
38
+
39
+ The provider API is the source of truth for Target state. Local caches
40
+ (target_state.json) are performance hints only.
41
+
42
+ Fields:
43
+ resource_id: Provider's unique ID (pod_id, droplet_id, or
44
+ "baremetal:{host}:{port}" for SSH targets with no cloud lifecycle).
45
+ provider: Which cloud provider owns this resource.
46
+ status: Current state from provider API.
47
+ public_ip: SSH-reachable IP address (None if not yet assigned).
48
+ ssh_port: SSH port (None if not yet assigned).
49
+ ssh_username: SSH user (typically "root" for cloud providers).
50
+ gpu_type: GPU model name (e.g., "MI300X", "B200").
51
+ name: Provider-side resource name (e.g., "wafer-runpod-mi300x-1706000000",
52
+ "kernelbench-pool-0"). Used for spec_name inference.
53
+ created_at: ISO timestamp of resource creation (None if unknown).
54
+ spec_name: Name of the TargetSpec that owns this resource.
55
+ None means unbound (orphan) — running but no spec claims it.
56
+ price_per_hour: Cost in $/hr (None if unknown or baremetal).
57
+ labels: Software metadata not available from the provider API's
58
+ structured fields. Examples: {"rocm_version": "7.0.2",
59
+ "cuda_version": "12.4", "image": "rocm/pytorch:rocm7.0.2_..."}.
60
+ Populated from the container image string at provision time,
61
+ or from SSH probe on demand. Pool queries filter on these.
62
+ """
63
+
64
+ resource_id: str
65
+ provider: str
66
+ status: str
67
+ public_ip: str | None
68
+ ssh_port: int | None
69
+ ssh_username: str
70
+ gpu_type: str
71
+ name: str | None = None
72
+ created_at: str | None = None
73
+ spec_name: str | None = None
74
+ price_per_hour: float | None = None
75
+ labels: dict[str, str] = field(default_factory=dict)
76
+
77
+ def __post_init__(self) -> None:
78
+ assert self.resource_id, "resource_id cannot be empty"
79
+ assert self.provider, "provider cannot be empty"
80
+ assert self.status, "status cannot be empty"
81
+
82
+
83
+ @dataclass(frozen=True)
84
+ class ReconcileResult:
85
+ """Result of comparing TargetSpecs to live Targets.
86
+
87
+ Pure data — no side effects. The caller decides what to do:
88
+ - Display bound/unbound/unprovisioned in CLI
89
+ - Terminate unbound targets
90
+ - Provision from unprovisioned specs
91
+
92
+ Fields:
93
+ bound: Specs matched to live targets (spec, target) pairs.
94
+ unbound: Live targets with no matching spec (orphans).
95
+ unprovisioned: Specs with no live target running.
96
+ """
97
+
98
+ bound: list[tuple[TargetSpec, Target]]
99
+ unbound: list[Target]
100
+ unprovisioned: list[TargetSpec]
101
+
102
+
103
+ @runtime_checkable
104
+ class TargetProvider(Protocol):
105
+ """Interface for querying and managing live GPU resources from a cloud provider.
106
+
107
+ Each cloud provider (RunPod, DigitalOcean, etc.) implements this protocol.
108
+ Methods are async because they hit external APIs.
109
+
110
+ Baremetal is a degenerate case: list_targets returns a Target built from
111
+ the spec's ssh_target, provision/terminate are no-ops.
112
+ """
113
+
114
+ async def list_targets(self) -> list[Target]:
115
+ """List all running resources on the provider account.
116
+
117
+ Always hits the provider API — never reads from local cache.
118
+ """
119
+ ...
120
+
121
+ async def get_target(self, resource_id: str) -> Target | None:
122
+ """Get a specific resource by provider ID.
123
+
124
+ Returns None if the resource doesn't exist or is terminated.
125
+ """
126
+ ...
127
+
128
+ async def provision(self, spec: TargetSpec) -> Target:
129
+ """Provision a new resource from a spec.
130
+
131
+ Blocks until the resource is SSH-ready.
132
+ Raises on failure (no silent None returns).
133
+ """
134
+ ...
135
+
136
+ async def terminate(self, resource_id: str) -> bool:
137
+ """Terminate a resource by provider ID.
138
+
139
+ Returns True if terminated, False if resource not found.
140
+ """
141
+ ...
@@ -346,25 +346,17 @@ class RunPodTarget:
346
346
  ncu_available: bool = False
347
347
 
348
348
  def __post_init__(self) -> None:
349
- """Validate configuration."""
350
- from wafer_core.auth import get_api_key
349
+ """Validate configuration fields.
351
350
 
351
+ API key availability is checked at provision/query time, not here —
352
+ loading a spec from TOML should not require credentials.
353
+ """
352
354
  assert self.name, "name cannot be empty"
353
355
  assert self.ssh_key, "ssh_key cannot be empty"
354
356
  assert self.gpu_count > 0, "gpu_count must be positive"
355
357
  assert self.provision_timeout > 0, "provision_timeout must be positive"
356
358
  assert self.eval_timeout > 0, "eval_timeout must be positive"
357
359
 
358
- # Check for API key (env var or ~/.wafer/auth.json)
359
- api_key = get_api_key("runpod")
360
- if not api_key:
361
- raise ValueError(
362
- "RunPod API key not found.\n"
363
- "Set WAFER_RUNPOD_API_KEY environment variable, or run:\n"
364
- " wafer auth login runpod\n"
365
- "Get your API key from: https://runpod.io/console/user/settings"
366
- )
367
-
368
360
 
369
361
  @dataclass(frozen=True)
370
362
  class LocalTarget:
@@ -468,24 +460,16 @@ class DigitalOceanTarget:
468
460
  ncu_available: bool = False
469
461
 
470
462
  def __post_init__(self) -> None:
471
- """Validate configuration."""
472
- from wafer_core.auth import get_api_key
463
+ """Validate configuration fields.
473
464
 
465
+ API key availability is checked at provision/query time, not here —
466
+ loading a spec from TOML should not require credentials.
467
+ """
474
468
  assert self.name, "name cannot be empty"
475
469
  assert self.ssh_key, "ssh_key cannot be empty"
476
470
  assert self.provision_timeout > 0, "provision_timeout must be positive"
477
471
  assert self.eval_timeout > 0, "eval_timeout must be positive"
478
472
 
479
- # Check for API key (env var or ~/.wafer/auth.json)
480
- api_key = get_api_key("digitalocean")
481
- if not api_key:
482
- raise ValueError(
483
- "DigitalOcean API key not found.\n"
484
- "Set WAFER_AMD_DIGITALOCEAN_API_KEY environment variable, or run:\n"
485
- " wafer auth login digitalocean\n"
486
- "Get your API key from: https://cloud.digitalocean.com/account/api/tokens"
487
- )
488
-
489
473
 
490
474
  # Union type for target configs
491
475
  TargetConfig = (
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wafer-core
3
- Version: 0.1.27
3
+ Version: 0.1.29
4
4
  Summary: Core utilities and environments for Wafer GPU kernel optimization
5
5
  Requires-Python: >=3.10
6
6
  Requires-Dist: aiohttp>=3.9.0
@@ -320,13 +320,13 @@ wafer_core/lib/rocprofiler/systems/sample/__init__.py,sha256=31rNmLPQ7OVhvlOEEOw
320
320
  wafer_core/lib/rocprofiler/systems/sample/profiler.py,sha256=CYZPTzNXd48LoCfmY6h_5RSYEdWYccuv3-t4YncHJLE,7384
321
321
  wafer_core/lib/trace_compare/PERFORMANCE.md,sha256=jkJh7ApZi8H7NKTcz8v0LNtwSFtIUqY88e3QbL749ww,3823
322
322
  wafer_core/lib/trace_compare/__init__.py,sha256=CyUPbPQDYhVLCFFA7S_jNSilG3OgqYjmHSKfR5X11go,1377
323
- wafer_core/lib/trace_compare/aligner.py,sha256=6HplOHCUIb0cMXA-Lu-91T-hKVTMK4bk8Ei-v7HE1G4,13471
324
- wafer_core/lib/trace_compare/analyzer.py,sha256=m-waAiU5S72M9J4kUwIy9fPWUecg_oOUczri8Na6xUY,29360
323
+ wafer_core/lib/trace_compare/aligner.py,sha256=1S8Ob3RaEsIjN0HdqEx0yGsW5uf_lMrJVSH_MnZhKok,13788
324
+ wafer_core/lib/trace_compare/analyzer.py,sha256=YkuOPA3HFX_7mNUEhE9CMOtEMGLQd12lvUkvqqeQF14,29698
325
325
  wafer_core/lib/trace_compare/api.py,sha256=JSRTcd7eZK1Z8l18TFEiA5A8ENJS1TMz7oIiw1KBbAs,8796
326
326
  wafer_core/lib/trace_compare/architecture.py,sha256=8bqlAJQeJLBHblyXvFV-w55PIKiVQDPjDQZ8Jx4tuGg,2110
327
- wafer_core/lib/trace_compare/classifier.py,sha256=CDGzY9TY-I5wRuEGsu4mTCdljqVTOnLWyFLyNgmkGXI,16864
327
+ wafer_core/lib/trace_compare/classifier.py,sha256=cYAmDW8S75N6cE3mJNZM-UKCJSX7rFP-8klVrukBvNQ,17504
328
328
  wafer_core/lib/trace_compare/formatter.py,sha256=GNrCZ45ueBN05CEXjOtTuKvTI8z-g-ZZFil-ni3sWVY,37962
329
- wafer_core/lib/trace_compare/fusion_analyzer.py,sha256=bD_CJ3JoVg_N6vxJJULd6G8l_-O5qnLuXKDEDItcQtg,15489
329
+ wafer_core/lib/trace_compare/fusion_analyzer.py,sha256=ga0sfxx8OCQu9Hq7uJSAMfXhnCvBaAmzVofBN7_gdV8,19843
330
330
  wafer_core/lib/trace_compare/kernel_registry.yaml,sha256=0-knXwsF3pR1x1JdIz-aWaH-5xDgTylh53E47Kf6nHo,9808
331
331
  wafer_core/lib/trace_compare/layer_segmentation.py,sha256=kI_Y1e9nrKZfdwfcrGo4h7gpMxqXI_xkgXk46zuFen4,4642
332
332
  wafer_core/lib/trace_compare/loader.py,sha256=zBHI0r7CX_wJ2mz0_-s0lm9KGSdaVaq7OKyxUL6KIlw,23997
@@ -597,9 +597,19 @@ wafer_core/sessions/__init__.py,sha256=Ybps5QclZShAELoW9bva4w6OCNrcBf8vd9nGDjYfQ
597
597
  wafer_core/sessions/agent.py,sha256=4-Q-NG_xm07FFq7hB8mjxW38nt2_S0QpwCYkPOoGRxA,5946
598
598
  wafer_core/sessions/dtypes.py,sha256=K6nOjvL6sjCGY7GTtdEygf1IZY_18R9YkHGqFyMd8wY,589
599
599
  wafer_core/sessions/hooks.py,sha256=A-txm6ufnRGQCdtP3vwh7oEOdlLN9Tv0XsjORMihuAI,4295
600
- wafer_core/targets/__init__.py,sha256=sHndC7AAOaHXlrmDXFLB53a5Y8DBjuyqS6nwsO2nj-Y,1728
600
+ wafer_core/targets/__init__.py,sha256=N4lTf9MjZ5dzAShObweZzyBfPMSzwjD5qBFWnM5lczM,2800
601
601
  wafer_core/targets/digitalocean.py,sha256=cvoYpYjtSyy5t2lQAPi7ERruuuibronah_ivOiduAHQ,16550
602
+ wafer_core/targets/pool.py,sha256=TeNE9rpr67OsGtbxniYpr9Cb3wosnf_e3kTLBbwtDok,5434
603
+ wafer_core/targets/probe.py,sha256=rzF8tiq5GxkMR3jhryTOW0GMcoHtrN67wmHlGJuBTv8,3038
604
+ wafer_core/targets/reconcile.py,sha256=Hftd7LyqkcTOP0Qpa_cdYpxGW2I3bkSlkQrnYjU5lns,3091
602
605
  wafer_core/targets/runpod.py,sha256=LrVmNvA6qjzL5nbGSWvtw7CHrK6bDu7_o3vKIek00Tc,20286
606
+ wafer_core/targets/spec_store.py,sha256=uNpMdo7ASeq7_RhgAqj8CFIK39rGEbaYtYtqt--FXO0,6455
607
+ wafer_core/targets/state_cache.py,sha256=oji4APL_tjOty_u0CJzHaP59jJAIJWQTjYvD4pCdQ3g,4479
608
+ wafer_core/targets/types.py,sha256=MQ7ECcBAwSoWsJfGxycJoLBeoTXSYtGeXEg5ZNxfs4c,5217
609
+ wafer_core/targets/providers/__init__.py,sha256=u6OCCgyPRymrnZmIYPLF0hdkr6aTCF301K9gSgcFWvc,1355
610
+ wafer_core/targets/providers/baremetal.py,sha256=L0KAiTkRH_fQvCbtaEa5wlJBqsvNaY56Zq6ovBhk2YY,2452
611
+ wafer_core/targets/providers/digitalocean.py,sha256=_TnGi9Otzsn2T_vSv40T_3HFLT559WS_ljGsrWr7j0s,5281
612
+ wafer_core/targets/providers/runpod.py,sha256=jCA7ENFRwbTKyToGa7fw2VS3coY61ggK1m0F17-rvng,7388
603
613
  wafer_core/tools/__init__.py,sha256=wBQD45GdSfkxcT6NHzIv0IMeXCc0enwwkpm3T_9j1X8,3341
604
614
  wafer_core/tools/bash_tool.py,sha256=daoKOVGSgL0x9X_3l8Apd6-wFH4VMXMGJwVemw2FIfc,16828
605
615
  wafer_core/tools/glob_tool.py,sha256=9X5PdOjQJj7kiVNqqCZC0-1LmnE6wHx3Zc9zfMjtXdc,3533
@@ -679,7 +689,7 @@ wafer_core/utils/kernel_utils/static_checker.py,sha256=XIQkzAOkGH5xtrOuZM4tNUqVJ
679
689
  wafer_core/utils/kernel_utils/task.py,sha256=XcmKxKUWh5It6nX3zGqj77tWgA32uPfQMqNOqyD5T48,2682
680
690
  wafer_core/utils/kernel_utils/utils.py,sha256=uDZoJDxh07hJeLNlPdKN2vgB15pqIr1LbXf0YIBHU4E,43056
681
691
  wafer_core/utils/kernel_utils/targets/__init__.py,sha256=4NwRLsuJ__S4xKAfda4Ag82C5MQ3Qio-4xA5S-mQGlU,2067
682
- wafer_core/utils/kernel_utils/targets/config.py,sha256=V587DYkisEFoWwkmLQUW6I0mXkMEwA52sM7ZINslkK8,20625
692
+ wafer_core/utils/kernel_utils/targets/config.py,sha256=DJPPyV7yGmyvS7cavdDENC5PQsia1dQeQYlWCTE7iUo,19975
683
693
  wafer_core/utils/kernel_utils/targets/execution.py,sha256=bZuNXCo0sIdD6hFhetLPrtDC-zMSiIsAx_aml49VVL0,15033
684
694
  wafer_core/utils/kernel_utils/targets/selection.py,sha256=5I_RG_7cfhq7uaeR28meC2EeNNKssFsK-Tc3QFG6Ze0,3590
685
695
  wafer_core/utils/modal_execution/__init__.py,sha256=jkVqYOLzCT5K73N9Od0UIUsx-99A0m6bpDrxfyXxQZ8,945
@@ -687,6 +697,6 @@ wafer_core/utils/modal_execution/modal_app.py,sha256=VfS2cX8gHtnlPXemmMcEwDPeQdh
687
697
  wafer_core/utils/modal_execution/modal_config.py,sha256=7cGX9TGqilQ3qxI3OFGXV5orjtyRU-PEDOJ4vP2oxno,4421
688
698
  wafer_core/utils/modal_execution/modal_execution.py,sha256=gChjnV6jqA3A7IRP3DfvV5cSfm_MN0X4f7JZufXgdZE,24594
689
699
  wafer_core/utils/modal_execution/test_modal.py,sha256=_jqou_hrLs1Daf1590Pnb0a_lXMMa2rczAPpW9HpoNQ,8153
690
- wafer_core-0.1.27.dist-info/METADATA,sha256=NYiI9hCaVd9RCCAfd8Ys0UwTMju6wiyaT7nsk4gsB8A,1477
691
- wafer_core-0.1.27.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
692
- wafer_core-0.1.27.dist-info/RECORD,,
700
+ wafer_core-0.1.29.dist-info/METADATA,sha256=Qjyx92KhI1joutpM8lF0G1zgPou-d8CdzWI80QQqKYg,1477
701
+ wafer_core-0.1.29.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
702
+ wafer_core-0.1.29.dist-info/RECORD,,