runpod-deploy 0.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,74 @@
1
+ """Config-driven RunPod orchestration."""
2
+
3
+ import logging
4
+
5
+ from runpod_deploy.config import (
6
+ DEFAULT_STAGING_EXCLUDES,
7
+ SCHEMA_VERSION,
8
+ STORAGE_EPHEMERAL,
9
+ STORAGE_NETWORK_VOLUME,
10
+ ArtifactPullSpec,
11
+ BudgetSpec,
12
+ CommandSpec,
13
+ JobContext,
14
+ LocalSpec,
15
+ PodSpec,
16
+ RemoteEnvSpec,
17
+ RsyncPushSpec,
18
+ RunpodJobSpec,
19
+ RunSpec,
20
+ SecretSpec,
21
+ SshSpec,
22
+ StopPolicySpec,
23
+ StorageSpec,
24
+ TelemetrySpec,
25
+ build_job_context,
26
+ load_job_spec,
27
+ validate_local_paths,
28
+ )
29
+ from runpod_deploy.metadata import capture_local_git, capture_payload_lockfile
30
+ from runpod_deploy.orchestrator import run_job
31
+ from runpod_deploy.pricing import GpuPrice, fetch_gpu_prices, select_price_for_pod
32
+ from runpod_deploy.provider import PodConnection, resolve_volume, select_gpu_across_datacenters
33
+ from runpod_deploy.transport import RemoteRunError, RemoteRunner, rsync_argv
34
+
35
+ __all__ = [
36
+ "DEFAULT_STAGING_EXCLUDES",
37
+ "SCHEMA_VERSION",
38
+ "STORAGE_EPHEMERAL",
39
+ "STORAGE_NETWORK_VOLUME",
40
+ "ArtifactPullSpec",
41
+ "BudgetSpec",
42
+ "CommandSpec",
43
+ "GpuPrice",
44
+ "JobContext",
45
+ "LocalSpec",
46
+ "PodConnection",
47
+ "PodSpec",
48
+ "RemoteEnvSpec",
49
+ "RemoteRunError",
50
+ "RemoteRunner",
51
+ "RsyncPushSpec",
52
+ "RunSpec",
53
+ "RunpodJobSpec",
54
+ "SecretSpec",
55
+ "SshSpec",
56
+ "StopPolicySpec",
57
+ "StorageSpec",
58
+ "TelemetrySpec",
59
+ "build_job_context",
60
+ "capture_local_git",
61
+ "capture_payload_lockfile",
62
+ "fetch_gpu_prices",
63
+ "load_job_spec",
64
+ "resolve_volume",
65
+ "rsync_argv",
66
+ "run_job",
67
+ "select_gpu_across_datacenters",
68
+ "select_price_for_pod",
69
+ "validate_local_paths",
70
+ ]
71
+
72
+ __version__ = "0.7.3"
73
+
74
+ logging.getLogger(__name__).addHandler(logging.NullHandler())
@@ -0,0 +1,436 @@
1
+ """Internal YAML parsers for the runpod-deploy schema."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from collections.abc import Mapping, Sequence
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from runpod_deploy.config import (
11
+ DEFAULT_FAILURE_MARKERS,
12
+ SCHEMA_VERSION,
13
+ ArtifactPullSpec,
14
+ BudgetSpec,
15
+ CommandSpec,
16
+ LocalSpec,
17
+ PodSpec,
18
+ RemoteEnvSpec,
19
+ RsyncPushSpec,
20
+ RunpodJobSpec,
21
+ RunSpec,
22
+ SecretSpec,
23
+ SshSpec,
24
+ StopPolicySpec,
25
+ StorageSpec,
26
+ TelemetrySpec,
27
+ )
28
+
29
+ __all__ = [
30
+ "parse_job_spec",
31
+ "render_template",
32
+ "resolve_relative_path",
33
+ ]
34
+
35
+ _TEMPLATE_RE = re.compile(r"\{([A-Za-z_][A-Za-z0-9_]*)\}")
36
+
37
+
38
+ def parse_job_spec(raw: Mapping[str, Any]) -> RunpodJobSpec:
39
+ """Parse a validated YAML mapping into a RunpodJobSpec."""
40
+ _check_keys(
41
+ raw,
42
+ "root",
43
+ {
44
+ "schema_version",
45
+ "name",
46
+ "run_id_prefix",
47
+ "state_file",
48
+ "local",
49
+ "pod",
50
+ "storage",
51
+ "ssh",
52
+ "budget",
53
+ "remote_env",
54
+ "setup",
55
+ "preflight",
56
+ "staging",
57
+ "secrets",
58
+ "run",
59
+ "artifacts",
60
+ "stop",
61
+ "telemetry",
62
+ "variables",
63
+ },
64
+ )
65
+ return RunpodJobSpec(
66
+ schema_version=_as_int(raw.get("schema_version", SCHEMA_VERSION), "schema_version"),
67
+ name=_as_str(raw.get("name"), "name"),
68
+ run_id_prefix=_as_str(raw.get("run_id_prefix", raw.get("name")), "run_id_prefix"),
69
+ state_file=_as_str(raw.get("state_file", "~/.runpod-deploy-current"), "state_file"),
70
+ local=_parse_local(_mapping(raw.get("local", {}), "local")),
71
+ pod=_parse_pod(_mapping(raw.get("pod"), "pod")),
72
+ storage=_parse_storage(_mapping(raw.get("storage"), "storage")),
73
+ ssh=_parse_ssh(_mapping(raw.get("ssh", {}), "ssh")),
74
+ budget=_parse_budget(_mapping(raw.get("budget", {}), "budget")),
75
+ remote_env=_parse_remote_env(_mapping(raw.get("remote_env", {}), "remote_env")),
76
+ setup=_parse_commands(raw.get("setup", ()), "setup"),
77
+ preflight=_parse_commands(raw.get("preflight", ()), "preflight"),
78
+ staging=_parse_rsync_pushes(raw.get("staging", ())),
79
+ secrets=_parse_secrets(raw.get("secrets", ())),
80
+ run=_parse_run(_mapping(raw.get("run"), "run")),
81
+ artifacts=_parse_artifacts(raw.get("artifacts", ())),
82
+ stop=_parse_stop(_mapping(raw.get("stop", {}), "stop")),
83
+ telemetry=_parse_telemetry(_mapping(raw.get("telemetry", {}), "telemetry")),
84
+ variables=_parse_str_dict(raw.get("variables", {}), "variables"),
85
+ )
86
+
87
+
88
+ def _parse_local(raw: Mapping[str, Any]) -> LocalSpec:
89
+ _check_keys(raw, "local", {"project_root", "required_paths"})
90
+ return LocalSpec(
91
+ project_root=_as_str(raw.get("project_root", "."), "local.project_root"),
92
+ required_paths=_tuple_str(raw.get("required_paths", ()), "local.required_paths"),
93
+ )
94
+
95
+
96
+ def _parse_pod(raw: Mapping[str, Any]) -> PodSpec:
97
+ _check_keys(
98
+ raw,
99
+ "pod",
100
+ {
101
+ "image",
102
+ "datacenters",
103
+ "gpu_order",
104
+ "cloud_type",
105
+ "ports",
106
+ "container_disk_gb",
107
+ "gpu_count",
108
+ "spot",
109
+ "min_vcpu_count",
110
+ "min_memory_gb",
111
+ "python_version",
112
+ },
113
+ )
114
+ return PodSpec(
115
+ image=_as_str(raw.get("image"), "pod.image"),
116
+ datacenters=_tuple_str(raw.get("datacenters"), "pod.datacenters"),
117
+ gpu_order=_tuple_str(raw.get("gpu_order"), "pod.gpu_order"),
118
+ cloud_type=_as_str(raw.get("cloud_type", "SECURE"), "pod.cloud_type"),
119
+ ports=_tuple_str(raw.get("ports", ("22/tcp",)), "pod.ports"),
120
+ container_disk_gb=_as_int(raw.get("container_disk_gb", 20), "pod.container_disk_gb"),
121
+ gpu_count=_as_int(raw.get("gpu_count", 1), "pod.gpu_count"),
122
+ spot=_as_bool(raw.get("spot", False), "pod.spot"),
123
+ min_vcpu_count=_optional_int(raw.get("min_vcpu_count"), "pod.min_vcpu_count"),
124
+ min_memory_gb=_optional_int(raw.get("min_memory_gb"), "pod.min_memory_gb"),
125
+ python_version=_optional_str(raw.get("python_version"), "pod.python_version"),
126
+ )
127
+
128
+
129
+ def _parse_storage(raw: Mapping[str, Any]) -> StorageSpec:
130
+ _check_keys(raw, "storage", {"mode", "volume_mount", "volume_name", "volume_gb"})
131
+ return StorageSpec(
132
+ mode=_as_str(raw.get("mode"), "storage.mode"),
133
+ volume_mount=_as_str(raw.get("volume_mount", "/workspace"), "storage.volume_mount"),
134
+ volume_name=_optional_str(raw.get("volume_name"), "storage.volume_name"),
135
+ volume_gb=_optional_int(raw.get("volume_gb"), "storage.volume_gb"),
136
+ )
137
+
138
+
139
+ def _parse_ssh(raw: Mapping[str, Any]) -> SshSpec:
140
+ _check_keys(raw, "ssh", {"key_path"})
141
+ return SshSpec(key_path=_as_str(raw.get("key_path", "~/.ssh/id_ed25519"), "ssh.key_path"))
142
+
143
+
144
+ def _parse_budget(raw: Mapping[str, Any]) -> BudgetSpec:
145
+ _check_keys(
146
+ raw,
147
+ "budget",
148
+ {"cost_cap_usd", "assumed_hourly_rate_usd", "max_runtime_minutes", "poll_interval_sec"},
149
+ )
150
+ return BudgetSpec(
151
+ cost_cap_usd=_as_float(raw.get("cost_cap_usd", 10.0), "budget.cost_cap_usd"),
152
+ assumed_hourly_rate_usd=_as_float(
153
+ raw.get("assumed_hourly_rate_usd", 1.65),
154
+ "budget.assumed_hourly_rate_usd",
155
+ ),
156
+ max_runtime_minutes=_optional_int(
157
+ raw.get("max_runtime_minutes"),
158
+ "budget.max_runtime_minutes",
159
+ ),
160
+ poll_interval_sec=_as_int(raw.get("poll_interval_sec", 60), "budget.poll_interval_sec"),
161
+ )
162
+
163
+
164
+ def _parse_remote_env(raw: Mapping[str, Any]) -> RemoteEnvSpec:
165
+ _check_keys(raw, "remote_env", {"source_files", "exports"})
166
+ return RemoteEnvSpec(
167
+ source_files=_tuple_str(raw.get("source_files", ()), "remote_env.source_files"),
168
+ exports=_parse_str_dict(raw.get("exports", {}), "remote_env.exports"),
169
+ )
170
+
171
+
172
+ def _parse_commands(raw: Any, label: str) -> tuple[CommandSpec, ...]:
173
+ if raw is None:
174
+ return ()
175
+ if not isinstance(raw, Sequence) or isinstance(raw, str):
176
+ raise TypeError(f"{label} must be a list of command mappings")
177
+ out: list[CommandSpec] = []
178
+ for i, item in enumerate(raw):
179
+ item_label = f"{label}[{i}]"
180
+ mapping = _mapping(item, item_label)
181
+ _check_keys(mapping, item_label, {"command", "timeout_sec", "with_env"})
182
+ out.append(
183
+ CommandSpec(
184
+ command=_as_str(mapping.get("command"), f"{item_label}.command"),
185
+ timeout_sec=_as_int(mapping.get("timeout_sec", 600), f"{item_label}.timeout_sec"),
186
+ with_env=_as_bool(mapping.get("with_env", False), f"{item_label}.with_env"),
187
+ )
188
+ )
189
+ return tuple(out)
190
+
191
+
192
+ def _parse_rsync_pushes(raw: Any) -> tuple[RsyncPushSpec, ...]:
193
+ if raw is None:
194
+ return ()
195
+ if not isinstance(raw, Sequence) or isinstance(raw, str):
196
+ raise TypeError("staging must be a list of rsync push mappings")
197
+ out: list[RsyncPushSpec] = []
198
+ allowed = {
199
+ "label",
200
+ "source",
201
+ "destination",
202
+ "excludes",
203
+ "delete",
204
+ "excludes_default",
205
+ "excludes_extra",
206
+ }
207
+ for i, item in enumerate(raw):
208
+ item_label = f"staging[{i}]"
209
+ mapping = _mapping(item, item_label)
210
+ _check_keys(mapping, item_label, allowed)
211
+ out.append(
212
+ RsyncPushSpec(
213
+ label=_as_str(mapping.get("label"), f"{item_label}.label"),
214
+ source=_as_str(mapping.get("source"), f"{item_label}.source"),
215
+ destination=_as_str(mapping.get("destination"), f"{item_label}.destination"),
216
+ excludes=_tuple_str(mapping.get("excludes", ()), f"{item_label}.excludes"),
217
+ delete=_as_bool(mapping.get("delete", True), f"{item_label}.delete"),
218
+ excludes_default=_as_bool(
219
+ mapping.get("excludes_default", False),
220
+ f"{item_label}.excludes_default",
221
+ ),
222
+ excludes_extra=_tuple_str(
223
+ mapping.get("excludes_extra", ()),
224
+ f"{item_label}.excludes_extra",
225
+ ),
226
+ )
227
+ )
228
+ return tuple(out)
229
+
230
+
231
+ def _parse_secrets(raw: Any) -> tuple[SecretSpec, ...]:
232
+ if raw is None:
233
+ return ()
234
+ if not isinstance(raw, Sequence) or isinstance(raw, str):
235
+ raise TypeError("secrets must be a list of secret mappings")
236
+ out: list[SecretSpec] = []
237
+ for i, item in enumerate(raw):
238
+ item_label = f"secrets[{i}]"
239
+ mapping = _mapping(item, item_label)
240
+ _check_keys(mapping, item_label, {"name", "destination", "env", "file", "mode"})
241
+ out.append(
242
+ SecretSpec(
243
+ name=_as_str(mapping.get("name"), f"{item_label}.name"),
244
+ destination=_as_str(mapping.get("destination"), f"{item_label}.destination"),
245
+ env=_tuple_str(mapping.get("env", ()), f"{item_label}.env"),
246
+ file=_optional_str(mapping.get("file"), f"{item_label}.file"),
247
+ mode=_as_str(mapping.get("mode", "0600"), f"{item_label}.mode"),
248
+ )
249
+ )
250
+ return tuple(out)
251
+
252
+
253
+ def _parse_run(raw: Mapping[str, Any]) -> RunSpec:
254
+ _check_keys(
255
+ raw, "run", {"script_path", "log_path", "success_marker", "failure_markers", "body"}
256
+ )
257
+ return RunSpec(
258
+ script_path=_as_str(raw.get("script_path"), "run.script_path"),
259
+ log_path=_as_str(raw.get("log_path"), "run.log_path"),
260
+ success_marker=_as_str(raw.get("success_marker"), "run.success_marker"),
261
+ failure_markers=_tuple_str(
262
+ raw.get("failure_markers", DEFAULT_FAILURE_MARKERS),
263
+ "run.failure_markers",
264
+ ),
265
+ body=_as_str(raw.get("body"), "run.body"),
266
+ )
267
+
268
+
269
+ def _parse_artifacts(raw: Any) -> tuple[ArtifactPullSpec, ...]:
270
+ if raw is None:
271
+ return ()
272
+ if not isinstance(raw, Sequence) or isinstance(raw, str):
273
+ raise TypeError("artifacts must be a list of pull mappings")
274
+ out: list[ArtifactPullSpec] = []
275
+ for i, item in enumerate(raw):
276
+ item_label = f"artifacts[{i}]"
277
+ mapping = _mapping(item, item_label)
278
+ _check_keys(
279
+ mapping,
280
+ item_label,
281
+ {"label", "remote_path", "local_path", "required", "excludes", "delete"},
282
+ )
283
+ out.append(
284
+ ArtifactPullSpec(
285
+ label=_as_str(mapping.get("label"), f"{item_label}.label"),
286
+ remote_path=_as_str(mapping.get("remote_path"), f"{item_label}.remote_path"),
287
+ local_path=_as_str(mapping.get("local_path"), f"{item_label}.local_path"),
288
+ required=_as_bool(mapping.get("required", True), f"{item_label}.required"),
289
+ excludes=_tuple_str(mapping.get("excludes", ()), f"{item_label}.excludes"),
290
+ delete=_as_bool(mapping.get("delete", True), f"{item_label}.delete"),
291
+ )
292
+ )
293
+ return tuple(out)
294
+
295
+
296
+ def _parse_stop(raw: Mapping[str, Any]) -> StopPolicySpec:
297
+ _check_keys(raw, "stop", {"on_success", "on_failure"})
298
+ return StopPolicySpec(
299
+ on_success=_as_bool(raw.get("on_success", True), "stop.on_success"),
300
+ on_failure=_as_bool(raw.get("on_failure", True), "stop.on_failure"),
301
+ )
302
+
303
+
304
+ def _parse_telemetry(raw: Mapping[str, Any]) -> TelemetrySpec:
305
+ _check_keys(
306
+ raw,
307
+ "telemetry",
308
+ {
309
+ "enabled",
310
+ "sample_interval_sec",
311
+ "capture_nvidia_smi",
312
+ "capture_dmesg",
313
+ "capture_pod_describe",
314
+ "capture_remote_env",
315
+ "capture_local_git",
316
+ "capture_payload_lockfile",
317
+ },
318
+ )
319
+ return TelemetrySpec(
320
+ enabled=_as_bool(raw.get("enabled", True), "telemetry.enabled"),
321
+ sample_interval_sec=_as_int(
322
+ raw.get("sample_interval_sec", 30), "telemetry.sample_interval_sec"
323
+ ),
324
+ capture_nvidia_smi=_as_bool(
325
+ raw.get("capture_nvidia_smi", True), "telemetry.capture_nvidia_smi"
326
+ ),
327
+ capture_dmesg=_as_bool(raw.get("capture_dmesg", True), "telemetry.capture_dmesg"),
328
+ capture_pod_describe=_as_bool(
329
+ raw.get("capture_pod_describe", True), "telemetry.capture_pod_describe"
330
+ ),
331
+ capture_remote_env=_as_bool(
332
+ raw.get("capture_remote_env", True), "telemetry.capture_remote_env"
333
+ ),
334
+ capture_local_git=_as_bool(
335
+ raw.get("capture_local_git", True), "telemetry.capture_local_git"
336
+ ),
337
+ capture_payload_lockfile=_as_bool(
338
+ raw.get("capture_payload_lockfile", True), "telemetry.capture_payload_lockfile"
339
+ ),
340
+ )
341
+
342
+
343
+ def _check_keys(raw: Mapping[str, Any], label: str, allowed: set[str]) -> None:
344
+ unknown = set(raw) - allowed
345
+ if unknown:
346
+ raise KeyError(
347
+ f"unknown {label} keys: {sorted(unknown)}; expected subset of {sorted(allowed)}"
348
+ )
349
+
350
+
351
+ def _mapping(raw: Any, label: str) -> Mapping[str, Any]:
352
+ if not isinstance(raw, Mapping):
353
+ raise TypeError(f"{label} must be a mapping, got {type(raw).__name__}")
354
+ return raw
355
+
356
+
357
+ def _as_str(raw: Any, label: str) -> str:
358
+ if not isinstance(raw, str):
359
+ raise TypeError(f"{label} must be str, got {type(raw).__name__}")
360
+ return raw
361
+
362
+
363
+ def _optional_str(raw: Any, label: str) -> str | None:
364
+ if raw is None:
365
+ return None
366
+ return _as_str(raw, label)
367
+
368
+
369
+ def _as_int(raw: Any, label: str) -> int:
370
+ if not isinstance(raw, int) or isinstance(raw, bool):
371
+ raise TypeError(f"{label} must be int, got {type(raw).__name__}")
372
+ return raw
373
+
374
+
375
+ def _optional_int(raw: Any, label: str) -> int | None:
376
+ if raw is None:
377
+ return None
378
+ return _as_int(raw, label)
379
+
380
+
381
+ def _as_float(raw: Any, label: str) -> float:
382
+ if not isinstance(raw, (int, float)) or isinstance(raw, bool):
383
+ raise TypeError(f"{label} must be float, got {type(raw).__name__}")
384
+ return float(raw)
385
+
386
+
387
+ def _as_bool(raw: Any, label: str) -> bool:
388
+ if not isinstance(raw, bool):
389
+ raise TypeError(f"{label} must be bool, got {type(raw).__name__}")
390
+ return raw
391
+
392
+
393
+ def _tuple_str(raw: Any, label: str) -> tuple[str, ...]:
394
+ if raw is None:
395
+ raise TypeError(f"{label} must be a list of strings, got None")
396
+ if not isinstance(raw, Sequence) or isinstance(raw, str):
397
+ raise TypeError(f"{label} must be a list of strings, got {type(raw).__name__}")
398
+ out: list[str] = []
399
+ for i, item in enumerate(raw):
400
+ if not isinstance(item, str):
401
+ raise TypeError(f"{label}[{i}] must be str, got {type(item).__name__}")
402
+ out.append(item)
403
+ return tuple(out)
404
+
405
+
406
+ def _parse_str_dict(raw: Any, label: str) -> dict[str, str]:
407
+ if not isinstance(raw, Mapping):
408
+ raise TypeError(f"{label} must be a mapping, got {type(raw).__name__}")
409
+ out: dict[str, str] = {}
410
+ for key, value in raw.items():
411
+ if not isinstance(key, str):
412
+ raise TypeError(f"{label} key must be str, got {type(key).__name__}")
413
+ if not isinstance(value, str):
414
+ raise TypeError(f"{label}[{key!r}] must be str, got {type(value).__name__}")
415
+ out[key] = value
416
+ return out
417
+
418
+
419
+ def resolve_relative_path(value: str, *, base: Path) -> Path:
420
+ """Resolve a possibly-relative path against ``base`` and absolutize it."""
421
+ path = Path(value).expanduser()
422
+ if path.is_absolute():
423
+ return path.resolve()
424
+ return (base / path).resolve()
425
+
426
+
427
+ def render_template(value: str, variables: Mapping[str, str]) -> str:
428
+ """Substitute ``{name}`` placeholders against ``variables``."""
429
+
430
+ def replace(match: re.Match[str]) -> str:
431
+ name = match.group(1)
432
+ if name not in variables:
433
+ raise KeyError(f"unknown template variable {name!r} in {value!r}")
434
+ return variables[name]
435
+
436
+ return _TEMPLATE_RE.sub(replace, value)