opbdh 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
opbdh/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """OPBDH: a small RunPod launcher for model-backed scripts."""
2
+
3
+ __all__ = ["__version__"]
4
+
5
+ __version__ = "0.1.0"
opbdh/cli.py ADDED
@@ -0,0 +1,408 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import asdict
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import typer
9
+ from rich.console import Console
10
+ from rich.table import Table
11
+
12
+ from .config import OpbdhConfig, global_config_path, load_config, save_config
13
+ from .gpu import candidate_gpus
14
+ from .hal import QUOTE_OVERSPEND, QUOTE_REFUSAL, QUOTE_SUCCESS, hal_says
15
+ from .hf import estimate_model_size_gb, suggested_network_volume_gb
16
+ from .runpod import MaxSpendReached, make_plan, plan_summary, run_plan
17
+ from .verify import verify_code
18
+
19
+
20
+ app = typer.Typer(help="OPBDH: Open the Pod Bay Door, Hal. Run model scripts on RunPod.")
21
+ run_app = typer.Typer(help="Plan, launch, or interactively build a RunPod run.")
22
+ config_app = typer.Typer(help="Inspect and build OPBDH config.")
23
+ models_app = typer.Typer(help="Hugging Face model helpers.")
24
+ app.add_typer(run_app, name="run")
25
+ app.add_typer(config_app, name="config")
26
+ app.add_typer(models_app, name="models")
27
+ console = Console()
28
+
29
+
30
+ def _overrides(**kwargs: Any) -> dict[str, Any]:
31
+ return {key: value for key, value in kwargs.items() if value is not None and value != ""}
32
+
33
+
34
+ def _print_plan(payload: dict[str, Any]) -> None:
35
+ table = Table(title="OPBDH plan")
36
+ table.add_column("Field", style="cyan")
37
+ table.add_column("Value")
38
+ for key, value in payload.items():
39
+ if isinstance(value, list):
40
+ display = "\n".join(str(item) for item in value) or "[]"
41
+ else:
42
+ display = str(value)
43
+ table.add_row(key, display)
44
+ console.print(table)
45
+
46
+
47
+ def _confirm_launch(plan_payload: dict[str, Any]) -> bool:
48
+ hourly = plan_payload.get("estimated_hourly_dollars")
49
+ max_spend = plan_payload.get("max_spend_dollars")
50
+ console.print(
51
+ f"[yellow]This can launch billable RunPod compute.[/] "
52
+ f"Estimated first GPU candidate: ${hourly}/hr, max spend guard: ${max_spend}."
53
+ )
54
+ return typer.confirm("Launch now?", default=False)
55
+
56
+
57
+ @app.command()
58
+ def plan(
59
+ code: Path | None = typer.Argument(None, help="Code file or directory to upload."),
60
+ config_file: Path | None = typer.Option(None, "--config", "-c", help="Local OPBDH JSON config."),
61
+ model: str | None = typer.Option(None, "--model", "-m", help="Hugging Face model id."),
62
+ command: str | None = typer.Option(None, "--command", help="Remote shell command. Defaults from code path."),
63
+ vram_gb: int | None = typer.Option(None, "--vram-gb", help="Minimum GPU VRAM."),
64
+ max_dollars_per_hour: float | None = typer.Option(None, "--max-dollars-per-hour", help="Estimated hourly cap."),
65
+ max_spend: float | None = typer.Option(None, "--max-spend", help="Spend guard for this run."),
66
+ ) -> None:
67
+ cfg = load_config(
68
+ local_config=config_file,
69
+ overrides=_overrides(
70
+ model_id=model,
71
+ code=str(code) if code else None,
72
+ command=command,
73
+ vram_gb=vram_gb,
74
+ max_dollars_per_hour=max_dollars_per_hour,
75
+ max_spend_dollars=max_spend,
76
+ ),
77
+ )
78
+ if not cfg.code:
79
+ raise typer.BadParameter("Code path is required, either as an argument or config.code.")
80
+ opbdh_plan = make_plan(cfg, code_path=Path(cfg.code))
81
+ _print_plan(plan_summary(opbdh_plan))
82
+
83
+
84
+ @app.command()
85
+ def verify(
86
+ code: Path = typer.Argument(..., help="Code file or directory to statically verify."),
87
+ command: str = typer.Option("", "--command", help="Remote command, if the path needs one."),
88
+ ) -> None:
89
+ result = verify_code(code, command=command)
90
+ if result.ok:
91
+ console.print(f"[green]OK[/] checked {len(result.checked)} file(s).")
92
+ return
93
+ hal_says(QUOTE_REFUSAL)
94
+ for error in result.errors:
95
+ console.print(f"[red]{error}[/]")
96
+ raise typer.Exit(1)
97
+
98
+
99
+ def _load_run_config(
100
+ *,
101
+ code: Path | None,
102
+ config_file: Path | None,
103
+ model: str | None,
104
+ command: str | None,
105
+ vram_gb: int | None,
106
+ max_dollars_per_hour: float | None,
107
+ max_spend: float | None,
108
+ network_volume_id: str | None,
109
+ auto_network_volume: bool | None,
110
+ network_volume_data_center_id: str | None,
111
+ ) -> OpbdhConfig:
112
+ return load_config(
113
+ local_config=config_file,
114
+ overrides=_overrides(
115
+ model_id=model,
116
+ code=str(code) if code else None,
117
+ command=command,
118
+ vram_gb=vram_gb,
119
+ max_dollars_per_hour=max_dollars_per_hour,
120
+ max_spend_dollars=max_spend,
121
+ network_volume_id=network_volume_id,
122
+ auto_network_volume=auto_network_volume,
123
+ network_volume_data_center_id=network_volume_data_center_id,
124
+ ),
125
+ )
126
+
127
+
128
+ def _execute_run(config: OpbdhConfig, *, dry_run: bool, yes: bool) -> None:
129
+ if not config.code:
130
+ raise typer.BadParameter("Code path is required, either as an argument or config.code.")
131
+ opbdh_plan = make_plan(config, code_path=Path(config.code))
132
+ payload = plan_summary(opbdh_plan)
133
+ _print_plan(payload)
134
+ if dry_run:
135
+ run_plan(opbdh_plan, dry_run=True)
136
+ console.print(f"[green]Dry run written to[/] {opbdh_plan.results_dir}")
137
+ return
138
+ if not yes and not _confirm_launch(payload):
139
+ hal_says(QUOTE_REFUSAL)
140
+ raise typer.Exit(1)
141
+ try:
142
+ result = run_plan(opbdh_plan)
143
+ except MaxSpendReached as exc:
144
+ hal_says(QUOTE_OVERSPEND)
145
+ console.print(f"[red]{exc}[/] Results synced so far are in {opbdh_plan.results_dir}.")
146
+ raise typer.Exit(1) from exc
147
+ if result:
148
+ hal_says(QUOTE_SUCCESS)
149
+ console.print(f"[green]Run complete[/] {result.results_dir}")
150
+
151
+
152
+ @run_app.command("now")
153
+ def run_now(
154
+ code: Path | None = typer.Argument(None, help="Code file or directory to upload."),
155
+ config_file: Path | None = typer.Option(None, "--config", "-c", help="Local OPBDH JSON config."),
156
+ model: str | None = typer.Option(None, "--model", "-m", help="Hugging Face model id."),
157
+ command: str | None = typer.Option(None, "--command", help="Remote shell command. Defaults from code path."),
158
+ vram_gb: int | None = typer.Option(None, "--vram-gb", help="Minimum GPU VRAM."),
159
+ max_dollars_per_hour: float | None = typer.Option(None, "--max-dollars-per-hour", help="Estimated hourly cap."),
160
+ max_spend: float | None = typer.Option(None, "--max-spend", help="Spend guard for this run."),
161
+ network_volume_id: str | None = typer.Option(None, "--network-volume-id", help="Existing RunPod network volume id."),
162
+ auto_network_volume: bool | None = typer.Option(
163
+ None,
164
+ "--auto-network-volume/--no-auto-network-volume",
165
+ help="Create a network volume if none is configured.",
166
+ ),
167
+ network_volume_data_center_id: str | None = typer.Option(
168
+ None,
169
+ "--network-volume-data-center-id",
170
+ help="RunPod data center id for auto-created volumes, for example EU-RO-1.",
171
+ ),
172
+ dry_run: bool = typer.Option(False, "--dry-run", help="Verify and print the plan without contacting RunPod."),
173
+ yes: bool = typer.Option(False, "--yes", "-y", help="Skip billable-compute confirmation."),
174
+ ) -> None:
175
+ cfg = _load_run_config(
176
+ code=code,
177
+ config_file=config_file,
178
+ model=model,
179
+ command=command,
180
+ vram_gb=vram_gb,
181
+ max_dollars_per_hour=max_dollars_per_hour,
182
+ max_spend=max_spend,
183
+ network_volume_id=network_volume_id,
184
+ auto_network_volume=auto_network_volume,
185
+ network_volume_data_center_id=network_volume_data_center_id,
186
+ )
187
+ _execute_run(cfg, dry_run=dry_run, yes=yes)
188
+
189
+
190
+ @app.command("launch")
191
+ def launch(
192
+ code: Path | None = typer.Argument(None, help="Code file or directory to upload."),
193
+ config_file: Path | None = typer.Option(None, "--config", "-c", help="Local OPBDH JSON config."),
194
+ model: str | None = typer.Option(None, "--model", "-m", help="Hugging Face model id."),
195
+ command: str | None = typer.Option(None, "--command", help="Remote shell command. Defaults from code path."),
196
+ vram_gb: int | None = typer.Option(None, "--vram-gb", help="Minimum GPU VRAM."),
197
+ max_dollars_per_hour: float | None = typer.Option(None, "--max-dollars-per-hour", help="Estimated hourly cap."),
198
+ max_spend: float | None = typer.Option(None, "--max-spend", help="Spend guard for this run."),
199
+ network_volume_id: str | None = typer.Option(None, "--network-volume-id", help="Existing RunPod network volume id."),
200
+ auto_network_volume: bool | None = typer.Option(None, "--auto-network-volume/--no-auto-network-volume"),
201
+ network_volume_data_center_id: str | None = typer.Option(None, "--network-volume-data-center-id"),
202
+ dry_run: bool = typer.Option(False, "--dry-run", help="Verify and print the plan without contacting RunPod."),
203
+ yes: bool = typer.Option(False, "--yes", "-y", help="Skip billable-compute confirmation."),
204
+ ) -> None:
205
+ """Shortcut for `opbdh run now`."""
206
+ cfg = _load_run_config(
207
+ code=code,
208
+ config_file=config_file,
209
+ model=model,
210
+ command=command,
211
+ vram_gb=vram_gb,
212
+ max_dollars_per_hour=max_dollars_per_hour,
213
+ max_spend=max_spend,
214
+ network_volume_id=network_volume_id,
215
+ auto_network_volume=auto_network_volume,
216
+ network_volume_data_center_id=network_volume_data_center_id,
217
+ )
218
+ _execute_run(cfg, dry_run=dry_run, yes=yes)
219
+
220
+
221
+ @run_app.command("wizard")
222
+ def run_wizard(
223
+ config_file: Path | None = typer.Option(None, "--config", "-c", help="Local OPBDH JSON config."),
224
+ ) -> None:
225
+ try:
226
+ import questionary
227
+ except Exception as exc:
228
+ raise typer.BadParameter("questionary is required for the run wizard.") from exc
229
+
230
+ base = load_config(local_config=config_file)
231
+ model = _questionary_model(questionary, default=base.model_id or "Qwen")
232
+ model_estimate = estimate_model_size_gb(model)
233
+ code = questionary.path("Code file or directory", default=base.code or str(Path.cwd() / "run.py")).ask() or base.code
234
+ command = questionary.text("Remote command override", default=base.command).ask() or ""
235
+ vram_gb = int(questionary.text("Minimum VRAM GB", default=str(base.vram_gb)).ask() or str(base.vram_gb))
236
+ hourly_default = "" if base.max_dollars_per_hour is None else str(base.max_dollars_per_hour)
237
+ hourly_text = questionary.text("Max dollars/hour estimate (blank for no cap)", default=hourly_default).ask() or ""
238
+ spend = float(questionary.text("Max spend dollars", default=str(base.max_spend_dollars)).ask() or str(base.max_spend_dollars))
239
+ network_volume_id = questionary.text("Existing network volume id (blank for none)", default=base.network_volume_id).ask() or ""
240
+ auto_volume = False
241
+ data_center = base.network_volume_data_center_id
242
+ if not network_volume_id:
243
+ suggested_volume = suggested_network_volume_gb(model_estimate, fallback_gb=base.pod_volume_gb)
244
+ auto_volume = bool(questionary.confirm(f"Create a network volume if needed? Suggested size: {suggested_volume} GB", default=base.auto_network_volume).ask())
245
+ if auto_volume:
246
+ data_center = questionary.text("RunPod data center id", default=data_center or "EU-RO-1").ask() or ""
247
+ base.network_volume_size_gb = suggested_volume
248
+ dry_run = bool(questionary.confirm("Dry run first?", default=True).ask())
249
+ yes = bool(questionary.confirm("Skip launch confirmation?", default=False).ask()) if not dry_run else True
250
+ cfg = load_config(
251
+ local_config=config_file,
252
+ overrides=_overrides(
253
+ model_id=model,
254
+ code=code,
255
+ command=command,
256
+ vram_gb=vram_gb,
257
+ max_dollars_per_hour=float(hourly_text) if hourly_text else None,
258
+ max_spend_dollars=spend,
259
+ network_volume_id=network_volume_id,
260
+ auto_network_volume=auto_volume,
261
+ network_volume_data_center_id=data_center,
262
+ network_volume_size_gb=base.network_volume_size_gb,
263
+ ),
264
+ )
265
+ _execute_run(cfg, dry_run=dry_run, yes=yes)
266
+
267
+
268
+ @config_app.command("show")
269
+ def config_show(
270
+ config_file: Path | None = typer.Option(None, "--config", "-c", help="Local OPBDH JSON config."),
271
+ ) -> None:
272
+ cfg = load_config(local_config=config_file)
273
+ console.print_json(json.dumps(asdict(cfg), indent=2, sort_keys=True))
274
+
275
+
276
+ @config_app.command("write")
277
+ def config_write(
278
+ output: Path | None = typer.Option(None, "--output", "-o", help="Config path. Defaults to global config."),
279
+ model: str = typer.Option(..., "--model", "-m", help="Hugging Face model id."),
280
+ code: str = typer.Option("", "--code", help="Default code path. Supports {cwd}, {model_slug}, and env vars."),
281
+ command: str = typer.Option("", "--command", help="Default remote command."),
282
+ vram_gb: int = typer.Option(24, "--vram-gb"),
283
+ max_dollars_per_hour: float | None = typer.Option(None, "--max-dollars-per-hour"),
284
+ max_spend: float = typer.Option(5.0, "--max-spend"),
285
+ auto_network_volume: bool = typer.Option(False, "--auto-network-volume/--no-auto-network-volume"),
286
+ network_volume_data_center_id: str = typer.Option("", "--network-volume-data-center-id"),
287
+ ) -> None:
288
+ cfg = OpbdhConfig(
289
+ model_id=model,
290
+ code=code,
291
+ command=command,
292
+ vram_gb=vram_gb,
293
+ max_dollars_per_hour=max_dollars_per_hour,
294
+ max_spend_dollars=max_spend,
295
+ auto_network_volume=auto_network_volume,
296
+ network_volume_data_center_id=network_volume_data_center_id,
297
+ )
298
+ path = save_config(cfg, output or global_config_path())
299
+ console.print(f"[green]Wrote[/] {path}")
300
+
301
+
302
+ @config_app.command("wizard")
303
+ def config_wizard(
304
+ scope: str = typer.Option("global", "--scope", help="global or local"),
305
+ output: Path | None = typer.Option(None, "--output", "-o"),
306
+ ) -> None:
307
+ try:
308
+ import questionary
309
+ except Exception as exc:
310
+ raise typer.BadParameter("questionary is required for the wizard; use `opbdh config write` instead.") from exc
311
+
312
+ model = _questionary_model(questionary)
313
+ model_estimate = estimate_model_size_gb(model)
314
+ suggested_volume = suggested_network_volume_gb(model_estimate)
315
+ code = questionary.text("Default local code path", default="{cwd}/run.py").ask() or ""
316
+ command = questionary.text("Remote command override", default="").ask() or ""
317
+ vram_gb = int(questionary.text("Minimum VRAM GB", default="24").ask() or "24")
318
+ hourly_text = questionary.text("Max dollars/hour estimate (blank for no cap)", default="").ask() or ""
319
+ spend = float(questionary.text("Max spend dollars", default="5").ask() or "5")
320
+ auto_volume = bool(questionary.confirm("Create a RunPod network volume when none is configured?", default=False).ask())
321
+ data_center = ""
322
+ if auto_volume:
323
+ data_center = questionary.text("RunPod data center id for the volume", default="EU-RO-1").ask() or ""
324
+ console.print(f"Suggested volume size for {model}: {suggested_volume} GB")
325
+ cfg = OpbdhConfig(
326
+ model_id=model,
327
+ code=code,
328
+ command=command,
329
+ vram_gb=vram_gb,
330
+ max_dollars_per_hour=float(hourly_text) if hourly_text else None,
331
+ max_spend_dollars=spend,
332
+ auto_network_volume=auto_volume,
333
+ network_volume_data_center_id=data_center,
334
+ network_volume_size_gb=suggested_volume if auto_volume else None,
335
+ )
336
+ if output:
337
+ target = output
338
+ elif scope == "local":
339
+ target = Path.cwd() / "opbdh.json"
340
+ elif scope == "global":
341
+ target = global_config_path()
342
+ else:
343
+ raise typer.BadParameter("--scope must be global or local")
344
+ save_config(cfg, target)
345
+ console.print(f"[green]Wrote[/] {target}")
346
+
347
+
348
+ def _questionary_model(questionary: Any, *, default: str = "Qwen") -> str:
349
+ query = questionary.text("Search Hugging Face models", default=default).ask() or ""
350
+ choices: list[str] = []
351
+ if query.strip():
352
+ try:
353
+ from huggingface_hub import HfApi
354
+
355
+ choices = [model.modelId for model in HfApi().list_models(search=query, limit=25) if model.modelId]
356
+ except Exception:
357
+ choices = []
358
+ if choices:
359
+ selected = questionary.autocomplete("Model", choices=choices, default=choices[0]).ask()
360
+ if selected:
361
+ return str(selected)
362
+ return questionary.text("Model id", default=query).ask() or query
363
+
364
+
365
+ @models_app.command("search")
366
+ def models_search(query: str, limit: int = typer.Option(10, "--limit", "-n")) -> None:
367
+ from huggingface_hub import HfApi
368
+
369
+ table = Table(title=f"Hugging Face models: {query}")
370
+ table.add_column("Model")
371
+ table.add_column("Downloads", justify="right")
372
+ for model in HfApi().list_models(search=query, limit=limit):
373
+ table.add_row(str(model.modelId), str(getattr(model, "downloads", "") or ""))
374
+ console.print(table)
375
+
376
+
377
+ @models_app.command("size")
378
+ def models_size(model: str) -> None:
379
+ estimate = estimate_model_size_gb(model)
380
+ console.print_json(json.dumps({
381
+ "model": model,
382
+ "size_gb": estimate.size_gb,
383
+ "source": estimate.source,
384
+ "suggested_network_volume_gb": suggested_network_volume_gb(estimate),
385
+ }))
386
+
387
+
388
+ @app.command("gpus")
389
+ def gpus(
390
+ vram_gb: int = typer.Option(24, "--vram-gb"),
391
+ max_dollars_per_hour: float | None = typer.Option(None, "--max-dollars-per-hour"),
392
+ cloud_type: str = typer.Option("SECURE", "--cloud-type"),
393
+ ) -> None:
394
+ table = Table(title="OPBDH GPU candidates")
395
+ table.add_column("RunPod GPU id")
396
+ table.add_column("VRAM", justify="right")
397
+ table.add_column("$/hr estimate", justify="right")
398
+ for gpu in candidate_gpus(vram_gb, max_dollars_per_hour, cloud_type):
399
+ table.add_row(gpu.id, str(gpu.memory_gb), f"{gpu.hourly(cloud_type):.2f}")
400
+ console.print(table)
401
+
402
+
403
+ def main() -> None:
404
+ app()
405
+
406
+
407
+ if __name__ == "__main__":
408
+ main()
opbdh/config.py ADDED
@@ -0,0 +1,158 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from dataclasses import asdict, dataclass, fields
6
+ from datetime import UTC, datetime
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+
11
+ DEFAULT_RUNPOD_CONTAINER_DISK_GB = 120
12
+ DEFAULT_RUNPOD_IMAGE = "runpod/pytorch:2.8.0-py3.11-cuda12.8.1-cudnn-devel-ubuntu22.04"
13
+ DEFAULT_RUNPOD_VOLUME_GB = 160
14
+ LOCAL_CONFIG_NAMES = ("opbdh.json", ".opbdh.json")
15
+
16
+
17
+ class _SafeFormatDict(dict[str, str]):
18
+ def __missing__(self, key: str) -> str:
19
+ return "{" + key + "}"
20
+
21
+
22
+ @dataclass(slots=True)
23
+ class OpbdhConfig:
24
+ model_id: str = ""
25
+ code: str = ""
26
+ command: str = ""
27
+ image: str = DEFAULT_RUNPOD_IMAGE
28
+ cloud_type: str = "SECURE"
29
+ vram_gb: int = 24
30
+ max_dollars_per_hour: float | None = None
31
+ max_spend_dollars: float = 5.0
32
+ container_disk_gb: int = DEFAULT_RUNPOD_CONTAINER_DISK_GB
33
+ pod_volume_gb: int = DEFAULT_RUNPOD_VOLUME_GB
34
+ network_volume_id: str = ""
35
+ auto_network_volume: bool = False
36
+ network_volume_data_center_id: str = ""
37
+ network_volume_name: str = "opbdh-{model_slug}"
38
+ network_volume_size_gb: int | None = None
39
+ pre_download_model: bool = True
40
+ results_dir: str = "runpod_results"
41
+ poll_seconds: int = 20
42
+ failure_keepalive_seconds: int = 120
43
+ keep_pod_on_success: bool = False
44
+ ssh_key: str = "~/.ssh/id_ed25519"
45
+ ssh_public_key: str = "~/.ssh/id_ed25519.pub"
46
+
47
+
48
+ def config_dir() -> Path:
49
+ explicit = os.environ.get("OPBDH_CONFIG_DIR", "").strip()
50
+ if explicit:
51
+ return Path(explicit).expanduser().resolve()
52
+ xdg = os.environ.get("XDG_CONFIG_HOME", "").strip()
53
+ if xdg:
54
+ return (Path(xdg).expanduser() / "opbdh").resolve()
55
+ return (Path.home() / ".config" / "opbdh").resolve()
56
+
57
+
58
+ def global_config_path() -> Path:
59
+ explicit = os.environ.get("OPBDH_CONFIG", "").strip()
60
+ if explicit:
61
+ return Path(explicit).expanduser().resolve()
62
+ return config_dir() / "config.json"
63
+
64
+
65
+ def discover_local_config(start: Path | None = None) -> Path | None:
66
+ current = (start or Path.cwd()).expanduser().resolve()
67
+ if current.is_file():
68
+ current = current.parent
69
+ for root in [current, *current.parents]:
70
+ for name in LOCAL_CONFIG_NAMES:
71
+ candidate = root / name
72
+ if candidate.exists():
73
+ return candidate
74
+ return None
75
+
76
+
77
+ def _read_json_object(path: Path | None) -> dict[str, Any]:
78
+ if not path or not path.exists():
79
+ return {}
80
+ payload = json.loads(path.read_text(encoding="utf-8"))
81
+ if not isinstance(payload, dict):
82
+ raise ValueError(f"{path} must contain a JSON object")
83
+ return payload
84
+
85
+
86
+ def _known_fields() -> set[str]:
87
+ return {field.name for field in fields(OpbdhConfig)}
88
+
89
+
90
+ def _coerce_config(data: dict[str, Any]) -> OpbdhConfig:
91
+ known = _known_fields()
92
+ payload = {key: value for key, value in data.items() if key in known and value is not None}
93
+ return OpbdhConfig(**payload)
94
+
95
+
96
+ def merge_config(*layers: dict[str, Any]) -> OpbdhConfig:
97
+ merged: dict[str, Any] = asdict(OpbdhConfig())
98
+ known = _known_fields()
99
+ for layer in layers:
100
+ for key, value in layer.items():
101
+ if key in known and value is not None:
102
+ merged[key] = value
103
+ return _coerce_config(merged)
104
+
105
+
106
+ def save_config(config: OpbdhConfig, path: Path) -> Path:
107
+ path = path.expanduser().resolve()
108
+ path.parent.mkdir(parents=True, exist_ok=True)
109
+ path.write_text(json.dumps(asdict(config), indent=2, sort_keys=True) + "\n", encoding="utf-8")
110
+ return path
111
+
112
+
113
+ def model_slug(model_id: str) -> str:
114
+ slug = model_id.strip().lower().replace("/", "-")
115
+ return "".join(ch if ch.isalnum() or ch in {"-", "_", "."} else "-" for ch in slug).strip("-") or "model"
116
+
117
+
118
+ def interpolation_context(config: OpbdhConfig, *, cwd: Path | None = None, run_id: str | None = None) -> dict[str, str]:
119
+ now = datetime.now(UTC)
120
+ effective_run_id = run_id or now.strftime("%Y%m%d-%H%M%S")
121
+ effective_cwd = (cwd or Path.cwd()).expanduser().resolve()
122
+ return {
123
+ "config_dir": str(config_dir()),
124
+ "cwd": str(effective_cwd),
125
+ "model_id": config.model_id,
126
+ "model_slug": model_slug(config.model_id),
127
+ "run_id": effective_run_id,
128
+ "timestamp": effective_run_id,
129
+ }
130
+
131
+
132
+ def interpolate_value(value: str, context: dict[str, str]) -> str:
133
+ return os.path.expandvars(value).format_map(_SafeFormatDict(context))
134
+
135
+
136
+ def interpolate_config(config: OpbdhConfig, *, cwd: Path | None = None, run_id: str | None = None) -> OpbdhConfig:
137
+ context = interpolation_context(config, cwd=cwd, run_id=run_id)
138
+ data = asdict(config)
139
+ for key, value in list(data.items()):
140
+ if isinstance(value, str):
141
+ data[key] = interpolate_value(value, context)
142
+ return _coerce_config(data)
143
+
144
+
145
+ def load_config(
146
+ *,
147
+ local_config: Path | None = None,
148
+ overrides: dict[str, Any] | None = None,
149
+ cwd: Path | None = None,
150
+ run_id: str | None = None,
151
+ ) -> OpbdhConfig:
152
+ discovered = local_config if local_config is not None else discover_local_config(cwd)
153
+ config = merge_config(
154
+ _read_json_object(global_config_path()),
155
+ _read_json_object(discovered),
156
+ overrides or {},
157
+ )
158
+ return interpolate_config(config, cwd=cwd, run_id=run_id)
opbdh/gpu.py ADDED
@@ -0,0 +1,55 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass(frozen=True, slots=True)
7
+ class GpuOffer:
8
+ id: str
9
+ memory_gb: int
10
+ community_dollars_per_hour: float
11
+ secure_dollars_per_hour: float
12
+
13
+ def hourly(self, cloud_type: str) -> float:
14
+ return self.community_dollars_per_hour if cloud_type.upper() == "COMMUNITY" else self.secure_dollars_per_hour
15
+
16
+
17
+ # Estimates are intentionally conservative and only used to choose a candidate
18
+ # list before RunPod performs the real availability check.
19
+ GPU_CATALOG: tuple[GpuOffer, ...] = (
20
+ GpuOffer("NVIDIA GeForce RTX 3090", 24, 0.22, 0.70),
21
+ GpuOffer("NVIDIA GeForce RTX 4090", 24, 0.34, 1.10),
22
+ GpuOffer("NVIDIA L4", 24, 0.40, 1.10),
23
+ GpuOffer("NVIDIA A40", 48, 0.74, 1.22),
24
+ GpuOffer("NVIDIA L40", 48, 0.79, 1.90),
25
+ GpuOffer("NVIDIA L40S", 48, 0.79, 1.90),
26
+ GpuOffer("NVIDIA RTX 6000 Ada Generation", 48, 0.74, 1.90),
27
+ GpuOffer("NVIDIA A100 80GB PCIe", 80, 1.19, 1.39),
28
+ GpuOffer("NVIDIA A100-SXM4-80GB", 80, 1.39, 1.49),
29
+ GpuOffer("NVIDIA H100 PCIe", 80, 1.99, 2.39),
30
+ GpuOffer("NVIDIA H100 80GB HBM3", 80, 2.69, 4.00),
31
+ GpuOffer("NVIDIA H100 NVL", 94, 2.69, 4.00),
32
+ GpuOffer("NVIDIA RTX PRO 6000 Blackwell Server Edition", 96, 1.58, 5.58),
33
+ GpuOffer("NVIDIA H200", 141, 3.59, 5.58),
34
+ GpuOffer("NVIDIA B200", 180, 5.98, 8.64),
35
+ GpuOffer("AMD Instinct MI300X OAM", 192, 3.99, 6.50),
36
+ )
37
+
38
+
39
+ def candidate_gpus(min_vram_gb: int, max_dollars_per_hour: float | None, cloud_type: str) -> list[GpuOffer]:
40
+ cloud = cloud_type.upper()
41
+ candidates = [gpu for gpu in GPU_CATALOG if gpu.memory_gb >= min_vram_gb]
42
+ if max_dollars_per_hour is not None and max_dollars_per_hour > 0:
43
+ candidates = [gpu for gpu in candidates if gpu.hourly(cloud) <= max_dollars_per_hour]
44
+ return sorted(candidates, key=lambda gpu: (gpu.memory_gb, gpu.hourly(cloud), gpu.id))
45
+
46
+
47
+ def gpu_type_ids(min_vram_gb: int, max_dollars_per_hour: float | None, cloud_type: str) -> list[str]:
48
+ return [gpu.id for gpu in candidate_gpus(min_vram_gb, max_dollars_per_hour, cloud_type)]
49
+
50
+
51
+ def estimated_hourly(gpu_id: str, cloud_type: str) -> float | None:
52
+ for gpu in GPU_CATALOG:
53
+ if gpu.id == gpu_id:
54
+ return gpu.hourly(cloud_type)
55
+ return None