relaymd-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- relaymd_cli-0.1.0/.gitignore +30 -0
- relaymd_cli-0.1.0/PKG-INFO +11 -0
- relaymd_cli-0.1.0/pyproject.toml +30 -0
- relaymd_cli-0.1.0/relaymd-cli.spec +28 -0
- relaymd_cli-0.1.0/src/relaymd/cli/__init__.py +3 -0
- relaymd_cli-0.1.0/src/relaymd/cli/__main__.py +21 -0
- relaymd_cli-0.1.0/src/relaymd/cli/commands/__init__.py +0 -0
- relaymd_cli-0.1.0/src/relaymd/cli/commands/jobs.py +150 -0
- relaymd_cli-0.1.0/src/relaymd/cli/commands/submit.py +129 -0
- relaymd_cli-0.1.0/src/relaymd/cli/commands/workers.py +65 -0
- relaymd_cli-0.1.0/src/relaymd/cli/config.py +73 -0
- relaymd_cli-0.1.0/tests/test_submit.py +105 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.so
|
|
5
|
+
.pytest_cache/
|
|
6
|
+
.mypy_cache/
|
|
7
|
+
.ruff_cache/
|
|
8
|
+
.coverage
|
|
9
|
+
htmlcov/
|
|
10
|
+
*.egg-info/
|
|
11
|
+
*.egg
|
|
12
|
+
*.whl
|
|
13
|
+
|
|
14
|
+
# Virtual environments
|
|
15
|
+
.venv/
|
|
16
|
+
venv/
|
|
17
|
+
env/
|
|
18
|
+
|
|
19
|
+
# IDE
|
|
20
|
+
.idea/
|
|
21
|
+
.vscode/
|
|
22
|
+
|
|
23
|
+
# Environment files
|
|
24
|
+
.env
|
|
25
|
+
.env.*
|
|
26
|
+
!.env.orchestrator.example
|
|
27
|
+
!.env.worker.example
|
|
28
|
+
|
|
29
|
+
# Local databases
|
|
30
|
+
relaymd.db
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: relaymd-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Requires-Python: >=3.11
|
|
5
|
+
Requires-Dist: httpx
|
|
6
|
+
Requires-Dist: pydantic-settings[yaml]
|
|
7
|
+
Requires-Dist: pyyaml
|
|
8
|
+
Requires-Dist: relaymd-models
|
|
9
|
+
Requires-Dist: relaymd-storage
|
|
10
|
+
Requires-Dist: rich
|
|
11
|
+
Requires-Dist: typer
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "relaymd-cli"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
requires-python = ">=3.11"
|
|
9
|
+
dependencies = [
|
|
10
|
+
"relaymd-models",
|
|
11
|
+
"relaymd-storage",
|
|
12
|
+
"httpx",
|
|
13
|
+
"typer",
|
|
14
|
+
"rich",
|
|
15
|
+
"pydantic-settings[yaml]",
|
|
16
|
+
"pyyaml",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.scripts]
|
|
20
|
+
relaymd = "relaymd.cli.__main__:app"
|
|
21
|
+
|
|
22
|
+
[tool.uv.sources]
|
|
23
|
+
relaymd-models = { workspace = true }
|
|
24
|
+
relaymd-storage = { workspace = true }
|
|
25
|
+
|
|
26
|
+
[tool.pytest.ini_options]
|
|
27
|
+
pythonpath = ["src"]
|
|
28
|
+
|
|
29
|
+
[tool.hatch.build.targets.wheel]
|
|
30
|
+
packages = ["src/relaymd"]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# -*- mode: python ; coding: utf-8 -*-
|
|
2
|
+
a = Analysis(
|
|
3
|
+
['src/relaymd/cli/__main__.py'],
|
|
4
|
+
pathex=['src'],
|
|
5
|
+
binaries=[],
|
|
6
|
+
datas=[],
|
|
7
|
+
hiddenimports=['relaymd.models', 'relaymd.storage'],
|
|
8
|
+
hookspath=[],
|
|
9
|
+
hooksconfig={},
|
|
10
|
+
runtime_hooks=[],
|
|
11
|
+
excludes=[],
|
|
12
|
+
noarchive=False,
|
|
13
|
+
)
|
|
14
|
+
pex = PYZ(a.pure)
|
|
15
|
+
exe = EXE(
|
|
16
|
+
pex,
|
|
17
|
+
a.scripts,
|
|
18
|
+
a.binaries,
|
|
19
|
+
a.datas,
|
|
20
|
+
[],
|
|
21
|
+
name='relaymd',
|
|
22
|
+
debug=False,
|
|
23
|
+
bootloader_ignore_signals=False,
|
|
24
|
+
strip=True,
|
|
25
|
+
upx=False,
|
|
26
|
+
console=True,
|
|
27
|
+
onefile=True,
|
|
28
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
from relaymd.cli.commands.jobs import app as jobs_app
|
|
7
|
+
from relaymd.cli.commands.submit import submit
|
|
8
|
+
from relaymd.cli.commands.workers import app as workers_app
|
|
9
|
+
|
|
10
|
+
cli = typer.Typer(help="RelayMD operator CLI")
|
|
11
|
+
cli.command()(submit)
|
|
12
|
+
cli.add_typer(jobs_app, name="jobs")
|
|
13
|
+
cli.add_typer(workers_app, name="workers")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def app() -> None:
|
|
17
|
+
cli()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if __name__ == "__main__":
|
|
21
|
+
sys.exit(app())
|
|
File without changes
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import typer
|
|
8
|
+
from relaymd.cli.config import load_settings
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.table import Table
|
|
11
|
+
|
|
12
|
+
app = typer.Typer(help="Manage jobs.")
|
|
13
|
+
console = Console()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _headers() -> dict[str, str]:
|
|
17
|
+
return {"X-API-Token": load_settings().api_token}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _orchestrator_base() -> str:
|
|
21
|
+
return load_settings().orchestrator_url.rstrip("/")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _status_style(status: str) -> str:
|
|
25
|
+
styles = {
|
|
26
|
+
"queued": "yellow",
|
|
27
|
+
"assigned": "cyan",
|
|
28
|
+
"running": "blue",
|
|
29
|
+
"completed": "green",
|
|
30
|
+
"failed": "red",
|
|
31
|
+
"cancelled": "dim",
|
|
32
|
+
}
|
|
33
|
+
return styles.get(status, "white")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _parse_iso_datetime(value: str | None) -> datetime | None:
|
|
37
|
+
if value is None:
|
|
38
|
+
return None
|
|
39
|
+
normalized = value.replace("Z", "+00:00")
|
|
40
|
+
parsed = datetime.fromisoformat(normalized)
|
|
41
|
+
return parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=UTC)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _age_label(timestamp: str | None) -> str:
|
|
45
|
+
parsed = _parse_iso_datetime(timestamp)
|
|
46
|
+
if parsed is None:
|
|
47
|
+
return "-"
|
|
48
|
+
seconds = int((datetime.now(UTC) - parsed).total_seconds())
|
|
49
|
+
if seconds < 60:
|
|
50
|
+
return f"{seconds}s"
|
|
51
|
+
if seconds < 3600:
|
|
52
|
+
return f"{seconds // 60}m"
|
|
53
|
+
return f"{seconds // 3600}h"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _request(method: str, path: str, **kwargs: Any) -> httpx.Response:
|
|
57
|
+
with httpx.Client(timeout=30.0) as client:
|
|
58
|
+
response = client.request(
|
|
59
|
+
method,
|
|
60
|
+
f"{_orchestrator_base()}{path}",
|
|
61
|
+
headers=_headers(),
|
|
62
|
+
**kwargs,
|
|
63
|
+
)
|
|
64
|
+
response.raise_for_status()
|
|
65
|
+
return response
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@app.command("list")
|
|
69
|
+
def list_jobs() -> None:
|
|
70
|
+
try:
|
|
71
|
+
jobs = _request("GET", "/jobs").json()
|
|
72
|
+
except Exception as exc: # noqa: BLE001
|
|
73
|
+
console.print(f"[red]Failed to list jobs:[/red] {exc}")
|
|
74
|
+
raise typer.Exit(code=1) from exc
|
|
75
|
+
|
|
76
|
+
table = Table(title="Jobs")
|
|
77
|
+
table.add_column("ID")
|
|
78
|
+
table.add_column("Title")
|
|
79
|
+
table.add_column("Status")
|
|
80
|
+
table.add_column("Assigned Worker")
|
|
81
|
+
table.add_column("Last Checkpoint Age")
|
|
82
|
+
|
|
83
|
+
for job in jobs:
|
|
84
|
+
status = str(job.get("status", "unknown"))
|
|
85
|
+
table.add_row(
|
|
86
|
+
str(job.get("id", "-"))[:8],
|
|
87
|
+
str(job.get("title", "-")),
|
|
88
|
+
f"[{_status_style(status)}]{status}[/{_status_style(status)}]",
|
|
89
|
+
str(job.get("assigned_worker_id") or "-")[:8],
|
|
90
|
+
_age_label(job.get("last_checkpoint_at")),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
console.print(table)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@app.command("status")
|
|
97
|
+
def job_status(job_id: str) -> None:
|
|
98
|
+
try:
|
|
99
|
+
job = _request("GET", f"/jobs/{job_id}").json()
|
|
100
|
+
except Exception as exc: # noqa: BLE001
|
|
101
|
+
console.print(f"[red]Failed to get job status:[/red] {exc}")
|
|
102
|
+
raise typer.Exit(code=1) from exc
|
|
103
|
+
|
|
104
|
+
status = str(job.get("status", "unknown"))
|
|
105
|
+
rows = [
|
|
106
|
+
("id", str(job.get("id", "-"))),
|
|
107
|
+
("title", str(job.get("title", "-"))),
|
|
108
|
+
("status", f"[{_status_style(status)}]{status}[/{_status_style(status)}]"),
|
|
109
|
+
("input_bundle_path", str(job.get("input_bundle_path", "-"))),
|
|
110
|
+
("latest_checkpoint_path", str(job.get("latest_checkpoint_path") or "-")),
|
|
111
|
+
("last_checkpoint_at", str(job.get("last_checkpoint_at") or "-")),
|
|
112
|
+
("assigned_worker_id", str(job.get("assigned_worker_id") or "-")),
|
|
113
|
+
("created_at", str(job.get("created_at") or "-")),
|
|
114
|
+
("updated_at", str(job.get("updated_at") or "-")),
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
table = Table(title=f"Job {job_id}", show_header=False)
|
|
118
|
+
table.add_column("Field", style="cyan")
|
|
119
|
+
table.add_column("Value")
|
|
120
|
+
for key, value in rows:
|
|
121
|
+
table.add_row(key, value)
|
|
122
|
+
console.print(table)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@app.command("cancel")
|
|
126
|
+
def cancel_job(
|
|
127
|
+
job_id: str,
|
|
128
|
+
force: bool = typer.Option(False, "--force", help="Cancel running job."),
|
|
129
|
+
) -> None:
|
|
130
|
+
try:
|
|
131
|
+
path = f"/jobs/{job_id}"
|
|
132
|
+
if force:
|
|
133
|
+
path = f"{path}?force=true"
|
|
134
|
+
_request("DELETE", path)
|
|
135
|
+
except Exception as exc: # noqa: BLE001
|
|
136
|
+
console.print(f"[red]Failed to cancel job:[/red] {exc}")
|
|
137
|
+
raise typer.Exit(code=1) from exc
|
|
138
|
+
|
|
139
|
+
console.print(f"[green]Cancelled job[/green] {job_id}")
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@app.command("requeue")
|
|
143
|
+
def requeue_job(job_id: str) -> None:
|
|
144
|
+
try:
|
|
145
|
+
_request("POST", f"/jobs/{job_id}/requeue")
|
|
146
|
+
except Exception as exc: # noqa: BLE001
|
|
147
|
+
console.print(f"[red]Failed to requeue job:[/red] {exc}")
|
|
148
|
+
raise typer.Exit(code=1) from exc
|
|
149
|
+
|
|
150
|
+
console.print(f"[green]Requeued job[/green] {job_id}")
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import tarfile
|
|
5
|
+
import tempfile
|
|
6
|
+
import uuid
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Annotated, Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
import typer
|
|
12
|
+
from relaymd.cli.config import load_settings
|
|
13
|
+
from relaymd.storage.client import StorageClient
|
|
14
|
+
from rich.console import Console
|
|
15
|
+
from rich.panel import Panel
|
|
16
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
|
17
|
+
|
|
18
|
+
console = Console()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def ensure_worker_config(input_dir: Path, command: str | None, checkpoint_glob: str | None) -> None:
|
|
22
|
+
worker_json = input_dir / "relaymd-worker.json"
|
|
23
|
+
worker_toml = input_dir / "relaymd-worker.toml"
|
|
24
|
+
|
|
25
|
+
if command is not None:
|
|
26
|
+
worker_payload: dict[str, Any] = {
|
|
27
|
+
"command": command,
|
|
28
|
+
"checkpoint_glob_pattern": checkpoint_glob,
|
|
29
|
+
}
|
|
30
|
+
worker_json.write_text(f"{json.dumps(worker_payload, indent=2)}\n", encoding="utf-8")
|
|
31
|
+
return
|
|
32
|
+
|
|
33
|
+
if not worker_json.exists() and not worker_toml.exists():
|
|
34
|
+
console.print(
|
|
35
|
+
"[red]Missing worker configuration:[/red] provide --command or add "
|
|
36
|
+
"relaymd-worker.json / relaymd-worker.toml in the input directory."
|
|
37
|
+
)
|
|
38
|
+
raise typer.Exit(code=1)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def create_bundle_archive(input_dir: Path, archive_path: Path) -> None:
|
|
42
|
+
with tarfile.open(archive_path, "w:gz") as tar:
|
|
43
|
+
for path in sorted(input_dir.rglob("*")):
|
|
44
|
+
if not path.is_file():
|
|
45
|
+
continue
|
|
46
|
+
arcname = path.relative_to(input_dir)
|
|
47
|
+
tar.add(path, arcname=str(arcname))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def upload_bundle(local_archive: Path, b2_key: str) -> None:
|
|
51
|
+
settings = load_settings()
|
|
52
|
+
storage = StorageClient(
|
|
53
|
+
b2_endpoint_url=settings.b2_endpoint_url,
|
|
54
|
+
b2_bucket_name=settings.b2_bucket_name,
|
|
55
|
+
b2_access_key_id=settings.b2_access_key_id,
|
|
56
|
+
b2_secret_access_key=settings.b2_secret_access_key,
|
|
57
|
+
cf_worker_url="",
|
|
58
|
+
cf_bearer_token="",
|
|
59
|
+
)
|
|
60
|
+
with Progress(
|
|
61
|
+
SpinnerColumn(),
|
|
62
|
+
TextColumn("[progress.description]{task.description}"),
|
|
63
|
+
TimeElapsedColumn(),
|
|
64
|
+
console=console,
|
|
65
|
+
transient=True,
|
|
66
|
+
) as progress:
|
|
67
|
+
task_id = progress.add_task("Uploading bundle to B2...", total=None)
|
|
68
|
+
storage.upload_file(local_archive, b2_key)
|
|
69
|
+
progress.update(task_id, description="Upload complete")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def register_job(title: str, b2_key: str) -> str:
|
|
73
|
+
settings = load_settings()
|
|
74
|
+
headers = {"X-API-Token": settings.api_token}
|
|
75
|
+
payload = {"title": title, "input_bundle_path": b2_key}
|
|
76
|
+
with httpx.Client(timeout=30.0) as client:
|
|
77
|
+
response = client.post(
|
|
78
|
+
f"{settings.orchestrator_url.rstrip('/')}/jobs",
|
|
79
|
+
json=payload,
|
|
80
|
+
headers=headers,
|
|
81
|
+
)
|
|
82
|
+
response.raise_for_status()
|
|
83
|
+
return str(response.json()["id"])
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def submit(
|
|
87
|
+
input_dir: Annotated[Path, typer.Argument(help="Input directory to pack and submit.")],
|
|
88
|
+
title: Annotated[str, typer.Option("--title", help="Human-readable job title.")],
|
|
89
|
+
command: Annotated[
|
|
90
|
+
str | None,
|
|
91
|
+
typer.Option("--command", help="Command to write to relaymd-worker.json."),
|
|
92
|
+
] = None,
|
|
93
|
+
checkpoint_glob: Annotated[
|
|
94
|
+
str | None,
|
|
95
|
+
typer.Option(
|
|
96
|
+
"--checkpoint-glob",
|
|
97
|
+
help="Checkpoint glob pattern to write alongside --command.",
|
|
98
|
+
),
|
|
99
|
+
] = None,
|
|
100
|
+
) -> None:
|
|
101
|
+
if not input_dir.exists() or not input_dir.is_dir():
|
|
102
|
+
console.print(
|
|
103
|
+
"[red]Input directory does not exist or is not a directory:[/red] "
|
|
104
|
+
f"{input_dir}"
|
|
105
|
+
)
|
|
106
|
+
raise typer.Exit(code=1)
|
|
107
|
+
|
|
108
|
+
ensure_worker_config(input_dir, command, checkpoint_glob)
|
|
109
|
+
|
|
110
|
+
job_id = str(uuid.uuid4())
|
|
111
|
+
b2_key = f"jobs/{job_id}/input/bundle.tar.gz"
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
115
|
+
archive_path = Path(tmpdir) / "bundle.tar.gz"
|
|
116
|
+
create_bundle_archive(input_dir, archive_path)
|
|
117
|
+
upload_bundle(archive_path, b2_key)
|
|
118
|
+
|
|
119
|
+
created_job_id = register_job(title, b2_key)
|
|
120
|
+
except Exception as exc: # noqa: BLE001
|
|
121
|
+
console.print(f"[red]Failed to submit job:[/red] {exc}")
|
|
122
|
+
raise typer.Exit(code=1) from exc
|
|
123
|
+
|
|
124
|
+
console.print(
|
|
125
|
+
Panel.fit(
|
|
126
|
+
f"[bold green]Job submitted[/bold green]\n[bold]{created_job_id}[/bold]",
|
|
127
|
+
title="RelayMD",
|
|
128
|
+
)
|
|
129
|
+
)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
import typer
|
|
7
|
+
from relaymd.cli.config import load_settings
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
|
|
11
|
+
app = typer.Typer(help="List workers.")
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _parse_iso_datetime(value: str) -> datetime:
|
|
16
|
+
normalized = value.replace("Z", "+00:00")
|
|
17
|
+
parsed = datetime.fromisoformat(normalized)
|
|
18
|
+
return parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=UTC)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@app.command("list")
|
|
22
|
+
def list_workers() -> None:
|
|
23
|
+
settings = load_settings()
|
|
24
|
+
headers = {"X-API-Token": settings.api_token}
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
with httpx.Client(timeout=30.0) as client:
|
|
28
|
+
response = client.get(
|
|
29
|
+
f"{settings.orchestrator_url.rstrip('/')}/workers",
|
|
30
|
+
headers=headers,
|
|
31
|
+
)
|
|
32
|
+
response.raise_for_status()
|
|
33
|
+
workers = response.json()
|
|
34
|
+
except Exception as exc: # noqa: BLE001
|
|
35
|
+
console.print(f"[red]Failed to list workers:[/red] {exc}")
|
|
36
|
+
raise typer.Exit(code=1) from exc
|
|
37
|
+
|
|
38
|
+
table = Table(title="Workers")
|
|
39
|
+
table.add_column("ID")
|
|
40
|
+
table.add_column("Platform")
|
|
41
|
+
table.add_column("GPU Model")
|
|
42
|
+
table.add_column("GPU Count")
|
|
43
|
+
table.add_column("VRAM (GB)")
|
|
44
|
+
table.add_column("Last Heartbeat")
|
|
45
|
+
table.add_column("Status")
|
|
46
|
+
|
|
47
|
+
now = datetime.now(UTC)
|
|
48
|
+
for worker in workers:
|
|
49
|
+
last_heartbeat_raw = str(worker.get("last_heartbeat", ""))
|
|
50
|
+
last_heartbeat = _parse_iso_datetime(last_heartbeat_raw)
|
|
51
|
+
age_seconds = (now - last_heartbeat).total_seconds()
|
|
52
|
+
status = "stale" if age_seconds > 120 else "healthy"
|
|
53
|
+
status_color = "red" if status == "stale" else "green"
|
|
54
|
+
|
|
55
|
+
table.add_row(
|
|
56
|
+
str(worker.get("id", "-"))[:8],
|
|
57
|
+
str(worker.get("platform", "-")),
|
|
58
|
+
str(worker.get("gpu_model", "-")),
|
|
59
|
+
str(worker.get("gpu_count", "-")),
|
|
60
|
+
str(worker.get("vram_gb", "-")),
|
|
61
|
+
last_heartbeat_raw,
|
|
62
|
+
f"[{status_color}]{status}[/{status_color}]",
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
console.print(table)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from pydantic import AliasChoices, Field
|
|
7
|
+
from pydantic_settings import (
|
|
8
|
+
BaseSettings,
|
|
9
|
+
EnvSettingsSource,
|
|
10
|
+
SettingsConfigDict,
|
|
11
|
+
YamlConfigSettingsSource,
|
|
12
|
+
)
|
|
13
|
+
from pydantic_settings.sources import DefaultSettingsSource, PydanticBaseSettingsSource
|
|
14
|
+
|
|
15
|
+
RELAYMD_CONFIG_ENV_VAR = "RELAYMD_CONFIG"
|
|
16
|
+
DEFAULT_RELAYMD_CONFIG_PATH = "~/.config/relaymd/config.yaml"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CliSettings(BaseSettings):
|
|
20
|
+
orchestrator_url: str = "http://localhost:8000"
|
|
21
|
+
api_token: str = Field(
|
|
22
|
+
default="change-me",
|
|
23
|
+
validation_alias=AliasChoices("api_token", "RELAYMD_API_TOKEN", "API_TOKEN"),
|
|
24
|
+
)
|
|
25
|
+
b2_endpoint_url: str = ""
|
|
26
|
+
b2_bucket_name: str = ""
|
|
27
|
+
b2_access_key_id: str = ""
|
|
28
|
+
b2_secret_access_key: str = ""
|
|
29
|
+
|
|
30
|
+
model_config = SettingsConfigDict(env_prefix="", extra="ignore")
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def config_path(cls) -> Path:
|
|
34
|
+
return Path(os.getenv(RELAYMD_CONFIG_ENV_VAR, DEFAULT_RELAYMD_CONFIG_PATH)).expanduser()
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def _drop_yaml_keys_with_env_overrides(
|
|
38
|
+
cls,
|
|
39
|
+
yaml_source: YamlConfigSettingsSource,
|
|
40
|
+
) -> None:
|
|
41
|
+
env_override_map = {
|
|
42
|
+
"orchestrator_url": ("ORCHESTRATOR_URL",),
|
|
43
|
+
"api_token": ("RELAYMD_API_TOKEN", "API_TOKEN"),
|
|
44
|
+
"b2_endpoint_url": ("B2_ENDPOINT_URL",),
|
|
45
|
+
"b2_bucket_name": ("B2_BUCKET_NAME",),
|
|
46
|
+
"b2_access_key_id": ("B2_ACCESS_KEY_ID",),
|
|
47
|
+
"b2_secret_access_key": ("B2_SECRET_ACCESS_KEY",),
|
|
48
|
+
}
|
|
49
|
+
for field_name, env_keys in env_override_map.items():
|
|
50
|
+
if any(os.getenv(env_key) is not None for env_key in env_keys):
|
|
51
|
+
yaml_source.init_kwargs.pop(field_name, None)
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def settings_customise_sources(
|
|
55
|
+
cls,
|
|
56
|
+
settings_cls: type[BaseSettings],
|
|
57
|
+
init_settings: PydanticBaseSettingsSource,
|
|
58
|
+
env_settings: PydanticBaseSettingsSource,
|
|
59
|
+
dotenv_settings: PydanticBaseSettingsSource,
|
|
60
|
+
file_secret_settings: PydanticBaseSettingsSource,
|
|
61
|
+
) -> tuple[PydanticBaseSettingsSource, ...]:
|
|
62
|
+
yaml_source = YamlConfigSettingsSource(settings_cls, yaml_file=cls.config_path())
|
|
63
|
+
cls._drop_yaml_keys_with_env_overrides(yaml_source)
|
|
64
|
+
return (
|
|
65
|
+
init_settings,
|
|
66
|
+
yaml_source,
|
|
67
|
+
EnvSettingsSource(settings_cls),
|
|
68
|
+
DefaultSettingsSource(settings_cls),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def load_settings() -> CliSettings:
|
|
73
|
+
return CliSettings()
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import tarfile
|
|
5
|
+
import uuid
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from unittest.mock import Mock
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
import typer
|
|
11
|
+
from relaymd.cli.commands import submit as submit_cmd
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _FakeSettings:
|
|
15
|
+
orchestrator_url = "http://localhost:8000"
|
|
16
|
+
api_token = "token"
|
|
17
|
+
b2_endpoint_url = "https://b2.example"
|
|
18
|
+
b2_bucket_name = "bucket"
|
|
19
|
+
b2_access_key_id = "key"
|
|
20
|
+
b2_secret_access_key = "secret"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_create_bundle_archive_uses_flat_archive_root(tmp_path: Path) -> None:
|
|
24
|
+
input_dir = tmp_path / "input"
|
|
25
|
+
input_dir.mkdir()
|
|
26
|
+
(input_dir / "relaymd-worker.json").write_text('{"command": "run"}\n', encoding="utf-8")
|
|
27
|
+
(input_dir / "input.txt").write_text("data", encoding="utf-8")
|
|
28
|
+
nested = input_dir / "nested"
|
|
29
|
+
nested.mkdir()
|
|
30
|
+
(nested / "checkpoint.dat").write_text("cpt", encoding="utf-8")
|
|
31
|
+
|
|
32
|
+
bundle = tmp_path / "bundle.tar.gz"
|
|
33
|
+
submit_cmd.create_bundle_archive(input_dir, bundle)
|
|
34
|
+
|
|
35
|
+
with tarfile.open(bundle, "r:gz") as tar:
|
|
36
|
+
names = tar.getnames()
|
|
37
|
+
|
|
38
|
+
assert "relaymd-worker.json" in names
|
|
39
|
+
assert "input.txt" in names
|
|
40
|
+
assert "nested/checkpoint.dat" in names
|
|
41
|
+
assert all(not name.startswith("input/") for name in names)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_submit_writes_worker_json_when_command_flag_provided(monkeypatch, tmp_path: Path) -> None:
|
|
45
|
+
input_dir = tmp_path / "input"
|
|
46
|
+
input_dir.mkdir()
|
|
47
|
+
(input_dir / "hello.txt").write_text("hello", encoding="utf-8")
|
|
48
|
+
|
|
49
|
+
uploaded = {}
|
|
50
|
+
|
|
51
|
+
class FakeStorageClient:
|
|
52
|
+
def __init__(self, **kwargs):
|
|
53
|
+
self.kwargs = kwargs
|
|
54
|
+
|
|
55
|
+
def upload_file(self, local_path: Path, b2_key: str) -> None:
|
|
56
|
+
uploaded["path"] = Path(local_path)
|
|
57
|
+
uploaded["key"] = b2_key
|
|
58
|
+
with tarfile.open(local_path, "r:gz") as tar:
|
|
59
|
+
uploaded["tar_names"] = tar.getnames()
|
|
60
|
+
worker_json = tar.extractfile("relaymd-worker.json")
|
|
61
|
+
assert worker_json is not None
|
|
62
|
+
uploaded["worker_config"] = json.loads(worker_json.read().decode("utf-8"))
|
|
63
|
+
|
|
64
|
+
fake_http_client = Mock()
|
|
65
|
+
fake_response = Mock()
|
|
66
|
+
fake_response.json.return_value = {"id": str(uuid.uuid4())}
|
|
67
|
+
fake_response.raise_for_status.return_value = None
|
|
68
|
+
fake_http_client.post.return_value = fake_response
|
|
69
|
+
fake_http_cm = Mock()
|
|
70
|
+
fake_http_cm.__enter__ = Mock(return_value=fake_http_client)
|
|
71
|
+
fake_http_cm.__exit__ = Mock(return_value=False)
|
|
72
|
+
|
|
73
|
+
monkeypatch.setattr(submit_cmd, "StorageClient", FakeStorageClient)
|
|
74
|
+
monkeypatch.setattr(submit_cmd.httpx, "Client", Mock(return_value=fake_http_cm))
|
|
75
|
+
monkeypatch.setattr(submit_cmd, "load_settings", lambda: _FakeSettings())
|
|
76
|
+
|
|
77
|
+
submit_cmd.submit(
|
|
78
|
+
input_dir=input_dir,
|
|
79
|
+
title="test-job",
|
|
80
|
+
command="python run.py",
|
|
81
|
+
checkpoint_glob="*.cpt",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
worker_json = input_dir / "relaymd-worker.json"
|
|
85
|
+
assert worker_json.exists()
|
|
86
|
+
worker_payload = json.loads(worker_json.read_text(encoding="utf-8"))
|
|
87
|
+
assert worker_payload == {
|
|
88
|
+
"command": "python run.py",
|
|
89
|
+
"checkpoint_glob_pattern": "*.cpt",
|
|
90
|
+
}
|
|
91
|
+
assert "relaymd-worker.json" in uploaded["tar_names"]
|
|
92
|
+
assert uploaded["worker_config"] == worker_payload
|
|
93
|
+
assert uploaded["key"].startswith("jobs/")
|
|
94
|
+
assert uploaded["key"].endswith("/input/bundle.tar.gz")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_submit_aborts_when_worker_config_missing_and_no_command(tmp_path: Path) -> None:
|
|
98
|
+
input_dir = tmp_path / "input"
|
|
99
|
+
input_dir.mkdir()
|
|
100
|
+
(input_dir / "hello.txt").write_text("hello", encoding="utf-8")
|
|
101
|
+
|
|
102
|
+
with pytest.raises(typer.Exit) as exc:
|
|
103
|
+
submit_cmd.ensure_worker_config(input_dir, command=None, checkpoint_glob=None)
|
|
104
|
+
|
|
105
|
+
assert exc.value.exit_code == 1
|