flowmesh-sdk-stack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowmesh_sdk_stack-0.1.0.dist-info/METADATA +24 -0
- flowmesh_sdk_stack-0.1.0.dist-info/RECORD +14 -0
- flowmesh_sdk_stack-0.1.0.dist-info/WHEEL +5 -0
- flowmesh_sdk_stack-0.1.0.dist-info/licenses/LICENSE +202 -0
- flowmesh_sdk_stack-0.1.0.dist-info/top_level.txt +1 -0
- flowmesh_stack/__init__.py +23 -0
- flowmesh_stack/docker.py +262 -0
- flowmesh_stack/doctor.py +227 -0
- flowmesh_stack/env.py +145 -0
- flowmesh_stack/env_schema.py +238 -0
- flowmesh_stack/images.py +112 -0
- flowmesh_stack/node_client.py +173 -0
- flowmesh_stack/paths.py +21 -0
- flowmesh_stack/workers.py +278 -0
flowmesh_stack/images.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""FlowMesh Docker image reference management.
|
|
2
|
+
|
|
3
|
+
Provides the canonical mapping from build targets to image references,
|
|
4
|
+
used by both the CLI dev commands and programmatic build/deploy scripts.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
BUILD_TARGETS: dict[str, str] = {
|
|
8
|
+
"flowmesh_server": "{registry}/flowmesh_server:{version}",
|
|
9
|
+
"flowmesh_worker_cpu": "{registry}/flowmesh_worker:{version}-cpu",
|
|
10
|
+
"flowmesh_worker_gpu_builder": "{registry}/flowmesh_worker_builder:{version}-gpu",
|
|
11
|
+
"flowmesh_worker_gpu": "{registry}/flowmesh_worker:{version}-gpu",
|
|
12
|
+
"flowmesh_ssh_cpu": "{registry}/flowmesh_ssh:{version}-cpu",
|
|
13
|
+
"flowmesh_ssh_gpu": "{registry}/flowmesh_ssh:{version}-gpu",
|
|
14
|
+
}
|
|
15
|
+
"""Mapping from build target name to image reference format string."""
|
|
16
|
+
|
|
17
|
+
CACHE_TARGETS: dict[str, str] = {
|
|
18
|
+
"flowmesh_server": "{registry}/flowmesh_server:{scope}",
|
|
19
|
+
"flowmesh_worker_cpu": "{registry}/flowmesh_worker:{scope}-cpu",
|
|
20
|
+
"flowmesh_worker_gpu_builder": "{registry}/flowmesh_worker_builder:{scope}-gpu",
|
|
21
|
+
"flowmesh_worker_gpu": "{registry}/flowmesh_worker:{scope}-gpu",
|
|
22
|
+
"flowmesh_ssh_cpu": "{registry}/flowmesh_ssh:{scope}-cpu",
|
|
23
|
+
"flowmesh_ssh_gpu": "{registry}/flowmesh_ssh:{scope}-gpu",
|
|
24
|
+
}
|
|
25
|
+
"""Mapping from build target name to registry cache reference format string."""
|
|
26
|
+
|
|
27
|
+
BUILD_GROUPS: dict[str, list[str]] = {
|
|
28
|
+
"server": ["flowmesh_server"],
|
|
29
|
+
"workers": [
|
|
30
|
+
"flowmesh_worker_cpu",
|
|
31
|
+
"flowmesh_worker_gpu",
|
|
32
|
+
"flowmesh_ssh_cpu",
|
|
33
|
+
"flowmesh_ssh_gpu",
|
|
34
|
+
],
|
|
35
|
+
"builders": ["flowmesh_worker_gpu_builder"],
|
|
36
|
+
}
|
|
37
|
+
"""Mapping from group name to list of build targets."""
|
|
38
|
+
BUILD_GROUPS["default"] = [
|
|
39
|
+
target for group in ("server", "workers") for target in BUILD_GROUPS[group]
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
BUILD_DEPENDENCIES: dict[str, list[str]] = {
|
|
43
|
+
"flowmesh_worker_gpu": ["flowmesh_worker_gpu_builder"],
|
|
44
|
+
}
|
|
45
|
+
"""Auxiliary build targets that must be configured alongside a selected target."""
|
|
46
|
+
|
|
47
|
+
PUSH_PLATFORMS: dict[str, str] = {
|
|
48
|
+
target: "linux/amd64,linux/arm64" for target in BUILD_TARGETS
|
|
49
|
+
}
|
|
50
|
+
"""Default platform matrix to publish for each build target."""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_image_ref(registry: str, version: str, target: str) -> str:
|
|
54
|
+
"""Resolve a Docker image reference for a build target.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
registry: Container registry (e.g. ``ghcr.io/mlsys-io``).
|
|
58
|
+
version: Image version tag (e.g. ``dev``, ``0.1.0``).
|
|
59
|
+
target: Build target name (must be a key in :data:`BUILD_TARGETS`).
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: If the target is unknown.
|
|
63
|
+
"""
|
|
64
|
+
if target not in BUILD_TARGETS:
|
|
65
|
+
raise ValueError(f"Unknown build target: {target}")
|
|
66
|
+
return BUILD_TARGETS[target].format(registry=registry, version=version)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def get_cache_ref(registry: str, scope: str, target: str) -> str:
|
|
70
|
+
"""Resolve a Docker registry cache reference for a build target."""
|
|
71
|
+
|
|
72
|
+
if target not in CACHE_TARGETS:
|
|
73
|
+
raise ValueError(f"Unknown build target: {target}")
|
|
74
|
+
normalized_scope = (scope or "").strip()
|
|
75
|
+
if not normalized_scope or normalized_scope == "cache":
|
|
76
|
+
cache_scope = "cache"
|
|
77
|
+
elif normalized_scope.startswith("cache-"):
|
|
78
|
+
cache_scope = normalized_scope
|
|
79
|
+
else:
|
|
80
|
+
cache_scope = f"cache-{normalized_scope}"
|
|
81
|
+
return CACHE_TARGETS[target].format(registry=registry, scope=cache_scope)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def expand_build_targets(targets: list[str]) -> list[str]:
|
|
85
|
+
"""Expand explicit targets with any dependent helper targets.
|
|
86
|
+
|
|
87
|
+
The returned list preserves first-seen order and inserts helper targets ahead
|
|
88
|
+
of any target that requires them.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
expanded: list[str] = []
|
|
92
|
+
seen: set[str] = set()
|
|
93
|
+
|
|
94
|
+
def _visit(target: str) -> None:
|
|
95
|
+
if target in seen:
|
|
96
|
+
return
|
|
97
|
+
for dep in BUILD_DEPENDENCIES.get(target, []):
|
|
98
|
+
_visit(dep)
|
|
99
|
+
seen.add(target)
|
|
100
|
+
expanded.append(target)
|
|
101
|
+
|
|
102
|
+
for target in targets:
|
|
103
|
+
_visit(target)
|
|
104
|
+
return expanded
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_push_platforms(target: str) -> str:
|
|
108
|
+
"""Resolve the default push platform matrix for a build target."""
|
|
109
|
+
|
|
110
|
+
if target not in PUSH_PLATFORMS:
|
|
111
|
+
raise ValueError(f"Unknown build target: {target}")
|
|
112
|
+
return PUSH_PLATFORMS[target]
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Node direct API client."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
from flowmesh.exceptions import FlowMeshConnectionError, FlowMeshError
|
|
8
|
+
|
|
9
|
+
_DEFAULT_TIMEOUT = 30.0
|
|
10
|
+
_WORKER_CREATE_TIMEOUT = 600.0
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _raise_for_status(response: httpx.Response, method: str) -> None:
|
|
14
|
+
if response.status_code < 400:
|
|
15
|
+
return
|
|
16
|
+
try:
|
|
17
|
+
body = response.json()
|
|
18
|
+
except Exception:
|
|
19
|
+
body = response.text
|
|
20
|
+
message = ""
|
|
21
|
+
if isinstance(body, dict):
|
|
22
|
+
message = body.get("detail", "") or body.get("message", "")
|
|
23
|
+
if not message:
|
|
24
|
+
message = str(body)
|
|
25
|
+
raise FlowMeshError(
|
|
26
|
+
f"{method} {response.url} returned {response.status_code}: {message}"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class NodeClient:
|
|
31
|
+
"""Client for a node's worker management HTTP API.
|
|
32
|
+
|
|
33
|
+
Used by the CLI stack sub-package for worker lifecycle management.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
base_url: Node HTTP endpoint (e.g. ``http://localhost:8000``).
|
|
37
|
+
token: Optional bearer token for authentication.
|
|
38
|
+
timeout: Request timeout in seconds.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
base_url: str | None = None,
|
|
44
|
+
token: str | None = None,
|
|
45
|
+
timeout: float = _DEFAULT_TIMEOUT,
|
|
46
|
+
) -> None:
|
|
47
|
+
resolved_url = base_url or _default_node_url()
|
|
48
|
+
resolved_token = token or os.getenv("FLOWMESH_API_KEY") or None
|
|
49
|
+
headers: dict[str, str] = {"Accept": "application/json"}
|
|
50
|
+
if resolved_token:
|
|
51
|
+
headers["Authorization"] = f"Bearer {resolved_token}"
|
|
52
|
+
self._base_url = resolved_url.rstrip("/")
|
|
53
|
+
self._http = httpx.Client(
|
|
54
|
+
base_url=self._base_url,
|
|
55
|
+
headers=headers,
|
|
56
|
+
timeout=httpx.Timeout(timeout),
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# -- Workers --------------------------------------------------------- #
|
|
60
|
+
|
|
61
|
+
def list_workers(self) -> list[dict[str, Any]]:
|
|
62
|
+
"""List all workers managed by this node."""
|
|
63
|
+
return self._request("GET", "/api/v1/stack/workers")
|
|
64
|
+
|
|
65
|
+
def create_worker(self, config: str | dict[str, Any]) -> dict[str, Any]:
|
|
66
|
+
"""Create a worker from a JSON/YAML config.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
config: Worker init config as a JSON string or dict.
|
|
70
|
+
"""
|
|
71
|
+
if isinstance(config, dict):
|
|
72
|
+
return self._request(
|
|
73
|
+
"POST",
|
|
74
|
+
"/api/v1/stack/workers",
|
|
75
|
+
json_body=config,
|
|
76
|
+
timeout=_WORKER_CREATE_TIMEOUT,
|
|
77
|
+
)
|
|
78
|
+
return self._request(
|
|
79
|
+
"POST",
|
|
80
|
+
"/api/v1/stack/workers",
|
|
81
|
+
data=config,
|
|
82
|
+
headers={"Content-Type": "application/json"},
|
|
83
|
+
timeout=_WORKER_CREATE_TIMEOUT,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def start_worker(self, name: str) -> None:
|
|
87
|
+
"""Start a stopped worker."""
|
|
88
|
+
self._request("POST", f"/api/v1/stack/workers/{name}/start")
|
|
89
|
+
|
|
90
|
+
def stop_worker(self, name: str) -> None:
|
|
91
|
+
"""Stop a running worker."""
|
|
92
|
+
self._request("POST", f"/api/v1/stack/workers/{name}/stop")
|
|
93
|
+
|
|
94
|
+
def destroy_worker(self, name: str) -> None:
|
|
95
|
+
"""Destroy a single worker, removing its container."""
|
|
96
|
+
self._request("DELETE", f"/api/v1/stack/workers/{name}")
|
|
97
|
+
|
|
98
|
+
def destroy_all_workers(self, *, ignore_unreachable: bool = False) -> bool:
|
|
99
|
+
"""Destroy all workers managed by this node.
|
|
100
|
+
|
|
101
|
+
Returns ``True`` on success, ``False`` when ``ignore_unreachable=True``
|
|
102
|
+
and the FlowMesh server was unreachable. Other errors propagate.
|
|
103
|
+
"""
|
|
104
|
+
try:
|
|
105
|
+
self._request(
|
|
106
|
+
"DELETE",
|
|
107
|
+
"/api/v1/stack/workers",
|
|
108
|
+
headers={"Content-Type": "application/json"},
|
|
109
|
+
)
|
|
110
|
+
except FlowMeshConnectionError:
|
|
111
|
+
if not ignore_unreachable:
|
|
112
|
+
raise
|
|
113
|
+
return False
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
def worker_names(self) -> list[str]:
|
|
117
|
+
"""Return a list of all worker names."""
|
|
118
|
+
data = self.list_workers()
|
|
119
|
+
names: list[str] = []
|
|
120
|
+
for item in data:
|
|
121
|
+
if isinstance(item, str):
|
|
122
|
+
names.append(item)
|
|
123
|
+
elif isinstance(item, dict):
|
|
124
|
+
name = item.get("name")
|
|
125
|
+
if isinstance(name, str) and name:
|
|
126
|
+
names.append(name)
|
|
127
|
+
return names
|
|
128
|
+
|
|
129
|
+
# -- Transport ------------------------------------------------------- #
|
|
130
|
+
|
|
131
|
+
def _request(
|
|
132
|
+
self,
|
|
133
|
+
method: str,
|
|
134
|
+
path: str,
|
|
135
|
+
json_body: Any = None,
|
|
136
|
+
data: str | bytes | None = None,
|
|
137
|
+
headers: dict[str, str] | None = None,
|
|
138
|
+
timeout: float | None = None,
|
|
139
|
+
) -> Any:
|
|
140
|
+
url = path
|
|
141
|
+
kwargs: dict[str, Any] = {}
|
|
142
|
+
if json_body is not None:
|
|
143
|
+
kwargs["json"] = json_body
|
|
144
|
+
if data is not None:
|
|
145
|
+
kwargs["content"] = data
|
|
146
|
+
if headers:
|
|
147
|
+
kwargs["headers"] = headers
|
|
148
|
+
if timeout is not None:
|
|
149
|
+
kwargs["timeout"] = timeout
|
|
150
|
+
try:
|
|
151
|
+
response = self._http.request(method, url, **kwargs)
|
|
152
|
+
except httpx.ConnectError as exc:
|
|
153
|
+
raise FlowMeshConnectionError(
|
|
154
|
+
f"Failed to connect to {self._base_url}{path}: {exc}"
|
|
155
|
+
)
|
|
156
|
+
_raise_for_status(response, method)
|
|
157
|
+
if not response.content:
|
|
158
|
+
return None
|
|
159
|
+
return response.json()
|
|
160
|
+
|
|
161
|
+
def close(self) -> None:
|
|
162
|
+
self._http.close()
|
|
163
|
+
|
|
164
|
+
def __enter__(self) -> "NodeClient":
|
|
165
|
+
return self
|
|
166
|
+
|
|
167
|
+
def __exit__(self, *args: Any) -> None:
|
|
168
|
+
self.close()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _default_node_url() -> str:
|
|
172
|
+
port = os.getenv("SERVER_HTTP_PORT", os.getenv("SERVER_APP_PORT", "8000"))
|
|
173
|
+
return f"http://localhost:{port}"
|
flowmesh_stack/paths.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Filesystem path helpers shared by FlowMesh tooling."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def resolve_path(value: str, default: str, base_dir: Path) -> Path:
|
|
7
|
+
"""Resolve a possibly relative path against a base directory."""
|
|
8
|
+
raw = value.strip() or default
|
|
9
|
+
path = Path(raw).expanduser()
|
|
10
|
+
return path if path.is_absolute() else (base_dir / path).resolve()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def ensure_dir(path: Path) -> None:
|
|
14
|
+
"""Create a directory if it does not exist."""
|
|
15
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def ensure_file(path: Path) -> None:
|
|
19
|
+
"""Create a file and its parent directory if needed."""
|
|
20
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
path.touch(exist_ok=True)
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""Worker lifecycle helpers for direct node operations."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import subprocess
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
from flowmesh.exceptions import FlowMeshError
|
|
11
|
+
|
|
12
|
+
from .docker import ensure_docker_available
|
|
13
|
+
from .node_client import NodeClient
|
|
14
|
+
|
|
15
|
+
_MAX_PARALLEL_REQUESTS = 16
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def operate_workers(
|
|
19
|
+
client: NodeClient,
|
|
20
|
+
names: list[str],
|
|
21
|
+
operation: str,
|
|
22
|
+
) -> list[str]:
|
|
23
|
+
"""Apply a start/stop/destroy operation to one or more workers."""
|
|
24
|
+
if not names:
|
|
25
|
+
return []
|
|
26
|
+
if "all" in names:
|
|
27
|
+
if len(names) != 1:
|
|
28
|
+
raise FlowMeshError("Use either 'all' or worker names, not both.")
|
|
29
|
+
names = client.worker_names()
|
|
30
|
+
if not names:
|
|
31
|
+
return []
|
|
32
|
+
|
|
33
|
+
def _apply(name: str) -> str:
|
|
34
|
+
match operation:
|
|
35
|
+
case "start":
|
|
36
|
+
client.start_worker(name)
|
|
37
|
+
case "stop":
|
|
38
|
+
client.stop_worker(name)
|
|
39
|
+
case "destroy":
|
|
40
|
+
client.destroy_worker(name)
|
|
41
|
+
case _:
|
|
42
|
+
raise FlowMeshError(f"Unsupported worker operation: {operation}")
|
|
43
|
+
return name
|
|
44
|
+
|
|
45
|
+
successes: list[str] = []
|
|
46
|
+
errors: list[str] = []
|
|
47
|
+
max_workers = min(len(names), _MAX_PARALLEL_REQUESTS)
|
|
48
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
49
|
+
futures = {executor.submit(_apply, name): name for name in names}
|
|
50
|
+
for future in as_completed(futures):
|
|
51
|
+
name = futures[future]
|
|
52
|
+
try:
|
|
53
|
+
successes.append(future.result())
|
|
54
|
+
except Exception as exc:
|
|
55
|
+
errors.append(f"{name}: {exc}")
|
|
56
|
+
if errors:
|
|
57
|
+
raise FlowMeshError("; ".join(errors))
|
|
58
|
+
return successes
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def create_workers(
|
|
62
|
+
client: NodeClient,
|
|
63
|
+
kind: str = "cpu",
|
|
64
|
+
count: int = 1,
|
|
65
|
+
targets: str = "all",
|
|
66
|
+
config_paths: list[Path] | None = None,
|
|
67
|
+
config_raw: list[str] | None = None,
|
|
68
|
+
) -> list[tuple[str, dict[str, Any]]]:
|
|
69
|
+
"""Create node workers from configs or built-in cpu/gpu presets.
|
|
70
|
+
|
|
71
|
+
When ``kind`` is "gpu", if ``count`` is equal to 1, a single worker with the
|
|
72
|
+
specified GPU targets will be created. If ``count`` is greater than 1, one worker
|
|
73
|
+
will be created per GPU target, and the number of targets must match ``count``.
|
|
74
|
+
"""
|
|
75
|
+
payloads = _payloads_for_worker_create(
|
|
76
|
+
kind=kind,
|
|
77
|
+
count=count,
|
|
78
|
+
targets=targets,
|
|
79
|
+
config_paths=config_paths,
|
|
80
|
+
config_raw=config_raw,
|
|
81
|
+
)
|
|
82
|
+
if not payloads:
|
|
83
|
+
return []
|
|
84
|
+
|
|
85
|
+
created: list[tuple[str, dict[str, Any]]] = []
|
|
86
|
+
errors: list[str] = []
|
|
87
|
+
max_workers = min(len(payloads), _MAX_PARALLEL_REQUESTS)
|
|
88
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
89
|
+
futures = {
|
|
90
|
+
executor.submit(client.create_worker, payload): label
|
|
91
|
+
for payload, label in payloads
|
|
92
|
+
}
|
|
93
|
+
for future in as_completed(futures):
|
|
94
|
+
label = futures[future]
|
|
95
|
+
try:
|
|
96
|
+
created.append((label, future.result()))
|
|
97
|
+
except Exception as exc:
|
|
98
|
+
errors.append(f"{label}: {exc}")
|
|
99
|
+
if errors:
|
|
100
|
+
raise FlowMeshError("; ".join(errors))
|
|
101
|
+
return created
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def select_worker_images(
|
|
105
|
+
kinds: list[str],
|
|
106
|
+
images: dict[str, str],
|
|
107
|
+
builder_images: dict[str, str] | None = None,
|
|
108
|
+
builder: bool = False,
|
|
109
|
+
) -> list[str]:
|
|
110
|
+
"""Resolve requested worker image kinds into concrete image refs."""
|
|
111
|
+
normalized = [kind.strip().lower() for kind in kinds if kind.strip()]
|
|
112
|
+
if not normalized:
|
|
113
|
+
raise FlowMeshError("worker pull expects cpu|gpu|ssh-cpu|ssh-gpu|all")
|
|
114
|
+
|
|
115
|
+
selected_images = builder_images if builder else images
|
|
116
|
+
valid_source = builder_images if builder and builder_images is not None else images
|
|
117
|
+
valid = set([*valid_source, "all"])
|
|
118
|
+
invalid = [kind for kind in normalized if kind not in valid]
|
|
119
|
+
if invalid:
|
|
120
|
+
unique = ", ".join(sorted(set(invalid)))
|
|
121
|
+
raise FlowMeshError(f"Invalid kind(s): {unique}")
|
|
122
|
+
|
|
123
|
+
if "all" in normalized:
|
|
124
|
+
return list((selected_images or {}).values())
|
|
125
|
+
|
|
126
|
+
requested = set(normalized)
|
|
127
|
+
if builder and builder_images is not None:
|
|
128
|
+
invalid_builder = [kind for kind in requested if kind not in builder_images]
|
|
129
|
+
if invalid_builder:
|
|
130
|
+
unique = ", ".join(sorted(invalid_builder))
|
|
131
|
+
raise FlowMeshError(f"No builder image for: {unique}")
|
|
132
|
+
return [builder_images[kind] for kind in builder_images if kind in requested]
|
|
133
|
+
|
|
134
|
+
return [images[kind] for kind in images if kind in requested]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def pull_images(images: list[str]) -> None:
|
|
138
|
+
"""Pull one or more worker images via docker."""
|
|
139
|
+
ensure_docker_available()
|
|
140
|
+
for image in images:
|
|
141
|
+
result = subprocess.run(["docker", "pull", image], text=True, check=False)
|
|
142
|
+
if result.returncode != 0:
|
|
143
|
+
raise FlowMeshError(f"Failed to pull image: {image}")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def detect_gpu_targets(targets: str) -> list[str]:
|
|
147
|
+
"""Resolve requested GPU ids into a concrete target list."""
|
|
148
|
+
if targets != "all":
|
|
149
|
+
return [item.strip() for item in targets.split(",") if item.strip()]
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
result = subprocess.run(
|
|
153
|
+
["nvidia-smi", "--query-gpu=index", "--format=csv,noheader"],
|
|
154
|
+
capture_output=True,
|
|
155
|
+
text=True,
|
|
156
|
+
check=False,
|
|
157
|
+
)
|
|
158
|
+
except FileNotFoundError:
|
|
159
|
+
return []
|
|
160
|
+
if result.returncode != 0:
|
|
161
|
+
return []
|
|
162
|
+
return [item.strip() for item in result.stdout.splitlines() if item.strip()]
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _payloads_for_worker_create(
|
|
166
|
+
kind: str,
|
|
167
|
+
count: int,
|
|
168
|
+
targets: str,
|
|
169
|
+
config_paths: list[Path] | None,
|
|
170
|
+
config_raw: list[str] | None,
|
|
171
|
+
) -> list[tuple[str, str]]:
|
|
172
|
+
if count < 1:
|
|
173
|
+
raise FlowMeshError("Worker count must be at least 1.")
|
|
174
|
+
if config_paths is not None or config_raw is not None:
|
|
175
|
+
payloads: list[tuple[str, str]] = []
|
|
176
|
+
for config_path in config_paths or []:
|
|
177
|
+
if not config_path.exists():
|
|
178
|
+
raise FlowMeshError(f"Config not found: {config_path}")
|
|
179
|
+
_extend_payloads(
|
|
180
|
+
payloads, f"worker from {config_path.name}", config_path.read_text()
|
|
181
|
+
)
|
|
182
|
+
for idx, raw in enumerate(config_raw or []):
|
|
183
|
+
_extend_payloads(payloads, f"worker from raw#{idx}", raw)
|
|
184
|
+
return payloads
|
|
185
|
+
|
|
186
|
+
if kind == "cpu":
|
|
187
|
+
return [
|
|
188
|
+
(
|
|
189
|
+
json.dumps(
|
|
190
|
+
{
|
|
191
|
+
"provider": "docker",
|
|
192
|
+
"init_on_start": True,
|
|
193
|
+
"worker_config": {
|
|
194
|
+
"worker_type": "cpu",
|
|
195
|
+
"worker_alias": f"worker_cpu_{idx}",
|
|
196
|
+
},
|
|
197
|
+
}
|
|
198
|
+
),
|
|
199
|
+
"CPU worker",
|
|
200
|
+
)
|
|
201
|
+
for idx in range(count)
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
if kind == "gpu":
|
|
205
|
+
raw_gpu_ids = detect_gpu_targets(targets)
|
|
206
|
+
gpu_ids: list[int] = []
|
|
207
|
+
for raw_gpu_id in raw_gpu_ids:
|
|
208
|
+
if not raw_gpu_id.isdigit():
|
|
209
|
+
raise FlowMeshError(f"Invalid GPU id '{raw_gpu_id}'")
|
|
210
|
+
gpu_ids.append(int(raw_gpu_id))
|
|
211
|
+
|
|
212
|
+
if count > 1:
|
|
213
|
+
if count != len(gpu_ids):
|
|
214
|
+
raise FlowMeshError(
|
|
215
|
+
f"GPU worker count {count} does not match "
|
|
216
|
+
f"detected GPU targets: {gpu_ids}. "
|
|
217
|
+
f"Consider setting count={len(gpu_ids)} or specifying exactly "
|
|
218
|
+
f"{count} GPU targets."
|
|
219
|
+
)
|
|
220
|
+
gpu_payloads = [
|
|
221
|
+
(
|
|
222
|
+
json.dumps(
|
|
223
|
+
{
|
|
224
|
+
"provider": "docker",
|
|
225
|
+
"init_on_start": True,
|
|
226
|
+
"worker_config": {
|
|
227
|
+
"worker_type": "gpu",
|
|
228
|
+
"cuda_devices": [gpu_id],
|
|
229
|
+
"worker_alias": f"worker_gpu_{gpu_id}",
|
|
230
|
+
},
|
|
231
|
+
}
|
|
232
|
+
),
|
|
233
|
+
f"GPU worker for GPU {gpu_id}",
|
|
234
|
+
)
|
|
235
|
+
for gpu_id in gpu_ids
|
|
236
|
+
]
|
|
237
|
+
else:
|
|
238
|
+
worker_suffix = "all" if targets == "all" else "_".join(raw_gpu_ids)
|
|
239
|
+
gpu_payloads = [
|
|
240
|
+
(
|
|
241
|
+
json.dumps(
|
|
242
|
+
{
|
|
243
|
+
"provider": "docker",
|
|
244
|
+
"init_on_start": True,
|
|
245
|
+
"worker_config": {
|
|
246
|
+
"worker_type": "gpu",
|
|
247
|
+
"cuda_devices": gpu_ids,
|
|
248
|
+
"worker_alias": f"worker_gpu_{worker_suffix}",
|
|
249
|
+
},
|
|
250
|
+
}
|
|
251
|
+
),
|
|
252
|
+
f"GPU worker for GPUs {', '.join(raw_gpu_ids)}",
|
|
253
|
+
)
|
|
254
|
+
]
|
|
255
|
+
|
|
256
|
+
if not gpu_payloads:
|
|
257
|
+
raise FlowMeshError("No GPUs detected or specified.")
|
|
258
|
+
return gpu_payloads
|
|
259
|
+
|
|
260
|
+
raise FlowMeshError("worker up expects kind cpu|gpu or use --config")
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _extend_payloads(
|
|
264
|
+
payloads: list[tuple[str, str]], label_prefix: str, payload_text: str
|
|
265
|
+
) -> None:
|
|
266
|
+
try:
|
|
267
|
+
payload_obj = yaml.safe_load(payload_text)
|
|
268
|
+
except Exception:
|
|
269
|
+
payload_obj = None
|
|
270
|
+
|
|
271
|
+
if isinstance(payload_obj, list):
|
|
272
|
+
for idx, item in enumerate(payload_obj):
|
|
273
|
+
payloads.append((json.dumps(item), f"{label_prefix}#{idx}"))
|
|
274
|
+
return
|
|
275
|
+
if isinstance(payload_obj, dict):
|
|
276
|
+
payloads.append((json.dumps(payload_obj), label_prefix))
|
|
277
|
+
return
|
|
278
|
+
payloads.append((payload_text, label_prefix))
|