flowmesh-sdk-stack 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,112 @@
1
+ """FlowMesh Docker image reference management.
2
+
3
+ Provides the canonical mapping from build targets to image references,
4
+ used by both the CLI dev commands and programmatic build/deploy scripts.
5
+ """
6
+
7
+ BUILD_TARGETS: dict[str, str] = {
8
+ "flowmesh_server": "{registry}/flowmesh_server:{version}",
9
+ "flowmesh_worker_cpu": "{registry}/flowmesh_worker:{version}-cpu",
10
+ "flowmesh_worker_gpu_builder": "{registry}/flowmesh_worker_builder:{version}-gpu",
11
+ "flowmesh_worker_gpu": "{registry}/flowmesh_worker:{version}-gpu",
12
+ "flowmesh_ssh_cpu": "{registry}/flowmesh_ssh:{version}-cpu",
13
+ "flowmesh_ssh_gpu": "{registry}/flowmesh_ssh:{version}-gpu",
14
+ }
15
+ """Mapping from build target name to image reference format string."""
16
+
17
+ CACHE_TARGETS: dict[str, str] = {
18
+ "flowmesh_server": "{registry}/flowmesh_server:{scope}",
19
+ "flowmesh_worker_cpu": "{registry}/flowmesh_worker:{scope}-cpu",
20
+ "flowmesh_worker_gpu_builder": "{registry}/flowmesh_worker_builder:{scope}-gpu",
21
+ "flowmesh_worker_gpu": "{registry}/flowmesh_worker:{scope}-gpu",
22
+ "flowmesh_ssh_cpu": "{registry}/flowmesh_ssh:{scope}-cpu",
23
+ "flowmesh_ssh_gpu": "{registry}/flowmesh_ssh:{scope}-gpu",
24
+ }
25
+ """Mapping from build target name to registry cache reference format string."""
26
+
27
+ BUILD_GROUPS: dict[str, list[str]] = {
28
+ "server": ["flowmesh_server"],
29
+ "workers": [
30
+ "flowmesh_worker_cpu",
31
+ "flowmesh_worker_gpu",
32
+ "flowmesh_ssh_cpu",
33
+ "flowmesh_ssh_gpu",
34
+ ],
35
+ "builders": ["flowmesh_worker_gpu_builder"],
36
+ }
37
+ """Mapping from group name to list of build targets."""
38
+ BUILD_GROUPS["default"] = [
39
+ target for group in ("server", "workers") for target in BUILD_GROUPS[group]
40
+ ]
41
+
42
+ BUILD_DEPENDENCIES: dict[str, list[str]] = {
43
+ "flowmesh_worker_gpu": ["flowmesh_worker_gpu_builder"],
44
+ }
45
+ """Auxiliary build targets that must be configured alongside a selected target."""
46
+
47
+ PUSH_PLATFORMS: dict[str, str] = {
48
+ target: "linux/amd64,linux/arm64" for target in BUILD_TARGETS
49
+ }
50
+ """Default platform matrix to publish for each build target."""
51
+
52
+
53
+ def get_image_ref(registry: str, version: str, target: str) -> str:
54
+ """Resolve a Docker image reference for a build target.
55
+
56
+ Args:
57
+ registry: Container registry (e.g. ``ghcr.io/mlsys-io``).
58
+ version: Image version tag (e.g. ``dev``, ``0.1.0``).
59
+ target: Build target name (must be a key in :data:`BUILD_TARGETS`).
60
+
61
+ Raises:
62
+ ValueError: If the target is unknown.
63
+ """
64
+ if target not in BUILD_TARGETS:
65
+ raise ValueError(f"Unknown build target: {target}")
66
+ return BUILD_TARGETS[target].format(registry=registry, version=version)
67
+
68
+
69
+ def get_cache_ref(registry: str, scope: str, target: str) -> str:
70
+ """Resolve a Docker registry cache reference for a build target."""
71
+
72
+ if target not in CACHE_TARGETS:
73
+ raise ValueError(f"Unknown build target: {target}")
74
+ normalized_scope = (scope or "").strip()
75
+ if not normalized_scope or normalized_scope == "cache":
76
+ cache_scope = "cache"
77
+ elif normalized_scope.startswith("cache-"):
78
+ cache_scope = normalized_scope
79
+ else:
80
+ cache_scope = f"cache-{normalized_scope}"
81
+ return CACHE_TARGETS[target].format(registry=registry, scope=cache_scope)
82
+
83
+
84
+ def expand_build_targets(targets: list[str]) -> list[str]:
85
+ """Expand explicit targets with any dependent helper targets.
86
+
87
+ The returned list preserves first-seen order and inserts helper targets ahead
88
+ of any target that requires them.
89
+ """
90
+
91
+ expanded: list[str] = []
92
+ seen: set[str] = set()
93
+
94
+ def _visit(target: str) -> None:
95
+ if target in seen:
96
+ return
97
+ for dep in BUILD_DEPENDENCIES.get(target, []):
98
+ _visit(dep)
99
+ seen.add(target)
100
+ expanded.append(target)
101
+
102
+ for target in targets:
103
+ _visit(target)
104
+ return expanded
105
+
106
+
107
+ def get_push_platforms(target: str) -> str:
108
+ """Resolve the default push platform matrix for a build target."""
109
+
110
+ if target not in PUSH_PLATFORMS:
111
+ raise ValueError(f"Unknown build target: {target}")
112
+ return PUSH_PLATFORMS[target]
@@ -0,0 +1,173 @@
1
+ """Node direct API client."""
2
+
3
+ import os
4
+ from typing import Any
5
+
6
+ import httpx
7
+ from flowmesh.exceptions import FlowMeshConnectionError, FlowMeshError
8
+
9
+ _DEFAULT_TIMEOUT = 30.0
10
+ _WORKER_CREATE_TIMEOUT = 600.0
11
+
12
+
13
+ def _raise_for_status(response: httpx.Response, method: str) -> None:
14
+ if response.status_code < 400:
15
+ return
16
+ try:
17
+ body = response.json()
18
+ except Exception:
19
+ body = response.text
20
+ message = ""
21
+ if isinstance(body, dict):
22
+ message = body.get("detail", "") or body.get("message", "")
23
+ if not message:
24
+ message = str(body)
25
+ raise FlowMeshError(
26
+ f"{method} {response.url} returned {response.status_code}: {message}"
27
+ )
28
+
29
+
30
+ class NodeClient:
31
+ """Client for a node's worker management HTTP API.
32
+
33
+ Used by the CLI stack sub-package for worker lifecycle management.
34
+
35
+ Args:
36
+ base_url: Node HTTP endpoint (e.g. ``http://localhost:8000``).
37
+ token: Optional bearer token for authentication.
38
+ timeout: Request timeout in seconds.
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ base_url: str | None = None,
44
+ token: str | None = None,
45
+ timeout: float = _DEFAULT_TIMEOUT,
46
+ ) -> None:
47
+ resolved_url = base_url or _default_node_url()
48
+ resolved_token = token or os.getenv("FLOWMESH_API_KEY") or None
49
+ headers: dict[str, str] = {"Accept": "application/json"}
50
+ if resolved_token:
51
+ headers["Authorization"] = f"Bearer {resolved_token}"
52
+ self._base_url = resolved_url.rstrip("/")
53
+ self._http = httpx.Client(
54
+ base_url=self._base_url,
55
+ headers=headers,
56
+ timeout=httpx.Timeout(timeout),
57
+ )
58
+
59
+ # -- Workers --------------------------------------------------------- #
60
+
61
+ def list_workers(self) -> list[dict[str, Any]]:
62
+ """List all workers managed by this node."""
63
+ return self._request("GET", "/api/v1/stack/workers")
64
+
65
+ def create_worker(self, config: str | dict[str, Any]) -> dict[str, Any]:
66
+ """Create a worker from a JSON/YAML config.
67
+
68
+ Args:
69
+ config: Worker init config as a JSON string or dict.
70
+ """
71
+ if isinstance(config, dict):
72
+ return self._request(
73
+ "POST",
74
+ "/api/v1/stack/workers",
75
+ json_body=config,
76
+ timeout=_WORKER_CREATE_TIMEOUT,
77
+ )
78
+ return self._request(
79
+ "POST",
80
+ "/api/v1/stack/workers",
81
+ data=config,
82
+ headers={"Content-Type": "application/json"},
83
+ timeout=_WORKER_CREATE_TIMEOUT,
84
+ )
85
+
86
+ def start_worker(self, name: str) -> None:
87
+ """Start a stopped worker."""
88
+ self._request("POST", f"/api/v1/stack/workers/{name}/start")
89
+
90
+ def stop_worker(self, name: str) -> None:
91
+ """Stop a running worker."""
92
+ self._request("POST", f"/api/v1/stack/workers/{name}/stop")
93
+
94
+ def destroy_worker(self, name: str) -> None:
95
+ """Destroy a single worker, removing its container."""
96
+ self._request("DELETE", f"/api/v1/stack/workers/{name}")
97
+
98
+ def destroy_all_workers(self, *, ignore_unreachable: bool = False) -> bool:
99
+ """Destroy all workers managed by this node.
100
+
101
+ Returns ``True`` on success, ``False`` when ``ignore_unreachable=True``
102
+ and the FlowMesh server was unreachable. Other errors propagate.
103
+ """
104
+ try:
105
+ self._request(
106
+ "DELETE",
107
+ "/api/v1/stack/workers",
108
+ headers={"Content-Type": "application/json"},
109
+ )
110
+ except FlowMeshConnectionError:
111
+ if not ignore_unreachable:
112
+ raise
113
+ return False
114
+ return True
115
+
116
+ def worker_names(self) -> list[str]:
117
+ """Return a list of all worker names."""
118
+ data = self.list_workers()
119
+ names: list[str] = []
120
+ for item in data:
121
+ if isinstance(item, str):
122
+ names.append(item)
123
+ elif isinstance(item, dict):
124
+ name = item.get("name")
125
+ if isinstance(name, str) and name:
126
+ names.append(name)
127
+ return names
128
+
129
+ # -- Transport ------------------------------------------------------- #
130
+
131
+ def _request(
132
+ self,
133
+ method: str,
134
+ path: str,
135
+ json_body: Any = None,
136
+ data: str | bytes | None = None,
137
+ headers: dict[str, str] | None = None,
138
+ timeout: float | None = None,
139
+ ) -> Any:
140
+ url = path
141
+ kwargs: dict[str, Any] = {}
142
+ if json_body is not None:
143
+ kwargs["json"] = json_body
144
+ if data is not None:
145
+ kwargs["content"] = data
146
+ if headers:
147
+ kwargs["headers"] = headers
148
+ if timeout is not None:
149
+ kwargs["timeout"] = timeout
150
+ try:
151
+ response = self._http.request(method, url, **kwargs)
152
+ except httpx.ConnectError as exc:
153
+ raise FlowMeshConnectionError(
154
+ f"Failed to connect to {self._base_url}{path}: {exc}"
155
+ )
156
+ _raise_for_status(response, method)
157
+ if not response.content:
158
+ return None
159
+ return response.json()
160
+
161
+ def close(self) -> None:
162
+ self._http.close()
163
+
164
+ def __enter__(self) -> "NodeClient":
165
+ return self
166
+
167
+ def __exit__(self, *args: Any) -> None:
168
+ self.close()
169
+
170
+
171
+ def _default_node_url() -> str:
172
+ port = os.getenv("SERVER_HTTP_PORT", os.getenv("SERVER_APP_PORT", "8000"))
173
+ return f"http://localhost:{port}"
@@ -0,0 +1,21 @@
1
+ """Filesystem path helpers shared by FlowMesh tooling."""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ def resolve_path(value: str, default: str, base_dir: Path) -> Path:
7
+ """Resolve a possibly relative path against a base directory."""
8
+ raw = value.strip() or default
9
+ path = Path(raw).expanduser()
10
+ return path if path.is_absolute() else (base_dir / path).resolve()
11
+
12
+
13
+ def ensure_dir(path: Path) -> None:
14
+ """Create a directory if it does not exist."""
15
+ path.mkdir(parents=True, exist_ok=True)
16
+
17
+
18
+ def ensure_file(path: Path) -> None:
19
+ """Create a file and its parent directory if needed."""
20
+ path.parent.mkdir(parents=True, exist_ok=True)
21
+ path.touch(exist_ok=True)
@@ -0,0 +1,278 @@
1
+ """Worker lifecycle helpers for direct node operations."""
2
+
3
+ import json
4
+ import subprocess
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import yaml
10
+ from flowmesh.exceptions import FlowMeshError
11
+
12
+ from .docker import ensure_docker_available
13
+ from .node_client import NodeClient
14
+
15
+ _MAX_PARALLEL_REQUESTS = 16
16
+
17
+
18
+ def operate_workers(
19
+ client: NodeClient,
20
+ names: list[str],
21
+ operation: str,
22
+ ) -> list[str]:
23
+ """Apply a start/stop/destroy operation to one or more workers."""
24
+ if not names:
25
+ return []
26
+ if "all" in names:
27
+ if len(names) != 1:
28
+ raise FlowMeshError("Use either 'all' or worker names, not both.")
29
+ names = client.worker_names()
30
+ if not names:
31
+ return []
32
+
33
+ def _apply(name: str) -> str:
34
+ match operation:
35
+ case "start":
36
+ client.start_worker(name)
37
+ case "stop":
38
+ client.stop_worker(name)
39
+ case "destroy":
40
+ client.destroy_worker(name)
41
+ case _:
42
+ raise FlowMeshError(f"Unsupported worker operation: {operation}")
43
+ return name
44
+
45
+ successes: list[str] = []
46
+ errors: list[str] = []
47
+ max_workers = min(len(names), _MAX_PARALLEL_REQUESTS)
48
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
49
+ futures = {executor.submit(_apply, name): name for name in names}
50
+ for future in as_completed(futures):
51
+ name = futures[future]
52
+ try:
53
+ successes.append(future.result())
54
+ except Exception as exc:
55
+ errors.append(f"{name}: {exc}")
56
+ if errors:
57
+ raise FlowMeshError("; ".join(errors))
58
+ return successes
59
+
60
+
61
+ def create_workers(
62
+ client: NodeClient,
63
+ kind: str = "cpu",
64
+ count: int = 1,
65
+ targets: str = "all",
66
+ config_paths: list[Path] | None = None,
67
+ config_raw: list[str] | None = None,
68
+ ) -> list[tuple[str, dict[str, Any]]]:
69
+ """Create node workers from configs or built-in cpu/gpu presets.
70
+
71
+ When ``kind`` is "gpu", if ``count`` is equal to 1, a single worker with the
72
+ specified GPU targets will be created. If ``count`` is greater than 1, one worker
73
+ will be created per GPU target, and the number of targets must match ``count``.
74
+ """
75
+ payloads = _payloads_for_worker_create(
76
+ kind=kind,
77
+ count=count,
78
+ targets=targets,
79
+ config_paths=config_paths,
80
+ config_raw=config_raw,
81
+ )
82
+ if not payloads:
83
+ return []
84
+
85
+ created: list[tuple[str, dict[str, Any]]] = []
86
+ errors: list[str] = []
87
+ max_workers = min(len(payloads), _MAX_PARALLEL_REQUESTS)
88
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
89
+ futures = {
90
+ executor.submit(client.create_worker, payload): label
91
+ for payload, label in payloads
92
+ }
93
+ for future in as_completed(futures):
94
+ label = futures[future]
95
+ try:
96
+ created.append((label, future.result()))
97
+ except Exception as exc:
98
+ errors.append(f"{label}: {exc}")
99
+ if errors:
100
+ raise FlowMeshError("; ".join(errors))
101
+ return created
102
+
103
+
104
+ def select_worker_images(
105
+ kinds: list[str],
106
+ images: dict[str, str],
107
+ builder_images: dict[str, str] | None = None,
108
+ builder: bool = False,
109
+ ) -> list[str]:
110
+ """Resolve requested worker image kinds into concrete image refs."""
111
+ normalized = [kind.strip().lower() for kind in kinds if kind.strip()]
112
+ if not normalized:
113
+ raise FlowMeshError("worker pull expects cpu|gpu|ssh-cpu|ssh-gpu|all")
114
+
115
+ selected_images = builder_images if builder else images
116
+ valid_source = builder_images if builder and builder_images is not None else images
117
+ valid = set([*valid_source, "all"])
118
+ invalid = [kind for kind in normalized if kind not in valid]
119
+ if invalid:
120
+ unique = ", ".join(sorted(set(invalid)))
121
+ raise FlowMeshError(f"Invalid kind(s): {unique}")
122
+
123
+ if "all" in normalized:
124
+ return list((selected_images or {}).values())
125
+
126
+ requested = set(normalized)
127
+ if builder and builder_images is not None:
128
+ invalid_builder = [kind for kind in requested if kind not in builder_images]
129
+ if invalid_builder:
130
+ unique = ", ".join(sorted(invalid_builder))
131
+ raise FlowMeshError(f"No builder image for: {unique}")
132
+ return [builder_images[kind] for kind in builder_images if kind in requested]
133
+
134
+ return [images[kind] for kind in images if kind in requested]
135
+
136
+
137
+ def pull_images(images: list[str]) -> None:
138
+ """Pull one or more worker images via docker."""
139
+ ensure_docker_available()
140
+ for image in images:
141
+ result = subprocess.run(["docker", "pull", image], text=True, check=False)
142
+ if result.returncode != 0:
143
+ raise FlowMeshError(f"Failed to pull image: {image}")
144
+
145
+
146
+ def detect_gpu_targets(targets: str) -> list[str]:
147
+ """Resolve requested GPU ids into a concrete target list."""
148
+ if targets != "all":
149
+ return [item.strip() for item in targets.split(",") if item.strip()]
150
+
151
+ try:
152
+ result = subprocess.run(
153
+ ["nvidia-smi", "--query-gpu=index", "--format=csv,noheader"],
154
+ capture_output=True,
155
+ text=True,
156
+ check=False,
157
+ )
158
+ except FileNotFoundError:
159
+ return []
160
+ if result.returncode != 0:
161
+ return []
162
+ return [item.strip() for item in result.stdout.splitlines() if item.strip()]
163
+
164
+
165
+ def _payloads_for_worker_create(
166
+ kind: str,
167
+ count: int,
168
+ targets: str,
169
+ config_paths: list[Path] | None,
170
+ config_raw: list[str] | None,
171
+ ) -> list[tuple[str, str]]:
172
+ if count < 1:
173
+ raise FlowMeshError("Worker count must be at least 1.")
174
+ if config_paths is not None or config_raw is not None:
175
+ payloads: list[tuple[str, str]] = []
176
+ for config_path in config_paths or []:
177
+ if not config_path.exists():
178
+ raise FlowMeshError(f"Config not found: {config_path}")
179
+ _extend_payloads(
180
+ payloads, f"worker from {config_path.name}", config_path.read_text()
181
+ )
182
+ for idx, raw in enumerate(config_raw or []):
183
+ _extend_payloads(payloads, f"worker from raw#{idx}", raw)
184
+ return payloads
185
+
186
+ if kind == "cpu":
187
+ return [
188
+ (
189
+ json.dumps(
190
+ {
191
+ "provider": "docker",
192
+ "init_on_start": True,
193
+ "worker_config": {
194
+ "worker_type": "cpu",
195
+ "worker_alias": f"worker_cpu_{idx}",
196
+ },
197
+ }
198
+ ),
199
+ "CPU worker",
200
+ )
201
+ for idx in range(count)
202
+ ]
203
+
204
+ if kind == "gpu":
205
+ raw_gpu_ids = detect_gpu_targets(targets)
206
+ gpu_ids: list[int] = []
207
+ for raw_gpu_id in raw_gpu_ids:
208
+ if not raw_gpu_id.isdigit():
209
+ raise FlowMeshError(f"Invalid GPU id '{raw_gpu_id}'")
210
+ gpu_ids.append(int(raw_gpu_id))
211
+
212
+ if count > 1:
213
+ if count != len(gpu_ids):
214
+ raise FlowMeshError(
215
+ f"GPU worker count {count} does not match "
216
+ f"detected GPU targets: {gpu_ids}. "
217
+ f"Consider setting count={len(gpu_ids)} or specifying exactly "
218
+ f"{count} GPU targets."
219
+ )
220
+ gpu_payloads = [
221
+ (
222
+ json.dumps(
223
+ {
224
+ "provider": "docker",
225
+ "init_on_start": True,
226
+ "worker_config": {
227
+ "worker_type": "gpu",
228
+ "cuda_devices": [gpu_id],
229
+ "worker_alias": f"worker_gpu_{gpu_id}",
230
+ },
231
+ }
232
+ ),
233
+ f"GPU worker for GPU {gpu_id}",
234
+ )
235
+ for gpu_id in gpu_ids
236
+ ]
237
+ else:
238
+ worker_suffix = "all" if targets == "all" else "_".join(raw_gpu_ids)
239
+ gpu_payloads = [
240
+ (
241
+ json.dumps(
242
+ {
243
+ "provider": "docker",
244
+ "init_on_start": True,
245
+ "worker_config": {
246
+ "worker_type": "gpu",
247
+ "cuda_devices": gpu_ids,
248
+ "worker_alias": f"worker_gpu_{worker_suffix}",
249
+ },
250
+ }
251
+ ),
252
+ f"GPU worker for GPUs {', '.join(raw_gpu_ids)}",
253
+ )
254
+ ]
255
+
256
+ if not gpu_payloads:
257
+ raise FlowMeshError("No GPUs detected or specified.")
258
+ return gpu_payloads
259
+
260
+ raise FlowMeshError("worker up expects kind cpu|gpu or use --config")
261
+
262
+
263
+ def _extend_payloads(
264
+ payloads: list[tuple[str, str]], label_prefix: str, payload_text: str
265
+ ) -> None:
266
+ try:
267
+ payload_obj = yaml.safe_load(payload_text)
268
+ except Exception:
269
+ payload_obj = None
270
+
271
+ if isinstance(payload_obj, list):
272
+ for idx, item in enumerate(payload_obj):
273
+ payloads.append((json.dumps(item), f"{label_prefix}#{idx}"))
274
+ return
275
+ if isinstance(payload_obj, dict):
276
+ payloads.append((json.dumps(payload_obj), label_prefix))
277
+ return
278
+ payloads.append((payload_text, label_prefix))