uxarray-mcp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,322 @@
1
+ """Configuration management for remote execution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any, Optional
9
+ from uuid import UUID
10
+
11
+ import yaml
12
+
13
+ USER_CONFIG_PATH = Path.home() / ".config" / "uxarray-mcp" / "config.yaml"
14
+
15
+
16
+ def discover_config_path() -> Path | None:
17
+ """Return the first existing config file in the discovery order.
18
+
19
+ Order:
20
+ 1. ``$UXARRAY_MCP_CONFIG`` (explicit override)
21
+ 2. ``./config.yaml`` (current working directory — project-local)
22
+ 3. ``~/.config/uxarray-mcp/config.yaml`` (user install)
23
+ 4. ``<repo_root>/config.yaml`` (editable install fallback)
24
+
25
+ Project-local (cwd) wins over the user config so that running the CLI
26
+ from inside a checkout uses the repo's config, even when an empty user
27
+ config was previously written by ``uxarray-mcp setup``.
28
+
29
+ Returns ``None`` when no config file is found.
30
+ """
31
+ env_path = os.environ.get("UXARRAY_MCP_CONFIG")
32
+ if env_path:
33
+ candidate = Path(env_path).expanduser()
34
+ if candidate.exists():
35
+ return candidate
36
+
37
+ cwd_config = Path.cwd() / "config.yaml"
38
+ if cwd_config.exists():
39
+ return cwd_config
40
+
41
+ if USER_CONFIG_PATH.exists():
42
+ return USER_CONFIG_PATH
43
+
44
+ repo_config = Path(__file__).resolve().parent.parent.parent.parent / "config.yaml"
45
+ if repo_config.exists():
46
+ return repo_config
47
+
48
+ return None
49
+
50
+
51
+ def discover_config_search_paths() -> list[Path]:
52
+ """Return the ordered list of paths discover_config_path inspects.
53
+
54
+ Useful for diagnostics — ``endpoints list`` prints this when no
55
+ endpoints are configured so the user can see exactly which file was
56
+ used or which paths were searched.
57
+ """
58
+ paths: list[Path] = []
59
+ env_path = os.environ.get("UXARRAY_MCP_CONFIG")
60
+ if env_path:
61
+ paths.append(Path(env_path).expanduser())
62
+ paths.append(Path.cwd() / "config.yaml")
63
+ paths.append(USER_CONFIG_PATH)
64
+ paths.append(Path(__file__).resolve().parent.parent.parent.parent / "config.yaml")
65
+ return paths
66
+
67
+
68
+ _VALID_EXECUTION_MODES = {"local", "hpc", "auto"}
69
+ _EXECUTION_MODE_ALIASES = {"remote": "hpc"}
70
+
71
+
72
+ def _is_uuid(value: str) -> bool:
73
+ try:
74
+ UUID(value)
75
+ except ValueError:
76
+ return False
77
+ return True
78
+
79
+
80
+ @dataclass(frozen=True)
81
+ class EndpointProfile:
82
+ """Named Globus Compute endpoint profile."""
83
+
84
+ name: str
85
+ endpoint_id: str
86
+ path_prefixes: tuple[str, ...] = ()
87
+ timeout_seconds: int | None = None
88
+
89
+
90
+ def normalize_execution_mode(execution_mode: str) -> str:
91
+ """Return the canonical execution mode name.
92
+
93
+ The repository previously used ``remote`` for the HPC-only mode. Accept it
94
+ as a backwards-compatible alias so older configs and tests keep working.
95
+ """
96
+ canonical_mode = _EXECUTION_MODE_ALIASES.get(execution_mode, execution_mode)
97
+ if canonical_mode not in _VALID_EXECUTION_MODES:
98
+ raise ValueError(
99
+ f"Invalid execution mode {execution_mode!r}. "
100
+ f"Must be one of: {', '.join(sorted(_VALID_EXECUTION_MODES))}"
101
+ )
102
+ return canonical_mode
103
+
104
+
105
+ class HPCConfig:
106
+ """HPC execution configuration.
107
+
108
+ Parameters
109
+ ----------
110
+ endpoint_id : str | None
111
+ Globus Compute endpoint UUID
112
+ execution_mode : str
113
+ Execution mode: "local", "hpc", or "auto"
114
+ timeout_seconds : int
115
+ Timeout for remote execution in seconds
116
+
117
+ Examples
118
+ --------
119
+ >>> config = HPCConfig(endpoint_id="abc-123", execution_mode="hpc")
120
+ >>> config.has_endpoint
121
+ True
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ endpoint_id: Optional[str] = None,
127
+ execution_mode: str = "local",
128
+ timeout_seconds: int = 300,
129
+ endpoints: dict[str, EndpointProfile] | None = None,
130
+ default_endpoint: str | None = None,
131
+ endpoint_name: str | None = None,
132
+ ):
133
+ self.endpoints = endpoints or {}
134
+ self.default_endpoint = default_endpoint
135
+ self.endpoint_name = endpoint_name
136
+ self.endpoint_id = endpoint_id
137
+ self.execution_mode = normalize_execution_mode(execution_mode)
138
+ self.timeout_seconds = timeout_seconds
139
+
140
+ @property
141
+ def has_endpoint(self) -> bool:
142
+ """Check if Globus Compute endpoint is configured."""
143
+ return self.endpoint_id is not None or bool(self.endpoints)
144
+
145
+ @property
146
+ def should_use_remote(self) -> bool:
147
+ """Determine if remote execution should be used."""
148
+ if self.execution_mode == "local":
149
+ return False
150
+ elif self.execution_mode == "hpc":
151
+ return self.has_endpoint
152
+ elif self.execution_mode == "auto":
153
+ return self.has_endpoint
154
+ else:
155
+ return False
156
+
157
+ @property
158
+ def endpoint_names(self) -> list[str]:
159
+ """Return configured endpoint profile names."""
160
+ return sorted(self.endpoints)
161
+
162
+ def resolve_endpoint(
163
+ self, endpoint: str | None = None, path: str | None = None
164
+ ) -> EndpointProfile | None:
165
+ """Resolve an explicit endpoint name, default endpoint, or raw UUID."""
166
+ if endpoint:
167
+ if endpoint in self.endpoints:
168
+ return self.endpoints[endpoint]
169
+ if endpoint == self.endpoint_id:
170
+ return EndpointProfile(
171
+ name=self.endpoint_name or endpoint,
172
+ endpoint_id=endpoint,
173
+ timeout_seconds=self.timeout_seconds,
174
+ )
175
+ if _is_uuid(endpoint):
176
+ return EndpointProfile(
177
+ name=endpoint,
178
+ endpoint_id=endpoint,
179
+ timeout_seconds=self.timeout_seconds,
180
+ )
181
+ configured = ", ".join(self.endpoint_names) or "none"
182
+ raise ValueError(
183
+ f"Unknown endpoint {endpoint!r}. "
184
+ f"Configured endpoint names: {configured}. "
185
+ "Pass a configured endpoint name or a Globus Compute endpoint UUID."
186
+ )
187
+
188
+ if self.default_endpoint and self.default_endpoint in self.endpoints:
189
+ return self.endpoints[self.default_endpoint]
190
+
191
+ if self.endpoint_id is not None:
192
+ return EndpointProfile(
193
+ name=self.endpoint_name or "default",
194
+ endpoint_id=self.endpoint_id,
195
+ timeout_seconds=self.timeout_seconds,
196
+ )
197
+
198
+ if len(self.endpoints) == 1:
199
+ return next(iter(self.endpoints.values()))
200
+
201
+ return None
202
+
203
+ def for_endpoint(
204
+ self, endpoint: str | None = None, path: str | None = None
205
+ ) -> "HPCConfig":
206
+ """Return a copy configured for the selected endpoint profile."""
207
+ profile = self.resolve_endpoint(endpoint=endpoint, path=path)
208
+ if profile is None:
209
+ return HPCConfig(
210
+ endpoint_id=None,
211
+ execution_mode=self.execution_mode,
212
+ timeout_seconds=self.timeout_seconds,
213
+ endpoints=self.endpoints,
214
+ default_endpoint=self.default_endpoint,
215
+ )
216
+
217
+ return HPCConfig(
218
+ endpoint_id=profile.endpoint_id,
219
+ execution_mode=self.execution_mode,
220
+ timeout_seconds=profile.timeout_seconds or self.timeout_seconds,
221
+ endpoints=self.endpoints,
222
+ default_endpoint=self.default_endpoint,
223
+ endpoint_name=profile.name,
224
+ )
225
+
226
+
227
+ def _coerce_prefixes(value: Any) -> tuple[str, ...]:
228
+ if value is None:
229
+ return ()
230
+ if isinstance(value, str):
231
+ return (value,)
232
+ if isinstance(value, list):
233
+ return tuple(str(item) for item in value if item)
234
+ return ()
235
+
236
+
237
+ def _parse_endpoint_profiles(raw_endpoints: Any) -> dict[str, EndpointProfile]:
238
+ if not isinstance(raw_endpoints, dict):
239
+ return {}
240
+
241
+ profiles: dict[str, EndpointProfile] = {}
242
+ for name, raw_profile in raw_endpoints.items():
243
+ if not isinstance(raw_profile, dict):
244
+ continue
245
+ endpoint_id = raw_profile.get("endpoint_id")
246
+ if not endpoint_id:
247
+ continue
248
+ timeout = raw_profile.get("timeout_seconds")
249
+ profiles[str(name)] = EndpointProfile(
250
+ name=str(name),
251
+ endpoint_id=str(endpoint_id),
252
+ path_prefixes=_coerce_prefixes(raw_profile.get("path_prefixes")),
253
+ timeout_seconds=int(timeout) if timeout is not None else None,
254
+ )
255
+ return profiles
256
+
257
+
258
+ def load_config(config_path: Optional[Path] = None) -> HPCConfig:
259
+ """Load HPC configuration from YAML file.
260
+
261
+ Parameters
262
+ ----------
263
+ config_path : Path | None
264
+ Path to config.yaml. If None, uses default location.
265
+
266
+ Returns
267
+ -------
268
+ HPCConfig
269
+ Loaded configuration object
270
+
271
+ Examples
272
+ --------
273
+ >>> config = load_config()
274
+ >>> config.execution_mode
275
+ 'local'
276
+ """
277
+ if config_path is None:
278
+ config_path = discover_config_path()
279
+
280
+ if config_path is None or not config_path.exists():
281
+ return HPCConfig()
282
+
283
+ with open(config_path, "r", encoding="utf-8") as f:
284
+ data = yaml.safe_load(f) or {}
285
+
286
+ if not isinstance(data, dict):
287
+ return HPCConfig()
288
+
289
+ hpc_config = data.get("hpc", {})
290
+ if not isinstance(hpc_config, dict):
291
+ hpc_config = {}
292
+
293
+ globus_config = hpc_config.get("globus_compute", {})
294
+ if not isinstance(globus_config, dict):
295
+ globus_config = {}
296
+
297
+ endpoints = _parse_endpoint_profiles(hpc_config.get("endpoints"))
298
+ if not endpoints:
299
+ endpoints = _parse_endpoint_profiles(globus_config.get("endpoints"))
300
+
301
+ default_endpoint = hpc_config.get("default_endpoint") or globus_config.get(
302
+ "default_endpoint"
303
+ )
304
+ endpoint_id = globus_config.get("endpoint_id")
305
+
306
+ if endpoint_id is None and default_endpoint in endpoints:
307
+ endpoint_id = endpoints[default_endpoint].endpoint_id
308
+ endpoint_name = (
309
+ default_endpoint
310
+ if default_endpoint in endpoints
311
+ and endpoint_id == endpoints[default_endpoint].endpoint_id
312
+ else None
313
+ )
314
+
315
+ return HPCConfig(
316
+ endpoint_id=endpoint_id,
317
+ execution_mode=hpc_config.get("execution_mode", "local"),
318
+ timeout_seconds=hpc_config.get("timeout_seconds", 300),
319
+ endpoints=endpoints,
320
+ default_endpoint=default_endpoint,
321
+ endpoint_name=endpoint_name,
322
+ )
@@ -0,0 +1,372 @@
1
+ """Endpoint manager status checking for HPC execution.
2
+
3
+ ## Status vocabulary
4
+
5
+ Every function in this module returns one of four manager-level statuses:
6
+
7
+ ``"registered"``
8
+ The endpoint manager process is running and visible to Globus Compute.
9
+ Slurm/PBS jobs will be submitted when tasks arrive. Workers are not
10
+ necessarily running yet — the scheduler allocates them on demand.
11
+ This was formerly reported as ``"online"`` by the Globus SDK.
12
+
13
+ ``"active"``
14
+ Manager is registered AND a lightweight probe task ran successfully on a
15
+ real compute node. Use :func:`probe_endpoint_worker` to obtain this status.
16
+
17
+ ``"offline"``
18
+ The endpoint manager is not running. Someone must SSH in and run
19
+ ``globus-compute-endpoint start <name>``.
20
+
21
+ ``"unreachable"``
22
+ The Globus Compute service cannot be contacted (auth error, network issue).
23
+
24
+ ``"no_endpoint"``
25
+ No endpoint UUID is configured for this name.
26
+
27
+ ## Why "registered" not "online"
28
+
29
+ Globus reports ``"online"`` when the manager process is registered. This is
30
+ purely a registration state — it says nothing about whether Slurm has idle
31
+ nodes, whether the worker environment is healthy, or whether submitted tasks
32
+ will actually run. Using ``"online"`` led to misleading status messages.
33
+ ``"registered"`` is honest: the manager is up and accepting submissions.
34
+
35
+ ## Caching
36
+
37
+ The Globus Compute ``Client()`` constructor and ``get_endpoint_status()`` are
38
+ both network round-trips that take 1–5 s. This module keeps a process-level
39
+ Client cache plus a short-TTL per-endpoint status cache so back-to-back checks
40
+ are effectively free.
41
+ """
42
+
43
+ from __future__ import annotations
44
+
45
+ import threading
46
+ import time
47
+ import warnings
48
+ from typing import Any
49
+
50
+ # ── Process-wide cached Client ────────────────────────────────────────────────
51
+ _CLIENT: Any = None
52
+ _CLIENT_LOCK = threading.Lock()
53
+
54
+ # ── Per-endpoint status cache ─────────────────────────────────────────────────
55
+ # Keyed by endpoint_id; value is (monotonic_ts, status_dict).
56
+ # The stored dict never contains "cached"/"cache_age_seconds" — those are
57
+ # added on the way out by _cached_entry().
58
+ _STATUS_CACHE: dict[str, tuple[float, dict[str, Any]]] = {}
59
+ _STATUS_CACHE_LOCK = threading.Lock()
60
+
61
+ # Registered endpoints stay valid longer; offline/unreachable are re-checked sooner.
62
+ _TTL_REGISTERED_SECONDS = 10.0
63
+ _TTL_OTHER_SECONDS = 3.0
64
+
65
+ # ── Globus → our vocabulary mapping ──────────────────────────────────────────
66
+ _GLOBUS_TO_STATUS: dict[str, str] = {
67
+ "online": "registered",
68
+ "healthy": "registered",
69
+ "offline": "offline",
70
+ "stopped": "offline",
71
+ }
72
+
73
+
74
+ def _translate_globus_status(raw: str) -> str:
75
+ """Map a raw Globus status string to our vocabulary."""
76
+ return _GLOBUS_TO_STATUS.get(raw.lower(), "unreachable")
77
+
78
+
79
+ def _endpoint_public_fields(config: Any) -> dict[str, Any]:
80
+ """Return non-sensitive endpoint metadata for public tool payloads."""
81
+ return {
82
+ "endpoint_name": getattr(config, "endpoint_name", None) or "configured",
83
+ "endpoint_configured": bool(getattr(config, "endpoint_id", None)),
84
+ }
85
+
86
+
87
+ def _get_client() -> Any:
88
+ """Return a process-wide cached Globus Compute ``Client``."""
89
+ global _CLIENT
90
+ if _CLIENT is None:
91
+ with _CLIENT_LOCK:
92
+ if _CLIENT is None:
93
+ from globus_compute_sdk import Client
94
+
95
+ _CLIENT = Client()
96
+ return _CLIENT
97
+
98
+
99
+ def invalidate_cache(endpoint_id: str | None = None) -> None:
100
+ """Drop cached status entries.
101
+
102
+ Parameters
103
+ ----------
104
+ endpoint_id : str | None
105
+ If ``None``, clear the entire cache. The cached Client is preserved.
106
+ """
107
+ with _STATUS_CACHE_LOCK:
108
+ if endpoint_id is None:
109
+ _STATUS_CACHE.clear()
110
+ else:
111
+ _STATUS_CACHE.pop(endpoint_id, None)
112
+
113
+
114
+ def _cached_entry(endpoint_id: str) -> dict[str, Any] | None:
115
+ with _STATUS_CACHE_LOCK:
116
+ entry = _STATUS_CACHE.get(endpoint_id)
117
+ if entry is None:
118
+ return None
119
+ ts, payload = entry
120
+ age = time.monotonic() - ts
121
+ ttl = (
122
+ _TTL_REGISTERED_SECONDS
123
+ if payload.get("status") == "registered"
124
+ else _TTL_OTHER_SECONDS
125
+ )
126
+ if age > ttl:
127
+ return None
128
+ cached = dict(payload)
129
+ cached["cached"] = True
130
+ cached["cache_age_seconds"] = round(age, 3)
131
+ return cached
132
+
133
+
134
+ def _store(endpoint_id: str, payload: dict[str, Any]) -> dict[str, Any]:
135
+ with _STATUS_CACHE_LOCK:
136
+ _STATUS_CACHE[endpoint_id] = (time.monotonic(), payload)
137
+ fresh = dict(payload)
138
+ fresh["cached"] = False
139
+ return fresh
140
+
141
+
142
+ def check_endpoint_manager_status(
143
+ config: Any, *, force: bool = False
144
+ ) -> dict[str, Any]:
145
+ """Check whether the Globus Compute endpoint manager is registered.
146
+
147
+ This is a fast, cached check that queries the Globus cloud about the
148
+ manager process. It does **not** verify that workers are running or that
149
+ submitted tasks will succeed.
150
+
151
+ Parameters
152
+ ----------
153
+ config : HPCConfig
154
+ HPC configuration with ``endpoint_id``.
155
+ force : bool, default False
156
+ Bypass the short-TTL cache and re-query the SDK.
157
+
158
+ Returns
159
+ -------
160
+ dict
161
+ Keys:
162
+
163
+ ``status``
164
+ One of ``"registered"``, ``"offline"``, ``"unreachable"``,
165
+ ``"no_endpoint"``.
166
+ ``endpoint_name`` / ``endpoint_configured``
167
+ Non-sensitive endpoint metadata. Raw UUIDs stay in private config.
168
+ ``error``
169
+ Error message when ``status`` is ``"unreachable"``.
170
+ ``cached``
171
+ ``True`` if this result was served from the in-process cache.
172
+ ``cache_age_seconds``
173
+ Age of the cached entry in seconds (only when ``cached=True``).
174
+
175
+ Examples
176
+ --------
177
+ >>> from uxarray_mcp.remote.config import load_config
178
+ >>> config = load_config()
179
+ >>> check_endpoint_manager_status(config)
180
+ {"status": "registered", "endpoint_name": "improv", "cached": False}
181
+ """
182
+ endpoint_id = getattr(config, "endpoint_id", None)
183
+ if not endpoint_id:
184
+ return {"status": "no_endpoint", "message": "No endpoint configured"}
185
+
186
+ if not force:
187
+ cached = _cached_entry(endpoint_id)
188
+ if cached is not None:
189
+ return cached
190
+
191
+ try:
192
+ client = _get_client()
193
+ raw = client.get_endpoint_status(endpoint_id)
194
+ raw_status = raw.get("status", "unknown") if isinstance(raw, dict) else str(raw)
195
+ status = _translate_globus_status(raw_status)
196
+ payload: dict[str, Any] = {"status": status, **_endpoint_public_fields(config)}
197
+ return _store(endpoint_id, payload)
198
+ except Exception as exc:
199
+ payload = {
200
+ "status": "unreachable",
201
+ **_endpoint_public_fields(config),
202
+ "error": str(exc),
203
+ }
204
+ return _store(endpoint_id, payload)
205
+
206
+
207
+ def probe_endpoint_worker(
208
+ config: Any,
209
+ *,
210
+ timeout_seconds: int = 60,
211
+ ) -> dict[str, Any]:
212
+ """Submit a lightweight task to confirm a real worker responds.
213
+
214
+ Unlike :func:`check_endpoint_manager_status`, this function actually
215
+ submits a task via Globus Compute and waits for it to run on a scheduler
216
+ node. This is the only way to confirm that Slurm/PBS allocated a worker
217
+ and the Python environment is intact.
218
+
219
+ Parameters
220
+ ----------
221
+ config : HPCConfig
222
+ HPC configuration with ``endpoint_id``.
223
+ timeout_seconds : int, default 60
224
+ How long to wait for the probe task to complete.
225
+
226
+ Returns
227
+ -------
228
+ dict
229
+ Keys:
230
+
231
+ ``status``
232
+ ``"active"`` if a worker responded, ``"registered"`` if the
233
+ manager is up but the probe timed out, ``"offline"`` or
234
+ ``"unreachable"`` if the manager itself is down.
235
+ ``node``
236
+ Hostname of the compute node that ran the task (when active).
237
+ ``python``
238
+ Python version on the worker (when active).
239
+ ``slurm_job_id``
240
+ Slurm job ID (when active and running under Slurm).
241
+ ``elapsed_seconds``
242
+ Wall-clock time the probe took.
243
+ ``error``
244
+ Error message on failure.
245
+
246
+ Examples
247
+ --------
248
+ >>> probe_endpoint_worker(config, timeout_seconds=90)
249
+ {"status": "active", "node": "chr-0497", "python": "3.13.13",
250
+ "slurm_job_id": "1228500", "elapsed_seconds": 28.4}
251
+ """
252
+ endpoint_id = getattr(config, "endpoint_id", None)
253
+ if not endpoint_id:
254
+ return {"status": "no_endpoint", "message": "No endpoint configured"}
255
+
256
+ # Fast manager check first — no point probing an offline endpoint
257
+ manager = check_endpoint_manager_status(config, force=True)
258
+ if manager["status"] not in ("registered", "active"):
259
+ return manager
260
+
261
+ try:
262
+ from globus_compute_sdk import Executor
263
+ from globus_compute_sdk.serialize import AllCodeStrategies, ComputeSerializer
264
+ except ImportError:
265
+ return {
266
+ "status": "unreachable",
267
+ **_endpoint_public_fields(config),
268
+ "error": "globus-compute-sdk not installed. Run: uv sync --extra hpc",
269
+ }
270
+
271
+ def _worker_probe() -> dict:
272
+ import os
273
+ import platform
274
+
275
+ return {
276
+ "node": platform.node(),
277
+ "python": platform.python_version(),
278
+ "slurm_job_id": os.environ.get("SLURM_JOB_ID", ""),
279
+ "pbs_job_id": os.environ.get("PBS_JOBID", ""),
280
+ "pythonpath": os.environ.get("PYTHONPATH", ""),
281
+ }
282
+
283
+ t0 = time.monotonic()
284
+ try:
285
+ with warnings.catch_warnings():
286
+ warnings.filterwarnings(
287
+ "ignore",
288
+ message=r"(?s).*Environment differences detected between local SDK and endpoint.*",
289
+ category=UserWarning,
290
+ )
291
+ with Executor(
292
+ endpoint_id=endpoint_id,
293
+ serializer=ComputeSerializer(strategy_code=AllCodeStrategies()),
294
+ ) as ex:
295
+ fut = ex.submit(_worker_probe)
296
+ result = fut.result(timeout=timeout_seconds)
297
+
298
+ elapsed = round(time.monotonic() - t0, 1)
299
+ payload = {
300
+ "status": "active",
301
+ **_endpoint_public_fields(config),
302
+ "node": result.get("node", ""),
303
+ "python": result.get("python", ""),
304
+ "slurm_job_id": result.get("slurm_job_id") or None,
305
+ "pbs_job_id": result.get("pbs_job_id") or None,
306
+ "pythonpath_set": bool(result.get("pythonpath")),
307
+ "elapsed_seconds": elapsed,
308
+ }
309
+ # Warn if PYTHONPATH is set — this is the root cause of most worker crashes
310
+ if result.get("pythonpath"):
311
+ payload["warning"] = (
312
+ "PYTHONPATH is set on the worker. This can cause pydantic/dill "
313
+ "conflicts. Add 'unset PYTHONPATH' to worker_init in the endpoint "
314
+ "config, and set PYTHONPATH: '' in user_environment.yaml."
315
+ )
316
+ return payload
317
+
318
+ except TimeoutError:
319
+ return {
320
+ "status": "registered",
321
+ **_endpoint_public_fields(config),
322
+ "error": f"Probe timed out after {timeout_seconds}s. Manager is up but "
323
+ "no worker responded. The scheduler may be busy or the worker "
324
+ "environment may be broken.",
325
+ "elapsed_seconds": round(time.monotonic() - t0, 1),
326
+ }
327
+ except Exception as exc:
328
+ return {
329
+ "status": "unreachable",
330
+ **_endpoint_public_fields(config),
331
+ "error": str(exc),
332
+ "elapsed_seconds": round(time.monotonic() - t0, 1),
333
+ }
334
+
335
+
336
+ def check_all_endpoints_manager_status(
337
+ config: Any, *, force: bool = False
338
+ ) -> list[dict[str, Any]]:
339
+ """Return a status row for every configured endpoint.
340
+
341
+ Each row contains ``name`` and the same fields produced
342
+ by :func:`check_endpoint_manager_status`. Uses the cache by default.
343
+ """
344
+ rows: list[dict[str, Any]] = []
345
+ names = list(getattr(config, "endpoint_names", []) or [])
346
+ if not names:
347
+ if getattr(config, "endpoint_id", None):
348
+ status = check_endpoint_manager_status(config, force=force)
349
+ rows.append({"name": config.endpoint_name or "default", **status})
350
+ return rows
351
+
352
+ for name in names:
353
+ endpoint_cfg = config.for_endpoint(endpoint=name)
354
+ status = check_endpoint_manager_status(endpoint_cfg, force=force)
355
+ rows.append({"name": name, **status})
356
+ return rows
357
+
358
+
359
+ # ---------------------------------------------------------------------------
360
+ # Backwards-compatibility aliases — kept so existing callers don't break
361
+ # while we migrate. Will be removed in a future release.
362
+ # ---------------------------------------------------------------------------
363
+ def check_endpoint_health(config: Any, *, force: bool = False) -> dict[str, Any]:
364
+ """Deprecated alias for :func:`check_endpoint_manager_status`."""
365
+ return check_endpoint_manager_status(config, force=force)
366
+
367
+
368
+ def check_all_endpoints_health(
369
+ config: Any, *, force: bool = False
370
+ ) -> list[dict[str, Any]]:
371
+ """Deprecated alias for :func:`check_all_endpoints_manager_status`."""
372
+ return check_all_endpoints_manager_status(config, force=force)