uxarray-mcp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uxarray_mcp/__init__.py +7 -0
- uxarray_mcp/__main__.py +16 -0
- uxarray_mcp/cli.py +356 -0
- uxarray_mcp/domain/__init__.py +27 -0
- uxarray_mcp/domain/area.py +32 -0
- uxarray_mcp/domain/mesh.py +26 -0
- uxarray_mcp/domain/plotting.py +499 -0
- uxarray_mcp/domain/variable.py +77 -0
- uxarray_mcp/domain/vector_calc.py +256 -0
- uxarray_mcp/domain/zonal.py +66 -0
- uxarray_mcp/provenance.py +79 -0
- uxarray_mcp/py.typed +0 -0
- uxarray_mcp/remote/__init__.py +6 -0
- uxarray_mcp/remote/agent.py +493 -0
- uxarray_mcp/remote/compute_functions.py +1151 -0
- uxarray_mcp/remote/config.py +322 -0
- uxarray_mcp/remote/health.py +372 -0
- uxarray_mcp/server.py +230 -0
- uxarray_mcp/state.py +521 -0
- uxarray_mcp/tools/__init__.py +115 -0
- uxarray_mcp/tools/advanced.py +1110 -0
- uxarray_mcp/tools/capabilities.py +669 -0
- uxarray_mcp/tools/catalog.py +369 -0
- uxarray_mcp/tools/execution_control.py +763 -0
- uxarray_mcp/tools/inspection.py +557 -0
- uxarray_mcp/tools/orchestration.py +327 -0
- uxarray_mcp/tools/plotting.py +854 -0
- uxarray_mcp/tools/remote_tools.py +702 -0
- uxarray_mcp/tools/scientific_agent.py +367 -0
- uxarray_mcp/tools/stateful.py +402 -0
- uxarray_mcp/tools/vector_calc.py +432 -0
- uxarray_mcp-0.1.0.dist-info/METADATA +468 -0
- uxarray_mcp-0.1.0.dist-info/RECORD +35 -0
- uxarray_mcp-0.1.0.dist-info/WHEEL +4 -0
- uxarray_mcp-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""Configuration management for remote execution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
|
|
11
|
+
import yaml
|
|
12
|
+
|
|
13
|
+
USER_CONFIG_PATH = Path.home() / ".config" / "uxarray-mcp" / "config.yaml"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def discover_config_path() -> Path | None:
|
|
17
|
+
"""Return the first existing config file in the discovery order.
|
|
18
|
+
|
|
19
|
+
Order:
|
|
20
|
+
1. ``$UXARRAY_MCP_CONFIG`` (explicit override)
|
|
21
|
+
2. ``./config.yaml`` (current working directory — project-local)
|
|
22
|
+
3. ``~/.config/uxarray-mcp/config.yaml`` (user install)
|
|
23
|
+
4. ``<repo_root>/config.yaml`` (editable install fallback)
|
|
24
|
+
|
|
25
|
+
Project-local (cwd) wins over the user config so that running the CLI
|
|
26
|
+
from inside a checkout uses the repo's config, even when an empty user
|
|
27
|
+
config was previously written by ``uxarray-mcp setup``.
|
|
28
|
+
|
|
29
|
+
Returns ``None`` when no config file is found.
|
|
30
|
+
"""
|
|
31
|
+
env_path = os.environ.get("UXARRAY_MCP_CONFIG")
|
|
32
|
+
if env_path:
|
|
33
|
+
candidate = Path(env_path).expanduser()
|
|
34
|
+
if candidate.exists():
|
|
35
|
+
return candidate
|
|
36
|
+
|
|
37
|
+
cwd_config = Path.cwd() / "config.yaml"
|
|
38
|
+
if cwd_config.exists():
|
|
39
|
+
return cwd_config
|
|
40
|
+
|
|
41
|
+
if USER_CONFIG_PATH.exists():
|
|
42
|
+
return USER_CONFIG_PATH
|
|
43
|
+
|
|
44
|
+
repo_config = Path(__file__).resolve().parent.parent.parent.parent / "config.yaml"
|
|
45
|
+
if repo_config.exists():
|
|
46
|
+
return repo_config
|
|
47
|
+
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def discover_config_search_paths() -> list[Path]:
|
|
52
|
+
"""Return the ordered list of paths discover_config_path inspects.
|
|
53
|
+
|
|
54
|
+
Useful for diagnostics — ``endpoints list`` prints this when no
|
|
55
|
+
endpoints are configured so the user can see exactly which file was
|
|
56
|
+
used or which paths were searched.
|
|
57
|
+
"""
|
|
58
|
+
paths: list[Path] = []
|
|
59
|
+
env_path = os.environ.get("UXARRAY_MCP_CONFIG")
|
|
60
|
+
if env_path:
|
|
61
|
+
paths.append(Path(env_path).expanduser())
|
|
62
|
+
paths.append(Path.cwd() / "config.yaml")
|
|
63
|
+
paths.append(USER_CONFIG_PATH)
|
|
64
|
+
paths.append(Path(__file__).resolve().parent.parent.parent.parent / "config.yaml")
|
|
65
|
+
return paths
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
_VALID_EXECUTION_MODES = {"local", "hpc", "auto"}
|
|
69
|
+
_EXECUTION_MODE_ALIASES = {"remote": "hpc"}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _is_uuid(value: str) -> bool:
|
|
73
|
+
try:
|
|
74
|
+
UUID(value)
|
|
75
|
+
except ValueError:
|
|
76
|
+
return False
|
|
77
|
+
return True
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass(frozen=True)
|
|
81
|
+
class EndpointProfile:
|
|
82
|
+
"""Named Globus Compute endpoint profile."""
|
|
83
|
+
|
|
84
|
+
name: str
|
|
85
|
+
endpoint_id: str
|
|
86
|
+
path_prefixes: tuple[str, ...] = ()
|
|
87
|
+
timeout_seconds: int | None = None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def normalize_execution_mode(execution_mode: str) -> str:
|
|
91
|
+
"""Return the canonical execution mode name.
|
|
92
|
+
|
|
93
|
+
The repository previously used ``remote`` for the HPC-only mode. Accept it
|
|
94
|
+
as a backwards-compatible alias so older configs and tests keep working.
|
|
95
|
+
"""
|
|
96
|
+
canonical_mode = _EXECUTION_MODE_ALIASES.get(execution_mode, execution_mode)
|
|
97
|
+
if canonical_mode not in _VALID_EXECUTION_MODES:
|
|
98
|
+
raise ValueError(
|
|
99
|
+
f"Invalid execution mode {execution_mode!r}. "
|
|
100
|
+
f"Must be one of: {', '.join(sorted(_VALID_EXECUTION_MODES))}"
|
|
101
|
+
)
|
|
102
|
+
return canonical_mode
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class HPCConfig:
|
|
106
|
+
"""HPC execution configuration.
|
|
107
|
+
|
|
108
|
+
Parameters
|
|
109
|
+
----------
|
|
110
|
+
endpoint_id : str | None
|
|
111
|
+
Globus Compute endpoint UUID
|
|
112
|
+
execution_mode : str
|
|
113
|
+
Execution mode: "local", "hpc", or "auto"
|
|
114
|
+
timeout_seconds : int
|
|
115
|
+
Timeout for remote execution in seconds
|
|
116
|
+
|
|
117
|
+
Examples
|
|
118
|
+
--------
|
|
119
|
+
>>> config = HPCConfig(endpoint_id="abc-123", execution_mode="hpc")
|
|
120
|
+
>>> config.has_endpoint
|
|
121
|
+
True
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
def __init__(
|
|
125
|
+
self,
|
|
126
|
+
endpoint_id: Optional[str] = None,
|
|
127
|
+
execution_mode: str = "local",
|
|
128
|
+
timeout_seconds: int = 300,
|
|
129
|
+
endpoints: dict[str, EndpointProfile] | None = None,
|
|
130
|
+
default_endpoint: str | None = None,
|
|
131
|
+
endpoint_name: str | None = None,
|
|
132
|
+
):
|
|
133
|
+
self.endpoints = endpoints or {}
|
|
134
|
+
self.default_endpoint = default_endpoint
|
|
135
|
+
self.endpoint_name = endpoint_name
|
|
136
|
+
self.endpoint_id = endpoint_id
|
|
137
|
+
self.execution_mode = normalize_execution_mode(execution_mode)
|
|
138
|
+
self.timeout_seconds = timeout_seconds
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def has_endpoint(self) -> bool:
|
|
142
|
+
"""Check if Globus Compute endpoint is configured."""
|
|
143
|
+
return self.endpoint_id is not None or bool(self.endpoints)
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def should_use_remote(self) -> bool:
|
|
147
|
+
"""Determine if remote execution should be used."""
|
|
148
|
+
if self.execution_mode == "local":
|
|
149
|
+
return False
|
|
150
|
+
elif self.execution_mode == "hpc":
|
|
151
|
+
return self.has_endpoint
|
|
152
|
+
elif self.execution_mode == "auto":
|
|
153
|
+
return self.has_endpoint
|
|
154
|
+
else:
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def endpoint_names(self) -> list[str]:
|
|
159
|
+
"""Return configured endpoint profile names."""
|
|
160
|
+
return sorted(self.endpoints)
|
|
161
|
+
|
|
162
|
+
def resolve_endpoint(
|
|
163
|
+
self, endpoint: str | None = None, path: str | None = None
|
|
164
|
+
) -> EndpointProfile | None:
|
|
165
|
+
"""Resolve an explicit endpoint name, default endpoint, or raw UUID."""
|
|
166
|
+
if endpoint:
|
|
167
|
+
if endpoint in self.endpoints:
|
|
168
|
+
return self.endpoints[endpoint]
|
|
169
|
+
if endpoint == self.endpoint_id:
|
|
170
|
+
return EndpointProfile(
|
|
171
|
+
name=self.endpoint_name or endpoint,
|
|
172
|
+
endpoint_id=endpoint,
|
|
173
|
+
timeout_seconds=self.timeout_seconds,
|
|
174
|
+
)
|
|
175
|
+
if _is_uuid(endpoint):
|
|
176
|
+
return EndpointProfile(
|
|
177
|
+
name=endpoint,
|
|
178
|
+
endpoint_id=endpoint,
|
|
179
|
+
timeout_seconds=self.timeout_seconds,
|
|
180
|
+
)
|
|
181
|
+
configured = ", ".join(self.endpoint_names) or "none"
|
|
182
|
+
raise ValueError(
|
|
183
|
+
f"Unknown endpoint {endpoint!r}. "
|
|
184
|
+
f"Configured endpoint names: {configured}. "
|
|
185
|
+
"Pass a configured endpoint name or a Globus Compute endpoint UUID."
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
if self.default_endpoint and self.default_endpoint in self.endpoints:
|
|
189
|
+
return self.endpoints[self.default_endpoint]
|
|
190
|
+
|
|
191
|
+
if self.endpoint_id is not None:
|
|
192
|
+
return EndpointProfile(
|
|
193
|
+
name=self.endpoint_name or "default",
|
|
194
|
+
endpoint_id=self.endpoint_id,
|
|
195
|
+
timeout_seconds=self.timeout_seconds,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
if len(self.endpoints) == 1:
|
|
199
|
+
return next(iter(self.endpoints.values()))
|
|
200
|
+
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
def for_endpoint(
|
|
204
|
+
self, endpoint: str | None = None, path: str | None = None
|
|
205
|
+
) -> "HPCConfig":
|
|
206
|
+
"""Return a copy configured for the selected endpoint profile."""
|
|
207
|
+
profile = self.resolve_endpoint(endpoint=endpoint, path=path)
|
|
208
|
+
if profile is None:
|
|
209
|
+
return HPCConfig(
|
|
210
|
+
endpoint_id=None,
|
|
211
|
+
execution_mode=self.execution_mode,
|
|
212
|
+
timeout_seconds=self.timeout_seconds,
|
|
213
|
+
endpoints=self.endpoints,
|
|
214
|
+
default_endpoint=self.default_endpoint,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
return HPCConfig(
|
|
218
|
+
endpoint_id=profile.endpoint_id,
|
|
219
|
+
execution_mode=self.execution_mode,
|
|
220
|
+
timeout_seconds=profile.timeout_seconds or self.timeout_seconds,
|
|
221
|
+
endpoints=self.endpoints,
|
|
222
|
+
default_endpoint=self.default_endpoint,
|
|
223
|
+
endpoint_name=profile.name,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _coerce_prefixes(value: Any) -> tuple[str, ...]:
|
|
228
|
+
if value is None:
|
|
229
|
+
return ()
|
|
230
|
+
if isinstance(value, str):
|
|
231
|
+
return (value,)
|
|
232
|
+
if isinstance(value, list):
|
|
233
|
+
return tuple(str(item) for item in value if item)
|
|
234
|
+
return ()
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _parse_endpoint_profiles(raw_endpoints: Any) -> dict[str, EndpointProfile]:
|
|
238
|
+
if not isinstance(raw_endpoints, dict):
|
|
239
|
+
return {}
|
|
240
|
+
|
|
241
|
+
profiles: dict[str, EndpointProfile] = {}
|
|
242
|
+
for name, raw_profile in raw_endpoints.items():
|
|
243
|
+
if not isinstance(raw_profile, dict):
|
|
244
|
+
continue
|
|
245
|
+
endpoint_id = raw_profile.get("endpoint_id")
|
|
246
|
+
if not endpoint_id:
|
|
247
|
+
continue
|
|
248
|
+
timeout = raw_profile.get("timeout_seconds")
|
|
249
|
+
profiles[str(name)] = EndpointProfile(
|
|
250
|
+
name=str(name),
|
|
251
|
+
endpoint_id=str(endpoint_id),
|
|
252
|
+
path_prefixes=_coerce_prefixes(raw_profile.get("path_prefixes")),
|
|
253
|
+
timeout_seconds=int(timeout) if timeout is not None else None,
|
|
254
|
+
)
|
|
255
|
+
return profiles
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def load_config(config_path: Optional[Path] = None) -> HPCConfig:
|
|
259
|
+
"""Load HPC configuration from YAML file.
|
|
260
|
+
|
|
261
|
+
Parameters
|
|
262
|
+
----------
|
|
263
|
+
config_path : Path | None
|
|
264
|
+
Path to config.yaml. If None, uses default location.
|
|
265
|
+
|
|
266
|
+
Returns
|
|
267
|
+
-------
|
|
268
|
+
HPCConfig
|
|
269
|
+
Loaded configuration object
|
|
270
|
+
|
|
271
|
+
Examples
|
|
272
|
+
--------
|
|
273
|
+
>>> config = load_config()
|
|
274
|
+
>>> config.execution_mode
|
|
275
|
+
'local'
|
|
276
|
+
"""
|
|
277
|
+
if config_path is None:
|
|
278
|
+
config_path = discover_config_path()
|
|
279
|
+
|
|
280
|
+
if config_path is None or not config_path.exists():
|
|
281
|
+
return HPCConfig()
|
|
282
|
+
|
|
283
|
+
with open(config_path, "r", encoding="utf-8") as f:
|
|
284
|
+
data = yaml.safe_load(f) or {}
|
|
285
|
+
|
|
286
|
+
if not isinstance(data, dict):
|
|
287
|
+
return HPCConfig()
|
|
288
|
+
|
|
289
|
+
hpc_config = data.get("hpc", {})
|
|
290
|
+
if not isinstance(hpc_config, dict):
|
|
291
|
+
hpc_config = {}
|
|
292
|
+
|
|
293
|
+
globus_config = hpc_config.get("globus_compute", {})
|
|
294
|
+
if not isinstance(globus_config, dict):
|
|
295
|
+
globus_config = {}
|
|
296
|
+
|
|
297
|
+
endpoints = _parse_endpoint_profiles(hpc_config.get("endpoints"))
|
|
298
|
+
if not endpoints:
|
|
299
|
+
endpoints = _parse_endpoint_profiles(globus_config.get("endpoints"))
|
|
300
|
+
|
|
301
|
+
default_endpoint = hpc_config.get("default_endpoint") or globus_config.get(
|
|
302
|
+
"default_endpoint"
|
|
303
|
+
)
|
|
304
|
+
endpoint_id = globus_config.get("endpoint_id")
|
|
305
|
+
|
|
306
|
+
if endpoint_id is None and default_endpoint in endpoints:
|
|
307
|
+
endpoint_id = endpoints[default_endpoint].endpoint_id
|
|
308
|
+
endpoint_name = (
|
|
309
|
+
default_endpoint
|
|
310
|
+
if default_endpoint in endpoints
|
|
311
|
+
and endpoint_id == endpoints[default_endpoint].endpoint_id
|
|
312
|
+
else None
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
return HPCConfig(
|
|
316
|
+
endpoint_id=endpoint_id,
|
|
317
|
+
execution_mode=hpc_config.get("execution_mode", "local"),
|
|
318
|
+
timeout_seconds=hpc_config.get("timeout_seconds", 300),
|
|
319
|
+
endpoints=endpoints,
|
|
320
|
+
default_endpoint=default_endpoint,
|
|
321
|
+
endpoint_name=endpoint_name,
|
|
322
|
+
)
|
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
"""Endpoint manager status checking for HPC execution.
|
|
2
|
+
|
|
3
|
+
## Status vocabulary
|
|
4
|
+
|
|
5
|
+
Every function in this module returns one of four manager-level statuses:
|
|
6
|
+
|
|
7
|
+
``"registered"``
|
|
8
|
+
The endpoint manager process is running and visible to Globus Compute.
|
|
9
|
+
Slurm/PBS jobs will be submitted when tasks arrive. Workers are not
|
|
10
|
+
necessarily running yet — the scheduler allocates them on demand.
|
|
11
|
+
This was formerly reported as ``"online"`` by the Globus SDK.
|
|
12
|
+
|
|
13
|
+
``"active"``
|
|
14
|
+
Manager is registered AND a lightweight probe task ran successfully on a
|
|
15
|
+
real compute node. Use :func:`probe_endpoint_worker` to obtain this status.
|
|
16
|
+
|
|
17
|
+
``"offline"``
|
|
18
|
+
The endpoint manager is not running. Someone must SSH in and run
|
|
19
|
+
``globus-compute-endpoint start <name>``.
|
|
20
|
+
|
|
21
|
+
``"unreachable"``
|
|
22
|
+
The Globus Compute service cannot be contacted (auth error, network issue).
|
|
23
|
+
|
|
24
|
+
``"no_endpoint"``
|
|
25
|
+
No endpoint UUID is configured for this name.
|
|
26
|
+
|
|
27
|
+
## Why "registered" not "online"
|
|
28
|
+
|
|
29
|
+
Globus reports ``"online"`` when the manager process is registered. This is
|
|
30
|
+
purely a registration state — it says nothing about whether Slurm has idle
|
|
31
|
+
nodes, whether the worker environment is healthy, or whether submitted tasks
|
|
32
|
+
will actually run. Using ``"online"`` led to misleading status messages.
|
|
33
|
+
``"registered"`` is honest: the manager is up and accepting submissions.
|
|
34
|
+
|
|
35
|
+
## Caching
|
|
36
|
+
|
|
37
|
+
The Globus Compute ``Client()`` constructor and ``get_endpoint_status()`` are
|
|
38
|
+
both network round-trips that take 1–5 s. This module keeps a process-level
|
|
39
|
+
Client cache plus a short-TTL per-endpoint status cache so back-to-back checks
|
|
40
|
+
are effectively free.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
import threading
|
|
46
|
+
import time
|
|
47
|
+
import warnings
|
|
48
|
+
from typing import Any
|
|
49
|
+
|
|
50
|
+
# ── Process-wide cached Client ────────────────────────────────────────────────
|
|
51
|
+
_CLIENT: Any = None
|
|
52
|
+
_CLIENT_LOCK = threading.Lock()
|
|
53
|
+
|
|
54
|
+
# ── Per-endpoint status cache ─────────────────────────────────────────────────
|
|
55
|
+
# Keyed by endpoint_id; value is (monotonic_ts, status_dict).
|
|
56
|
+
# The stored dict never contains "cached"/"cache_age_seconds" — those are
|
|
57
|
+
# added on the way out by _cached_entry().
|
|
58
|
+
_STATUS_CACHE: dict[str, tuple[float, dict[str, Any]]] = {}
|
|
59
|
+
_STATUS_CACHE_LOCK = threading.Lock()
|
|
60
|
+
|
|
61
|
+
# Registered endpoints stay valid longer; offline/unreachable are re-checked sooner.
|
|
62
|
+
_TTL_REGISTERED_SECONDS = 10.0
|
|
63
|
+
_TTL_OTHER_SECONDS = 3.0
|
|
64
|
+
|
|
65
|
+
# ── Globus → our vocabulary mapping ──────────────────────────────────────────
|
|
66
|
+
_GLOBUS_TO_STATUS: dict[str, str] = {
|
|
67
|
+
"online": "registered",
|
|
68
|
+
"healthy": "registered",
|
|
69
|
+
"offline": "offline",
|
|
70
|
+
"stopped": "offline",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _translate_globus_status(raw: str) -> str:
|
|
75
|
+
"""Map a raw Globus status string to our vocabulary."""
|
|
76
|
+
return _GLOBUS_TO_STATUS.get(raw.lower(), "unreachable")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _endpoint_public_fields(config: Any) -> dict[str, Any]:
|
|
80
|
+
"""Return non-sensitive endpoint metadata for public tool payloads."""
|
|
81
|
+
return {
|
|
82
|
+
"endpoint_name": getattr(config, "endpoint_name", None) or "configured",
|
|
83
|
+
"endpoint_configured": bool(getattr(config, "endpoint_id", None)),
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _get_client() -> Any:
|
|
88
|
+
"""Return a process-wide cached Globus Compute ``Client``."""
|
|
89
|
+
global _CLIENT
|
|
90
|
+
if _CLIENT is None:
|
|
91
|
+
with _CLIENT_LOCK:
|
|
92
|
+
if _CLIENT is None:
|
|
93
|
+
from globus_compute_sdk import Client
|
|
94
|
+
|
|
95
|
+
_CLIENT = Client()
|
|
96
|
+
return _CLIENT
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def invalidate_cache(endpoint_id: str | None = None) -> None:
|
|
100
|
+
"""Drop cached status entries.
|
|
101
|
+
|
|
102
|
+
Parameters
|
|
103
|
+
----------
|
|
104
|
+
endpoint_id : str | None
|
|
105
|
+
If ``None``, clear the entire cache. The cached Client is preserved.
|
|
106
|
+
"""
|
|
107
|
+
with _STATUS_CACHE_LOCK:
|
|
108
|
+
if endpoint_id is None:
|
|
109
|
+
_STATUS_CACHE.clear()
|
|
110
|
+
else:
|
|
111
|
+
_STATUS_CACHE.pop(endpoint_id, None)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _cached_entry(endpoint_id: str) -> dict[str, Any] | None:
|
|
115
|
+
with _STATUS_CACHE_LOCK:
|
|
116
|
+
entry = _STATUS_CACHE.get(endpoint_id)
|
|
117
|
+
if entry is None:
|
|
118
|
+
return None
|
|
119
|
+
ts, payload = entry
|
|
120
|
+
age = time.monotonic() - ts
|
|
121
|
+
ttl = (
|
|
122
|
+
_TTL_REGISTERED_SECONDS
|
|
123
|
+
if payload.get("status") == "registered"
|
|
124
|
+
else _TTL_OTHER_SECONDS
|
|
125
|
+
)
|
|
126
|
+
if age > ttl:
|
|
127
|
+
return None
|
|
128
|
+
cached = dict(payload)
|
|
129
|
+
cached["cached"] = True
|
|
130
|
+
cached["cache_age_seconds"] = round(age, 3)
|
|
131
|
+
return cached
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _store(endpoint_id: str, payload: dict[str, Any]) -> dict[str, Any]:
|
|
135
|
+
with _STATUS_CACHE_LOCK:
|
|
136
|
+
_STATUS_CACHE[endpoint_id] = (time.monotonic(), payload)
|
|
137
|
+
fresh = dict(payload)
|
|
138
|
+
fresh["cached"] = False
|
|
139
|
+
return fresh
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def check_endpoint_manager_status(
|
|
143
|
+
config: Any, *, force: bool = False
|
|
144
|
+
) -> dict[str, Any]:
|
|
145
|
+
"""Check whether the Globus Compute endpoint manager is registered.
|
|
146
|
+
|
|
147
|
+
This is a fast, cached check that queries the Globus cloud about the
|
|
148
|
+
manager process. It does **not** verify that workers are running or that
|
|
149
|
+
submitted tasks will succeed.
|
|
150
|
+
|
|
151
|
+
Parameters
|
|
152
|
+
----------
|
|
153
|
+
config : HPCConfig
|
|
154
|
+
HPC configuration with ``endpoint_id``.
|
|
155
|
+
force : bool, default False
|
|
156
|
+
Bypass the short-TTL cache and re-query the SDK.
|
|
157
|
+
|
|
158
|
+
Returns
|
|
159
|
+
-------
|
|
160
|
+
dict
|
|
161
|
+
Keys:
|
|
162
|
+
|
|
163
|
+
``status``
|
|
164
|
+
One of ``"registered"``, ``"offline"``, ``"unreachable"``,
|
|
165
|
+
``"no_endpoint"``.
|
|
166
|
+
``endpoint_name`` / ``endpoint_configured``
|
|
167
|
+
Non-sensitive endpoint metadata. Raw UUIDs stay in private config.
|
|
168
|
+
``error``
|
|
169
|
+
Error message when ``status`` is ``"unreachable"``.
|
|
170
|
+
``cached``
|
|
171
|
+
``True`` if this result was served from the in-process cache.
|
|
172
|
+
``cache_age_seconds``
|
|
173
|
+
Age of the cached entry in seconds (only when ``cached=True``).
|
|
174
|
+
|
|
175
|
+
Examples
|
|
176
|
+
--------
|
|
177
|
+
>>> from uxarray_mcp.remote.config import load_config
|
|
178
|
+
>>> config = load_config()
|
|
179
|
+
>>> check_endpoint_manager_status(config)
|
|
180
|
+
{"status": "registered", "endpoint_name": "improv", "cached": False}
|
|
181
|
+
"""
|
|
182
|
+
endpoint_id = getattr(config, "endpoint_id", None)
|
|
183
|
+
if not endpoint_id:
|
|
184
|
+
return {"status": "no_endpoint", "message": "No endpoint configured"}
|
|
185
|
+
|
|
186
|
+
if not force:
|
|
187
|
+
cached = _cached_entry(endpoint_id)
|
|
188
|
+
if cached is not None:
|
|
189
|
+
return cached
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
client = _get_client()
|
|
193
|
+
raw = client.get_endpoint_status(endpoint_id)
|
|
194
|
+
raw_status = raw.get("status", "unknown") if isinstance(raw, dict) else str(raw)
|
|
195
|
+
status = _translate_globus_status(raw_status)
|
|
196
|
+
payload: dict[str, Any] = {"status": status, **_endpoint_public_fields(config)}
|
|
197
|
+
return _store(endpoint_id, payload)
|
|
198
|
+
except Exception as exc:
|
|
199
|
+
payload = {
|
|
200
|
+
"status": "unreachable",
|
|
201
|
+
**_endpoint_public_fields(config),
|
|
202
|
+
"error": str(exc),
|
|
203
|
+
}
|
|
204
|
+
return _store(endpoint_id, payload)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def probe_endpoint_worker(
|
|
208
|
+
config: Any,
|
|
209
|
+
*,
|
|
210
|
+
timeout_seconds: int = 60,
|
|
211
|
+
) -> dict[str, Any]:
|
|
212
|
+
"""Submit a lightweight task to confirm a real worker responds.
|
|
213
|
+
|
|
214
|
+
Unlike :func:`check_endpoint_manager_status`, this function actually
|
|
215
|
+
submits a task via Globus Compute and waits for it to run on a scheduler
|
|
216
|
+
node. This is the only way to confirm that Slurm/PBS allocated a worker
|
|
217
|
+
and the Python environment is intact.
|
|
218
|
+
|
|
219
|
+
Parameters
|
|
220
|
+
----------
|
|
221
|
+
config : HPCConfig
|
|
222
|
+
HPC configuration with ``endpoint_id``.
|
|
223
|
+
timeout_seconds : int, default 60
|
|
224
|
+
How long to wait for the probe task to complete.
|
|
225
|
+
|
|
226
|
+
Returns
|
|
227
|
+
-------
|
|
228
|
+
dict
|
|
229
|
+
Keys:
|
|
230
|
+
|
|
231
|
+
``status``
|
|
232
|
+
``"active"`` if a worker responded, ``"registered"`` if the
|
|
233
|
+
manager is up but the probe timed out, ``"offline"`` or
|
|
234
|
+
``"unreachable"`` if the manager itself is down.
|
|
235
|
+
``node``
|
|
236
|
+
Hostname of the compute node that ran the task (when active).
|
|
237
|
+
``python``
|
|
238
|
+
Python version on the worker (when active).
|
|
239
|
+
``slurm_job_id``
|
|
240
|
+
Slurm job ID (when active and running under Slurm).
|
|
241
|
+
``elapsed_seconds``
|
|
242
|
+
Wall-clock time the probe took.
|
|
243
|
+
``error``
|
|
244
|
+
Error message on failure.
|
|
245
|
+
|
|
246
|
+
Examples
|
|
247
|
+
--------
|
|
248
|
+
>>> probe_endpoint_worker(config, timeout_seconds=90)
|
|
249
|
+
{"status": "active", "node": "chr-0497", "python": "3.13.13",
|
|
250
|
+
"slurm_job_id": "1228500", "elapsed_seconds": 28.4}
|
|
251
|
+
"""
|
|
252
|
+
endpoint_id = getattr(config, "endpoint_id", None)
|
|
253
|
+
if not endpoint_id:
|
|
254
|
+
return {"status": "no_endpoint", "message": "No endpoint configured"}
|
|
255
|
+
|
|
256
|
+
# Fast manager check first — no point probing an offline endpoint
|
|
257
|
+
manager = check_endpoint_manager_status(config, force=True)
|
|
258
|
+
if manager["status"] not in ("registered", "active"):
|
|
259
|
+
return manager
|
|
260
|
+
|
|
261
|
+
try:
|
|
262
|
+
from globus_compute_sdk import Executor
|
|
263
|
+
from globus_compute_sdk.serialize import AllCodeStrategies, ComputeSerializer
|
|
264
|
+
except ImportError:
|
|
265
|
+
return {
|
|
266
|
+
"status": "unreachable",
|
|
267
|
+
**_endpoint_public_fields(config),
|
|
268
|
+
"error": "globus-compute-sdk not installed. Run: uv sync --extra hpc",
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
def _worker_probe() -> dict:
|
|
272
|
+
import os
|
|
273
|
+
import platform
|
|
274
|
+
|
|
275
|
+
return {
|
|
276
|
+
"node": platform.node(),
|
|
277
|
+
"python": platform.python_version(),
|
|
278
|
+
"slurm_job_id": os.environ.get("SLURM_JOB_ID", ""),
|
|
279
|
+
"pbs_job_id": os.environ.get("PBS_JOBID", ""),
|
|
280
|
+
"pythonpath": os.environ.get("PYTHONPATH", ""),
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
t0 = time.monotonic()
|
|
284
|
+
try:
|
|
285
|
+
with warnings.catch_warnings():
|
|
286
|
+
warnings.filterwarnings(
|
|
287
|
+
"ignore",
|
|
288
|
+
message=r"(?s).*Environment differences detected between local SDK and endpoint.*",
|
|
289
|
+
category=UserWarning,
|
|
290
|
+
)
|
|
291
|
+
with Executor(
|
|
292
|
+
endpoint_id=endpoint_id,
|
|
293
|
+
serializer=ComputeSerializer(strategy_code=AllCodeStrategies()),
|
|
294
|
+
) as ex:
|
|
295
|
+
fut = ex.submit(_worker_probe)
|
|
296
|
+
result = fut.result(timeout=timeout_seconds)
|
|
297
|
+
|
|
298
|
+
elapsed = round(time.monotonic() - t0, 1)
|
|
299
|
+
payload = {
|
|
300
|
+
"status": "active",
|
|
301
|
+
**_endpoint_public_fields(config),
|
|
302
|
+
"node": result.get("node", ""),
|
|
303
|
+
"python": result.get("python", ""),
|
|
304
|
+
"slurm_job_id": result.get("slurm_job_id") or None,
|
|
305
|
+
"pbs_job_id": result.get("pbs_job_id") or None,
|
|
306
|
+
"pythonpath_set": bool(result.get("pythonpath")),
|
|
307
|
+
"elapsed_seconds": elapsed,
|
|
308
|
+
}
|
|
309
|
+
# Warn if PYTHONPATH is set — this is the root cause of most worker crashes
|
|
310
|
+
if result.get("pythonpath"):
|
|
311
|
+
payload["warning"] = (
|
|
312
|
+
"PYTHONPATH is set on the worker. This can cause pydantic/dill "
|
|
313
|
+
"conflicts. Add 'unset PYTHONPATH' to worker_init in the endpoint "
|
|
314
|
+
"config, and set PYTHONPATH: '' in user_environment.yaml."
|
|
315
|
+
)
|
|
316
|
+
return payload
|
|
317
|
+
|
|
318
|
+
except TimeoutError:
|
|
319
|
+
return {
|
|
320
|
+
"status": "registered",
|
|
321
|
+
**_endpoint_public_fields(config),
|
|
322
|
+
"error": f"Probe timed out after {timeout_seconds}s. Manager is up but "
|
|
323
|
+
"no worker responded. The scheduler may be busy or the worker "
|
|
324
|
+
"environment may be broken.",
|
|
325
|
+
"elapsed_seconds": round(time.monotonic() - t0, 1),
|
|
326
|
+
}
|
|
327
|
+
except Exception as exc:
|
|
328
|
+
return {
|
|
329
|
+
"status": "unreachable",
|
|
330
|
+
**_endpoint_public_fields(config),
|
|
331
|
+
"error": str(exc),
|
|
332
|
+
"elapsed_seconds": round(time.monotonic() - t0, 1),
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def check_all_endpoints_manager_status(
|
|
337
|
+
config: Any, *, force: bool = False
|
|
338
|
+
) -> list[dict[str, Any]]:
|
|
339
|
+
"""Return a status row for every configured endpoint.
|
|
340
|
+
|
|
341
|
+
Each row contains ``name`` and the same fields produced
|
|
342
|
+
by :func:`check_endpoint_manager_status`. Uses the cache by default.
|
|
343
|
+
"""
|
|
344
|
+
rows: list[dict[str, Any]] = []
|
|
345
|
+
names = list(getattr(config, "endpoint_names", []) or [])
|
|
346
|
+
if not names:
|
|
347
|
+
if getattr(config, "endpoint_id", None):
|
|
348
|
+
status = check_endpoint_manager_status(config, force=force)
|
|
349
|
+
rows.append({"name": config.endpoint_name or "default", **status})
|
|
350
|
+
return rows
|
|
351
|
+
|
|
352
|
+
for name in names:
|
|
353
|
+
endpoint_cfg = config.for_endpoint(endpoint=name)
|
|
354
|
+
status = check_endpoint_manager_status(endpoint_cfg, force=force)
|
|
355
|
+
rows.append({"name": name, **status})
|
|
356
|
+
return rows
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
# ---------------------------------------------------------------------------
|
|
360
|
+
# Backwards-compatibility aliases — kept so existing callers don't break
|
|
361
|
+
# while we migrate. Will be removed in a future release.
|
|
362
|
+
# ---------------------------------------------------------------------------
|
|
363
|
+
def check_endpoint_health(config: Any, *, force: bool = False) -> dict[str, Any]:
|
|
364
|
+
"""Deprecated alias for :func:`check_endpoint_manager_status`."""
|
|
365
|
+
return check_endpoint_manager_status(config, force=force)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def check_all_endpoints_health(
|
|
369
|
+
config: Any, *, force: bool = False
|
|
370
|
+
) -> list[dict[str, Any]]:
|
|
371
|
+
"""Deprecated alias for :func:`check_all_endpoints_manager_status`."""
|
|
372
|
+
return check_all_endpoints_manager_status(config, force=force)
|