radical.orbit 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- radical/orbit/__init__.py +61 -0
- radical/orbit/_prof.py +8 -0
- radical/orbit/_version.py +3 -0
- radical/orbit/batch_system.py +195 -0
- radical/orbit/batch_system_pbs.py +329 -0
- radical/orbit/batch_system_slurm.py +193 -0
- radical/orbit/bridge.py +894 -0
- radical/orbit/bridge_plugin_host.py +179 -0
- radical/orbit/client.py +618 -0
- radical/orbit/data/orbit_explorer.html +2782 -0
- radical/orbit/data/plugins/globus.js +419 -0
- radical/orbit/data/plugins/iri_connect.js +238 -0
- radical/orbit/data/plugins/iri_instance.js +604 -0
- radical/orbit/data/plugins/lucid.js +31 -0
- radical/orbit/data/plugins/psij.js +746 -0
- radical/orbit/data/plugins/queue_info.js +624 -0
- radical/orbit/data/plugins/rhapsody.js +376 -0
- radical/orbit/data/plugins/staging.js +355 -0
- radical/orbit/data/plugins/sysinfo.js +253 -0
- radical/orbit/data/plugins/task_dispatcher.js +188 -0
- radical/orbit/data/plugins/xgfabric.js +567 -0
- radical/orbit/data/xgfabric_resource_default.json +4 -0
- radical/orbit/data/xgfabric_resource_test.json +25 -0
- radical/orbit/data/xgfabric_workflow_default.json +48 -0
- radical/orbit/data/xgfabric_workflow_test.json +24 -0
- radical/orbit/exceptions.py +152 -0
- radical/orbit/http_utils.py +51 -0
- radical/orbit/iri_endpoints.py +25 -0
- radical/orbit/logging_config.py +167 -0
- radical/orbit/models.py +137 -0
- radical/orbit/plugin_base.py +506 -0
- radical/orbit/plugin_globus.py +691 -0
- radical/orbit/plugin_host_base.py +290 -0
- radical/orbit/plugin_iri_connect.py +202 -0
- radical/orbit/plugin_iri_instance.py +499 -0
- radical/orbit/plugin_lucid.py +243 -0
- radical/orbit/plugin_psij.py +1258 -0
- radical/orbit/plugin_queue_info.py +469 -0
- radical/orbit/plugin_rhapsody.py +1546 -0
- radical/orbit/plugin_session_base.py +104 -0
- radical/orbit/plugin_staging.py +480 -0
- radical/orbit/plugin_sysinfo.py +650 -0
- radical/orbit/plugin_task_dispatcher.py +1883 -0
- radical/orbit/plugin_xgfabric.py +1394 -0
- radical/orbit/queue_info.py +321 -0
- radical/orbit/queue_info_none.py +21 -0
- radical/orbit/queue_info_pbs.py +510 -0
- radical/orbit/queue_info_slurm.py +370 -0
- radical/orbit/service.py +831 -0
- radical/orbit/task_dispatcher_config.py +307 -0
- radical/orbit/task_dispatcher_state.py +334 -0
- radical/orbit/task_dispatcher_strategy.py +345 -0
- radical/orbit/task_dispatcher_strategy_conservative.py +208 -0
- radical/orbit/task_dispatcher_strategy_examples.py +173 -0
- radical/orbit/tunnel.py +319 -0
- radical/orbit/ui_schema.py +178 -0
- radical/orbit/utils.py +295 -0
- radical_orbit-0.2.0.data/scripts/radical-orbit-bridge.py +58 -0
- radical_orbit-0.2.0.data/scripts/radical-orbit-endpoint-wrapper.sh +60 -0
- radical_orbit-0.2.0.data/scripts/radical-orbit-endpoint.py +107 -0
- radical_orbit-0.2.0.data/scripts/radical-orbit-iri-tunnel-helper.sh +165 -0
- radical_orbit-0.2.0.data/scripts/radical-orbit-makeflow +117 -0
- radical_orbit-0.2.0.data/scripts/radical-orbit-makeflow-prep +500 -0
- radical_orbit-0.2.0.data/scripts/radical-orbit-run +394 -0
- radical_orbit-0.2.0.dist-info/METADATA +326 -0
- radical_orbit-0.2.0.dist-info/RECORD +70 -0
- radical_orbit-0.2.0.dist-info/WHEEL +5 -0
- radical_orbit-0.2.0.dist-info/entry_points.txt +3 -0
- radical_orbit-0.2.0.dist-info/licenses/LICENSE.md +179 -0
- radical_orbit-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
|
|
2
|
+
# ── Public client surface — bind these BEFORE the plugin imports ─────────────
|
|
3
|
+
|
|
4
|
+
from .plugin_base import Plugin # noqa: F401
|
|
5
|
+
from .plugin_session_base import PluginSession # noqa: F401
|
|
6
|
+
from .ui_schema import (UIConfig, UIForm, UIField, # noqa: F401
|
|
7
|
+
UIMonitor, UINotifications,
|
|
8
|
+
ui_config_to_dict)
|
|
9
|
+
|
|
10
|
+
from .service import EndpointService # noqa: F401
|
|
11
|
+
from .bridge import Bridge # noqa: F401
|
|
12
|
+
from .client import BridgeClient, EndpointClient, PluginClient # noqa: F401
|
|
13
|
+
|
|
14
|
+
# Public alias — ``Endpoint`` is the natural counterpart to ``Bridge``.
|
|
15
|
+
Endpoint = EndpointService
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# ── Plugins ──────────────────────────────────────────────────────────────────
|
|
19
|
+
|
|
20
|
+
from .plugin_xgfabric import PluginXGFabric # noqa: F401
|
|
21
|
+
from .plugin_iri_connect import PluginIRIConnect # noqa: F401
|
|
22
|
+
from .plugin_queue_info import PluginQueueInfo # noqa: F401
|
|
23
|
+
from .plugin_sysinfo import PluginSysInfo # noqa: F401
|
|
24
|
+
from .plugin_staging import PluginStaging # noqa: F401
|
|
25
|
+
from .plugin_task_dispatcher import PluginTaskDispatcher # noqa: F401
|
|
26
|
+
|
|
27
|
+
# Optional plugins with external dependencies.
|
|
28
|
+
try:
|
|
29
|
+
from .plugin_lucid import PluginLucid # noqa: F401
|
|
30
|
+
except ImportError:
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
from .plugin_psij import PluginPSIJ # noqa: F401
|
|
35
|
+
except ImportError:
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
from .plugin_rhapsody import PluginRhapsody # noqa: F401
|
|
40
|
+
except ImportError:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
from .plugin_globus import PluginGlobus # noqa: F401
|
|
45
|
+
except ImportError:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ── Version (generated at install time) ──────────────────────────────────────
|
|
50
|
+
|
|
51
|
+
# _version.py is generated by setup.py at install time and is
|
|
52
|
+
# git-ignored. When running from an uninstalled source tree the
|
|
53
|
+
# import will fail and we fall back to dev markers.
|
|
54
|
+
try:
|
|
55
|
+
from ._version import version, version_detail # noqa: F401
|
|
56
|
+
except ImportError:
|
|
57
|
+
version = 'unknown'
|
|
58
|
+
version_detail = 'unknown'
|
|
59
|
+
|
|
60
|
+
__version__ = version
|
|
61
|
+
|
radical/orbit/_prof.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch system abstraction.
|
|
3
|
+
|
|
4
|
+
Encapsulates everything about the local HPC scheduler that callers
|
|
5
|
+
outside of ``queue_info`` need to know: presence detection, in-allocation
|
|
6
|
+
detection, normalized job state, node lookup, cancel, allocation summary.
|
|
7
|
+
|
|
8
|
+
Backend implementations live in ``batch_system_slurm.py`` and
|
|
9
|
+
``batch_system_pbs.py`` and register themselves via ``_REGISTRY`` below.
|
|
10
|
+
``detect_batch_system()`` returns the first backend whose ``detect()`` is
|
|
11
|
+
true; otherwise ``NullBatchSystem``.
|
|
12
|
+
|
|
13
|
+
State vocabulary (used everywhere outside the backend modules):
|
|
14
|
+
PENDING — queued, not yet running
|
|
15
|
+
RUNNING — running on compute nodes
|
|
16
|
+
DONE — completed successfully
|
|
17
|
+
FAILED — completed with non-zero exit, NODE_FAIL, PREEMPTED, TIMEOUT, …
|
|
18
|
+
CANCELLED — user- or admin-cancelled
|
|
19
|
+
HELD — held / suspended
|
|
20
|
+
UNKNOWN — backend reported nothing (job gone, transient error, no scheduler)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from abc import ABC, abstractmethod
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Normalized state vocabulary
|
|
27
|
+
STATE_PENDING = 'PENDING'
|
|
28
|
+
STATE_RUNNING = 'RUNNING'
|
|
29
|
+
STATE_DONE = 'DONE'
|
|
30
|
+
STATE_FAILED = 'FAILED'
|
|
31
|
+
STATE_CANCELLED = 'CANCELLED'
|
|
32
|
+
STATE_HELD = 'HELD'
|
|
33
|
+
STATE_UNKNOWN = 'UNKNOWN'
|
|
34
|
+
|
|
35
|
+
TERMINAL_STATES = frozenset({STATE_DONE, STATE_FAILED, STATE_CANCELLED})
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class BatchSystem(ABC):
|
|
39
|
+
"""Per-process scheduler interface.
|
|
40
|
+
|
|
41
|
+
Subclasses are stateless; one instance per backend is held in the
|
|
42
|
+
module-level cache returned by :func:`detect_batch_system`.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
name : str = 'none' # short identifier
|
|
46
|
+
psij_executor : str = 'local' # corresponding PsiJ executor name
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def detect(cls) -> bool:
|
|
51
|
+
"""Return True if this scheduler is installed locally."""
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def in_allocation(self) -> bool:
|
|
55
|
+
"""True when this process runs inside a batch job."""
|
|
56
|
+
|
|
57
|
+
@abstractmethod
|
|
58
|
+
def job_id(self) -> 'str | None':
|
|
59
|
+
"""Native job id of the current allocation, or None on a login node."""
|
|
60
|
+
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def job_state(self, native_id) -> str:
|
|
63
|
+
"""Return a normalized state string for *native_id*.
|
|
64
|
+
|
|
65
|
+
Returns one of the STATE_* constants. Returns STATE_UNKNOWN on any
|
|
66
|
+
error (job gone, command failure, timeout, parse error).
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
@abstractmethod
|
|
70
|
+
def job_nodes(self, native_id) -> list:
|
|
71
|
+
"""Return the list of compute node hostnames allocated to *native_id*.
|
|
72
|
+
|
|
73
|
+
Returns an empty list if the job is not running or the lookup fails.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
@abstractmethod
|
|
77
|
+
def nodelist(self) -> list:
|
|
78
|
+
"""Return the expanded list of hostnames in *this* endpoint's allocation.
|
|
79
|
+
|
|
80
|
+
Returns an empty list when the endpoint is not running inside a job (i.e.
|
|
81
|
+
on a login node) or when the scheduler doesn't expose the info.
|
|
82
|
+
Hostnames are returned one per node, in scheduler-reported order.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
@abstractmethod
|
|
86
|
+
def cancel(self, native_id) -> None:
|
|
87
|
+
"""Cancel *native_id*. Raises RuntimeError on failure."""
|
|
88
|
+
|
|
89
|
+
@abstractmethod
|
|
90
|
+
def job_allocation(self) -> 'dict | None':
|
|
91
|
+
"""Return allocation info about the current batch job, or None.
|
|
92
|
+
|
|
93
|
+
On a login node returns None. Inside a batch job returns a dict with
|
|
94
|
+
keys: job_id, partition, n_nodes, nodelist, cpus_per_node,
|
|
95
|
+
gpus_per_node, account, job_name, runtime (seconds, None for
|
|
96
|
+
unlimited).
|
|
97
|
+
|
|
98
|
+
Raises RuntimeError when in_allocation() is true but details cannot
|
|
99
|
+
be collected.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
def terminal_states(self) -> frozenset:
|
|
103
|
+
"""The set of normalized states that mean 'job is done'."""
|
|
104
|
+
return TERMINAL_STATES
|
|
105
|
+
|
|
106
|
+
def default_custom_attributes(self) -> dict:
|
|
107
|
+
"""Per-site PSIJ custom_attributes to merge into every submission.
|
|
108
|
+
|
|
109
|
+
Returned when the caller submits via the PSIJ executor that
|
|
110
|
+
corresponds to this backend (``psij_executor`` on the class).
|
|
111
|
+
Caller-provided attributes take precedence on key conflicts.
|
|
112
|
+
|
|
113
|
+
Default: no defaults. Site-specific subclasses override to
|
|
114
|
+
encode hard requirements (e.g. Aurora's ``filesystems=home:flare``
|
|
115
|
+
resource is mandatory for qsub).
|
|
116
|
+
"""
|
|
117
|
+
return {}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class NullBatchSystem(BatchSystem):
|
|
121
|
+
"""Fallback when no scheduler is installed (e.g. dev laptop)."""
|
|
122
|
+
|
|
123
|
+
name = 'none'
|
|
124
|
+
psij_executor = 'local'
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def detect(cls) -> bool:
|
|
128
|
+
return True # always last in the registry; matches everything
|
|
129
|
+
|
|
130
|
+
def in_allocation(self) -> bool:
|
|
131
|
+
return False
|
|
132
|
+
|
|
133
|
+
def job_id(self) -> 'str | None':
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
def job_state(self, native_id) -> str:
|
|
137
|
+
return STATE_UNKNOWN
|
|
138
|
+
|
|
139
|
+
def job_nodes(self, native_id) -> list:
|
|
140
|
+
return []
|
|
141
|
+
|
|
142
|
+
def nodelist(self) -> list:
|
|
143
|
+
return []
|
|
144
|
+
|
|
145
|
+
def cancel(self, native_id) -> None:
|
|
146
|
+
raise RuntimeError(f"no batch system available to cancel job {native_id}")
|
|
147
|
+
|
|
148
|
+
def job_allocation(self) -> 'dict | None':
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ---------------------------------------------------------------------------
|
|
153
|
+
# Registry + detection
|
|
154
|
+
# ---------------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
_REGISTRY : list = [] # populated by backend modules at import time
|
|
157
|
+
_DETECTED : 'BatchSystem | None' = None
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def register_backend(cls) -> None:
|
|
161
|
+
"""Register a BatchSystem subclass. Called by backend modules at import."""
|
|
162
|
+
if cls not in _REGISTRY:
|
|
163
|
+
_REGISTRY.append(cls)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def detect_batch_system(force: bool = False) -> BatchSystem:
|
|
167
|
+
"""Return the active batch system, probing the registry on first call.
|
|
168
|
+
|
|
169
|
+
Result is cached. Pass ``force=True`` to re-probe (mainly for tests).
|
|
170
|
+
"""
|
|
171
|
+
global _DETECTED
|
|
172
|
+
if _DETECTED is not None and not force:
|
|
173
|
+
return _DETECTED
|
|
174
|
+
|
|
175
|
+
# Import backend modules so they register. Local import avoids circular
|
|
176
|
+
# imports during package init.
|
|
177
|
+
from . import batch_system_slurm # noqa: F401
|
|
178
|
+
from . import batch_system_pbs # noqa: F401
|
|
179
|
+
|
|
180
|
+
for cls in _REGISTRY:
|
|
181
|
+
try:
|
|
182
|
+
if cls.detect():
|
|
183
|
+
_DETECTED = cls()
|
|
184
|
+
return _DETECTED
|
|
185
|
+
except Exception:
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
_DETECTED = NullBatchSystem()
|
|
189
|
+
return _DETECTED
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def reset_detection() -> None:
|
|
193
|
+
"""Clear the cached detection (tests only)."""
|
|
194
|
+
global _DETECTED
|
|
195
|
+
_DETECTED = None
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
"""PBSPro implementation of BatchSystem.
|
|
2
|
+
|
|
3
|
+
Targets PBS Professional (Altair) as found on Aurora and similar ALCF
|
|
4
|
+
systems. Uses qstat / qdel for job control. Aurora's PBSPro publishes
|
|
5
|
+
state info both as text (qstat -f) and partially as JSON (qstat -f -F json
|
|
6
|
+
on recent versions); this backend only relies on text output, which is
|
|
7
|
+
universal.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import shutil
|
|
12
|
+
import subprocess
|
|
13
|
+
|
|
14
|
+
from .batch_system import (BatchSystem, register_backend,
|
|
15
|
+
STATE_PENDING, STATE_RUNNING, STATE_DONE,
|
|
16
|
+
STATE_FAILED, STATE_CANCELLED,
|
|
17
|
+
STATE_HELD, STATE_UNKNOWN)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# PBS single-letter state → normalized vocabulary.
|
|
21
|
+
# Reference: PBSPro qstat manual (job_state column).
|
|
22
|
+
_STATE_MAP = {
|
|
23
|
+
'Q': STATE_PENDING, # queued
|
|
24
|
+
'W': STATE_PENDING, # waiting (begin time / dependency)
|
|
25
|
+
'T': STATE_PENDING, # being moved
|
|
26
|
+
'R': STATE_RUNNING, # running
|
|
27
|
+
'B': STATE_RUNNING, # array job: at least one subjob running
|
|
28
|
+
'E': STATE_RUNNING, # exiting (still cleaning up)
|
|
29
|
+
'F': STATE_DONE, # finished (PBSPro only with -x)
|
|
30
|
+
'X': STATE_DONE, # subjob completed (array)
|
|
31
|
+
'H': STATE_HELD, # held
|
|
32
|
+
'S': STATE_HELD, # suspended
|
|
33
|
+
'M': STATE_HELD, # moved to another server
|
|
34
|
+
'U': STATE_HELD, # cycle-harvesting suspension
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _parse_pbs_walltime(s: str) -> 'int | None':
|
|
39
|
+
"""Parse a PBS walltime string ([[HH:]MM:]SS) to seconds.
|
|
40
|
+
|
|
41
|
+
Returns None on empty / unset values.
|
|
42
|
+
"""
|
|
43
|
+
if not s:
|
|
44
|
+
return None
|
|
45
|
+
s = s.strip()
|
|
46
|
+
if not s:
|
|
47
|
+
return None
|
|
48
|
+
parts = s.split(':')
|
|
49
|
+
try:
|
|
50
|
+
if len(parts) == 3: h, m, sec = (int(p) for p in parts)
|
|
51
|
+
elif len(parts) == 2: h, m, sec = 0, int(parts[0]), int(parts[1])
|
|
52
|
+
elif len(parts) == 1: h, m, sec = 0, 0, int(parts[0])
|
|
53
|
+
else: raise ValueError
|
|
54
|
+
except ValueError as e:
|
|
55
|
+
raise RuntimeError(f"Cannot parse PBS walltime: {s!r}") from e
|
|
56
|
+
return h * 3600 + m * 60 + sec
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _parse_qstat_f(stdout: str) -> dict:
|
|
60
|
+
"""Parse ``qstat -f <jobid>`` text output into a {key: value} dict.
|
|
61
|
+
|
|
62
|
+
PBSPro indents every attribute line (`` key = value``). Continuation
|
|
63
|
+
lines for long values are indented *more* than attribute lines and may
|
|
64
|
+
themselves contain ``=`` characters (e.g. inside ``Resource_List.select``).
|
|
65
|
+
The rule used here: the first indented attribute line sets the
|
|
66
|
+
*attribute-indent* width; any line indented strictly deeper is a
|
|
67
|
+
continuation of the prior key.
|
|
68
|
+
Section headers like ``Job Id: 12345`` are ignored.
|
|
69
|
+
"""
|
|
70
|
+
result = {}
|
|
71
|
+
cur_key = None
|
|
72
|
+
cur_val_parts = []
|
|
73
|
+
attr_indent = None # set by the first attribute line we see
|
|
74
|
+
|
|
75
|
+
def _flush():
|
|
76
|
+
if cur_key is not None:
|
|
77
|
+
result[cur_key] = ''.join(cur_val_parts).strip()
|
|
78
|
+
|
|
79
|
+
for raw in stdout.splitlines():
|
|
80
|
+
if not raw or not raw.strip():
|
|
81
|
+
continue
|
|
82
|
+
stripped = raw.strip()
|
|
83
|
+
if stripped.startswith('Job Id:'):
|
|
84
|
+
continue
|
|
85
|
+
indent = len(raw) - len(raw.lstrip())
|
|
86
|
+
is_continuation = (attr_indent is not None
|
|
87
|
+
and indent > attr_indent
|
|
88
|
+
and cur_key is not None)
|
|
89
|
+
if is_continuation:
|
|
90
|
+
cur_val_parts.append(stripped)
|
|
91
|
+
continue
|
|
92
|
+
if attr_indent is None:
|
|
93
|
+
attr_indent = indent
|
|
94
|
+
# A new attribute line.
|
|
95
|
+
if '=' in stripped:
|
|
96
|
+
_flush()
|
|
97
|
+
k, v = stripped.split('=', 1)
|
|
98
|
+
cur_key = k.strip()
|
|
99
|
+
cur_val_parts = [v.strip()]
|
|
100
|
+
elif cur_key is not None:
|
|
101
|
+
# No '=' and not deeper indented → treat as plain continuation.
|
|
102
|
+
cur_val_parts.append(stripped)
|
|
103
|
+
_flush()
|
|
104
|
+
return result
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _parse_exec_host(s: str) -> list:
|
|
108
|
+
"""Parse PBS exec_host into a list of hostnames.
|
|
109
|
+
|
|
110
|
+
Format: ``host1/0*64+host2/0*64`` (host/cpuset*ncpus pairs).
|
|
111
|
+
Returns deduplicated host list preserving order.
|
|
112
|
+
"""
|
|
113
|
+
if not s:
|
|
114
|
+
return []
|
|
115
|
+
seen = set()
|
|
116
|
+
hosts = []
|
|
117
|
+
for token in s.split('+'):
|
|
118
|
+
host = token.split('/', 1)[0].split('.', 1)[0]
|
|
119
|
+
if host and host not in seen:
|
|
120
|
+
seen.add(host)
|
|
121
|
+
hosts.append(host)
|
|
122
|
+
return hosts
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _read_pbs_nodefile() -> list:
|
|
126
|
+
"""Return deduplicated host list from $PBS_NODEFILE, empty if missing."""
|
|
127
|
+
path = os.environ.get('PBS_NODEFILE')
|
|
128
|
+
if not path or not os.path.isfile(path):
|
|
129
|
+
return []
|
|
130
|
+
try:
|
|
131
|
+
with open(path) as f:
|
|
132
|
+
lines = [l.strip() for l in f if l.strip()]
|
|
133
|
+
except OSError:
|
|
134
|
+
return []
|
|
135
|
+
seen = set()
|
|
136
|
+
hosts = []
|
|
137
|
+
for h in lines:
|
|
138
|
+
h = h.split('.', 1)[0]
|
|
139
|
+
if h not in seen:
|
|
140
|
+
seen.add(h)
|
|
141
|
+
hosts.append(h)
|
|
142
|
+
return hosts
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class PBSProBatchSystem(BatchSystem):
|
|
146
|
+
"""PBSPro scheduler interface."""
|
|
147
|
+
|
|
148
|
+
name = 'pbs'
|
|
149
|
+
psij_executor = 'pbs'
|
|
150
|
+
|
|
151
|
+
def __init__(self) -> None:
|
|
152
|
+
super().__init__()
|
|
153
|
+
# Native ids we've been asked to cancel. PBSPro's qstat letter
|
|
154
|
+
# codes have no dedicated 'cancelled' value — the job ends up in
|
|
155
|
+
# 'F' (finished) just like a normal exit, so we remember the
|
|
156
|
+
# intent here and map terminal states to STATE_CANCELLED in
|
|
157
|
+
# job_state().
|
|
158
|
+
self._cancelled: set = set()
|
|
159
|
+
|
|
160
|
+
@classmethod
|
|
161
|
+
def detect(cls) -> bool:
|
|
162
|
+
return shutil.which('qstat') is not None
|
|
163
|
+
|
|
164
|
+
def in_allocation(self) -> bool:
|
|
165
|
+
return bool(os.environ.get('PBS_JOBID'))
|
|
166
|
+
|
|
167
|
+
def job_id(self) -> 'str | None':
|
|
168
|
+
return os.environ.get('PBS_JOBID')
|
|
169
|
+
|
|
170
|
+
def job_state(self, native_id) -> str:
|
|
171
|
+
try:
|
|
172
|
+
r = subprocess.run(
|
|
173
|
+
['qstat', '-f', str(native_id)],
|
|
174
|
+
capture_output=True, text=True, timeout=10)
|
|
175
|
+
except (OSError, subprocess.TimeoutExpired):
|
|
176
|
+
return STATE_UNKNOWN
|
|
177
|
+
if r.returncode != 0:
|
|
178
|
+
# try -x to look up finished jobs
|
|
179
|
+
try:
|
|
180
|
+
r = subprocess.run(
|
|
181
|
+
['qstat', '-x', '-f', str(native_id)],
|
|
182
|
+
capture_output=True, text=True, timeout=10)
|
|
183
|
+
except (OSError, subprocess.TimeoutExpired):
|
|
184
|
+
return STATE_UNKNOWN
|
|
185
|
+
if r.returncode != 0:
|
|
186
|
+
return STATE_UNKNOWN
|
|
187
|
+
info = _parse_qstat_f(r.stdout)
|
|
188
|
+
code = info.get('job_state', '').strip()
|
|
189
|
+
if not code:
|
|
190
|
+
return STATE_UNKNOWN
|
|
191
|
+
state = _STATE_MAP.get(code[0].upper(), STATE_UNKNOWN)
|
|
192
|
+
if str(native_id) in self._cancelled and state in (STATE_DONE, STATE_FAILED):
|
|
193
|
+
return STATE_CANCELLED
|
|
194
|
+
return state
|
|
195
|
+
|
|
196
|
+
def job_nodes(self, native_id) -> list:
|
|
197
|
+
try:
|
|
198
|
+
r = subprocess.run(
|
|
199
|
+
['qstat', '-f', str(native_id)],
|
|
200
|
+
capture_output=True, text=True, timeout=10)
|
|
201
|
+
except (OSError, subprocess.TimeoutExpired):
|
|
202
|
+
return []
|
|
203
|
+
if r.returncode != 0:
|
|
204
|
+
return []
|
|
205
|
+
info = _parse_qstat_f(r.stdout)
|
|
206
|
+
return _parse_exec_host(info.get('exec_host', ''))
|
|
207
|
+
|
|
208
|
+
def nodelist(self) -> list:
|
|
209
|
+
# PBS_NODEFILE lists each host once per slot; ``_read_pbs_nodefile``
|
|
210
|
+
# already dedupes and short-circuits on a missing / empty file.
|
|
211
|
+
return _read_pbs_nodefile()
|
|
212
|
+
|
|
213
|
+
def cancel(self, native_id) -> None:
|
|
214
|
+
r = subprocess.run(['qdel', str(native_id)],
|
|
215
|
+
capture_output=True, text=True, timeout=10)
|
|
216
|
+
if r.returncode != 0:
|
|
217
|
+
raise RuntimeError(f"qdel failed: {r.stderr.strip()}")
|
|
218
|
+
self._cancelled.add(str(native_id))
|
|
219
|
+
|
|
220
|
+
def job_allocation(self) -> 'dict | None':
|
|
221
|
+
job_id = os.environ.get('PBS_JOBID')
|
|
222
|
+
if not job_id:
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
# Node count from PBS_NODEFILE first (always present in jobs),
|
|
226
|
+
# fall back to qstat -f Resource_List.nodect if needed.
|
|
227
|
+
nodes = _read_pbs_nodefile()
|
|
228
|
+
n_nodes = len(nodes) or None
|
|
229
|
+
|
|
230
|
+
# Pull walltime / partition / account from qstat.
|
|
231
|
+
runtime = None
|
|
232
|
+
partition = os.environ.get('PBS_QUEUE') or os.environ.get('PBS_O_QUEUE')
|
|
233
|
+
account = os.environ.get('PBS_ACCOUNT')
|
|
234
|
+
job_name = os.environ.get('PBS_JOBNAME')
|
|
235
|
+
nodelist = ','.join(nodes) if nodes else None
|
|
236
|
+
cpus_per_node = None
|
|
237
|
+
gpus_per_node = None
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
r = subprocess.run(
|
|
241
|
+
['qstat', '-f', job_id],
|
|
242
|
+
capture_output=True, text=True, timeout=10)
|
|
243
|
+
if r.returncode == 0:
|
|
244
|
+
info = _parse_qstat_f(r.stdout)
|
|
245
|
+
runtime = _parse_pbs_walltime(
|
|
246
|
+
info.get('Resource_List.walltime', ''))
|
|
247
|
+
if not n_nodes:
|
|
248
|
+
nct = info.get('Resource_List.nodect', '')
|
|
249
|
+
try:
|
|
250
|
+
n_nodes = int(nct) if nct else None
|
|
251
|
+
except ValueError:
|
|
252
|
+
n_nodes = None
|
|
253
|
+
if not partition:
|
|
254
|
+
partition = info.get('queue', '') or None
|
|
255
|
+
if not account:
|
|
256
|
+
account = info.get('Account_Name', '') or None
|
|
257
|
+
if not job_name:
|
|
258
|
+
job_name = info.get('Job_Name', '') or None
|
|
259
|
+
if not nodelist:
|
|
260
|
+
eh = info.get('exec_host', '')
|
|
261
|
+
if eh:
|
|
262
|
+
nodelist = ','.join(_parse_exec_host(eh))
|
|
263
|
+
|
|
264
|
+
# Extract per-node resources from select=... when possible.
|
|
265
|
+
# Format: "1:ncpus=64:ngpus=4" or "2:ncpus=64".
|
|
266
|
+
select = info.get('Resource_List.select', '')
|
|
267
|
+
if select:
|
|
268
|
+
chunk = select.split('+', 1)[0]
|
|
269
|
+
tokens = chunk.split(':')
|
|
270
|
+
for tok in tokens:
|
|
271
|
+
if tok.startswith('ncpus='):
|
|
272
|
+
try: cpus_per_node = int(tok[6:])
|
|
273
|
+
except ValueError: pass
|
|
274
|
+
elif tok.startswith('ngpus='):
|
|
275
|
+
try: gpus_per_node = int(tok[6:])
|
|
276
|
+
except ValueError: pass
|
|
277
|
+
except (OSError, subprocess.TimeoutExpired) as exc:
|
|
278
|
+
if not n_nodes:
|
|
279
|
+
raise RuntimeError(
|
|
280
|
+
f"PBS_JOBID={job_id!r} is set but cannot query qstat: {exc}"
|
|
281
|
+
) from exc
|
|
282
|
+
|
|
283
|
+
if not n_nodes:
|
|
284
|
+
raise RuntimeError(
|
|
285
|
+
f"PBS_JOBID={job_id!r} is set but node count is unavailable")
|
|
286
|
+
|
|
287
|
+
return {
|
|
288
|
+
'job_id' : job_id,
|
|
289
|
+
'partition' : partition,
|
|
290
|
+
'n_nodes' : int(n_nodes),
|
|
291
|
+
'nodelist' : nodelist,
|
|
292
|
+
'cpus_per_node': cpus_per_node,
|
|
293
|
+
'gpus_per_node': gpus_per_node,
|
|
294
|
+
'account' : account,
|
|
295
|
+
'job_name' : job_name,
|
|
296
|
+
'runtime' : runtime,
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class AuroraPBSBatchSystem(PBSProBatchSystem):
|
|
301
|
+
"""Aurora (ALCF) specialization of PBSPro.
|
|
302
|
+
|
|
303
|
+
Aurora requires ``#PBS -l filesystems=<list>`` on every submission
|
|
304
|
+
(qsub rejects jobs without it), and the expected user base of
|
|
305
|
+
radical.orbit is not expected to know PBS-level resource names. This
|
|
306
|
+
class fills in the defaults so that the UI and the Python client API
|
|
307
|
+
both succeed out of the box; user-supplied values still win on
|
|
308
|
+
conflict.
|
|
309
|
+
|
|
310
|
+
Detection: the vendor-installed ``/opt/aurora`` directory is present
|
|
311
|
+
on both login and compute nodes and is unambiguous (no hostname
|
|
312
|
+
regex, no subprocess calls).
|
|
313
|
+
"""
|
|
314
|
+
|
|
315
|
+
name = 'pbs-aurora'
|
|
316
|
+
|
|
317
|
+
@classmethod
|
|
318
|
+
def detect(cls) -> bool:
|
|
319
|
+
return (super().detect()
|
|
320
|
+
and os.path.isdir('/opt/aurora'))
|
|
321
|
+
|
|
322
|
+
def default_custom_attributes(self) -> dict:
|
|
323
|
+
return {'pbs.l': 'filesystems=home:flare'}
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
# Register Aurora before the generic backend so detect_batch_system()
|
|
327
|
+
# picks the specialization on ALCF hosts.
|
|
328
|
+
register_backend(AuroraPBSBatchSystem)
|
|
329
|
+
register_backend(PBSProBatchSystem)
|