radical.orbit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. radical/orbit/__init__.py +61 -0
  2. radical/orbit/_prof.py +8 -0
  3. radical/orbit/_version.py +3 -0
  4. radical/orbit/batch_system.py +195 -0
  5. radical/orbit/batch_system_pbs.py +329 -0
  6. radical/orbit/batch_system_slurm.py +193 -0
  7. radical/orbit/bridge.py +894 -0
  8. radical/orbit/bridge_plugin_host.py +179 -0
  9. radical/orbit/client.py +618 -0
  10. radical/orbit/data/orbit_explorer.html +2782 -0
  11. radical/orbit/data/plugins/globus.js +419 -0
  12. radical/orbit/data/plugins/iri_connect.js +238 -0
  13. radical/orbit/data/plugins/iri_instance.js +604 -0
  14. radical/orbit/data/plugins/lucid.js +31 -0
  15. radical/orbit/data/plugins/psij.js +746 -0
  16. radical/orbit/data/plugins/queue_info.js +624 -0
  17. radical/orbit/data/plugins/rhapsody.js +376 -0
  18. radical/orbit/data/plugins/staging.js +355 -0
  19. radical/orbit/data/plugins/sysinfo.js +253 -0
  20. radical/orbit/data/plugins/task_dispatcher.js +188 -0
  21. radical/orbit/data/plugins/xgfabric.js +567 -0
  22. radical/orbit/data/xgfabric_resource_default.json +4 -0
  23. radical/orbit/data/xgfabric_resource_test.json +25 -0
  24. radical/orbit/data/xgfabric_workflow_default.json +48 -0
  25. radical/orbit/data/xgfabric_workflow_test.json +24 -0
  26. radical/orbit/exceptions.py +152 -0
  27. radical/orbit/http_utils.py +51 -0
  28. radical/orbit/iri_endpoints.py +25 -0
  29. radical/orbit/logging_config.py +167 -0
  30. radical/orbit/models.py +137 -0
  31. radical/orbit/plugin_base.py +506 -0
  32. radical/orbit/plugin_globus.py +691 -0
  33. radical/orbit/plugin_host_base.py +290 -0
  34. radical/orbit/plugin_iri_connect.py +202 -0
  35. radical/orbit/plugin_iri_instance.py +499 -0
  36. radical/orbit/plugin_lucid.py +243 -0
  37. radical/orbit/plugin_psij.py +1258 -0
  38. radical/orbit/plugin_queue_info.py +469 -0
  39. radical/orbit/plugin_rhapsody.py +1546 -0
  40. radical/orbit/plugin_session_base.py +104 -0
  41. radical/orbit/plugin_staging.py +480 -0
  42. radical/orbit/plugin_sysinfo.py +650 -0
  43. radical/orbit/plugin_task_dispatcher.py +1883 -0
  44. radical/orbit/plugin_xgfabric.py +1394 -0
  45. radical/orbit/queue_info.py +321 -0
  46. radical/orbit/queue_info_none.py +21 -0
  47. radical/orbit/queue_info_pbs.py +510 -0
  48. radical/orbit/queue_info_slurm.py +370 -0
  49. radical/orbit/service.py +831 -0
  50. radical/orbit/task_dispatcher_config.py +307 -0
  51. radical/orbit/task_dispatcher_state.py +334 -0
  52. radical/orbit/task_dispatcher_strategy.py +345 -0
  53. radical/orbit/task_dispatcher_strategy_conservative.py +208 -0
  54. radical/orbit/task_dispatcher_strategy_examples.py +173 -0
  55. radical/orbit/tunnel.py +319 -0
  56. radical/orbit/ui_schema.py +178 -0
  57. radical/orbit/utils.py +295 -0
  58. radical_orbit-0.2.0.data/scripts/radical-orbit-bridge.py +58 -0
  59. radical_orbit-0.2.0.data/scripts/radical-orbit-endpoint-wrapper.sh +60 -0
  60. radical_orbit-0.2.0.data/scripts/radical-orbit-endpoint.py +107 -0
  61. radical_orbit-0.2.0.data/scripts/radical-orbit-iri-tunnel-helper.sh +165 -0
  62. radical_orbit-0.2.0.data/scripts/radical-orbit-makeflow +117 -0
  63. radical_orbit-0.2.0.data/scripts/radical-orbit-makeflow-prep +500 -0
  64. radical_orbit-0.2.0.data/scripts/radical-orbit-run +394 -0
  65. radical_orbit-0.2.0.dist-info/METADATA +326 -0
  66. radical_orbit-0.2.0.dist-info/RECORD +70 -0
  67. radical_orbit-0.2.0.dist-info/WHEEL +5 -0
  68. radical_orbit-0.2.0.dist-info/entry_points.txt +3 -0
  69. radical_orbit-0.2.0.dist-info/licenses/LICENSE.md +179 -0
  70. radical_orbit-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,61 @@
1
+
2
+ # ── Public client surface — bind these BEFORE the plugin imports ─────────────
3
+
4
+ from .plugin_base import Plugin # noqa: F401
5
+ from .plugin_session_base import PluginSession # noqa: F401
6
+ from .ui_schema import (UIConfig, UIForm, UIField, # noqa: F401
7
+ UIMonitor, UINotifications,
8
+ ui_config_to_dict)
9
+
10
+ from .service import EndpointService # noqa: F401
11
+ from .bridge import Bridge # noqa: F401
12
+ from .client import BridgeClient, EndpointClient, PluginClient # noqa: F401
13
+
14
+ # Public alias — ``Endpoint`` is the natural counterpart to ``Bridge``.
15
+ Endpoint = EndpointService
16
+
17
+
18
+ # ── Plugins ──────────────────────────────────────────────────────────────────
19
+
20
+ from .plugin_xgfabric import PluginXGFabric # noqa: F401
21
+ from .plugin_iri_connect import PluginIRIConnect # noqa: F401
22
+ from .plugin_queue_info import PluginQueueInfo # noqa: F401
23
+ from .plugin_sysinfo import PluginSysInfo # noqa: F401
24
+ from .plugin_staging import PluginStaging # noqa: F401
25
+ from .plugin_task_dispatcher import PluginTaskDispatcher # noqa: F401
26
+
27
+ # Optional plugins with external dependencies.
28
+ try:
29
+ from .plugin_lucid import PluginLucid # noqa: F401
30
+ except ImportError:
31
+ pass
32
+
33
+ try:
34
+ from .plugin_psij import PluginPSIJ # noqa: F401
35
+ except ImportError:
36
+ pass
37
+
38
+ try:
39
+ from .plugin_rhapsody import PluginRhapsody # noqa: F401
40
+ except ImportError:
41
+ pass
42
+
43
+ try:
44
+ from .plugin_globus import PluginGlobus # noqa: F401
45
+ except ImportError:
46
+ pass
47
+
48
+
49
+ # ── Version (generated at install time) ──────────────────────────────────────
50
+
51
+ # _version.py is generated by setup.py at install time and is
52
+ # git-ignored. When running from an uninstalled source tree the
53
+ # import will fail and we fall back to dev markers.
54
+ try:
55
+ from ._version import version, version_detail # noqa: F401
56
+ except ImportError:
57
+ version = 'unknown'
58
+ version_detail = 'unknown'
59
+
60
+ __version__ = version
61
+
radical/orbit/_prof.py ADDED
@@ -0,0 +1,8 @@
1
+
2
+ try:
3
+ from radical.prof import Profiler
4
+ except ImportError:
5
+ class Profiler:
6
+ def __init__(self, name, ns=None): pass
7
+ def prof(self, *args, **kwargs): pass
8
+ def close(self): pass
@@ -0,0 +1,3 @@
1
+ # This file is auto-generated by setup.py — do not edit.
2
+ version = '0.2.0'
3
+ version_detail = '0.2.0-v0.1.0-265-g4b12570@master'
@@ -0,0 +1,195 @@
1
+ """
2
+ Batch system abstraction.
3
+
4
+ Encapsulates everything about the local HPC scheduler that callers
5
+ outside of ``queue_info`` need to know: presence detection, in-allocation
6
+ detection, normalized job state, node lookup, cancel, allocation summary.
7
+
8
+ Backend implementations live in ``batch_system_slurm.py`` and
9
+ ``batch_system_pbs.py`` and register themselves via ``_REGISTRY`` below.
10
+ ``detect_batch_system()`` returns the first backend whose ``detect()`` is
11
+ true; otherwise ``NullBatchSystem``.
12
+
13
+ State vocabulary (used everywhere outside the backend modules):
14
+ PENDING — queued, not yet running
15
+ RUNNING — running on compute nodes
16
+ DONE — completed successfully
17
+ FAILED — completed with non-zero exit, NODE_FAIL, PREEMPTED, TIMEOUT, …
18
+ CANCELLED — user- or admin-cancelled
19
+ HELD — held / suspended
20
+ UNKNOWN — backend reported nothing (job gone, transient error, no scheduler)
21
+ """
22
+
23
+ from abc import ABC, abstractmethod
24
+
25
+
26
+ # Normalized state vocabulary
27
+ STATE_PENDING = 'PENDING'
28
+ STATE_RUNNING = 'RUNNING'
29
+ STATE_DONE = 'DONE'
30
+ STATE_FAILED = 'FAILED'
31
+ STATE_CANCELLED = 'CANCELLED'
32
+ STATE_HELD = 'HELD'
33
+ STATE_UNKNOWN = 'UNKNOWN'
34
+
35
+ TERMINAL_STATES = frozenset({STATE_DONE, STATE_FAILED, STATE_CANCELLED})
36
+
37
+
38
+ class BatchSystem(ABC):
39
+ """Per-process scheduler interface.
40
+
41
+ Subclasses are stateless; one instance per backend is held in the
42
+ module-level cache returned by :func:`detect_batch_system`.
43
+ """
44
+
45
+ name : str = 'none' # short identifier
46
+ psij_executor : str = 'local' # corresponding PsiJ executor name
47
+
48
+ @classmethod
49
+ @abstractmethod
50
+ def detect(cls) -> bool:
51
+ """Return True if this scheduler is installed locally."""
52
+
53
+ @abstractmethod
54
+ def in_allocation(self) -> bool:
55
+ """True when this process runs inside a batch job."""
56
+
57
+ @abstractmethod
58
+ def job_id(self) -> 'str | None':
59
+ """Native job id of the current allocation, or None on a login node."""
60
+
61
+ @abstractmethod
62
+ def job_state(self, native_id) -> str:
63
+ """Return a normalized state string for *native_id*.
64
+
65
+ Returns one of the STATE_* constants. Returns STATE_UNKNOWN on any
66
+ error (job gone, command failure, timeout, parse error).
67
+ """
68
+
69
+ @abstractmethod
70
+ def job_nodes(self, native_id) -> list:
71
+ """Return the list of compute node hostnames allocated to *native_id*.
72
+
73
+ Returns an empty list if the job is not running or the lookup fails.
74
+ """
75
+
76
+ @abstractmethod
77
+ def nodelist(self) -> list:
78
+ """Return the expanded list of hostnames in *this* endpoint's allocation.
79
+
80
+ Returns an empty list when the endpoint is not running inside a job (i.e.
81
+ on a login node) or when the scheduler doesn't expose the info.
82
+ Hostnames are returned one per node, in scheduler-reported order.
83
+ """
84
+
85
+ @abstractmethod
86
+ def cancel(self, native_id) -> None:
87
+ """Cancel *native_id*. Raises RuntimeError on failure."""
88
+
89
+ @abstractmethod
90
+ def job_allocation(self) -> 'dict | None':
91
+ """Return allocation info about the current batch job, or None.
92
+
93
+ On a login node returns None. Inside a batch job returns a dict with
94
+ keys: job_id, partition, n_nodes, nodelist, cpus_per_node,
95
+ gpus_per_node, account, job_name, runtime (seconds, None for
96
+ unlimited).
97
+
98
+ Raises RuntimeError when in_allocation() is true but details cannot
99
+ be collected.
100
+ """
101
+
102
+ def terminal_states(self) -> frozenset:
103
+ """The set of normalized states that mean 'job is done'."""
104
+ return TERMINAL_STATES
105
+
106
+ def default_custom_attributes(self) -> dict:
107
+ """Per-site PSIJ custom_attributes to merge into every submission.
108
+
109
+ Returned when the caller submits via the PSIJ executor that
110
+ corresponds to this backend (``psij_executor`` on the class).
111
+ Caller-provided attributes take precedence on key conflicts.
112
+
113
+ Default: no defaults. Site-specific subclasses override to
114
+ encode hard requirements (e.g. Aurora's ``filesystems=home:flare``
115
+ resource is mandatory for qsub).
116
+ """
117
+ return {}
118
+
119
+
120
+ class NullBatchSystem(BatchSystem):
121
+ """Fallback when no scheduler is installed (e.g. dev laptop)."""
122
+
123
+ name = 'none'
124
+ psij_executor = 'local'
125
+
126
+ @classmethod
127
+ def detect(cls) -> bool:
128
+ return True # always last in the registry; matches everything
129
+
130
+ def in_allocation(self) -> bool:
131
+ return False
132
+
133
+ def job_id(self) -> 'str | None':
134
+ return None
135
+
136
+ def job_state(self, native_id) -> str:
137
+ return STATE_UNKNOWN
138
+
139
+ def job_nodes(self, native_id) -> list:
140
+ return []
141
+
142
+ def nodelist(self) -> list:
143
+ return []
144
+
145
+ def cancel(self, native_id) -> None:
146
+ raise RuntimeError(f"no batch system available to cancel job {native_id}")
147
+
148
+ def job_allocation(self) -> 'dict | None':
149
+ return None
150
+
151
+
152
+ # ---------------------------------------------------------------------------
153
+ # Registry + detection
154
+ # ---------------------------------------------------------------------------
155
+
156
+ _REGISTRY : list = [] # populated by backend modules at import time
157
+ _DETECTED : 'BatchSystem | None' = None
158
+
159
+
160
+ def register_backend(cls) -> None:
161
+ """Register a BatchSystem subclass. Called by backend modules at import."""
162
+ if cls not in _REGISTRY:
163
+ _REGISTRY.append(cls)
164
+
165
+
166
+ def detect_batch_system(force: bool = False) -> BatchSystem:
167
+ """Return the active batch system, probing the registry on first call.
168
+
169
+ Result is cached. Pass ``force=True`` to re-probe (mainly for tests).
170
+ """
171
+ global _DETECTED
172
+ if _DETECTED is not None and not force:
173
+ return _DETECTED
174
+
175
+ # Import backend modules so they register. Local import avoids circular
176
+ # imports during package init.
177
+ from . import batch_system_slurm # noqa: F401
178
+ from . import batch_system_pbs # noqa: F401
179
+
180
+ for cls in _REGISTRY:
181
+ try:
182
+ if cls.detect():
183
+ _DETECTED = cls()
184
+ return _DETECTED
185
+ except Exception:
186
+ continue
187
+
188
+ _DETECTED = NullBatchSystem()
189
+ return _DETECTED
190
+
191
+
192
+ def reset_detection() -> None:
193
+ """Clear the cached detection (tests only)."""
194
+ global _DETECTED
195
+ _DETECTED = None
@@ -0,0 +1,329 @@
1
+ """PBSPro implementation of BatchSystem.
2
+
3
+ Targets PBS Professional (Altair) as found on Aurora and similar ALCF
4
+ systems. Uses qstat / qdel for job control. Aurora's PBSPro publishes
5
+ state info both as text (qstat -f) and partially as JSON (qstat -f -F json
6
+ on recent versions); this backend only relies on text output, which is
7
+ universal.
8
+ """
9
+
10
+ import os
11
+ import shutil
12
+ import subprocess
13
+
14
+ from .batch_system import (BatchSystem, register_backend,
15
+ STATE_PENDING, STATE_RUNNING, STATE_DONE,
16
+ STATE_FAILED, STATE_CANCELLED,
17
+ STATE_HELD, STATE_UNKNOWN)
18
+
19
+
20
+ # PBS single-letter state → normalized vocabulary.
21
+ # Reference: PBSPro qstat manual (job_state column).
22
+ _STATE_MAP = {
23
+ 'Q': STATE_PENDING, # queued
24
+ 'W': STATE_PENDING, # waiting (begin time / dependency)
25
+ 'T': STATE_PENDING, # being moved
26
+ 'R': STATE_RUNNING, # running
27
+ 'B': STATE_RUNNING, # array job: at least one subjob running
28
+ 'E': STATE_RUNNING, # exiting (still cleaning up)
29
+ 'F': STATE_DONE, # finished (PBSPro only with -x)
30
+ 'X': STATE_DONE, # subjob completed (array)
31
+ 'H': STATE_HELD, # held
32
+ 'S': STATE_HELD, # suspended
33
+ 'M': STATE_HELD, # moved to another server
34
+ 'U': STATE_HELD, # cycle-harvesting suspension
35
+ }
36
+
37
+
38
+ def _parse_pbs_walltime(s: str) -> 'int | None':
39
+ """Parse a PBS walltime string ([[HH:]MM:]SS) to seconds.
40
+
41
+ Returns None on empty / unset values.
42
+ """
43
+ if not s:
44
+ return None
45
+ s = s.strip()
46
+ if not s:
47
+ return None
48
+ parts = s.split(':')
49
+ try:
50
+ if len(parts) == 3: h, m, sec = (int(p) for p in parts)
51
+ elif len(parts) == 2: h, m, sec = 0, int(parts[0]), int(parts[1])
52
+ elif len(parts) == 1: h, m, sec = 0, 0, int(parts[0])
53
+ else: raise ValueError
54
+ except ValueError as e:
55
+ raise RuntimeError(f"Cannot parse PBS walltime: {s!r}") from e
56
+ return h * 3600 + m * 60 + sec
57
+
58
+
59
+ def _parse_qstat_f(stdout: str) -> dict:
60
+ """Parse ``qstat -f <jobid>`` text output into a {key: value} dict.
61
+
62
+ PBSPro indents every attribute line (`` key = value``). Continuation
63
+ lines for long values are indented *more* than attribute lines and may
64
+ themselves contain ``=`` characters (e.g. inside ``Resource_List.select``).
65
+ The rule used here: the first indented attribute line sets the
66
+ *attribute-indent* width; any line indented strictly deeper is a
67
+ continuation of the prior key.
68
+ Section headers like ``Job Id: 12345`` are ignored.
69
+ """
70
+ result = {}
71
+ cur_key = None
72
+ cur_val_parts = []
73
+ attr_indent = None # set by the first attribute line we see
74
+
75
+ def _flush():
76
+ if cur_key is not None:
77
+ result[cur_key] = ''.join(cur_val_parts).strip()
78
+
79
+ for raw in stdout.splitlines():
80
+ if not raw or not raw.strip():
81
+ continue
82
+ stripped = raw.strip()
83
+ if stripped.startswith('Job Id:'):
84
+ continue
85
+ indent = len(raw) - len(raw.lstrip())
86
+ is_continuation = (attr_indent is not None
87
+ and indent > attr_indent
88
+ and cur_key is not None)
89
+ if is_continuation:
90
+ cur_val_parts.append(stripped)
91
+ continue
92
+ if attr_indent is None:
93
+ attr_indent = indent
94
+ # A new attribute line.
95
+ if '=' in stripped:
96
+ _flush()
97
+ k, v = stripped.split('=', 1)
98
+ cur_key = k.strip()
99
+ cur_val_parts = [v.strip()]
100
+ elif cur_key is not None:
101
+ # No '=' and not deeper indented → treat as plain continuation.
102
+ cur_val_parts.append(stripped)
103
+ _flush()
104
+ return result
105
+
106
+
107
+ def _parse_exec_host(s: str) -> list:
108
+ """Parse PBS exec_host into a list of hostnames.
109
+
110
+ Format: ``host1/0*64+host2/0*64`` (host/cpuset*ncpus pairs).
111
+ Returns deduplicated host list preserving order.
112
+ """
113
+ if not s:
114
+ return []
115
+ seen = set()
116
+ hosts = []
117
+ for token in s.split('+'):
118
+ host = token.split('/', 1)[0].split('.', 1)[0]
119
+ if host and host not in seen:
120
+ seen.add(host)
121
+ hosts.append(host)
122
+ return hosts
123
+
124
+
125
+ def _read_pbs_nodefile() -> list:
126
+ """Return deduplicated host list from $PBS_NODEFILE, empty if missing."""
127
+ path = os.environ.get('PBS_NODEFILE')
128
+ if not path or not os.path.isfile(path):
129
+ return []
130
+ try:
131
+ with open(path) as f:
132
+ lines = [l.strip() for l in f if l.strip()]
133
+ except OSError:
134
+ return []
135
+ seen = set()
136
+ hosts = []
137
+ for h in lines:
138
+ h = h.split('.', 1)[0]
139
+ if h not in seen:
140
+ seen.add(h)
141
+ hosts.append(h)
142
+ return hosts
143
+
144
+
145
+ class PBSProBatchSystem(BatchSystem):
146
+ """PBSPro scheduler interface."""
147
+
148
+ name = 'pbs'
149
+ psij_executor = 'pbs'
150
+
151
+ def __init__(self) -> None:
152
+ super().__init__()
153
+ # Native ids we've been asked to cancel. PBSPro's qstat letter
154
+ # codes have no dedicated 'cancelled' value — the job ends up in
155
+ # 'F' (finished) just like a normal exit, so we remember the
156
+ # intent here and map terminal states to STATE_CANCELLED in
157
+ # job_state().
158
+ self._cancelled: set = set()
159
+
160
+ @classmethod
161
+ def detect(cls) -> bool:
162
+ return shutil.which('qstat') is not None
163
+
164
+ def in_allocation(self) -> bool:
165
+ return bool(os.environ.get('PBS_JOBID'))
166
+
167
+ def job_id(self) -> 'str | None':
168
+ return os.environ.get('PBS_JOBID')
169
+
170
+ def job_state(self, native_id) -> str:
171
+ try:
172
+ r = subprocess.run(
173
+ ['qstat', '-f', str(native_id)],
174
+ capture_output=True, text=True, timeout=10)
175
+ except (OSError, subprocess.TimeoutExpired):
176
+ return STATE_UNKNOWN
177
+ if r.returncode != 0:
178
+ # try -x to look up finished jobs
179
+ try:
180
+ r = subprocess.run(
181
+ ['qstat', '-x', '-f', str(native_id)],
182
+ capture_output=True, text=True, timeout=10)
183
+ except (OSError, subprocess.TimeoutExpired):
184
+ return STATE_UNKNOWN
185
+ if r.returncode != 0:
186
+ return STATE_UNKNOWN
187
+ info = _parse_qstat_f(r.stdout)
188
+ code = info.get('job_state', '').strip()
189
+ if not code:
190
+ return STATE_UNKNOWN
191
+ state = _STATE_MAP.get(code[0].upper(), STATE_UNKNOWN)
192
+ if str(native_id) in self._cancelled and state in (STATE_DONE, STATE_FAILED):
193
+ return STATE_CANCELLED
194
+ return state
195
+
196
+ def job_nodes(self, native_id) -> list:
197
+ try:
198
+ r = subprocess.run(
199
+ ['qstat', '-f', str(native_id)],
200
+ capture_output=True, text=True, timeout=10)
201
+ except (OSError, subprocess.TimeoutExpired):
202
+ return []
203
+ if r.returncode != 0:
204
+ return []
205
+ info = _parse_qstat_f(r.stdout)
206
+ return _parse_exec_host(info.get('exec_host', ''))
207
+
208
+ def nodelist(self) -> list:
209
+ # PBS_NODEFILE lists each host once per slot; ``_read_pbs_nodefile``
210
+ # already dedupes and short-circuits on a missing / empty file.
211
+ return _read_pbs_nodefile()
212
+
213
+ def cancel(self, native_id) -> None:
214
+ r = subprocess.run(['qdel', str(native_id)],
215
+ capture_output=True, text=True, timeout=10)
216
+ if r.returncode != 0:
217
+ raise RuntimeError(f"qdel failed: {r.stderr.strip()}")
218
+ self._cancelled.add(str(native_id))
219
+
220
+ def job_allocation(self) -> 'dict | None':
221
+ job_id = os.environ.get('PBS_JOBID')
222
+ if not job_id:
223
+ return None
224
+
225
+ # Node count from PBS_NODEFILE first (always present in jobs),
226
+ # fall back to qstat -f Resource_List.nodect if needed.
227
+ nodes = _read_pbs_nodefile()
228
+ n_nodes = len(nodes) or None
229
+
230
+ # Pull walltime / partition / account from qstat.
231
+ runtime = None
232
+ partition = os.environ.get('PBS_QUEUE') or os.environ.get('PBS_O_QUEUE')
233
+ account = os.environ.get('PBS_ACCOUNT')
234
+ job_name = os.environ.get('PBS_JOBNAME')
235
+ nodelist = ','.join(nodes) if nodes else None
236
+ cpus_per_node = None
237
+ gpus_per_node = None
238
+
239
+ try:
240
+ r = subprocess.run(
241
+ ['qstat', '-f', job_id],
242
+ capture_output=True, text=True, timeout=10)
243
+ if r.returncode == 0:
244
+ info = _parse_qstat_f(r.stdout)
245
+ runtime = _parse_pbs_walltime(
246
+ info.get('Resource_List.walltime', ''))
247
+ if not n_nodes:
248
+ nct = info.get('Resource_List.nodect', '')
249
+ try:
250
+ n_nodes = int(nct) if nct else None
251
+ except ValueError:
252
+ n_nodes = None
253
+ if not partition:
254
+ partition = info.get('queue', '') or None
255
+ if not account:
256
+ account = info.get('Account_Name', '') or None
257
+ if not job_name:
258
+ job_name = info.get('Job_Name', '') or None
259
+ if not nodelist:
260
+ eh = info.get('exec_host', '')
261
+ if eh:
262
+ nodelist = ','.join(_parse_exec_host(eh))
263
+
264
+ # Extract per-node resources from select=... when possible.
265
+ # Format: "1:ncpus=64:ngpus=4" or "2:ncpus=64".
266
+ select = info.get('Resource_List.select', '')
267
+ if select:
268
+ chunk = select.split('+', 1)[0]
269
+ tokens = chunk.split(':')
270
+ for tok in tokens:
271
+ if tok.startswith('ncpus='):
272
+ try: cpus_per_node = int(tok[6:])
273
+ except ValueError: pass
274
+ elif tok.startswith('ngpus='):
275
+ try: gpus_per_node = int(tok[6:])
276
+ except ValueError: pass
277
+ except (OSError, subprocess.TimeoutExpired) as exc:
278
+ if not n_nodes:
279
+ raise RuntimeError(
280
+ f"PBS_JOBID={job_id!r} is set but cannot query qstat: {exc}"
281
+ ) from exc
282
+
283
+ if not n_nodes:
284
+ raise RuntimeError(
285
+ f"PBS_JOBID={job_id!r} is set but node count is unavailable")
286
+
287
+ return {
288
+ 'job_id' : job_id,
289
+ 'partition' : partition,
290
+ 'n_nodes' : int(n_nodes),
291
+ 'nodelist' : nodelist,
292
+ 'cpus_per_node': cpus_per_node,
293
+ 'gpus_per_node': gpus_per_node,
294
+ 'account' : account,
295
+ 'job_name' : job_name,
296
+ 'runtime' : runtime,
297
+ }
298
+
299
+
300
+ class AuroraPBSBatchSystem(PBSProBatchSystem):
301
+ """Aurora (ALCF) specialization of PBSPro.
302
+
303
+ Aurora requires ``#PBS -l filesystems=<list>`` on every submission
304
+ (qsub rejects jobs without it), and the expected user base of
305
+ radical.orbit is not expected to know PBS-level resource names. This
306
+ class fills in the defaults so that the UI and the Python client API
307
+ both succeed out of the box; user-supplied values still win on
308
+ conflict.
309
+
310
+ Detection: the vendor-installed ``/opt/aurora`` directory is present
311
+ on both login and compute nodes and is unambiguous (no hostname
312
+ regex, no subprocess calls).
313
+ """
314
+
315
+ name = 'pbs-aurora'
316
+
317
+ @classmethod
318
+ def detect(cls) -> bool:
319
+ return (super().detect()
320
+ and os.path.isdir('/opt/aurora'))
321
+
322
+ def default_custom_attributes(self) -> dict:
323
+ return {'pbs.l': 'filesystems=home:flare'}
324
+
325
+
326
+ # Register Aurora before the generic backend so detect_batch_system()
327
+ # picks the specialization on ALCF hosts.
328
+ register_backend(AuroraPBSBatchSystem)
329
+ register_backend(PBSProBatchSystem)