radical.orbit 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. radical_orbit-0.2.0/CHANGES.md +19 -0
  2. radical_orbit-0.2.0/CLAUDE.md +254 -0
  3. radical_orbit-0.2.0/DEPLOYMENT.md +151 -0
  4. radical_orbit-0.2.0/LICENSE.md +179 -0
  5. radical_orbit-0.2.0/MANIFEST.in +8 -0
  6. radical_orbit-0.2.0/PKG-INFO +326 -0
  7. radical_orbit-0.2.0/README.md +265 -0
  8. radical_orbit-0.2.0/README_AMSC.md +114 -0
  9. radical_orbit-0.2.0/README_amsc.md +91 -0
  10. radical_orbit-0.2.0/ROADMAP.md +145 -0
  11. radical_orbit-0.2.0/VERSION +1 -0
  12. radical_orbit-0.2.0/bin/radical-orbit-bridge.py +58 -0
  13. radical_orbit-0.2.0/bin/radical-orbit-endpoint-wrapper.sh.in +60 -0
  14. radical_orbit-0.2.0/bin/radical-orbit-endpoint.py +107 -0
  15. radical_orbit-0.2.0/bin/radical-orbit-iri-tunnel-helper.sh +165 -0
  16. radical_orbit-0.2.0/bin/radical-orbit-makeflow +117 -0
  17. radical_orbit-0.2.0/bin/radical-orbit-makeflow-prep +500 -0
  18. radical_orbit-0.2.0/bin/radical-orbit-run +394 -0
  19. radical_orbit-0.2.0/demo_state.md +127 -0
  20. radical_orbit-0.2.0/docs/source/getting_started.md +252 -0
  21. radical_orbit-0.2.0/docs/source/task_dispatcher_strategy.md +246 -0
  22. radical_orbit-0.2.0/dragon_one.py +47 -0
  23. radical_orbit-0.2.0/examples/amsc.py +1469 -0
  24. radical_orbit-0.2.0/examples/analyze_profiles.py +450 -0
  25. radical_orbit-0.2.0/examples/analyze_rhapsody.py +324 -0
  26. radical_orbit-0.2.0/examples/configs/psij_config_bridges2.json +15 -0
  27. radical_orbit-0.2.0/examples/configs/psij_config_polaris.json +18 -0
  28. radical_orbit-0.2.0/examples/example_endpoint.py +185 -0
  29. radical_orbit-0.2.0/examples/example_endpoint_spawn.py +365 -0
  30. radical_orbit-0.2.0/examples/example_globus.py +111 -0
  31. radical_orbit-0.2.0/examples/example_iri.py +209 -0
  32. radical_orbit-0.2.0/examples/example_lucid.py +49 -0
  33. radical_orbit-0.2.0/examples/example_makeflow_multiendpoint/pools.json +50 -0
  34. radical_orbit-0.2.0/examples/example_psij.py +160 -0
  35. radical_orbit-0.2.0/examples/example_queue_info.py +104 -0
  36. radical_orbit-0.2.0/examples/example_rhapsody.py +107 -0
  37. radical_orbit-0.2.0/examples/example_rhapsody_ddict.py +95 -0
  38. radical_orbit-0.2.0/examples/example_rhapsody_individual.py +114 -0
  39. radical_orbit-0.2.0/examples/example_rhapsody_throughput.py +155 -0
  40. radical_orbit-0.2.0/examples/example_rose.py +325 -0
  41. radical_orbit-0.2.0/examples/example_staging.py +58 -0
  42. radical_orbit-0.2.0/examples/example_sysinfo.py +88 -0
  43. radical_orbit-0.2.0/examples/plot_profiles.py +623 -0
  44. radical_orbit-0.2.0/examples/repro_dragon_placement.py +96 -0
  45. radical_orbit-0.2.0/examples/repro_placement.py +112 -0
  46. radical_orbit-0.2.0/examples/run_matey.py +58 -0
  47. radical_orbit-0.2.0/examples/test_placement_throughput.py +179 -0
  48. radical_orbit-0.2.0/examples/xgfabric.py +182 -0
  49. radical_orbit-0.2.0/get_globus_token.py +505 -0
  50. radical_orbit-0.2.0/iri.json +1 -0
  51. radical_orbit-0.2.0/iri_token.sh +34 -0
  52. radical_orbit-0.2.0/matey_debug.md +499 -0
  53. radical_orbit-0.2.0/matey_wrapper.sh +12 -0
  54. radical_orbit-0.2.0/pyproject.toml +6 -0
  55. radical_orbit-0.2.0/requirements.txt +17 -0
  56. radical_orbit-0.2.0/rhapsody_one.py +46 -0
  57. radical_orbit-0.2.0/roadmap.txt +27 -0
  58. radical_orbit-0.2.0/roadmap_q1.txt +109 -0
  59. radical_orbit-0.2.0/run_matey.py +58 -0
  60. radical_orbit-0.2.0/setup.cfg +9 -0
  61. radical_orbit-0.2.0/setup.py +243 -0
  62. radical_orbit-0.2.0/src/radical/orbit/__init__.py +61 -0
  63. radical_orbit-0.2.0/src/radical/orbit/_prof.py +8 -0
  64. radical_orbit-0.2.0/src/radical/orbit/_version.py +3 -0
  65. radical_orbit-0.2.0/src/radical/orbit/batch_system.py +195 -0
  66. radical_orbit-0.2.0/src/radical/orbit/batch_system_pbs.py +329 -0
  67. radical_orbit-0.2.0/src/radical/orbit/batch_system_slurm.py +193 -0
  68. radical_orbit-0.2.0/src/radical/orbit/bridge.py +894 -0
  69. radical_orbit-0.2.0/src/radical/orbit/bridge_plugin_host.py +179 -0
  70. radical_orbit-0.2.0/src/radical/orbit/client.py +618 -0
  71. radical_orbit-0.2.0/src/radical/orbit/data/orbit_explorer.html +2782 -0
  72. radical_orbit-0.2.0/src/radical/orbit/data/plugins/globus.js +419 -0
  73. radical_orbit-0.2.0/src/radical/orbit/data/plugins/iri_connect.js +238 -0
  74. radical_orbit-0.2.0/src/radical/orbit/data/plugins/iri_instance.js +604 -0
  75. radical_orbit-0.2.0/src/radical/orbit/data/plugins/lucid.js +31 -0
  76. radical_orbit-0.2.0/src/radical/orbit/data/plugins/psij.js +746 -0
  77. radical_orbit-0.2.0/src/radical/orbit/data/plugins/queue_info.js +624 -0
  78. radical_orbit-0.2.0/src/radical/orbit/data/plugins/rhapsody.js +376 -0
  79. radical_orbit-0.2.0/src/radical/orbit/data/plugins/staging.js +355 -0
  80. radical_orbit-0.2.0/src/radical/orbit/data/plugins/sysinfo.js +253 -0
  81. radical_orbit-0.2.0/src/radical/orbit/data/plugins/task_dispatcher.js +188 -0
  82. radical_orbit-0.2.0/src/radical/orbit/data/plugins/xgfabric.js +567 -0
  83. radical_orbit-0.2.0/src/radical/orbit/data/xgfabric_resource_default.json +4 -0
  84. radical_orbit-0.2.0/src/radical/orbit/data/xgfabric_resource_test.json +25 -0
  85. radical_orbit-0.2.0/src/radical/orbit/data/xgfabric_workflow_default.json +48 -0
  86. radical_orbit-0.2.0/src/radical/orbit/data/xgfabric_workflow_test.json +24 -0
  87. radical_orbit-0.2.0/src/radical/orbit/exceptions.py +152 -0
  88. radical_orbit-0.2.0/src/radical/orbit/http_utils.py +51 -0
  89. radical_orbit-0.2.0/src/radical/orbit/iri_endpoints.py +25 -0
  90. radical_orbit-0.2.0/src/radical/orbit/logging_config.py +167 -0
  91. radical_orbit-0.2.0/src/radical/orbit/models.py +137 -0
  92. radical_orbit-0.2.0/src/radical/orbit/plugin_base.py +506 -0
  93. radical_orbit-0.2.0/src/radical/orbit/plugin_globus.py +691 -0
  94. radical_orbit-0.2.0/src/radical/orbit/plugin_host_base.py +290 -0
  95. radical_orbit-0.2.0/src/radical/orbit/plugin_iri_connect.py +202 -0
  96. radical_orbit-0.2.0/src/radical/orbit/plugin_iri_instance.py +499 -0
  97. radical_orbit-0.2.0/src/radical/orbit/plugin_lucid.py +243 -0
  98. radical_orbit-0.2.0/src/radical/orbit/plugin_psij.py +1258 -0
  99. radical_orbit-0.2.0/src/radical/orbit/plugin_queue_info.py +469 -0
  100. radical_orbit-0.2.0/src/radical/orbit/plugin_rhapsody.py +1546 -0
  101. radical_orbit-0.2.0/src/radical/orbit/plugin_session_base.py +104 -0
  102. radical_orbit-0.2.0/src/radical/orbit/plugin_staging.py +480 -0
  103. radical_orbit-0.2.0/src/radical/orbit/plugin_sysinfo.py +650 -0
  104. radical_orbit-0.2.0/src/radical/orbit/plugin_task_dispatcher.py +1883 -0
  105. radical_orbit-0.2.0/src/radical/orbit/plugin_xgfabric.py +1394 -0
  106. radical_orbit-0.2.0/src/radical/orbit/queue_info.py +321 -0
  107. radical_orbit-0.2.0/src/radical/orbit/queue_info_none.py +21 -0
  108. radical_orbit-0.2.0/src/radical/orbit/queue_info_pbs.py +510 -0
  109. radical_orbit-0.2.0/src/radical/orbit/queue_info_slurm.py +370 -0
  110. radical_orbit-0.2.0/src/radical/orbit/service.py +831 -0
  111. radical_orbit-0.2.0/src/radical/orbit/task_dispatcher_config.py +307 -0
  112. radical_orbit-0.2.0/src/radical/orbit/task_dispatcher_state.py +334 -0
  113. radical_orbit-0.2.0/src/radical/orbit/task_dispatcher_strategy.py +345 -0
  114. radical_orbit-0.2.0/src/radical/orbit/task_dispatcher_strategy_conservative.py +208 -0
  115. radical_orbit-0.2.0/src/radical/orbit/task_dispatcher_strategy_examples.py +173 -0
  116. radical_orbit-0.2.0/src/radical/orbit/tunnel.py +319 -0
  117. radical_orbit-0.2.0/src/radical/orbit/ui_schema.py +178 -0
  118. radical_orbit-0.2.0/src/radical/orbit/utils.py +295 -0
  119. radical_orbit-0.2.0/src/radical.orbit.egg-info/PKG-INFO +326 -0
  120. radical_orbit-0.2.0/src/radical.orbit.egg-info/SOURCES.txt +126 -0
  121. radical_orbit-0.2.0/src/radical.orbit.egg-info/dependency_links.txt +1 -0
  122. radical_orbit-0.2.0/src/radical.orbit.egg-info/entry_points.txt +3 -0
  123. radical_orbit-0.2.0/src/radical.orbit.egg-info/not-zip-safe +1 -0
  124. radical_orbit-0.2.0/src/radical.orbit.egg-info/requires.txt +13 -0
  125. radical_orbit-0.2.0/src/radical.orbit.egg-info/top_level.txt +1 -0
  126. radical_orbit-0.2.0/stats.txt +286 -0
  127. radical_orbit-0.2.0/xgfabric.md +202 -0
@@ -0,0 +1,19 @@
1
+
2
+ Version 0.0.1 release 2026-03-11
3
+ --------------------------------------------------------------------------------
4
+
5
+ * For a list of bug fixes, see
6
+ https://github.com/radical-cybertools/radical.orbit/issues?q=is%3Aissue+is%3Aclosed+sort%3Aupdated-desc
7
+ * For a list of open issues and known problems, see
8
+ https://github.com/radical-cybertools/radical.orbit/issues?q=is%3Aissue+is%3Aopen+
9
+
10
+ * initial release for radical project 'radical.orbit'
11
+
12
+
13
+ Version 0.1.0 release 2026-03-31
14
+ --------------------------------------------------------------------------------
15
+
16
+ - development milestone toward AmSC demo
17
+
18
+
19
+ --------------------------------------------------------------------------------
@@ -0,0 +1,254 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Workflow Rules
6
+
7
+ **IMPORTANT: Always plan first, then wait for the user's literal "go" before implementing anything.** Do not write code, edit files, or make changes until explicitly told to proceed.
8
+
9
+ ## Project Overview
10
+
11
+ ORBIT is a bridge-based distributed framework that connects external RCT (RADICAL-Cybertools) applications with HPC resources. It uses a three-tier architecture: **Client → Bridge → Endpoint**, communicating over HTTPS and WebSockets.
12
+
13
+ ## Build & Install
14
+
15
+ ```sh
16
+ pip install .
17
+ ```
18
+
19
+ ## Running Locally
20
+
21
+ Requires two terminals (optionally three for testing):
22
+
23
+ ```sh
24
+ # Terminal 1 – Bridge (reverse proxy, public-facing)
25
+ ./bin/radical-orbit-bridge.py
26
+
27
+ # Terminal 2 – Endpoint service (HPC side, connects to bridge via WebSocket)
28
+ ./bin/radical-orbit-endpoint-wrapper.sh # preferred: sets up PATH and PYTHONPATH
29
+ # or: ./bin/radical-orbit-endpoint.py
30
+
31
+ # Terminal 3 – Test client (optional)
32
+ python examples/example_sysinfo.py # System info
33
+ python examples/example_psij.py # PsiJ job submission
34
+ python examples/example_rhapsody.py # Rhapsody tasks
35
+ python examples/example_endpoint.py # Submit a child endpoint service as a batch job
36
+ ```
37
+
38
+ The bridge includes a web-based **Explorer UI** at the root URL (e.g., `http://localhost:8000/`).
39
+
40
+ For HTTPS, generate a self-signed cert first:
41
+ ```sh
42
+ openssl req -x509 -newkey rsa:4096 -nodes -keyout key.pem -out cert.pem -days 365 -subj "/CN=localhost"
43
+ ```
44
+
45
+ ## Testing
46
+
47
+ ```sh
48
+ pytest tests/unittests/ # unit tests (231 tests)
49
+ pytest tests/integration/ # integration tests (require running services)
50
+ ```
51
+
52
+ ## Linting
53
+
54
+ ```sh
55
+ flake8 src/ bin/ # config in .flake8
56
+ pylint src/radical/orbit/ # config in .pylintrc
57
+ ```
58
+
59
+ The flake8 config ignores many whitespace/formatting rules to match the project's alignment-heavy coding style.
60
+
61
+ ## Architecture
62
+
63
+ ### Three-tier request flow
64
+
65
+ 1. **Bridge** (`bin/radical-orbit-bridge.py`) – FastAPI server acting as reverse proxy. Clients send HTTP requests; the bridge forwards them to the appropriate endpoint over a persistent WebSocket, then returns the response. Correlates requests via UUID. Provides SSE endpoint (`/events`) for real-time notifications.
66
+
67
+ 2. **Endpoint** (`bin/radical-orbit-endpoint.py`, wrapper: `bin/radical-orbit-endpoint-wrapper.sh`) – FastAPI service on HPC nodes. Initiates an outbound WebSocket connection to the bridge (firewall-friendly). Receives forwarded requests from the bridge, dispatches them to locally-mounted plugin routes via HTTP loopback, and returns results.
68
+
69
+ 3. **Plugins** – extend the endpoint with domain-specific functionality. Each plugin gets a unique namespace (`/{plugin_name}/{uuid}/`) to avoid route collisions.
70
+
71
+ ### Bridge REST API
72
+
73
+ Key endpoints:
74
+ - `POST /endpoint/list` – List connected endpoints and their plugins
75
+ - `POST /endpoint/disconnect/{endpoint_name}` – Disconnect and terminate an endpoint
76
+ - `POST /bridge/terminate` – Terminate the bridge process
77
+ - `GET /events` – SSE stream for real-time notifications
78
+ - `/{endpoint_name}/{plugin_namespace}/...` – Proxied requests to endpoint plugins
79
+
80
+ ### Plugin system
81
+
82
+ - **Base class**: `src/radical/orbit/plugin_base.py` – provides namespace isolation, session management, route-registration helpers, and notification support.
83
+ - **Session base**: `src/radical/orbit/plugin_session_base.py` – per-client session state management.
84
+ - **Client API**: `src/radical/orbit/client.py` – Python client for bridge/endpoint interaction with notification callback support.
85
+
86
+ **Available plugins:**
87
+ - **sysinfo** (`plugin_sysinfo.py`) – System info (hostname, OS, CPU, memory, disk, network, GPUs). Detects shared filesystems (Lustre, GPFS, NFS, DVS, etc.). Background prefetch on startup. Client API: `SysInfoClient.homedir()` (session-less, returns endpoint home dir), `get_metrics()` (requires session).
88
+ - **psij** (`plugin_psij.py`) – HPC job submission via PsiJ (supports local, SLURM, PBS, LSF). Background job state polling. Default executable: `radical-orbit-endpoint-wrapper.sh`. Stores job metadata at submit time. Client API: `submit_job(job_spec, executor)`, `get_job_status(job_id, stdout_offset, stderr_offset)` (streams stdout/stderr with byte offsets), `list_jobs()`, `cancel_job(job_id)`, `submit_tunneled(job_spec, executor, tunnel='none'|'forward'|'reverse')` (spawns child endpoint via batch job; see tunnel section below), `tunnel_status(endpoint_name)` (session-less, returns `{status, port, pid}`). Notification topic: `job_status` → `{job_id, state, exit_code, stdout, stderr}`.
89
+ - **Tunnel implementation** — three runtime modes selected per-target via the `tunnel` field on `submit_tunneled` (and on the IRI/PsiJ entries in `examples/{amsc,matey}.py`):
90
+ - `'none'` — child connects directly to the bridge. No SSH spawn.
91
+ - `'forward'` (compute→login) — `submit_tunneled` injects `--tunnel forward` and `--tunnel-via <login_hostname>` into the child's argv. The child opens `ssh -L <port>:<bridge_host>:<bridge_port> <login_host> -N` itself, writes `~/.radical/orbit/tunnels/<endpoint_name>.port` on the shared filesystem, then rewrites its bridge URL to `https://localhost:<port>`. Used on Aurora / Perlmutter (compute→login SSH allowed; reverse direction blocked). Failure surfaces naturally as the job's `FAILED` state — no parent-side cancel needed.
92
+ - `'reverse'` (login→compute) — child gets `--tunnel reverse` only; the parent-side `_tunnel_watcher` (running inside the login-node `plugin_psij`) waits for the batch job to reach `RUNNING`, asks `BatchSystem.job_nodes(native_id)` for the compute hostname, and spawns `ssh -R 0:<bridge_host>:<bridge_port> <compute_host> -N` itself. The remote port is parsed from sshd's `"Allocated port N"` stderr line and written to the rendezvous file; the child reads the same file path and connects to `https://localhost:<port>`. On any spawn / lifetime failure the watcher records the reason in `_failure_reasons[job_id]` and cancels the now-useless allocation; `get_job_status` then synthesises `state='FAILED'` with the recorded `error` so a client poll bails early — operator-initiated cancels (no entry in `_failure_reasons`) keep their natural `CANCELLED` state. Used on Odo (compute→login blocked; login→compute allowed).
93
+ Spawn + port-parsing logic for both directions lives in `src/radical/orbit/tunnel.py` (`spawn_tunnel`, `spawn_reverse_tunnel`) and is test-covered. Login host resolution for forward mode: `--tunnel-via` CLI arg → `$PBS_O_HOST` → `$SLURM_SUBMIT_HOST`. The boolean `tunnel=True/False` form is no longer accepted — must be one of the three string values.
94
+ - **Node discovery**: `BatchSystem.job_nodes(native_id)` returns allocated node hostnames; SLURM uses `squeue`/`scontrol show hostnames`, PBSPro parses `qstat -f exec_host`. Used by the tunnel watcher.
95
+ - **BatchSystem abstraction** (`batch_system.py`, `batch_system_slurm.py`, `batch_system_pbs.py`) – isolates scheduler-specific behaviour. `detect_batch_system()` returns the active backend (`SlurmBatchSystem`, `PBSProBatchSystem`, or `NullBatchSystem`). All schedulers expose a normalized state vocabulary (`PENDING`/`RUNNING`/`DONE`/`FAILED`/`CANCELLED`/`HELD`/`UNKNOWN`); callers compare against constants from `batch_system`, never raw scheduler strings. To add a new backend (e.g. LSF, Cobalt): subclass `BatchSystem`, implement the abstract methods, and call `register_backend(YourBackend)` at module load.
96
+ - **queue_info** (`plugin_queue_info.py`) – Batch queue/partition info, job listings, and allocations. Backend selected automatically via `make_queue_info()` factory: SLURM (`queue_info_slurm.py`, sinfo/squeue/sacctmgr), PBSPro (`queue_info_pbs.py`, qstat/pbsnodes; allocations not available — PBSPro has no native sacctmgr equivalent), or no-op (`queue_info_none.py`). Shared backend with caching. Background prefetch on startup. Client API: `backend()` (session-less, returns `'slurm'`/`'pbs'`/`'none'`), `job_allocation()` (session-less, returns `{job_id, partition, n_nodes, nodelist, cpus_per_node, gpus_per_node, account, job_name, runtime}` or None), `get_info(user, force)`, `list_jobs(queue, user, force)`, `list_all_jobs(user, force)`, `cancel_job(job_id)`, `list_allocations(user, force)`.
97
+ - **rhapsody** (`plugin_rhapsody.py`) – Task execution via Rhapsody backends (default: Dragon V3). Registers backend callbacks for intermediate state notifications (e.g. RUNNING). Client API: `submit_tasks(tasks)`, `wait_tasks(uids, timeout)`, `list_tasks()`, `get_task(uid)`, `cancel_task(uid)`, `cancel_all_tasks()`. Function tasks supported via cloudpickle (``"function": "cloudpickle::<base64>"``, ``"_pickled_fields": [...]``) or import path (``"function": "module:func"``). Resource specs via ``task_backend_specific_kwargs`` (timeout, ranks, type, process_template). Session accepts optional `backends` list. Notification topics: `session_status` → `{sid, status}` on session init ready/failed; `task_status` → `{uid, state}` on RUNNING, `{uid, state, exit_code, return_value, error, exception}` on terminal states; `task_status_batch` → `{tasks: [...]}` for bulk terminal notifications. Client-side optimizations: template compression for homogeneous batches, size-aware pipelined submission, SSE-based wait with event wakeup.
98
+ - **lucid** (`plugin_lucid.py`) – RADICAL Pilot integration. Client API: `pilot_submit(description)`, `task_submit(description)`, `task_wait(tid)`.
99
+ - **xgfabric** (`plugin_xgfabric.py`) – ExaGraph fabric operations. Classifies connected endpoints as `immediate_clusters` (direct execution) or `allocate_clusters` (batch submission via SLURM). An endpoint is classified as `allocate` only if it has the `queue_info` plugin **and** `is_enabled` returns `true`; otherwise it is `immediate`. Cluster lists updated in real-time via `on_topology_change`. Client API: `get_workdir()`, `set_workdir(path)`, `list_configs()`, `load_config(name)` (also accepts `'default'`/`'test'` builtins), `save_config(cfg)`, `delete_config(name)`, `get_status()`, `start_workflow(workflow, resource)`, `stop_workflow()`. Notification topic: `workflow_status` → full workflow state dict.
100
+ - **staging** (`plugin_staging.py`) – File transfer between client and endpoint. Paths must be absolute (or use `~/...`) and within `$HOME` or `/tmp`. Never overwrites existing files. Client API: `put(local_src, remote_dst, overwrite=False)`, `get(remote_src, local_dst)`, `list(remote_path)` → `{path, entries: [{name, type, size}]}`.
101
+ - **globus** (`plugin_globus.py`) – File staging via Globus Online (Transfer API). Endpoint-only (gated off the bridge; also disabled when `globus-sdk` is absent). Orchestrator only: Globus moves data **collection-to-collection** out of band, so no bytes flow through endpoint or bridge — distinct from the byte-streaming `staging` plugin. Synchronous `globus-sdk` calls are offloaded with `asyncio.to_thread`. **Auth** is supplied at `register_session`: either `access_token` (wrapped in `AccessTokenAuthorizer`; not renewed — re-register on expiry) **or** `refresh_token`+`client_id` (wrapped in `RefreshTokenAuthorizer`; auto-renews, survives long transfers). The credential lives in endpoint process memory, **never** on disk. **Collections** are UUIDs passed explicitly; the literal "local" resolves to the endpoint's configured collection (`RADICAL_ORBIT_GLOBUS_COLLECTION` env var, or a `local_collection` override at `register_session`). Client API: `submit_transfer(source, destination, items, label, sync_level)` (items = `[{source, destination, recursive}]`) → `{task_id, submission_id, status}`, `get_task(task_id)`, `task_wait(task_id, timeout, polling_interval)`, `cancel_task(task_id)`, `list_tasks(limit)`, `ls(collection, path)`, `mkdir(collection, path)`, `rename(collection, oldpath, newpath)`, `delete(collection, paths, recursive, label)` (Globus delete task), `endpoint_search(filter_text, limit)`, `get_endpoint(endpoint_id)`. Background poller (~10 s) emits notification topic `transfer_status` → `{task_id, status, label, bytes_transferred, files_transferred, nice_status}` on task state change. `ConsentRequired` (mapped-collection `data_access`) is surfaced as a clear 401 telling the caller to re-acquire a token with the collection's `data_access` scope. Explorer UI: `src/radical/orbit/data/plugins/globus.js`.
102
+ - **iri_connect** (`plugin_iri_connect.py`) – IRI endpoint configurator (bridge-only). Lists available IRI endpoints and, on `connect(endpoint, token)`, dynamically registers a `PluginIRIInstance` under the instance name `iri.<endpoint>` (e.g. `iri.nersc`). Hardcoded endpoints: NERSC (`https://api.iri.nersc.gov`, Globus auth), OLCF (`https://amsc-open.s3m.olcf.ornl.gov`, S3M auth). Endpoint constants in `iri_endpoints.py`; shares the `iri_tokens` localStorage key with the Explorer UI. Client API: `list_endpoints()`, `connect(endpoint, token)` → returns an `IRIInstanceClient` bound to the new `iri.<endpoint>` instance (idempotent: on 409 returns a client for the existing instance), `disconnect(endpoint)`, `get_status()`.
103
+ - **iri.&lt;endpoint&gt;** (`plugin_iri_instance.py`, class `PluginIRIInstance`, not auto-registered) – per-endpoint IRI integration dynamically created by `iri_connect`. Combines job submission and resource info on a single pre-created session (no `{sid}` in routes; `register_session` always returns the fixed session ID). The bearer token lives in bridge process memory (inside the httpx client) for the lifetime of the instance and is **never** written to disk. Background job poller every 10 s. Client API (`IRIInstanceClient`): `list_resources(resource_type='compute')`, `get_resource(resource_id)`, `submit_job(resource_id, job_spec)`, `get_job_status(resource_id, job_id)`, `list_jobs(resource_id)`, `cancel_job(resource_id, job_id)`, `list_incidents()`, `list_projects()`, `list_allocations(project_id)`. Notification topic: `job_status` → `{job_id, state, resource_id, name, details}`.
104
+
105
+ ### WebSocket protocol
106
+
107
+ Bridge ↔ Endpoint messages are JSON with `type` field (defined in `models.py`):
108
+ - **Endpoint → Bridge**: `register`, `response`, `notification`, `pong`
109
+ - **Bridge → Endpoint**: `request`, `ping`, `error`, `shutdown`, `topology`
110
+
111
+ Binary payloads use base64 encoding (`is_binary` flag). Heartbeat via WebSocket ping/pong.
112
+
113
+ ### Notifications
114
+
115
+ Plugins can send real-time notifications to clients via Server-Sent Events (SSE).
116
+ The notification flow is: **Session → Plugin → EndpointService → Bridge → SSE clients**.
117
+
118
+ #### Sending notifications from a plugin session
119
+
120
+ ```python
121
+ # In your PluginSession subclass:
122
+ class MySession(PluginSession):
123
+ def do_work(self):
124
+ # ... do some work ...
125
+
126
+ # Send notification (works from sync/async contexts and threads)
127
+ if self._notify:
128
+ self._notify("work_status", {
129
+ "status": "completed",
130
+ "result": {"key": "value"}
131
+ })
132
+ ```
133
+
134
+ #### Sending notifications from a plugin
135
+
136
+ ```python
137
+ # In your Plugin subclass (async context):
138
+ await self.send_notification("my_topic", {"key": "value"})
139
+ ```
140
+
141
+ #### Subscribing to notifications (JavaScript/Browser)
142
+
143
+ ```javascript
144
+ const eventSource = new EventSource('http://bridge:8000/events');
145
+ eventSource.onmessage = (event) => {
146
+ const msg = JSON.parse(event.data);
147
+ if (msg.topic === 'notification') {
148
+ const {endpoint, plugin, topic, data} = msg.data;
149
+ console.log(`${endpoint}/${plugin}: ${topic}`, data);
150
+ } else if (msg.topic === 'topology') {
151
+ // Endpoint connect/disconnect event
152
+ console.log('Topology changed:', msg.data.endpoints);
153
+ }
154
+ };
155
+ ```
156
+
157
+ #### Subscribing to notifications (Python client API)
158
+
159
+ The `BridgeClient` and `PluginClient` classes provide callback-based notification support:
160
+
161
+ ```python
162
+ from radical.orbit.client import BridgeClient
163
+
164
+ # Connect to bridge
165
+ client = BridgeClient(url="http://localhost:8000")
166
+
167
+ # Option 1: Global callback (all notifications)
168
+ def on_any_notification(endpoint, plugin, topic, data):
169
+ print(f"{endpoint}/{plugin}: {topic} -> {data}")
170
+
171
+ client.register_callback(callback=on_any_notification)
172
+
173
+ # Option 2: Plugin-specific callback
174
+ def on_psij_notification(endpoint, plugin, topic, data):
175
+ print(f"PsiJ: {topic} -> {data}")
176
+
177
+ client.register_callback(endpoint_id="hpc1", plugin_name="psij", callback=on_psij_notification)
178
+
179
+ # Option 3: Topic-specific callback
180
+ def on_job_status(endpoint, plugin, topic, data):
181
+ print(f"Job {data['job_id']}: {data['status']}")
182
+
183
+ client.register_callback(endpoint_id="hpc1", plugin_name="psij",
184
+ topic="job_status", callback=on_job_status)
185
+
186
+ # Option 4: Via PluginClient (most common)
187
+ endpoint = client.get_endpoint_client("hpc1")
188
+ psij = endpoint.get_plugin("psij")
189
+ psij.register_notification_callback(on_job_status, topic="job_status")
190
+
191
+ # Topology changes (endpoint connect/disconnect)
192
+ def on_topology(endpoints):
193
+ print(f"Connected endpoints: {list(endpoints.keys())}")
194
+
195
+ client.register_topology_callback(on_topology)
196
+
197
+ # Cleanup
198
+ client.close()
199
+ ```
200
+
201
+ #### Subscribing to notifications (raw SSE)
202
+
203
+ For non-Python clients or custom implementations:
204
+
205
+ ```python
206
+ import json
207
+ import sseclient
208
+ import requests
209
+
210
+ response = requests.get('http://bridge:8000/events', stream=True)
211
+ client = sseclient.SSEClient(response)
212
+ for event in client.events():
213
+ msg = json.loads(event.data)
214
+ if msg['topic'] == 'notification':
215
+ endpoint = msg['data']['endpoint']
216
+ plugin = msg['data']['plugin']
217
+ topic = msg['data']['topic']
218
+ data = msg['data']['data']
219
+ print(f"{endpoint}/{plugin}: {topic} -> {data}")
220
+ ```
221
+
222
+ #### Topology updates (endpoint connect/disconnect)
223
+
224
+ Plugins can react to endpoint connect/disconnect events by overriding `on_topology_change`:
225
+
226
+ ```python
227
+ class MyPlugin(Plugin):
228
+ async def on_topology_change(self, endpoints: dict):
229
+ """Called when endpoints connect or disconnect.
230
+
231
+ Args:
232
+ endpoints: Dict mapping endpoint names to plugin info.
233
+ Example: {"endpoint1": {"plugins": ["sysinfo", "psij"]}}
234
+ """
235
+ for endpoint_name, info in endpoints.items():
236
+ print(f"Endpoint {endpoint_name} has plugins: {info.get('plugins', [])}")
237
+ ```
238
+
239
+ ### Explorer UI
240
+
241
+ The bridge serves a web-based explorer (`src/radical/orbit/data/orbit_explorer.html`) that provides:
242
+ - Real-time view of connected endpoints and plugins
243
+ - Interactive plugin interfaces (job submission, task management, system metrics)
244
+ - Endpoint and bridge termination controls
245
+ - SSE-based live updates
246
+
247
+ ## Code Conventions
248
+
249
+ - Package uses `find_namespace_packages` under `src/radical/orbit/`.
250
+ - Scripts in `bin/` are installed as console entry points.
251
+ - The codebase uses alignment-style formatting (extra spaces for visual column alignment) – this is intentional and should be preserved.
252
+ - Version is derived from `VERSION` file + git tags at build time (see `setup.py:get_version`).
253
+ - Pydantic models for message validation in `models.py`.
254
+ - UI configuration via `ui_schema.py` for dynamic plugin interfaces.
@@ -0,0 +1,151 @@
1
+ # Deployment Guide
2
+
3
+ ## Recommended Network Topology
4
+
5
+ ```
6
+ Internet / User Network
7
+ |
8
+ | HTTPS (8000)
9
+ v
10
+ ┌─────────────┐
11
+ │ Bridge │ ← public-facing, DMZ or bastion
12
+ └─────────────┘
13
+ |
14
+ | WSS (outbound from HPC)
15
+ v
16
+ ┌─────────────┐ ┌─────────────┐
17
+ │ Endpoint (HPC) │ │ Endpoint (HPC) │ ← one per cluster or login node
18
+ └─────────────┘ └─────────────┘
19
+ ```
20
+
21
+ **Key point**: endpoints initiate the outbound WebSocket connection to the bridge.
22
+ No inbound ports need to be opened on the HPC firewall.
23
+
24
+ ## Bridge Setup
25
+
26
+ The bridge is a single FastAPI/uvicorn process. It holds no job state — all
27
+ session state lives in the endpoint processes.
28
+
29
+ ```sh
30
+ # HTTP (development only)
31
+ ./bin/radical-orbit-bridge.py
32
+
33
+ # HTTPS (production)
34
+ export RADICAL_ORBIT_BRIDGE_CERT=/path/to/cert.pem
35
+ export RADICAL_ORBIT_BRIDGE_KEY=/path/to/key.pem
36
+ ./bin/radical-orbit-bridge.py
37
+ ```
38
+
39
+ To change host/port, edit the last line of `bin/radical-orbit-bridge.py`:
40
+
41
+ ```python
42
+ uvicorn.run(app, host="0.0.0.0", port=8000, ...)
43
+ ```
44
+
45
+ ### systemd Unit File (Bridge)
46
+
47
+ ```ini
48
+ [Unit]
49
+ Description=ORBIT Bridge
50
+ After=network.target
51
+
52
+ [Service]
53
+ Type=simple
54
+ User=radical
55
+ WorkingDirectory=/opt/orbit
56
+ Environment=RADICAL_ORBIT_BRIDGE_CERT=/opt/orbit/certs/bridge_cert.pem
57
+ Environment=RADICAL_ORBIT_BRIDGE_KEY=/opt/orbit/certs/bridge_key.pem
58
+ ExecStart=/opt/orbit/bin/radical-orbit-bridge.py
59
+ Restart=on-failure
60
+ RestartSec=5s
61
+
62
+ [Install]
63
+ WantedBy=multi-user.target
64
+ ```
65
+
66
+ ## Endpoint Service Setup
67
+
68
+ Endpoints are typically launched as batch jobs via the host scheduler (SLURM, PBS)
69
+ or as long-running daemon processes on login nodes.
70
+
71
+ ```sh
72
+ # Direct launch (login node daemon)
73
+ ./bin/radical-orbit-endpoint.py \
74
+ --name my-hpc-endpoint \
75
+ --url wss://bridge.example.org:8000 \
76
+ -p sysinfo,psij,queue_info,staging
77
+
78
+ # Via SLURM batch script
79
+ sbatch endpoint_job.sh
80
+ ```
81
+
82
+ ### SLURM Batch Script Example
83
+
84
+ ```sh
85
+ #!/bin/bash
86
+ #SBATCH --job-name=orbit
87
+ #SBATCH --partition=service
88
+ #SBATCH --nodes=1
89
+ #SBATCH --time=24:00:00
90
+
91
+ export RADICAL_ORBIT_BRIDGE_CERT=/path/to/bridge_cert.pem
92
+
93
+ ./bin/radical-orbit-endpoint-wrapper.sh \
94
+ --name "$SLURM_CLUSTER_NAME-endpoint" \
95
+ --url wss://bridge.example.org:8000 \
96
+ -p sysinfo,psij,queue_info,staging,rhapsody
97
+ ```
98
+
99
+ The wrapper script (`radical-orbit-endpoint-wrapper.sh`) sets up `PYTHONPATH` and
100
+ `PATH` for the installed package before starting the endpoint service.
101
+
102
+ ## Session Persistence
103
+
104
+ **Sessions are not persisted.** When an endpoint disconnects and reconnects:
105
+
106
+ - All active sessions are lost
107
+ - The Explorer automatically refreshes its plugin list via SSE topology event
108
+ - Python clients will receive a `404` on next call; they must call
109
+ `register_session()` again
110
+ - The Explorer re-registers sessions transparently on next API call
111
+
112
+ Plan for endpoint restarts by wrapping your client loop with a reconnection
113
+ strategy.
114
+
115
+ ## Health Checks
116
+
117
+ Every plugin exposes a health endpoint at `GET /{plugin}/health`:
118
+
119
+ ```
120
+ GET /my-endpoint/psij/health
121
+ → {"status": "healthy", "plugin": "psij", "version": "...",
122
+ "uptime_seconds": 3600.0, "active_sessions": 2}
123
+ ```
124
+
125
+ The bridge itself does not yet have a dedicated `/health` endpoint, but
126
+ `GET /endpoint/list` returning 200 is a reliable liveness check.
127
+
128
+ For load-balancer health probes:
129
+
130
+ ```sh
131
+ curl -sk https://bridge:8000/endpoint/list -X POST | jq .
132
+ ```
133
+
134
+ ## Observability
135
+
136
+ Log level is controlled via the `RADICAL_ORBIT_LOG_LVL` environment
137
+ variable (falling back to the generic `RADICAL_LOG_LVL`):
138
+
139
+ ```sh
140
+ # DEBUG logging
141
+ RADICAL_ORBIT_LOG_LVL=DEBUG ./bin/radical-orbit-bridge.py
142
+ ```
143
+
144
+ Key log namespaces:
145
+
146
+ | Namespace | Content |
147
+ |--------------------|----------------------------------------------|
148
+ | `radical.orbit` | Bridge, endpoint service, plugin base |
149
+ | `radical.orbit.client` | Python client, SSE listener |
150
+
151
+ Structured logging is not yet enabled; logs go to stderr by default.
@@ -0,0 +1,179 @@
1
+
2
+
3
+ ================================================================================
4
+ The GNU Lesser General Public License, version 3.0 (LGPL-3.0)
5
+ [OSI Approved License]
6
+ ================================================================================
7
+
8
+ This license is a set of additional permissions added to version 3 of the GNU
9
+ General Public License.
10
+
11
+
12
+ GNU LESSER GENERAL PUBLIC LICENSE
13
+ =================================
14
+
15
+ Version 3, 29 June 2007
16
+
17
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
18
+
19
+ Everyone is permitted to copy and distribute verbatim copies of this license
20
+ document, but changing it is not allowed.
21
+
22
+ This version of the GNU Lesser General Public License incorporates the terms and
23
+ conditions of version 3 of the GNU General Public License, supplemented by the
24
+ additional permissions listed below.
25
+
26
+
27
+ 0. Additional Definitions.
28
+ --------------------------
29
+
30
+ As used herein, “this License” refers to version 3 of the GNU Lesser General
31
+ Public License, and the “GNU GPL” refers to version 3 of the GNU General Public
32
+ License.
33
+
34
+ “The Library” refers to a covered work governed by this License, other than an
35
+ Application or a Combined Work as defined below.
36
+
37
+ An “Application” is any work that makes use of an interface provided by the
38
+ Library, but which is not otherwise based on the Library. Defining a subclass of
39
+ a class defined by the Library is deemed a mode of using an interface provided
40
+ by the Library.
41
+
42
+ A “Combined Work” is a work produced by combining or linking an Application with
43
+ the Library. The particular version of the Library with which the Combined Work
44
+ was made is also called the “Linked Version”.
45
+
46
+ The “Minimal Corresponding Source” for a Combined Work means the Corresponding
47
+ Source for the Combined Work, excluding any source code for portions of the
48
+ Combined Work that, considered in isolation, are based on the Application, and
49
+ not on the Linked Version.
50
+
51
+ The “Corresponding Application Code” for a Combined Work means the object code
52
+ and/or source code for the Application, including any data and utility programs
53
+ needed for reproducing the Combined Work from the Application, but excluding the
54
+ System Libraries of the Combined Work.
55
+
56
+
57
+ 1. Exception to Section 3 of the GNU GPL.
58
+ -----------------------------------------
59
+
60
+ You may convey a covered work under sections 3 and 4 of this License without
61
+ being bound by section 3 of the GNU GPL.
62
+
63
+
64
+ 2. Conveying Modified Versions.
65
+ -------------------------------
66
+
67
+ If you modify a copy of the Library, and, in your modifications, a facility
68
+ refers to a function or data to be supplied by an Application that uses the
69
+ facility (other than as an argument passed when the facility is invoked), then
70
+ you may convey a copy of the modified version:
71
+
72
+ a) under this License, provided that you make a good faith effort to ensure
73
+ that, in the event an Application does not supply the function or data, the
74
+ facility still operates, and performs whatever part of its purpose remains
75
+ meaningful, or
76
+
77
+ b) under the GNU GPL, with none of the additional permissions of this License
78
+ applicable to that copy.
79
+
80
+
81
+ 3. Object Code Incorporating Material from Library Header Files.
82
+ ----------------------------------------------------------------
83
+
84
+ The object code form of an Application may incorporate material from a header
85
+ file that is part of the Library. You may convey such object code under terms of
86
+ your choice, provided that, if the incorporated material is not limited to
87
+ numerical parameters, data structure layouts and accessors, or small macros,
88
+ inline functions and templates (ten or fewer lines in length), you do both of
89
+ the following:
90
+
91
+ a) Give prominent notice with each copy of the object code that the Library is
92
+ used in it and that the Library and its use are covered by this License.
93
+
94
+ b) Accompany the object code with a copy of the GNU GPL and this license
95
+ document.
96
+
97
+ 4. Combined Works.
98
+
99
+ You may convey a Combined Work under terms of your choice that, taken together,
100
+ effectively do not restrict modification of the portions of the Library
101
+ contained in the Combined Work and reverse engineering for debugging such
102
+ modifications, if you also do each of the following:
103
+
104
+ a) Give prominent notice with each copy of the Combined Work that the Library
105
+ is used in it and that the Library and its use are covered by this License.
106
+
107
+ b) Accompany the Combined Work with a copy of the GNU GPL and this license
108
+ document.
109
+
110
+ c) For a Combined Work that displays copyright notices during execution,
111
+ include the copyright notice for the Library among these notices, as well
112
+ as a reference directing the user to the copies of the GNU GPL and this
113
+ license document.
114
+
115
+ d) Do one of the following:
116
+
117
+ 0) Convey the Minimal Corresponding Source under the terms of this
118
+ License, and the Corresponding Application Code in a form suitable for,
119
+ and under terms that permit, the user to recombine or relink the
120
+ Application with a modified version of the Linked Version to produce
121
+ a modified Combined Work, in the manner specified by section 6 of the
122
+ GNU GPL for conveying Corresponding Source.
123
+
124
+ 1) Use a suitable shared library mechanism for linking with the Library.
125
+ A suitable mechanism is one that (a) uses at run time a copy of the
126
+ Library already present on the user's computer system, and (b) will
127
+ operate properly with a modified version of the Library that is
128
+ interface-compatible with the Linked Version. e) Provide Installation
129
+ Information, but only if you would otherwise be required to provide such
130
+ information under section 6 of the GNU GPL, and only to the extent that
131
+ such information is necessary to install and execute a modified version
132
+ of the Combined Work produced by recombining or relinking the
133
+ Application with a modified version of the Linked Version. (If you use
134
+ option 4d0, the Installation Information must accompany the Minimal
135
+ Corresponding Source and Corresponding Application Code. If you use
136
+ option 4d1, you must provide the Installation Information in the manner
137
+ specified by section 6 of the GNU GPL for conveying Corresponding
138
+ Source.)
139
+
140
+
141
+ 5. Combined Libraries.
142
+ ----------------------
143
+
144
+ You may place library facilities that are a work based on the Library side by
145
+ side in a single library together with other library facilities that are not
146
+ Applications and are not covered by this License, and convey such a combined
147
+ library under terms of your choice, if you do both of the following:
148
+
149
+ a) Accompany the combined library with a copy of the same work based on the
150
+ Library, uncombined with any other library facilities, conveyed under the
151
+ terms of this License.
152
+
153
+ b) Give prominent notice with the combined library that part of it is a work
154
+ based on the Library, and explaining where to find the accompanying
155
+ uncombined form of the same work.
156
+
157
+
158
+ 6. Revised Versions of the GNU Lesser General Public License.
159
+ -------------------------------------------------------------
160
+
161
+ The Free Software Foundation may publish revised and/or new versions of the GNU
162
+ Lesser General Public License from time to time. Such new versions will be
163
+ similar in spirit to the present version, but may differ in detail to address
164
+ new problems or concerns.
165
+
166
+ Each version is given a distinguishing version number. If the Library as you
167
+ received it specifies that a certain numbered version of the GNU Lesser General
168
+ Public License “or any later version” applies to it, you have the option of
169
+ following the terms and conditions either of that published version or of any
170
+ later version published by the Free Software Foundation. If the Library as you
171
+ received it does not specify a version number of the GNU Lesser General Public
172
+ License, you may choose any version of the GNU Lesser General Public License
173
+ ever published by the Free Software Foundation.
174
+
175
+ If the Library as you received it specifies that a proxy can decide whether
176
+ future versions of the GNU Lesser General Public License shall apply, that
177
+ proxy's public statement of acceptance of any version is permanent authorization
178
+ for you to choose that version for the Library.
179
+
@@ -0,0 +1,8 @@
1
+
2
+ include *.py *.json *.md *.sh *.txt
3
+ include MANIFEST.in README.md CHANGES.md LICENSE.md VERSION
4
+
5
+ recursive-include docs *.md
6
+ recursive-include src *.sh *.json
7
+ recursive-include examples *.py *.json
8
+