fluence-hpc 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fluence/__init__.py +16 -0
- fluence/interceptor.py +38 -0
- fluence/providers/__init__.py +27 -0
- fluence/providers/base.py +117 -0
- fluence/providers/braket.py +122 -0
- fluence/sidecar.py +99 -0
- fluence/sitecustomize.py +12 -0
- fluence/stage.py +59 -0
- fluence/ungate.py +73 -0
- fluence_hpc-0.0.0.dist-info/METADATA +111 -0
- fluence_hpc-0.0.0.dist-info/RECORD +14 -0
- fluence_hpc-0.0.0.dist-info/WHEEL +5 -0
- fluence_hpc-0.0.0.dist-info/entry_points.txt +2 -0
- fluence_hpc-0.0.0.dist-info/top_level.txt +1 -0
fluence/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
fluence — quantum-classical scheduling coordination for the Fluence Kubernetes
|
|
3
|
+
scheduler.
|
|
4
|
+
|
|
5
|
+
This package is built into the Fluence sidecar image and staged into user
|
|
6
|
+
application containers at admission time (via an init container + shared volume
|
|
7
|
+
on PYTHONPATH), so the interceptor runs with zero user code changes.
|
|
8
|
+
|
|
9
|
+
Submodules:
|
|
10
|
+
fluence.providers provider interface + registry (per-vendor plug-ins)
|
|
11
|
+
fluence.interceptor runs every registered provider's submit-time tag hook
|
|
12
|
+
fluence.sidecar the sidecar coordination main loop
|
|
13
|
+
fluence.ungate generic worker ungating (Kubernetes patch logic)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
__version__ = "0.1.0"
|
fluence/interceptor.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
fluence.interceptor — installs every registered provider's submit-time tag hook.
|
|
3
|
+
|
|
4
|
+
Runs inside the user's application container, triggered automatically by a
|
|
5
|
+
sitecustomize.py on PYTHONPATH (staged there by the Fluence init container). On
|
|
6
|
+
import it asks every registered provider to install its interceptor; each
|
|
7
|
+
provider fail-soft skips if its vendor SDK is not present in this container.
|
|
8
|
+
|
|
9
|
+
This module's import must never raise — sitecustomize guards it, but we also
|
|
10
|
+
guard here so a single provider bug cannot affect the user application.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def install() -> None:
|
|
19
|
+
pod_uid = os.environ.get("FLUENCE_POD_UID", "")
|
|
20
|
+
try:
|
|
21
|
+
from fluence.providers import all_providers
|
|
22
|
+
except Exception as e: # pragma: no cover - defensive
|
|
23
|
+
print(f"[fluence] interceptor: providers unavailable: {e}", flush=True)
|
|
24
|
+
return
|
|
25
|
+
|
|
26
|
+
for provider in all_providers():
|
|
27
|
+
try:
|
|
28
|
+
if provider.install_interceptor(pod_uid):
|
|
29
|
+
print(f"[fluence] interceptor installed for provider "
|
|
30
|
+
f"{provider.name!r} (pod_uid={pod_uid})", flush=True)
|
|
31
|
+
except Exception as e:
|
|
32
|
+
# A provider's hook must never break the user app.
|
|
33
|
+
print(f"[fluence] interceptor for {provider.name!r} skipped: {e}",
|
|
34
|
+
flush=True)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Install on import.
|
|
38
|
+
install()
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
fluence.providers — provider registry.
|
|
3
|
+
|
|
4
|
+
Importing this package imports every provider submodule, each of which calls
|
|
5
|
+
fluence.providers.base.register() at import time. This is the single extension
|
|
6
|
+
point: to add a vendor, drop a new module here that defines a Provider subclass
|
|
7
|
+
and calls register() — nothing else in the codebase needs to change.
|
|
8
|
+
|
|
9
|
+
Provider discovery is by explicit submodule import below (simple and debuggable).
|
|
10
|
+
Importing a provider module never fails on a missing vendor SDK: the SDK is only
|
|
11
|
+
imported lazily inside the methods that need it.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from fluence.providers.base import ( # noqa: F401
|
|
15
|
+
Provider,
|
|
16
|
+
Task,
|
|
17
|
+
TAG_KEY,
|
|
18
|
+
log,
|
|
19
|
+
register,
|
|
20
|
+
all_providers,
|
|
21
|
+
resolve,
|
|
22
|
+
resolve_from_env,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Import provider modules so they self-register. Add new providers here.
|
|
26
|
+
from fluence.providers import braket # noqa: F401,E402
|
|
27
|
+
# from fluence.providers import ibm # noqa: F401 (when implemented)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""
|
|
2
|
+
fluence.providers.base — the provider interface and registration machinery.
|
|
3
|
+
|
|
4
|
+
A provider is a per-vendor plug-in (AWS Braket, IBM Qiskit Runtime, ...) that
|
|
5
|
+
implements two halves of the quantum-coordination mechanism:
|
|
6
|
+
|
|
7
|
+
- INTERCEPTOR hook (`install_interceptor`): runs inside the user's application
|
|
8
|
+
container; monkey-patches the vendor SDK's submit call to stamp the shared
|
|
9
|
+
`fluence-pod-uid` tag on every task. Must fail-soft if the vendor SDK is not
|
|
10
|
+
importable in that container.
|
|
11
|
+
|
|
12
|
+
- SIDECAR methods (`matches`, `find_my_task`, `is_ready_to_ungate`,
|
|
13
|
+
`queue_position`, `job_id`): run inside the Fluence sidecar container; find
|
|
14
|
+
the tagged task, poll readiness, and yield a vendor-neutral job id.
|
|
15
|
+
|
|
16
|
+
Providers self-register by calling `register()` at import time. The package
|
|
17
|
+
imports every provider submodule (see fluence.providers.__init__) so importing
|
|
18
|
+
the package registers them all. Registration is the single extension point:
|
|
19
|
+
adding a vendor is one new module that calls register().
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import os
|
|
25
|
+
from datetime import datetime, timezone
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Shared convention between every interceptor hook and every find_my_task.
|
|
29
|
+
# The interceptor stamps this tag key with the pod UID; the sidecar searches
|
|
30
|
+
# for it. Changing it is a coordinated change across all providers.
|
|
31
|
+
TAG_KEY = "fluence-pod-uid"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def log(msg: str) -> None:
|
|
35
|
+
ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
36
|
+
print(f"[fluence] {ts} {msg}", flush=True)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Task:
|
|
40
|
+
"""
|
|
41
|
+
Opaque handle to a vendor quantum task. A provider returns its own subclass
|
|
42
|
+
from find_my_task; the framework treats it opaquely and only passes it back
|
|
43
|
+
to that provider. Vendor identifiers (ARN, job id) live inside.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Provider:
|
|
48
|
+
"""Interface every quantum vendor implements. See module docstring."""
|
|
49
|
+
|
|
50
|
+
#: short stable name, e.g. "braket", "ibm"
|
|
51
|
+
name: str = "base"
|
|
52
|
+
|
|
53
|
+
# ── interceptor half (runs in the user container) ──────────────────────────
|
|
54
|
+
|
|
55
|
+
def install_interceptor(self, pod_uid: str) -> bool:
|
|
56
|
+
"""
|
|
57
|
+
Monkey-patch this vendor's SDK submit call to stamp TAG_KEY=<pod_uid>.
|
|
58
|
+
Return True if the patch was installed, False if the SDK is absent
|
|
59
|
+
(fail-soft). Must never raise.
|
|
60
|
+
"""
|
|
61
|
+
raise NotImplementedError
|
|
62
|
+
|
|
63
|
+
# ── sidecar half (runs in the sidecar container) ───────────────────────────
|
|
64
|
+
|
|
65
|
+
def matches(self, vendor: str, backend: str) -> bool:
|
|
66
|
+
"""True if this provider handles the given vendor/backend (resolved at
|
|
67
|
+
runtime from the pod's backend annotation)."""
|
|
68
|
+
raise NotImplementedError
|
|
69
|
+
|
|
70
|
+
def find_my_task(self, pod_uid: str, backend: str, timeout: int) -> "Task | None":
|
|
71
|
+
"""Search the vendor for the task tagged TAG_KEY=<pod_uid>, polling until
|
|
72
|
+
found or timeout. Returns an opaque Task or None."""
|
|
73
|
+
raise NotImplementedError
|
|
74
|
+
|
|
75
|
+
def is_ready_to_ungate(self, task: "Task") -> bool:
|
|
76
|
+
"""True when workers should be ungated — queue position == 1 or the task
|
|
77
|
+
is already RUNNING/terminal. Always implementable."""
|
|
78
|
+
raise NotImplementedError
|
|
79
|
+
|
|
80
|
+
def queue_position(self, task: "Task") -> "int | None":
|
|
81
|
+
"""Optional richer telemetry: integer queue position (1 == next), or None
|
|
82
|
+
if the vendor does not expose one. Not required for the ungate decision."""
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
def job_id(self, task: "Task") -> str:
|
|
86
|
+
"""Stable, vendor-neutral identifier handed to workers at ungate time."""
|
|
87
|
+
raise NotImplementedError
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# ── registry ────────────────────────────────────────────────────────────────────
|
|
91
|
+
|
|
92
|
+
_REGISTRY: "list[Provider]" = []
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def register(provider: Provider) -> None:
|
|
96
|
+
"""Register a provider. Called by each provider module at import time."""
|
|
97
|
+
_REGISTRY.append(provider)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def all_providers() -> "list[Provider]":
|
|
101
|
+
return list(_REGISTRY)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def resolve(vendor: str = "", backend: str = "") -> "Provider | None":
|
|
105
|
+
"""Return the registered provider matching vendor/backend, or None."""
|
|
106
|
+
for p in _REGISTRY:
|
|
107
|
+
try:
|
|
108
|
+
if p.matches(vendor, backend):
|
|
109
|
+
return p
|
|
110
|
+
except Exception as e: # a provider's matches() must never break resolution
|
|
111
|
+
log(f"provider {p.name!r} matches() error: {e}")
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def resolve_from_env() -> "Provider | None":
|
|
116
|
+
return resolve(os.environ.get("FLUXION_VENDOR", ""),
|
|
117
|
+
os.environ.get("FLUXION_BACKEND", ""))
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""
|
|
2
|
+
fluence.providers.braket — AWS Braket provider.
|
|
3
|
+
|
|
4
|
+
Holds both halves of the Braket coordination mechanism:
|
|
5
|
+
- install_interceptor: patches AwsDevice.run() to stamp the pod-uid tag
|
|
6
|
+
(runs in the user container; fail-soft if amazon-braket-sdk is absent).
|
|
7
|
+
- sidecar methods: discover the tagged task, poll queue position, yield the
|
|
8
|
+
task ARN as the (vendor-neutral-typed) job id.
|
|
9
|
+
|
|
10
|
+
Self-registers via register(PROVIDER) at import. Importing this module never
|
|
11
|
+
requires the braket SDK; SDK imports are lazy, inside the methods.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import os
|
|
17
|
+
import time
|
|
18
|
+
|
|
19
|
+
from fluence.providers.base import Provider, Task, TAG_KEY, log, register
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BraketTask(Task):
|
|
23
|
+
def __init__(self, arn: str):
|
|
24
|
+
self.arn = arn
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _region_from_arn(arn: str) -> str:
|
|
28
|
+
parts = arn.split(":")
|
|
29
|
+
region = parts[3] if len(parts) > 3 and parts[3] else ""
|
|
30
|
+
return region or os.environ.get("AWS_DEFAULT_REGION", "us-east-1")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class BraketProvider(Provider):
|
|
34
|
+
name = "braket"
|
|
35
|
+
|
|
36
|
+
# ── interceptor half ───────────────────────────────────────────────────────
|
|
37
|
+
|
|
38
|
+
def install_interceptor(self, pod_uid: str) -> bool:
|
|
39
|
+
try:
|
|
40
|
+
from braket.aws import AwsDevice
|
|
41
|
+
except ImportError:
|
|
42
|
+
return False # braket SDK not in this container — fail-soft
|
|
43
|
+
|
|
44
|
+
original_run = AwsDevice.run
|
|
45
|
+
|
|
46
|
+
def patched_run(self, task_specification, *args, **kwargs):
|
|
47
|
+
if pod_uid:
|
|
48
|
+
tags = kwargs.get("tags", {})
|
|
49
|
+
tags[TAG_KEY] = pod_uid
|
|
50
|
+
kwargs["tags"] = tags
|
|
51
|
+
return original_run(self, task_specification, *args, **kwargs)
|
|
52
|
+
|
|
53
|
+
AwsDevice.run = patched_run
|
|
54
|
+
return True
|
|
55
|
+
|
|
56
|
+
# ── sidecar half ───────────────────────────────────────────────────────────
|
|
57
|
+
|
|
58
|
+
def matches(self, vendor: str, backend: str) -> bool:
|
|
59
|
+
v, b = (vendor or "").lower(), (backend or "").lower()
|
|
60
|
+
if v == "braket":
|
|
61
|
+
return True
|
|
62
|
+
return "braket" in b or b.startswith("arn:aws:braket")
|
|
63
|
+
|
|
64
|
+
def _client(self, backend: str):
|
|
65
|
+
import boto3
|
|
66
|
+
region = (_region_from_arn(backend) if backend.startswith("arn:")
|
|
67
|
+
else os.environ.get("AWS_DEFAULT_REGION", "us-east-1"))
|
|
68
|
+
return boto3.client("braket", region_name=region)
|
|
69
|
+
|
|
70
|
+
def find_my_task(self, pod_uid, backend, timeout):
|
|
71
|
+
client = self._client(backend)
|
|
72
|
+
log(f"[braket] searching for task tagged {TAG_KEY}={pod_uid}")
|
|
73
|
+
deadline = time.time() + timeout
|
|
74
|
+
device_arn = backend if backend.startswith("arn:aws:braket") else None
|
|
75
|
+
while time.time() < deadline:
|
|
76
|
+
try:
|
|
77
|
+
filters = [{"name": f"tags:{TAG_KEY}", "operator": "EQUAL",
|
|
78
|
+
"values": [pod_uid]}]
|
|
79
|
+
if device_arn:
|
|
80
|
+
filters.append({"name": "deviceArn", "operator": "EQUAL",
|
|
81
|
+
"values": [device_arn]})
|
|
82
|
+
resp = client.search_quantum_tasks(filters=filters, maxResults=10)
|
|
83
|
+
tasks = resp.get("quantumTasks", [])
|
|
84
|
+
if tasks:
|
|
85
|
+
tasks.sort(key=lambda t: t.get("createdAt", ""), reverse=True)
|
|
86
|
+
arn = tasks[0]["quantumTaskArn"]
|
|
87
|
+
log(f"[braket] found task by tag: {arn}")
|
|
88
|
+
return BraketTask(arn)
|
|
89
|
+
except Exception as e:
|
|
90
|
+
log(f"[braket] search error (will retry): {e}")
|
|
91
|
+
time.sleep(10)
|
|
92
|
+
log("[braket] task discovery timed out")
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
def _aws_task(self, task: BraketTask):
|
|
96
|
+
import asyncio
|
|
97
|
+
asyncio.set_event_loop(asyncio.new_event_loop())
|
|
98
|
+
from braket.aws import AwsQuantumTask
|
|
99
|
+
return AwsQuantumTask(arn=task.arn)
|
|
100
|
+
|
|
101
|
+
def is_ready_to_ungate(self, task: BraketTask) -> bool:
|
|
102
|
+
t = self._aws_task(task)
|
|
103
|
+
if t.state() in ("RUNNING", "COMPLETED", "FAILED", "CANCELLED"):
|
|
104
|
+
return True
|
|
105
|
+
try:
|
|
106
|
+
return str(t.queue_position().queue_position) == "1"
|
|
107
|
+
except Exception:
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
def queue_position(self, task: BraketTask):
|
|
111
|
+
try:
|
|
112
|
+
pos = self._aws_task(task).queue_position().queue_position
|
|
113
|
+
return int(pos) if pos is not None and str(pos).isdigit() else None
|
|
114
|
+
except Exception:
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
def job_id(self, task: BraketTask) -> str:
|
|
118
|
+
return task.arn
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
PROVIDER = BraketProvider()
|
|
122
|
+
register(PROVIDER)
|
fluence/sidecar.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""
|
|
2
|
+
fluence.sidecar — provider-agnostic quantum coordination sidecar main loop.
|
|
3
|
+
|
|
4
|
+
Injected by the Fluence webhook into the quantum-submitting pod. Resolves its
|
|
5
|
+
vendor at runtime from the backend annotation, discovers the task the user
|
|
6
|
+
application submitted (tagged by the interceptor), polls readiness, and either
|
|
7
|
+
ungates gated workers (gang mode) or just logs the queue-position series
|
|
8
|
+
(observe-only mode).
|
|
9
|
+
|
|
10
|
+
Entry point: `fluence-sidecar` console script (see pyproject.toml) -> main().
|
|
11
|
+
|
|
12
|
+
Environment (injected by the Fluence webhook):
|
|
13
|
+
FLUENCE_POD_UID UID of this pod (matches interceptor tag)
|
|
14
|
+
FLUENCE_NAMESPACE Kubernetes namespace
|
|
15
|
+
FLUENCE_GATED_PODS comma-separated gated worker names
|
|
16
|
+
FLUENCE_OBSERVE "true" for observe-only telemetry mode
|
|
17
|
+
FLUXION_BACKEND / FLUXION_VENDOR scheduler-chosen backend / vendor
|
|
18
|
+
FLUENCE_TASK_DISCOVERY_TIMEOUT seconds to wait for discovery (default 300)
|
|
19
|
+
FLUENCE_POLL_INTERVAL seconds between polls (default 30)
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import os
|
|
25
|
+
import sys
|
|
26
|
+
import time
|
|
27
|
+
|
|
28
|
+
from fluence.providers import resolve_from_env
|
|
29
|
+
from fluence.providers.base import log
|
|
30
|
+
from fluence.ungate import ungate_pods, gated_pods_from_env, namespace_from_env
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _poll(provider, task, poll_interval, ungate):
|
|
34
|
+
mode = "gang" if ungate else "observe-only"
|
|
35
|
+
log(f"{mode} mode: polling queue position")
|
|
36
|
+
last = object()
|
|
37
|
+
while True:
|
|
38
|
+
try:
|
|
39
|
+
if provider.is_ready_to_ungate(task):
|
|
40
|
+
log(f"task ready (position={provider.queue_position(task)})")
|
|
41
|
+
return
|
|
42
|
+
pos = provider.queue_position(task)
|
|
43
|
+
if pos != last:
|
|
44
|
+
log(f"queue position: {pos}")
|
|
45
|
+
last = pos
|
|
46
|
+
except Exception as e:
|
|
47
|
+
log(f"poll error (will retry): {e}")
|
|
48
|
+
time.sleep(poll_interval)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def main():
|
|
52
|
+
pod_uid = os.environ.get("FLUENCE_POD_UID", "")
|
|
53
|
+
backend = os.environ.get("FLUXION_BACKEND", "")
|
|
54
|
+
observe = os.environ.get("FLUENCE_OBSERVE", "").lower() == "true"
|
|
55
|
+
discovery_timeout = int(os.environ.get("FLUENCE_TASK_DISCOVERY_TIMEOUT", 300))
|
|
56
|
+
poll_interval = int(os.environ.get("FLUENCE_POLL_INTERVAL", 30))
|
|
57
|
+
|
|
58
|
+
namespace = namespace_from_env()
|
|
59
|
+
gated_pods = gated_pods_from_env()
|
|
60
|
+
|
|
61
|
+
log("starting fluence quantum sidecar")
|
|
62
|
+
log(f" pod_uid={pod_uid} namespace={namespace} backend={backend} "
|
|
63
|
+
f"observe={observe} gated_pods={gated_pods}")
|
|
64
|
+
|
|
65
|
+
provider = resolve_from_env()
|
|
66
|
+
if provider is None:
|
|
67
|
+
log("ERROR: could not resolve a quantum provider from the backend")
|
|
68
|
+
if gated_pods and not observe:
|
|
69
|
+
ungate_pods(gated_pods, "", namespace)
|
|
70
|
+
sys.exit(1)
|
|
71
|
+
log(f"resolved provider: {provider.name}")
|
|
72
|
+
|
|
73
|
+
if not observe and not gated_pods:
|
|
74
|
+
log("no gated workers and not observe mode — nothing to do")
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
task = provider.find_my_task(pod_uid, backend, discovery_timeout)
|
|
78
|
+
if task is None:
|
|
79
|
+
log("ERROR: could not discover quantum task")
|
|
80
|
+
if gated_pods and not observe:
|
|
81
|
+
log("ungating workers anyway to avoid deadlock")
|
|
82
|
+
ungate_pods(gated_pods, "", namespace)
|
|
83
|
+
sys.exit(1)
|
|
84
|
+
|
|
85
|
+
job_id = provider.job_id(task)
|
|
86
|
+
log(f"discovered task, job_id={job_id}")
|
|
87
|
+
|
|
88
|
+
_poll(provider, task, poll_interval, ungate=not observe)
|
|
89
|
+
|
|
90
|
+
if observe:
|
|
91
|
+
log("observe-only run complete")
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
ungate_pods(gated_pods, job_id, namespace)
|
|
95
|
+
log("done — workers ungated")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
main()
|
fluence/sitecustomize.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# fluence sitecustomize — staged onto the user container's PYTHONPATH by the
|
|
2
|
+
# Fluence init container. Python imports `sitecustomize` automatically on every
|
|
3
|
+
# interpreter start (interactive OR script), so this runs the interceptor with
|
|
4
|
+
# zero user code changes and without relying on PYTHONSTARTUP (which only fires
|
|
5
|
+
# for interactive sessions).
|
|
6
|
+
#
|
|
7
|
+
# Guarded so a fluence-side error can never break the user's application.
|
|
8
|
+
try:
|
|
9
|
+
import fluence.interceptor # noqa: F401 (import side-effect installs hooks)
|
|
10
|
+
except Exception as _e: # pragma: no cover
|
|
11
|
+
import sys
|
|
12
|
+
print(f"[fluence] interceptor skipped: {_e}", file=sys.stderr, flush=True)
|
fluence/stage.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
fluence.stage — init-container entrypoint for Model C delivery.
|
|
3
|
+
|
|
4
|
+
The Fluence webhook injects an init container (the sidecar image, which has
|
|
5
|
+
`fluence` installed) that runs `python -m fluence.stage <dest>`. This copies the
|
|
6
|
+
installed `fluence` package plus a `sitecustomize.py` into <dest>, a shared
|
|
7
|
+
emptyDir volume. The webhook mounts that volume into the user's application
|
|
8
|
+
container and prepends <dest> to PYTHONPATH. Python then auto-imports
|
|
9
|
+
sitecustomize on startup, which imports fluence.interceptor — tagging the user's
|
|
10
|
+
quantum tasks with zero user code changes and no vendor SDK requirement on our
|
|
11
|
+
side (the interceptor patches whatever SDK the user already has).
|
|
12
|
+
|
|
13
|
+
Replaces the old build-interceptor.sh: assembly is real package staging, not
|
|
14
|
+
text concatenation.
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
python -m fluence.stage /opt/fluence-staged
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import os
|
|
23
|
+
import shutil
|
|
24
|
+
import sys
|
|
25
|
+
|
|
26
|
+
import fluence
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def stage(dest: str) -> None:
|
|
30
|
+
os.makedirs(dest, exist_ok=True)
|
|
31
|
+
|
|
32
|
+
# Copy the installed `fluence` package into <dest>/fluence so it is importable
|
|
33
|
+
# when <dest> is on PYTHONPATH. We copy only the pure-Python package — no
|
|
34
|
+
# vendor SDKs — so we never perturb the user container's own dependencies.
|
|
35
|
+
pkg_src = os.path.dirname(os.path.abspath(fluence.__file__))
|
|
36
|
+
pkg_dst = os.path.join(dest, "fluence")
|
|
37
|
+
if os.path.exists(pkg_dst):
|
|
38
|
+
shutil.rmtree(pkg_dst)
|
|
39
|
+
shutil.copytree(
|
|
40
|
+
pkg_src, pkg_dst,
|
|
41
|
+
ignore=shutil.ignore_patterns("__pycache__", "*.pyc", "tests"),
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Place sitecustomize.py at the TOP of <dest> (not inside the package) so
|
|
45
|
+
# Python's site machinery imports it automatically on interpreter startup.
|
|
46
|
+
src_sitecustomize = os.path.join(pkg_src, "sitecustomize.py")
|
|
47
|
+
shutil.copyfile(src_sitecustomize, os.path.join(dest, "sitecustomize.py"))
|
|
48
|
+
|
|
49
|
+
print(f"[fluence] staged package + sitecustomize into {dest}", flush=True)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def main(argv=None):
|
|
53
|
+
argv = argv if argv is not None else sys.argv[1:]
|
|
54
|
+
dest = argv[0] if argv else os.environ.get("FLUENCE_STAGE_DIR", "/opt/fluence-staged")
|
|
55
|
+
stage(dest)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == "__main__":
|
|
59
|
+
main()
|
fluence/ungate.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""
|
|
2
|
+
fluence.ungate — generic worker ungating (Kubernetes side).
|
|
3
|
+
|
|
4
|
+
Once the sidecar determines the quantum task is ready, it ungates the gated
|
|
5
|
+
classical worker pods: stamp the vendor-neutral job-id annotation, set the
|
|
6
|
+
high-priority class, and remove the scheduling gate atomically. This is pure
|
|
7
|
+
Kubernetes plumbing — no vendor specifics.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import subprocess
|
|
15
|
+
|
|
16
|
+
from fluence.providers.base import log
|
|
17
|
+
|
|
18
|
+
JOB_ID_ANNOTATION = "fluence.flux-framework.org/quantum-job-id"
|
|
19
|
+
QUANTUM_GATE_NAME = "quantum.braket/ready"
|
|
20
|
+
PRIORITY_CLASS = "fluence-quantum-classical"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def kubectl(args):
|
|
24
|
+
result = subprocess.run(["kubectl"] + args, capture_output=True, text=True)
|
|
25
|
+
if result.returncode != 0:
|
|
26
|
+
raise RuntimeError(f"kubectl {' '.join(args)} failed: {result.stderr.strip()}")
|
|
27
|
+
return result.stdout.strip()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def ungate_pods(gated_pods, job_id, namespace):
|
|
31
|
+
"""
|
|
32
|
+
For each gated worker pod:
|
|
33
|
+
1. Stamp the vendor-neutral job-id annotation so the worker can locate
|
|
34
|
+
the quantum result.
|
|
35
|
+
2. Set the high-priority class and remove the scheduling gate atomically
|
|
36
|
+
(priority is set here, not in the webhook, to avoid the admission
|
|
37
|
+
controller conflict where priority:0 is already defaulted).
|
|
38
|
+
"""
|
|
39
|
+
for pod_name in gated_pods:
|
|
40
|
+
pod_name = pod_name.strip()
|
|
41
|
+
if not pod_name:
|
|
42
|
+
continue
|
|
43
|
+
log(f"ungating pod: {pod_name}")
|
|
44
|
+
|
|
45
|
+
if job_id:
|
|
46
|
+
try:
|
|
47
|
+
kubectl(["annotate", "pod", pod_name, "-n", namespace,
|
|
48
|
+
f"{JOB_ID_ANNOTATION}={job_id}", "--overwrite"])
|
|
49
|
+
log(f" patched job id onto {pod_name}: {job_id}")
|
|
50
|
+
except RuntimeError as e:
|
|
51
|
+
log(f" WARNING: could not annotate {pod_name}: {e}")
|
|
52
|
+
else:
|
|
53
|
+
log(f" WARNING: no job id to patch onto {pod_name}")
|
|
54
|
+
|
|
55
|
+
patch = json.dumps([
|
|
56
|
+
{"op": "add", "path": "/spec/priorityClassName", "value": PRIORITY_CLASS},
|
|
57
|
+
{"op": "remove", "path": "/spec/schedulingGates/0"},
|
|
58
|
+
])
|
|
59
|
+
try:
|
|
60
|
+
kubectl(["patch", "pod", pod_name, "-n", namespace,
|
|
61
|
+
"--type=json", f"-p={patch}"])
|
|
62
|
+
log(f" set priority and removed gate from {pod_name}")
|
|
63
|
+
except RuntimeError as e:
|
|
64
|
+
log(f" WARNING: could not patch {pod_name}: {e}")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def gated_pods_from_env():
|
|
68
|
+
return [p.strip() for p in os.environ.get("FLUENCE_GATED_PODS", "").split(",")
|
|
69
|
+
if p.strip()]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def namespace_from_env():
|
|
73
|
+
return os.environ.get("FLUENCE_NAMESPACE", "default")
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fluence-hpc
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: Fluence quantum-classical scheduling coordination library (sidecar + interceptor + providers)
|
|
5
|
+
Author: Fluence / converged-computing
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Provides-Extra: braket
|
|
10
|
+
Requires-Dist: amazon-braket-sdk; extra == "braket"
|
|
11
|
+
Requires-Dist: boto3; extra == "braket"
|
|
12
|
+
Provides-Extra: all
|
|
13
|
+
Requires-Dist: amazon-braket-sdk; extra == "all"
|
|
14
|
+
Requires-Dist: boto3; extra == "all"
|
|
15
|
+
|
|
16
|
+
# fluence (Python)
|
|
17
|
+
|
|
18
|
+
Quantum-classical scheduling coordination library for the Fluence Kubernetes
|
|
19
|
+
scheduler. Import name `fluence`; distributed on PyPI as `fluence-hpc`.
|
|
20
|
+
|
|
21
|
+
This package is **built into the Fluence sidecar image** and **staged into user
|
|
22
|
+
application containers at admission time** — users never install it.
|
|
23
|
+
|
|
24
|
+
## What it does
|
|
25
|
+
|
|
26
|
+
A hybrid quantum-classical workflow submits work to two queues: the Kubernetes
|
|
27
|
+
scheduler (classical) and a QPU vendor API (quantum). Classical worker pods would
|
|
28
|
+
idle while the QPU queue drains. Fluence gates the workers until the quantum task
|
|
29
|
+
is about to run, then releases them. This library is the runtime half:
|
|
30
|
+
|
|
31
|
+
- **interceptor** (`fluence.interceptor`) — runs inside the user container,
|
|
32
|
+
monkey-patches the vendor SDK submit call to tag each task with the pod UID.
|
|
33
|
+
- **sidecar** (`fluence.sidecar`) — runs in a sidecar container, discovers the
|
|
34
|
+
tagged task, polls queue position, and ungates the classical workers when the
|
|
35
|
+
task is ready (or, in observe-only mode, just records the queue position).
|
|
36
|
+
- **providers** (`fluence.providers`) — per-vendor plug-ins implementing both
|
|
37
|
+
halves. Providers self-register on import.
|
|
38
|
+
|
|
39
|
+
## Delivery (Model C)
|
|
40
|
+
|
|
41
|
+
The interceptor must run in the user's container, which does **not** have this
|
|
42
|
+
package installed. Rather than require a user install or concatenate a text
|
|
43
|
+
snippet, the Fluence webhook:
|
|
44
|
+
|
|
45
|
+
1. injects an **init container** (the sidecar image) running
|
|
46
|
+
`python -m fluence.stage <dir>`, which copies the pure-Python `fluence`
|
|
47
|
+
package plus a `sitecustomize.py` into a shared `emptyDir`;
|
|
48
|
+
2. mounts that volume into the user container and prepends `<dir>` to
|
|
49
|
+
`PYTHONPATH`.
|
|
50
|
+
|
|
51
|
+
Python imports `sitecustomize` automatically on every interpreter start
|
|
52
|
+
(`python app.py` included — unlike `PYTHONSTARTUP`, which only fires for
|
|
53
|
+
interactive sessions), so `import fluence.interceptor` runs before user code.
|
|
54
|
+
The interceptor patches whichever vendor SDK is present and fail-soft skips the
|
|
55
|
+
rest. No user code changes, no vendor SDKs added to the user image.
|
|
56
|
+
|
|
57
|
+
## Adding a provider
|
|
58
|
+
|
|
59
|
+
Add one module under `fluence/providers/` that subclasses `Provider`, implements
|
|
60
|
+
`install_interceptor` (tag hook), `matches`, `find_my_task`, `is_ready_to_ungate`,
|
|
61
|
+
`queue_position` (optional), and `job_id`, and calls `register(PROVIDER)`. Import
|
|
62
|
+
it from `fluence/providers/__init__.py`. Nothing else changes.
|
|
63
|
+
|
|
64
|
+
## Tests
|
|
65
|
+
|
|
66
|
+
python3 python/tests/test_fluence.py
|
|
67
|
+
|
|
68
|
+
## Building and releasing
|
|
69
|
+
|
|
70
|
+
The package is distributed on PyPI as `fluence-hpc` (the import name `fluence` is
|
|
71
|
+
already taken on PyPI). It is also baked into the sidecar image, so a release
|
|
72
|
+
moves the package version and the image tag together.
|
|
73
|
+
|
|
74
|
+
### Build the distributions
|
|
75
|
+
|
|
76
|
+
From `python/`:
|
|
77
|
+
|
|
78
|
+
pip install --upgrade build twine
|
|
79
|
+
python -m build
|
|
80
|
+
|
|
81
|
+
This produces `dist/fluence_hpc-<version>-py3-none-any.whl` and
|
|
82
|
+
`dist/fluence_hpc-<version>.tar.gz`. Upload both.
|
|
83
|
+
|
|
84
|
+
### Test on TestPyPI first
|
|
85
|
+
|
|
86
|
+
twine upload --repository testpypi dist/*
|
|
87
|
+
pip install --index-url https://test.pypi.org/simple/ fluence-hpc
|
|
88
|
+
python -c "import fluence; print(fluence.__version__)"
|
|
89
|
+
|
|
90
|
+
### Release to PyPI
|
|
91
|
+
|
|
92
|
+
twine upload dist/*
|
|
93
|
+
|
|
94
|
+
After this, `pip install fluence-hpc` works anywhere and imports as `fluence`.
|
|
95
|
+
|
|
96
|
+
### Versioning
|
|
97
|
+
|
|
98
|
+
Bump `version` in `pyproject.toml` and `__version__` in `fluence/__init__.py`
|
|
99
|
+
together (PyPI refuses to overwrite an existing version). Because the package is
|
|
100
|
+
version-locked into the sidecar image, tag the release so the image and the
|
|
101
|
+
package share a version — e.g. a `v0.1.1` git tag triggers both the
|
|
102
|
+
`sidecar-build-deploy` workflow (image) and a PyPI publish.
|
|
103
|
+
|
|
104
|
+
### Automated release (recommended)
|
|
105
|
+
|
|
106
|
+
Prefer GitHub Actions with PyPI Trusted Publishing (OIDC) over manual token
|
|
107
|
+
uploads: register the repo + workflow once on PyPI, then a release workflow
|
|
108
|
+
triggered by a version tag builds with `python -m build` and uploads with
|
|
109
|
+
`pypa/gh-action-pypi-publish` — no stored secret. The Docker image is built by
|
|
110
|
+
`.github/workflows/sidecar-build-deploy.yaml` on the same tag, keeping the
|
|
111
|
+
package version and image tag in lockstep.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
fluence/__init__.py,sha256=ujHw9ujIGDoV11-xBjPmkKNVCxd7IliJ84upgcBcw44,641
|
|
2
|
+
fluence/interceptor.py,sha256=jEeco1jNn1FqX2ftjhnjIrU3QYIxA_fv9S7H5H45bd8,1377
|
|
3
|
+
fluence/sidecar.py,sha256=XXoBHFtDQVoQ7GTDJ5UYKwHr0CphA7_UAQu5W-3g4UQ,3542
|
|
4
|
+
fluence/sitecustomize.py,sha256=uOP0SjAZIJRf0HivmT31aFbArfSrHYqeQFd4VDRDdgU,648
|
|
5
|
+
fluence/stage.py,sha256=6NdFW1Mf4M5GKeV5Rbg228UG9cwfa2USOHh5-6lAUQk,2140
|
|
6
|
+
fluence/ungate.py,sha256=w2TMZb9KLHISfQENqvejGt3zBvAddllgm4jO-4b_J40,2596
|
|
7
|
+
fluence/providers/__init__.py,sha256=Rm_PrndQSWue1dD8GCu-I0KtxTR6Vgy2AAOdgJk8GC4,939
|
|
8
|
+
fluence/providers/base.py,sha256=5lLPr0zl_TZwhxqe9q6MRO3YKcxjzKJUAEtmAzuYIgQ,4699
|
|
9
|
+
fluence/providers/braket.py,sha256=mxfS-8eCh6Y6mfWjJjR3oh4ocoETRwJ3a-GozaSaae8,4671
|
|
10
|
+
fluence_hpc-0.0.0.dist-info/METADATA,sha256=uyEYL8SwM0t_8u4K9GOgs8uOu0HRz_AzGaad5DaVwW8,4559
|
|
11
|
+
fluence_hpc-0.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
12
|
+
fluence_hpc-0.0.0.dist-info/entry_points.txt,sha256=YvHH3et_sPBeTUL9VPoi8N_VicHLgScFiuOFl9mUvnw,57
|
|
13
|
+
fluence_hpc-0.0.0.dist-info/top_level.txt,sha256=nEkIvav-ivOCMGH58TC61ZYEsgzKanMhUahAIH5ntqU,8
|
|
14
|
+
fluence_hpc-0.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
fluence
|