quapp-hpc 0.0.1.dev3__tar.gz → 0.0.1.dev5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quapp_hpc-0.0.1.dev3/quapp_hpc.egg-info → quapp_hpc-0.0.1.dev5}/PKG-INFO +2 -2
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/pyproject.toml +2 -2
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/component/backend/hpc_invocation.py +3 -1
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/factory/hpc_provider_factory.py +1 -1
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/model/device/slurm_device.py +90 -20
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/model/provider/slurm_provider.py +1 -1
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5/quapp_hpc.egg-info}/PKG-INFO +2 -2
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc.egg-info/requires.txt +1 -1
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/LICENSE +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/README.md +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/component/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/component/backend/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/component/backend/slurm_job_fetching.py +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/factory/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/factory/hpc_device_factory.py +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/factory/hpc_handler_factory.py +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/handler/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/handler/invocation_handler.py +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/handler/job_fetching_handler.py +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/model/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/model/device/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/model/provider/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc.egg-info/SOURCES.txt +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc.egg-info/dependency_links.txt +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc.egg-info/top_level.txt +0 -0
- {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: quapp-hpc
|
|
3
|
-
Version: 0.0.1.
|
|
3
|
+
Version: 0.0.1.dev5
|
|
4
4
|
Summary: Quapp HPC library — Slurm integration for Quapp Platform
|
|
5
5
|
Author-email: "CITYNOW Co. Ltd." <corp@citynow.vn>
|
|
6
6
|
License: The MIT License (MIT)
|
|
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3
|
|
|
19
19
|
Requires-Python: <3.13,>=3.10
|
|
20
20
|
Description-Content-Type: text/markdown
|
|
21
21
|
License-File: LICENSE
|
|
22
|
-
Requires-Dist: quapp-common==0.0.12.
|
|
22
|
+
Requires-Dist: quapp-common==0.0.12.dev12
|
|
23
23
|
Requires-Dist: requests>=2.31.0
|
|
24
24
|
Requires-Dist: boto3>=1.28.0
|
|
25
25
|
Provides-Extra: dev
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "quapp-hpc"
|
|
7
|
-
version = "0.0.1.
|
|
7
|
+
version = "0.0.1.dev5"
|
|
8
8
|
description = "Quapp HPC library — Slurm integration for Quapp Platform"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{ name = "CITYNOW Co. Ltd.", email = "corp@citynow.vn" }]
|
|
@@ -16,7 +16,7 @@ classifiers = [
|
|
|
16
16
|
]
|
|
17
17
|
keywords = ["quapp", "quapp-hpc", "slurm", "hpc"]
|
|
18
18
|
dependencies = [
|
|
19
|
-
"quapp-common==0.0.12.
|
|
19
|
+
"quapp-common==0.0.12.dev12",
|
|
20
20
|
"requests>=2.31.0",
|
|
21
21
|
"boto3>=1.28.0",
|
|
22
22
|
]
|
|
@@ -12,12 +12,14 @@ class HpcInvocation(Invocation):
|
|
|
12
12
|
super().__init__(request_data)
|
|
13
13
|
raw = request_data.input or {}
|
|
14
14
|
job = raw.get("job", {})
|
|
15
|
+
# s3Bucket intentionally dropped — S3 bucket is system config, read by
|
|
16
|
+
# SlurmDevice from the S3_BUCKET env var (slurm-credentials secret).
|
|
17
|
+
# User input must not override system fields (HPC design philosophy).
|
|
15
18
|
self._hpc_config = {
|
|
16
19
|
"resources": raw.get("resources", {}),
|
|
17
20
|
"container": raw.get("container", {}),
|
|
18
21
|
"environment": job.get("environment", {}),
|
|
19
22
|
"input_s3_paths": job.get("input_s3_paths", []),
|
|
20
|
-
"s3_bucket": raw.get("s3Bucket", ""),
|
|
21
23
|
}
|
|
22
24
|
|
|
23
25
|
def _export_circuit(self, circuit):
|
|
@@ -15,7 +15,7 @@ class HpcProviderFactory:
|
|
|
15
15
|
def create_provider(provider_type: ProviderTag, authentication: dict):
|
|
16
16
|
logger.debug(f"Creating HPC provider: {provider_type}")
|
|
17
17
|
|
|
18
|
-
if provider_type == ProviderTag.
|
|
18
|
+
if provider_type == ProviderTag.QUAPP_HPC:
|
|
19
19
|
jwt = authentication.get("slurm_jwt") or SLURM_JWT
|
|
20
20
|
if not jwt:
|
|
21
21
|
raise ValueError("SLURM_JWT not set — cannot authenticate with Slurm API")
|
|
@@ -14,12 +14,21 @@ from quapp_common.model.provider.provider import Provider
|
|
|
14
14
|
|
|
15
15
|
from ..provider.slurm_provider import SlurmProvider, SLURM_ACCOUNT
|
|
16
16
|
|
|
17
|
+
# ── System config (read from K8s secret slurm-credentials at ksvc startup) ───
|
|
17
18
|
S3_BUCKET = os.getenv("S3_BUCKET", "quapp-slurm-output-dev")
|
|
18
19
|
AWS_REGION = os.getenv("AWS_REGION", "ap-southeast-1")
|
|
19
20
|
SLURM_POLL_SEC = int(os.getenv("SLURM_POLL_SEC", "30"))
|
|
20
21
|
SLURM_TIMEOUT_SEC = int(os.getenv("SLURM_TIMEOUT_SEC", "21600")) # 6 hours
|
|
21
22
|
SLURM_TIME_LIMIT = int(os.getenv("SLURM_TIME_LIMIT_MIN", "60"))
|
|
22
23
|
|
|
24
|
+
# ── Compute runtime config (injected by builder when creating ksvc) ──────────
|
|
25
|
+
# Builder sets these env vars based on the function's qapp_compute_runtime row.
|
|
26
|
+
# The path is resolved per-function: /data/containers/functions/<fn_hash>_<fn_tag>.sif
|
|
27
|
+
# CTS knative_builder/function/handler.sh adds these via --env to `kn ksvc create`.
|
|
28
|
+
COMPUTE_SIF_PATH = os.getenv("COMPUTE_SIF_PATH")
|
|
29
|
+
COMPUTE_ENTRYPOINT = os.getenv("COMPUTE_ENTRYPOINT", "python3 /tmp/quapp_job.py")
|
|
30
|
+
COMPUTE_SCRIPT_EXT = os.getenv("COMPUTE_SCRIPT_EXT", "py")
|
|
31
|
+
|
|
23
32
|
_TERMINAL_STATES = {"COMPLETED", "FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL", "PREEMPTED"}
|
|
24
33
|
_DONE_STATE = "COMPLETED"
|
|
25
34
|
|
|
@@ -44,6 +53,53 @@ _COLLECT_PY = (
|
|
|
44
53
|
)
|
|
45
54
|
|
|
46
55
|
|
|
56
|
+
def _resolve_compute_container(container_cfg: dict) -> tuple[str, str, str]:
|
|
57
|
+
"""Resolve Apptainer image path, entrypoint command, and script extension.
|
|
58
|
+
|
|
59
|
+
Resolution order (first match wins):
|
|
60
|
+
1. User override via invocation `container`:
|
|
61
|
+
container.type = 'sif' + container.image → /data/containers/<image>
|
|
62
|
+
container.type = 'docker' + container.image → docker://<image>
|
|
63
|
+
2. Builder-injected env vars (per-function SIF from build step):
|
|
64
|
+
COMPUTE_SIF_PATH + COMPUTE_ENTRYPOINT + COMPUTE_SCRIPT_EXT
|
|
65
|
+
3. Hard error — no valid configuration. Container is mandatory; running user
|
|
66
|
+
scripts on bare metal is not supported (see HPC design philosophy).
|
|
67
|
+
"""
|
|
68
|
+
container_type = (container_cfg or {}).get("type", "").lower()
|
|
69
|
+
user_image = (container_cfg or {}).get("image", "")
|
|
70
|
+
user_entry = (container_cfg or {}).get("entrypoint")
|
|
71
|
+
user_ext = (container_cfg or {}).get("script_extension")
|
|
72
|
+
|
|
73
|
+
# 1. User override — explicit container choice in invocation input
|
|
74
|
+
if container_type in ("sif", "docker"):
|
|
75
|
+
if not user_image:
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"container.type={container_type!r} requires container.image"
|
|
78
|
+
)
|
|
79
|
+
if container_type == "sif":
|
|
80
|
+
sif_path = f"/data/containers/{user_image}"
|
|
81
|
+
else:
|
|
82
|
+
sif_path = f"docker://{user_image}"
|
|
83
|
+
return (
|
|
84
|
+
sif_path,
|
|
85
|
+
user_entry or COMPUTE_ENTRYPOINT,
|
|
86
|
+
user_ext or COMPUTE_SCRIPT_EXT,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# 2. Builder-injected default — per-function SIF built at deploy time
|
|
90
|
+
if COMPUTE_SIF_PATH:
|
|
91
|
+
return (COMPUTE_SIF_PATH, COMPUTE_ENTRYPOINT, COMPUTE_SCRIPT_EXT)
|
|
92
|
+
|
|
93
|
+
# 3. No path resolved → fail explicitly. Running outside a container is not
|
|
94
|
+
# supported per the Quapp-HPC design philosophy (container-first sandboxing).
|
|
95
|
+
raise RuntimeError(
|
|
96
|
+
"No compute container configured. Either the function pod is missing the "
|
|
97
|
+
"COMPUTE_SIF_PATH env var (builder did not produce a per-function SIF — "
|
|
98
|
+
"rebuild required) or the invocation must explicitly provide "
|
|
99
|
+
"container.type and container.image."
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
47
103
|
class SlurmDevice(Device):
|
|
48
104
|
|
|
49
105
|
def __init__(
|
|
@@ -58,7 +114,9 @@ class SlurmDevice(Device):
|
|
|
58
114
|
self.slurm: SlurmProvider = provider
|
|
59
115
|
self.logger = job_logger(job_uuid)
|
|
60
116
|
self.hpc_config = hpc_config or {}
|
|
61
|
-
|
|
117
|
+
# S3 bucket is a system field — read only from env (no input override).
|
|
118
|
+
# Per HPC design philosophy: system fields are not user-controlled.
|
|
119
|
+
self.s3_bucket = S3_BUCKET
|
|
62
120
|
|
|
63
121
|
# ── Abstract method implementations ──────────────────────────────────────
|
|
64
122
|
|
|
@@ -143,12 +201,35 @@ class SlurmDevice(Device):
|
|
|
143
201
|
# ── SBATCH script builder ─────────────────────────────────────────────────
|
|
144
202
|
|
|
145
203
|
def _build_sbatch_script(self, user_script: str) -> str:
|
|
146
|
-
"""Build the full SBATCH bash script that wraps the user's computation.
|
|
204
|
+
"""Build the full SBATCH bash script that wraps the user's computation.
|
|
205
|
+
|
|
206
|
+
Container resolution (see _resolve_compute_container):
|
|
207
|
+
- Default: per-function SIF built at deploy time, identified by env
|
|
208
|
+
COMPUTE_SIF_PATH + COMPUTE_ENTRYPOINT + COMPUTE_SCRIPT_EXT
|
|
209
|
+
- Override: invocation `container` block can point at a shared SIF or
|
|
210
|
+
pull a Docker image at runtime
|
|
211
|
+
|
|
212
|
+
The user script string is base64-encoded into the SBATCH script and
|
|
213
|
+
written to /tmp/quapp_job.<ext> on the compute node, then executed
|
|
214
|
+
through Apptainer with the configured entrypoint.
|
|
215
|
+
"""
|
|
147
216
|
resources = self.hpc_config.get("resources", {})
|
|
148
217
|
container = self.hpc_config.get("container", {})
|
|
149
218
|
environment = self.hpc_config.get("environment", {})
|
|
150
219
|
input_s3_paths = self.hpc_config.get("input_s3_paths", [])
|
|
151
220
|
|
|
221
|
+
sif_path, entrypoint, script_ext = _resolve_compute_container(container)
|
|
222
|
+
script_file = f"/tmp/quapp_job.{script_ext}"
|
|
223
|
+
|
|
224
|
+
# Entrypoint may reference the canonical filename; rewrite to actual extension
|
|
225
|
+
entrypoint_resolved = entrypoint.replace("/tmp/quapp_job.py", script_file) \
|
|
226
|
+
.replace("/tmp/quapp_job.sh", script_file)
|
|
227
|
+
|
|
228
|
+
self.logger.info(
|
|
229
|
+
"Compute container resolved: sif=%s entrypoint=%s script_file=%s",
|
|
230
|
+
sif_path, entrypoint_resolved, script_file,
|
|
231
|
+
)
|
|
232
|
+
|
|
152
233
|
lines = ["#!/bin/bash"]
|
|
153
234
|
|
|
154
235
|
# ── SBATCH directives ─────────────────────────────────────────────────
|
|
@@ -187,27 +268,16 @@ class SlurmDevice(Device):
|
|
|
187
268
|
if environment:
|
|
188
269
|
lines.append("")
|
|
189
270
|
|
|
190
|
-
# ──
|
|
191
|
-
container_type = container.get("type", "none")
|
|
192
|
-
image = container.get("image", "")
|
|
193
|
-
if container_type == "sif" and image:
|
|
194
|
-
exec_prefix = f"apptainer exec /data/containers/{image}"
|
|
195
|
-
elif container_type == "docker" and image:
|
|
196
|
-
exec_prefix = f"apptainer exec docker://{image}"
|
|
197
|
-
else:
|
|
198
|
-
exec_prefix = ""
|
|
199
|
-
|
|
200
|
-
# ── Write and run user script ─────────────────────────────────────────
|
|
271
|
+
# ── Write and run user script inside Apptainer ────────────────────────
|
|
201
272
|
b64 = base64.b64encode(user_script.encode()).decode()
|
|
273
|
+
exec_prefix = f"apptainer exec {shlex.quote(sif_path)}" \
|
|
274
|
+
if not sif_path.startswith("docker://") \
|
|
275
|
+
else f"apptainer exec {sif_path}"
|
|
202
276
|
lines += [
|
|
203
|
-
f"echo {shlex.quote(b64)} | base64 -d >
|
|
204
|
-
"chmod +x
|
|
277
|
+
f"echo {shlex.quote(b64)} | base64 -d > {script_file}",
|
|
278
|
+
f"chmod +x {script_file}",
|
|
205
279
|
"",
|
|
206
|
-
|
|
207
|
-
run_cmd = "bash /tmp/quapp_job.sh"
|
|
208
|
-
full_cmd = f"{exec_prefix} {run_cmd}".strip()
|
|
209
|
-
lines += [
|
|
210
|
-
f"{full_cmd} > /tmp/quapp_stdout.txt 2>/tmp/quapp_stderr.txt || true",
|
|
280
|
+
f"{exec_prefix} {entrypoint_resolved} > /tmp/quapp_stdout.txt 2>/tmp/quapp_stderr.txt || true",
|
|
211
281
|
"echo $? > /tmp/quapp_exit_code.txt",
|
|
212
282
|
"",
|
|
213
283
|
]
|
|
@@ -15,7 +15,7 @@ SLURM_ACCOUNT = os.getenv("SLURM_ACCOUNT", "quapp")
|
|
|
15
15
|
class SlurmProvider(Provider):
|
|
16
16
|
|
|
17
17
|
def __init__(self, jwt_token: str):
|
|
18
|
-
super().__init__(ProviderTag.
|
|
18
|
+
super().__init__(ProviderTag.QUAPP_HPC)
|
|
19
19
|
self.jwt_token = jwt_token
|
|
20
20
|
self.base_url = f"{SLURM_API_URL}/slurm/{SLURM_API_VER}"
|
|
21
21
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: quapp-hpc
|
|
3
|
-
Version: 0.0.1.
|
|
3
|
+
Version: 0.0.1.dev5
|
|
4
4
|
Summary: Quapp HPC library — Slurm integration for Quapp Platform
|
|
5
5
|
Author-email: "CITYNOW Co. Ltd." <corp@citynow.vn>
|
|
6
6
|
License: The MIT License (MIT)
|
|
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3
|
|
|
19
19
|
Requires-Python: <3.13,>=3.10
|
|
20
20
|
Description-Content-Type: text/markdown
|
|
21
21
|
License-File: LICENSE
|
|
22
|
-
Requires-Dist: quapp-common==0.0.12.
|
|
22
|
+
Requires-Dist: quapp-common==0.0.12.dev12
|
|
23
23
|
Requires-Dist: requests>=2.31.0
|
|
24
24
|
Requires-Dist: boto3>=1.28.0
|
|
25
25
|
Provides-Extra: dev
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/component/backend/slurm_job_fetching.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|