quapp-hpc 0.0.1.dev2__tar.gz → 0.0.1.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quapp_hpc-0.0.1.dev2/quapp_hpc.egg-info → quapp_hpc-0.0.1.dev4}/PKG-INFO +1 -1
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/pyproject.toml +1 -1
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/component/backend/hpc_invocation.py +3 -1
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/component/backend/slurm_job_fetching.py +3 -7
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/model/device/slurm_device.py +92 -26
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4/quapp_hpc.egg-info}/PKG-INFO +1 -1
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/LICENSE +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/README.md +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/component/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/component/backend/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/factory/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/factory/hpc_device_factory.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/factory/hpc_handler_factory.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/factory/hpc_provider_factory.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/handler/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/handler/invocation_handler.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/handler/job_fetching_handler.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/model/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/model/device/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/model/provider/__init__.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/model/provider/slurm_provider.py +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc.egg-info/SOURCES.txt +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc.egg-info/dependency_links.txt +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc.egg-info/requires.txt +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc.egg-info/top_level.txt +0 -0
- {quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/setup.cfg +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "quapp-hpc"
|
|
7
|
-
version = "0.0.1.
|
|
7
|
+
version = "0.0.1.dev4"
|
|
8
8
|
description = "Quapp HPC library — Slurm integration for Quapp Platform"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [{ name = "CITYNOW Co. Ltd.", email = "corp@citynow.vn" }]
|
|
@@ -12,12 +12,14 @@ class HpcInvocation(Invocation):
|
|
|
12
12
|
super().__init__(request_data)
|
|
13
13
|
raw = request_data.input or {}
|
|
14
14
|
job = raw.get("job", {})
|
|
15
|
+
# s3Bucket intentionally dropped — S3 bucket is system config, read by
|
|
16
|
+
# SlurmDevice from the S3_BUCKET env var (slurm-credentials secret).
|
|
17
|
+
# User input must not override system fields (HPC design philosophy).
|
|
15
18
|
self._hpc_config = {
|
|
16
19
|
"resources": raw.get("resources", {}),
|
|
17
20
|
"container": raw.get("container", {}),
|
|
18
21
|
"environment": job.get("environment", {}),
|
|
19
22
|
"input_s3_paths": job.get("input_s3_paths", []),
|
|
20
|
-
"s3_bucket": raw.get("s3Bucket", ""),
|
|
21
23
|
}
|
|
22
24
|
|
|
23
25
|
def _export_circuit(self, circuit):
|
{quapp_hpc-0.0.1.dev2 → quapp_hpc-0.0.1.dev4}/quapp_hpc/component/backend/slurm_job_fetching.py
RENAMED
|
@@ -14,11 +14,7 @@ SLURM_JWT = os.getenv("SLURM_JWT", "")
|
|
|
14
14
|
S3_BUCKET = os.getenv("S3_BUCKET", "quapp-slurm-output-dev")
|
|
15
15
|
AWS_REGION = os.getenv("AWS_REGION", "ap-southeast-1")
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
"PENDING": JobStatus.RUNNING.value,
|
|
19
|
-
"CONFIGURING": JobStatus.RUNNING.value,
|
|
20
|
-
"RUNNING": JobStatus.RUNNING.value,
|
|
21
|
-
"COMPLETING": JobStatus.RUNNING.value,
|
|
17
|
+
_SLURM_TERMINAL_TO_JOB_STATUS = {
|
|
22
18
|
"COMPLETED": JobStatus.DONE.value,
|
|
23
19
|
"FAILED": JobStatus.ERROR.value,
|
|
24
20
|
"CANCELLED": JobStatus.ERROR.value,
|
|
@@ -85,7 +81,7 @@ class SlurmJobFetching(JobFetching):
|
|
|
85
81
|
|
|
86
82
|
jobs = data.get("jobs", [data])
|
|
87
83
|
if not jobs:
|
|
88
|
-
return
|
|
84
|
+
return "UNKNOWN"
|
|
89
85
|
|
|
90
86
|
raw_state = jobs[0].get("job_state", "UNKNOWN")
|
|
91
87
|
if isinstance(raw_state, list):
|
|
@@ -93,7 +89,7 @@ class SlurmJobFetching(JobFetching):
|
|
|
93
89
|
|
|
94
90
|
state = str(raw_state).strip()
|
|
95
91
|
self._logger.info(f"Slurm job {slurm_job_id} state: {state}")
|
|
96
|
-
return
|
|
92
|
+
return _SLURM_TERMINAL_TO_JOB_STATUS.get(state, state)
|
|
97
93
|
|
|
98
94
|
def _get_job_result(self, job: dict) -> _SlurmJobResult:
|
|
99
95
|
return _SlurmJobResult(job_uuid=self.job_id, s3_bucket=S3_BUCKET, aws_region=AWS_REGION)
|
|
@@ -14,20 +14,25 @@ from quapp_common.model.provider.provider import Provider
|
|
|
14
14
|
|
|
15
15
|
from ..provider.slurm_provider import SlurmProvider, SLURM_ACCOUNT
|
|
16
16
|
|
|
17
|
+
# ── System config (read from K8s secret slurm-credentials at ksvc startup) ───
|
|
17
18
|
S3_BUCKET = os.getenv("S3_BUCKET", "quapp-slurm-output-dev")
|
|
18
19
|
AWS_REGION = os.getenv("AWS_REGION", "ap-southeast-1")
|
|
19
20
|
SLURM_POLL_SEC = int(os.getenv("SLURM_POLL_SEC", "30"))
|
|
20
21
|
SLURM_TIMEOUT_SEC = int(os.getenv("SLURM_TIMEOUT_SEC", "21600")) # 6 hours
|
|
21
22
|
SLURM_TIME_LIMIT = int(os.getenv("SLURM_TIME_LIMIT_MIN", "60"))
|
|
22
23
|
|
|
24
|
+
# ── Compute runtime config (injected by builder when creating ksvc) ──────────
|
|
25
|
+
# Builder sets these env vars based on the function's qapp_compute_runtime row.
|
|
26
|
+
# The path is resolved per-function: /data/containers/functions/<fn_hash>_<fn_tag>.sif
|
|
27
|
+
# CTS knative_builder/function/handler.sh adds these via --env to `kn ksvc create`.
|
|
28
|
+
COMPUTE_SIF_PATH = os.getenv("COMPUTE_SIF_PATH")
|
|
29
|
+
COMPUTE_ENTRYPOINT = os.getenv("COMPUTE_ENTRYPOINT", "python3 /tmp/quapp_job.py")
|
|
30
|
+
COMPUTE_SCRIPT_EXT = os.getenv("COMPUTE_SCRIPT_EXT", "py")
|
|
31
|
+
|
|
23
32
|
_TERMINAL_STATES = {"COMPLETED", "FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL", "PREEMPTED"}
|
|
24
33
|
_DONE_STATE = "COMPLETED"
|
|
25
34
|
|
|
26
|
-
|
|
27
|
-
"PENDING": JobStatus.RUNNING.value,
|
|
28
|
-
"CONFIGURING": JobStatus.RUNNING.value,
|
|
29
|
-
"RUNNING": JobStatus.RUNNING.value,
|
|
30
|
-
"COMPLETING": JobStatus.RUNNING.value,
|
|
35
|
+
_SLURM_TERMINAL_TO_JOB_STATUS = {
|
|
31
36
|
"COMPLETED": JobStatus.DONE.value,
|
|
32
37
|
"FAILED": JobStatus.ERROR.value,
|
|
33
38
|
"CANCELLED": JobStatus.ERROR.value,
|
|
@@ -48,6 +53,53 @@ _COLLECT_PY = (
|
|
|
48
53
|
)
|
|
49
54
|
|
|
50
55
|
|
|
56
|
+
def _resolve_compute_container(container_cfg: dict) -> tuple[str, str, str]:
|
|
57
|
+
"""Resolve Apptainer image path, entrypoint command, and script extension.
|
|
58
|
+
|
|
59
|
+
Resolution order (first match wins):
|
|
60
|
+
1. User override via invocation `container`:
|
|
61
|
+
container.type = 'sif' + container.image → /data/containers/<image>
|
|
62
|
+
container.type = 'docker' + container.image → docker://<image>
|
|
63
|
+
2. Builder-injected env vars (per-function SIF from build step):
|
|
64
|
+
COMPUTE_SIF_PATH + COMPUTE_ENTRYPOINT + COMPUTE_SCRIPT_EXT
|
|
65
|
+
3. Hard error — no valid configuration. Container is mandatory; running user
|
|
66
|
+
scripts on bare metal is not supported (see HPC design philosophy).
|
|
67
|
+
"""
|
|
68
|
+
container_type = (container_cfg or {}).get("type", "").lower()
|
|
69
|
+
user_image = (container_cfg or {}).get("image", "")
|
|
70
|
+
user_entry = (container_cfg or {}).get("entrypoint")
|
|
71
|
+
user_ext = (container_cfg or {}).get("script_extension")
|
|
72
|
+
|
|
73
|
+
# 1. User override — explicit container choice in invocation input
|
|
74
|
+
if container_type in ("sif", "docker"):
|
|
75
|
+
if not user_image:
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"container.type={container_type!r} requires container.image"
|
|
78
|
+
)
|
|
79
|
+
if container_type == "sif":
|
|
80
|
+
sif_path = f"/data/containers/{user_image}"
|
|
81
|
+
else:
|
|
82
|
+
sif_path = f"docker://{user_image}"
|
|
83
|
+
return (
|
|
84
|
+
sif_path,
|
|
85
|
+
user_entry or COMPUTE_ENTRYPOINT,
|
|
86
|
+
user_ext or COMPUTE_SCRIPT_EXT,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# 2. Builder-injected default — per-function SIF built at deploy time
|
|
90
|
+
if COMPUTE_SIF_PATH:
|
|
91
|
+
return (COMPUTE_SIF_PATH, COMPUTE_ENTRYPOINT, COMPUTE_SCRIPT_EXT)
|
|
92
|
+
|
|
93
|
+
# 3. No path resolved → fail explicitly. Running outside a container is not
|
|
94
|
+
# supported per the Quapp-HPC design philosophy (container-first sandboxing).
|
|
95
|
+
raise RuntimeError(
|
|
96
|
+
"No compute container configured. Either the function pod is missing the "
|
|
97
|
+
"COMPUTE_SIF_PATH env var (builder did not produce a per-function SIF — "
|
|
98
|
+
"rebuild required) or the invocation must explicitly provide "
|
|
99
|
+
"container.type and container.image."
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
51
103
|
class SlurmDevice(Device):
|
|
52
104
|
|
|
53
105
|
def __init__(
|
|
@@ -62,7 +114,9 @@ class SlurmDevice(Device):
|
|
|
62
114
|
self.slurm: SlurmProvider = provider
|
|
63
115
|
self.logger = job_logger(job_uuid)
|
|
64
116
|
self.hpc_config = hpc_config or {}
|
|
65
|
-
|
|
117
|
+
# S3 bucket is a system field — read only from env (no input override).
|
|
118
|
+
# Per HPC design philosophy: system fields are not user-controlled.
|
|
119
|
+
self.s3_bucket = S3_BUCKET
|
|
66
120
|
|
|
67
121
|
# ── Abstract method implementations ──────────────────────────────────────
|
|
68
122
|
|
|
@@ -115,7 +169,7 @@ class SlurmDevice(Device):
|
|
|
115
169
|
|
|
116
170
|
def _get_job_status(self, job: dict) -> str:
|
|
117
171
|
state = self._fetch_slurm_state(job["slurm_job_id"])
|
|
118
|
-
return
|
|
172
|
+
return _SLURM_TERMINAL_TO_JOB_STATUS.get(state, state)
|
|
119
173
|
|
|
120
174
|
def _get_job_result(self, job: dict):
|
|
121
175
|
"""Block until Slurm job finishes, then download result from S3."""
|
|
@@ -147,12 +201,35 @@ class SlurmDevice(Device):
|
|
|
147
201
|
# ── SBATCH script builder ─────────────────────────────────────────────────
|
|
148
202
|
|
|
149
203
|
def _build_sbatch_script(self, user_script: str) -> str:
|
|
150
|
-
"""Build the full SBATCH bash script that wraps the user's computation.
|
|
204
|
+
"""Build the full SBATCH bash script that wraps the user's computation.
|
|
205
|
+
|
|
206
|
+
Container resolution (see _resolve_compute_container):
|
|
207
|
+
- Default: per-function SIF built at deploy time, identified by env
|
|
208
|
+
COMPUTE_SIF_PATH + COMPUTE_ENTRYPOINT + COMPUTE_SCRIPT_EXT
|
|
209
|
+
- Override: invocation `container` block can point at a shared SIF or
|
|
210
|
+
pull a Docker image at runtime
|
|
211
|
+
|
|
212
|
+
The user script string is base64-encoded into the SBATCH script and
|
|
213
|
+
written to /tmp/quapp_job.<ext> on the compute node, then executed
|
|
214
|
+
through Apptainer with the configured entrypoint.
|
|
215
|
+
"""
|
|
151
216
|
resources = self.hpc_config.get("resources", {})
|
|
152
217
|
container = self.hpc_config.get("container", {})
|
|
153
218
|
environment = self.hpc_config.get("environment", {})
|
|
154
219
|
input_s3_paths = self.hpc_config.get("input_s3_paths", [])
|
|
155
220
|
|
|
221
|
+
sif_path, entrypoint, script_ext = _resolve_compute_container(container)
|
|
222
|
+
script_file = f"/tmp/quapp_job.{script_ext}"
|
|
223
|
+
|
|
224
|
+
# Entrypoint may reference the canonical filename; rewrite to actual extension
|
|
225
|
+
entrypoint_resolved = entrypoint.replace("/tmp/quapp_job.py", script_file) \
|
|
226
|
+
.replace("/tmp/quapp_job.sh", script_file)
|
|
227
|
+
|
|
228
|
+
self.logger.info(
|
|
229
|
+
"Compute container resolved: sif=%s entrypoint=%s script_file=%s",
|
|
230
|
+
sif_path, entrypoint_resolved, script_file,
|
|
231
|
+
)
|
|
232
|
+
|
|
156
233
|
lines = ["#!/bin/bash"]
|
|
157
234
|
|
|
158
235
|
# ── SBATCH directives ─────────────────────────────────────────────────
|
|
@@ -191,27 +268,16 @@ class SlurmDevice(Device):
|
|
|
191
268
|
if environment:
|
|
192
269
|
lines.append("")
|
|
193
270
|
|
|
194
|
-
# ──
|
|
195
|
-
container_type = container.get("type", "none")
|
|
196
|
-
image = container.get("image", "")
|
|
197
|
-
if container_type == "sif" and image:
|
|
198
|
-
exec_prefix = f"apptainer exec /data/containers/{image}"
|
|
199
|
-
elif container_type == "docker" and image:
|
|
200
|
-
exec_prefix = f"apptainer exec docker://{image}"
|
|
201
|
-
else:
|
|
202
|
-
exec_prefix = ""
|
|
203
|
-
|
|
204
|
-
# ── Write and run user script ─────────────────────────────────────────
|
|
271
|
+
# ── Write and run user script inside Apptainer ────────────────────────
|
|
205
272
|
b64 = base64.b64encode(user_script.encode()).decode()
|
|
273
|
+
exec_prefix = f"apptainer exec {shlex.quote(sif_path)}" \
|
|
274
|
+
if not sif_path.startswith("docker://") \
|
|
275
|
+
else f"apptainer exec {sif_path}"
|
|
206
276
|
lines += [
|
|
207
|
-
f"echo {shlex.quote(b64)} | base64 -d >
|
|
208
|
-
"chmod +x
|
|
277
|
+
f"echo {shlex.quote(b64)} | base64 -d > {script_file}",
|
|
278
|
+
f"chmod +x {script_file}",
|
|
209
279
|
"",
|
|
210
|
-
|
|
211
|
-
run_cmd = "bash /tmp/quapp_job.sh"
|
|
212
|
-
full_cmd = f"{exec_prefix} {run_cmd}".strip()
|
|
213
|
-
lines += [
|
|
214
|
-
f"{full_cmd} > /tmp/quapp_stdout.txt 2>/tmp/quapp_stderr.txt || true",
|
|
280
|
+
f"{exec_prefix} {entrypoint_resolved} > /tmp/quapp_stdout.txt 2>/tmp/quapp_stderr.txt || true",
|
|
215
281
|
"echo $? > /tmp/quapp_exit_code.txt",
|
|
216
282
|
"",
|
|
217
283
|
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|