quapp-hpc 0.0.1.dev3__tar.gz → 0.0.1.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {quapp_hpc-0.0.1.dev3/quapp_hpc.egg-info → quapp_hpc-0.0.1.dev5}/PKG-INFO +2 -2
  2. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/pyproject.toml +2 -2
  3. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/component/backend/hpc_invocation.py +3 -1
  4. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/factory/hpc_provider_factory.py +1 -1
  5. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/model/device/slurm_device.py +90 -20
  6. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/model/provider/slurm_provider.py +1 -1
  7. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5/quapp_hpc.egg-info}/PKG-INFO +2 -2
  8. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc.egg-info/requires.txt +1 -1
  9. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/LICENSE +0 -0
  10. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/README.md +0 -0
  11. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/__init__.py +0 -0
  12. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/component/__init__.py +0 -0
  13. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/component/backend/__init__.py +0 -0
  14. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/component/backend/slurm_job_fetching.py +0 -0
  15. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/factory/__init__.py +0 -0
  16. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/factory/hpc_device_factory.py +0 -0
  17. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/factory/hpc_handler_factory.py +0 -0
  18. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/handler/__init__.py +0 -0
  19. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/handler/invocation_handler.py +0 -0
  20. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/handler/job_fetching_handler.py +0 -0
  21. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/model/__init__.py +0 -0
  22. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/model/device/__init__.py +0 -0
  23. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc/model/provider/__init__.py +0 -0
  24. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc.egg-info/SOURCES.txt +0 -0
  25. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc.egg-info/dependency_links.txt +0 -0
  26. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/quapp_hpc.egg-info/top_level.txt +0 -0
  27. {quapp_hpc-0.0.1.dev3 → quapp_hpc-0.0.1.dev5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: quapp-hpc
3
- Version: 0.0.1.dev3
3
+ Version: 0.0.1.dev5
4
4
  Summary: Quapp HPC library — Slurm integration for Quapp Platform
5
5
  Author-email: "CITYNOW Co. Ltd." <corp@citynow.vn>
6
6
  License: The MIT License (MIT)
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3
19
19
  Requires-Python: <3.13,>=3.10
20
20
  Description-Content-Type: text/markdown
21
21
  License-File: LICENSE
22
- Requires-Dist: quapp-common==0.0.12.dev3
22
+ Requires-Dist: quapp-common==0.0.12.dev12
23
23
  Requires-Dist: requests>=2.31.0
24
24
  Requires-Dist: boto3>=1.28.0
25
25
  Provides-Extra: dev
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "quapp-hpc"
7
- version = "0.0.1.dev3"
7
+ version = "0.0.1.dev5"
8
8
  description = "Quapp HPC library — Slurm integration for Quapp Platform"
9
9
  readme = "README.md"
10
10
  authors = [{ name = "CITYNOW Co. Ltd.", email = "corp@citynow.vn" }]
@@ -16,7 +16,7 @@ classifiers = [
16
16
  ]
17
17
  keywords = ["quapp", "quapp-hpc", "slurm", "hpc"]
18
18
  dependencies = [
19
- "quapp-common==0.0.12.dev3",
19
+ "quapp-common==0.0.12.dev12",
20
20
  "requests>=2.31.0",
21
21
  "boto3>=1.28.0",
22
22
  ]
@@ -12,12 +12,14 @@ class HpcInvocation(Invocation):
12
12
  super().__init__(request_data)
13
13
  raw = request_data.input or {}
14
14
  job = raw.get("job", {})
15
+ # s3Bucket intentionally dropped — S3 bucket is system config, read by
16
+ # SlurmDevice from the S3_BUCKET env var (slurm-credentials secret).
17
+ # User input must not override system fields (HPC design philosophy).
15
18
  self._hpc_config = {
16
19
  "resources": raw.get("resources", {}),
17
20
  "container": raw.get("container", {}),
18
21
  "environment": job.get("environment", {}),
19
22
  "input_s3_paths": job.get("input_s3_paths", []),
20
- "s3_bucket": raw.get("s3Bucket", ""),
21
23
  }
22
24
 
23
25
  def _export_circuit(self, circuit):
@@ -15,7 +15,7 @@ class HpcProviderFactory:
15
15
  def create_provider(provider_type: ProviderTag, authentication: dict):
16
16
  logger.debug(f"Creating HPC provider: {provider_type}")
17
17
 
18
- if provider_type == ProviderTag.SLURM_HPC:
18
+ if provider_type == ProviderTag.QUAPP_HPC:
19
19
  jwt = authentication.get("slurm_jwt") or SLURM_JWT
20
20
  if not jwt:
21
21
  raise ValueError("SLURM_JWT not set — cannot authenticate with Slurm API")
@@ -14,12 +14,21 @@ from quapp_common.model.provider.provider import Provider
14
14
 
15
15
  from ..provider.slurm_provider import SlurmProvider, SLURM_ACCOUNT
16
16
 
17
+ # ── System config (read from K8s secret slurm-credentials at ksvc startup) ───
17
18
  S3_BUCKET = os.getenv("S3_BUCKET", "quapp-slurm-output-dev")
18
19
  AWS_REGION = os.getenv("AWS_REGION", "ap-southeast-1")
19
20
  SLURM_POLL_SEC = int(os.getenv("SLURM_POLL_SEC", "30"))
20
21
  SLURM_TIMEOUT_SEC = int(os.getenv("SLURM_TIMEOUT_SEC", "21600")) # 6 hours
21
22
  SLURM_TIME_LIMIT = int(os.getenv("SLURM_TIME_LIMIT_MIN", "60"))
22
23
 
24
+ # ── Compute runtime config (injected by builder when creating ksvc) ──────────
25
+ # Builder sets these env vars based on the function's qapp_compute_runtime row.
26
+ # The path is resolved per-function: /data/containers/functions/<fn_hash>_<fn_tag>.sif
27
+ # CTS knative_builder/function/handler.sh adds these via --env to `kn ksvc create`.
28
+ COMPUTE_SIF_PATH = os.getenv("COMPUTE_SIF_PATH")
29
+ COMPUTE_ENTRYPOINT = os.getenv("COMPUTE_ENTRYPOINT", "python3 /tmp/quapp_job.py")
30
+ COMPUTE_SCRIPT_EXT = os.getenv("COMPUTE_SCRIPT_EXT", "py")
31
+
23
32
  _TERMINAL_STATES = {"COMPLETED", "FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL", "PREEMPTED"}
24
33
  _DONE_STATE = "COMPLETED"
25
34
 
@@ -44,6 +53,53 @@ _COLLECT_PY = (
44
53
  )
45
54
 
46
55
 
56
+ def _resolve_compute_container(container_cfg: dict) -> tuple[str, str, str]:
57
+ """Resolve Apptainer image path, entrypoint command, and script extension.
58
+
59
+ Resolution order (first match wins):
60
+ 1. User override via invocation `container`:
61
+ container.type = 'sif' + container.image → /data/containers/<image>
62
+ container.type = 'docker' + container.image → docker://<image>
63
+ 2. Builder-injected env vars (per-function SIF from build step):
64
+ COMPUTE_SIF_PATH + COMPUTE_ENTRYPOINT + COMPUTE_SCRIPT_EXT
65
+ 3. Hard error — no valid configuration. Container is mandatory; running user
66
+ scripts on bare metal is not supported (see HPC design philosophy).
67
+ """
68
+ container_type = (container_cfg or {}).get("type", "").lower()
69
+ user_image = (container_cfg or {}).get("image", "")
70
+ user_entry = (container_cfg or {}).get("entrypoint")
71
+ user_ext = (container_cfg or {}).get("script_extension")
72
+
73
+ # 1. User override — explicit container choice in invocation input
74
+ if container_type in ("sif", "docker"):
75
+ if not user_image:
76
+ raise ValueError(
77
+ f"container.type={container_type!r} requires container.image"
78
+ )
79
+ if container_type == "sif":
80
+ sif_path = f"/data/containers/{user_image}"
81
+ else:
82
+ sif_path = f"docker://{user_image}"
83
+ return (
84
+ sif_path,
85
+ user_entry or COMPUTE_ENTRYPOINT,
86
+ user_ext or COMPUTE_SCRIPT_EXT,
87
+ )
88
+
89
+ # 2. Builder-injected default — per-function SIF built at deploy time
90
+ if COMPUTE_SIF_PATH:
91
+ return (COMPUTE_SIF_PATH, COMPUTE_ENTRYPOINT, COMPUTE_SCRIPT_EXT)
92
+
93
+ # 3. No path resolved → fail explicitly. Running outside a container is not
94
+ # supported per the Quapp-HPC design philosophy (container-first sandboxing).
95
+ raise RuntimeError(
96
+ "No compute container configured. Either the function pod is missing the "
97
+ "COMPUTE_SIF_PATH env var (builder did not produce a per-function SIF — "
98
+ "rebuild required) or the invocation must explicitly provide "
99
+ "container.type and container.image."
100
+ )
101
+
102
+
47
103
  class SlurmDevice(Device):
48
104
 
49
105
  def __init__(
@@ -58,7 +114,9 @@ class SlurmDevice(Device):
58
114
  self.slurm: SlurmProvider = provider
59
115
  self.logger = job_logger(job_uuid)
60
116
  self.hpc_config = hpc_config or {}
61
- self.s3_bucket = self.hpc_config.get("s3_bucket") or S3_BUCKET
117
+ # S3 bucket is a system field — read only from env (no input override).
118
+ # Per HPC design philosophy: system fields are not user-controlled.
119
+ self.s3_bucket = S3_BUCKET
62
120
 
63
121
  # ── Abstract method implementations ──────────────────────────────────────
64
122
 
@@ -143,12 +201,35 @@ class SlurmDevice(Device):
143
201
  # ── SBATCH script builder ─────────────────────────────────────────────────
144
202
 
145
203
  def _build_sbatch_script(self, user_script: str) -> str:
146
- """Build the full SBATCH bash script that wraps the user's computation."""
204
+ """Build the full SBATCH bash script that wraps the user's computation.
205
+
206
+ Container resolution (see _resolve_compute_container):
207
+ - Default: per-function SIF built at deploy time, identified by env
208
+ COMPUTE_SIF_PATH + COMPUTE_ENTRYPOINT + COMPUTE_SCRIPT_EXT
209
+ - Override: invocation `container` block can point at a shared SIF or
210
+ pull a Docker image at runtime
211
+
212
+ The user script string is base64-encoded into the SBATCH script and
213
+ written to /tmp/quapp_job.<ext> on the compute node, then executed
214
+ through Apptainer with the configured entrypoint.
215
+ """
147
216
  resources = self.hpc_config.get("resources", {})
148
217
  container = self.hpc_config.get("container", {})
149
218
  environment = self.hpc_config.get("environment", {})
150
219
  input_s3_paths = self.hpc_config.get("input_s3_paths", [])
151
220
 
221
+ sif_path, entrypoint, script_ext = _resolve_compute_container(container)
222
+ script_file = f"/tmp/quapp_job.{script_ext}"
223
+
224
+ # Entrypoint may reference the canonical filename; rewrite to actual extension
225
+ entrypoint_resolved = entrypoint.replace("/tmp/quapp_job.py", script_file) \
226
+ .replace("/tmp/quapp_job.sh", script_file)
227
+
228
+ self.logger.info(
229
+ "Compute container resolved: sif=%s entrypoint=%s script_file=%s",
230
+ sif_path, entrypoint_resolved, script_file,
231
+ )
232
+
152
233
  lines = ["#!/bin/bash"]
153
234
 
154
235
  # ── SBATCH directives ─────────────────────────────────────────────────
@@ -187,27 +268,16 @@ class SlurmDevice(Device):
187
268
  if environment:
188
269
  lines.append("")
189
270
 
190
- # ── Container exec prefix ─────────────────────────────────────────────
191
- container_type = container.get("type", "none")
192
- image = container.get("image", "")
193
- if container_type == "sif" and image:
194
- exec_prefix = f"apptainer exec /data/containers/{image}"
195
- elif container_type == "docker" and image:
196
- exec_prefix = f"apptainer exec docker://{image}"
197
- else:
198
- exec_prefix = ""
199
-
200
- # ── Write and run user script ─────────────────────────────────────────
271
+ # ── Write and run user script inside Apptainer ────────────────────────
201
272
  b64 = base64.b64encode(user_script.encode()).decode()
273
+ exec_prefix = f"apptainer exec {shlex.quote(sif_path)}" \
274
+ if not sif_path.startswith("docker://") \
275
+ else f"apptainer exec {sif_path}"
202
276
  lines += [
203
- f"echo {shlex.quote(b64)} | base64 -d > /tmp/quapp_job.sh",
204
- "chmod +x /tmp/quapp_job.sh",
277
+ f"echo {shlex.quote(b64)} | base64 -d > {script_file}",
278
+ f"chmod +x {script_file}",
205
279
  "",
206
- ]
207
- run_cmd = "bash /tmp/quapp_job.sh"
208
- full_cmd = f"{exec_prefix} {run_cmd}".strip()
209
- lines += [
210
- f"{full_cmd} > /tmp/quapp_stdout.txt 2>/tmp/quapp_stderr.txt || true",
280
+ f"{exec_prefix} {entrypoint_resolved} > /tmp/quapp_stdout.txt 2>/tmp/quapp_stderr.txt || true",
211
281
  "echo $? > /tmp/quapp_exit_code.txt",
212
282
  "",
213
283
  ]
@@ -15,7 +15,7 @@ SLURM_ACCOUNT = os.getenv("SLURM_ACCOUNT", "quapp")
15
15
  class SlurmProvider(Provider):
16
16
 
17
17
  def __init__(self, jwt_token: str):
18
- super().__init__(ProviderTag.SLURM_HPC)
18
+ super().__init__(ProviderTag.QUAPP_HPC)
19
19
  self.jwt_token = jwt_token
20
20
  self.base_url = f"{SLURM_API_URL}/slurm/{SLURM_API_VER}"
21
21
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: quapp-hpc
3
- Version: 0.0.1.dev3
3
+ Version: 0.0.1.dev5
4
4
  Summary: Quapp HPC library — Slurm integration for Quapp Platform
5
5
  Author-email: "CITYNOW Co. Ltd." <corp@citynow.vn>
6
6
  License: The MIT License (MIT)
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3
19
19
  Requires-Python: <3.13,>=3.10
20
20
  Description-Content-Type: text/markdown
21
21
  License-File: LICENSE
22
- Requires-Dist: quapp-common==0.0.12.dev3
22
+ Requires-Dist: quapp-common==0.0.12.dev12
23
23
  Requires-Dist: requests>=2.31.0
24
24
  Requires-Dist: boto3>=1.28.0
25
25
  Provides-Extra: dev
@@ -1,4 +1,4 @@
1
- quapp-common==0.0.12.dev3
1
+ quapp-common==0.0.12.dev12
2
2
  requests>=2.31.0
3
3
  boto3>=1.28.0
4
4
 
File without changes
File without changes
File without changes