quapp-hpc 0.0.1.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. quapp_hpc-0.0.1.dev1/LICENSE +8 -0
  2. quapp_hpc-0.0.1.dev1/PKG-INFO +141 -0
  3. quapp_hpc-0.0.1.dev1/README.md +109 -0
  4. quapp_hpc-0.0.1.dev1/pyproject.toml +33 -0
  5. quapp_hpc-0.0.1.dev1/quapp_hpc/__init__.py +3 -0
  6. quapp_hpc-0.0.1.dev1/quapp_hpc/component/__init__.py +0 -0
  7. quapp_hpc-0.0.1.dev1/quapp_hpc/component/backend/__init__.py +0 -0
  8. quapp_hpc-0.0.1.dev1/quapp_hpc/component/backend/hpc_invocation.py +41 -0
  9. quapp_hpc-0.0.1.dev1/quapp_hpc/component/backend/slurm_job_fetching.py +99 -0
  10. quapp_hpc-0.0.1.dev1/quapp_hpc/factory/__init__.py +0 -0
  11. quapp_hpc-0.0.1.dev1/quapp_hpc/factory/hpc_device_factory.py +19 -0
  12. quapp_hpc-0.0.1.dev1/quapp_hpc/factory/hpc_handler_factory.py +30 -0
  13. quapp_hpc-0.0.1.dev1/quapp_hpc/factory/hpc_provider_factory.py +24 -0
  14. quapp_hpc-0.0.1.dev1/quapp_hpc/handler/__init__.py +0 -0
  15. quapp_hpc-0.0.1.dev1/quapp_hpc/handler/invocation_handler.py +23 -0
  16. quapp_hpc-0.0.1.dev1/quapp_hpc/handler/job_fetching_handler.py +18 -0
  17. quapp_hpc-0.0.1.dev1/quapp_hpc/model/__init__.py +0 -0
  18. quapp_hpc-0.0.1.dev1/quapp_hpc/model/device/__init__.py +0 -0
  19. quapp_hpc-0.0.1.dev1/quapp_hpc/model/device/slurm_device.py +255 -0
  20. quapp_hpc-0.0.1.dev1/quapp_hpc/model/provider/__init__.py +0 -0
  21. quapp_hpc-0.0.1.dev1/quapp_hpc/model/provider/slurm_provider.py +40 -0
  22. quapp_hpc-0.0.1.dev1/quapp_hpc.egg-info/PKG-INFO +141 -0
  23. quapp_hpc-0.0.1.dev1/quapp_hpc.egg-info/SOURCES.txt +25 -0
  24. quapp_hpc-0.0.1.dev1/quapp_hpc.egg-info/dependency_links.txt +1 -0
  25. quapp_hpc-0.0.1.dev1/quapp_hpc.egg-info/requires.txt +10 -0
  26. quapp_hpc-0.0.1.dev1/quapp_hpc.egg-info/top_level.txt +1 -0
  27. quapp_hpc-0.0.1.dev1/setup.cfg +4 -0
@@ -0,0 +1,8 @@
1
+ The MIT License (MIT)
2
+ Copyright © CITYNOW Co. Ltd. All rights reserved.
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5
+
6
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7
+
8
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,141 @@
1
+ Metadata-Version: 2.4
2
+ Name: quapp-hpc
3
+ Version: 0.0.1.dev1
4
+ Summary: Quapp HPC library — Slurm integration for Quapp Platform
5
+ Author-email: "CITYNOW Co. Ltd." <corp@citynow.vn>
6
+ License: The MIT License (MIT)
7
+ Copyright © CITYNOW Co. Ltd. All rights reserved.
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
14
+ Project-URL: Homepage, https://quapp.cloud/
15
+ Keywords: quapp,quapp-hpc,slurm,hpc
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python
18
+ Classifier: Programming Language :: Python :: 3
19
+ Requires-Python: <3.13,>=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: quapp-common==0.0.11.dev9
23
+ Requires-Dist: requests>=2.31.0
24
+ Requires-Dist: boto3>=1.28.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: black; extra == "dev"
27
+ Requires-Dist: bumpver; extra == "dev"
28
+ Requires-Dist: isort; extra == "dev"
29
+ Requires-Dist: pip-tools; extra == "dev"
30
+ Requires-Dist: pytest; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # quapp-hpc
34
+
35
+ Python library cho Quapp HPC functions — cầu nối giữa Quapp FaaS platform và Slurm HPC cluster.
36
+
37
+ ## Architecture
38
+
39
+ ```
40
+ ksvc (Docker)
41
+ ├── index.py FastAPI server
42
+ ├── quapp_hpc/
43
+ │ ├── factory/
44
+ │ │ └── hpc_handler_factory.py Entry point cho user
45
+ │ ├── component/backend/
46
+ │ │ └── hpc_invocation.py Orchestrates job lifecycle
47
+ │ └── model/
48
+ │ ├── provider/slurm_provider.py Auth headers, base URL
49
+ │ └── device/slurm_device.py Submit → Poll → S3 download
50
+ └── function/
51
+ └── handler.py User-defined processing() + post_processing()
52
+ ```
53
+
54
+ ## Luồng thực thi
55
+
56
+ ```
57
+ index.py nhận HTTP POST
58
+ → HpcHandlerFactory.create_handler(event, processing_fn, post_processing_fn)
59
+ → InvocationHandler.handle()
60
+ → HpcInvocation.submit_job()
61
+ 1. processing_fn(invocation_input) → bash script string
62
+ 2. SlurmDevice._create_job(script) → POST Slurm REST API → slurm_job_id
63
+ 3. SlurmDevice._get_job_result() → poll mỗi 30s → COMPLETED/FAILED
64
+ 4. SlurmDevice._download_s3_result()→ boto3 get s3://$S3_BUCKET/$JOB_UUID/output.json
65
+ 5. post_processing_fn(s3_result) → final response
66
+ ```
67
+
68
+ ## Environment variables (từ K8s Secret `slurm-credentials`)
69
+
70
+ | Var | Ví dụ | Mô tả |
71
+ |-----|-------|-------|
72
+ | `SLURM_API_URL` | `http://10.1.0.15:6820` | Slurm REST API base URL |
73
+ | `SLURM_JWT` | `eyJ...` | JWT token cho Slurm auth |
74
+ | `SLURM_USERNAME` | `quapp-svc` | Slurm username |
75
+ | `SLURM_ACCOUNT` | `quapp` | Slurm account/allocation |
76
+ | `S3_BUCKET` | `quapp-slurm-output-dev` | S3 bucket cho job output |
77
+ | `AWS_REGION` | `ap-southeast-1` | AWS region |
78
+ | `SLURM_POLL_SEC` | `30` | Polling interval (giây) |
79
+ | `SLURM_TIMEOUT_SEC` | `21600` | Max wait time (giây, default 6h) |
80
+
81
+ ## invocation_input schema
82
+
83
+ Xem chi tiết tại [`../qapp-sdk-templates/slurm-hpc/README.md`](../../qapp-sdk-templates/slurm-hpc/README.md).
84
+
85
+ Tóm tắt:
86
+ ```json
87
+ {
88
+ "resources": { "partition", "nodes", "cpus_per_task", "gpus", "memory_gb", "time_limit" },
89
+ "container": { "type": "sif"|"docker"|"none", "image": "..." },
90
+ "job": { "type": "script"|"command", "script"|"command": "...", "environment": {}, "input_s3_paths": [] }
91
+ }
92
+ ```
93
+
94
+ ## Slurm REST API
95
+
96
+ - Version: `v0.0.40`
97
+ - Submit: `POST {SLURM_API_URL}/slurm/v0.0.40/job/submit`
98
+ - Status: `GET {SLURM_API_URL}/slurm/v0.0.40/job/{job_id}`
99
+ - Auth headers: `X-SLURM-USER-NAME`, `X-SLURM-USER-TOKEN`
100
+
101
+ ### Job state mapping
102
+
103
+ | Slurm state | Quapp state |
104
+ |---|---|
105
+ | PENDING, CONFIGURING, RUNNING, COMPLETING | RUNNING |
106
+ | COMPLETED | DONE |
107
+ | FAILED, CANCELLED, TIMEOUT, NODE_FAIL, PREEMPTED | ERROR |
108
+
109
+ ## S3 output pattern
110
+
111
+ Job script phải upload kết quả:
112
+ ```bash
113
+ aws s3 cp /tmp/output.json s3://$S3_BUCKET/$JOB_UUID/output.json
114
+ ```
115
+
116
+ `$JOB_UUID` và `$S3_BUCKET` được inject tự động bởi `SlurmDevice._create_job()` qua Slurm `environment` array.
117
+
118
+ ## K8s Secret
119
+
120
+ ```yaml
121
+ # infrastructure/quapp-job-scheduler/k8s/cts/slurm-secret.yaml
122
+ apiVersion: v1
123
+ kind: Secret
124
+ metadata:
125
+ name: slurm-credentials
126
+ namespace: quapp-functions-dev
127
+ stringData:
128
+ SLURM_JWT: "<generate: sudo scontrol token username=quapp-svc lifespan=2592000>"
129
+ SLURM_API_URL: "http://10.1.0.15:6820"
130
+ SLURM_USERNAME: "quapp-svc"
131
+ SLURM_ACCOUNT: "quapp"
132
+ S3_BUCKET: "quapp-slurm-output-dev"
133
+ AWS_REGION: "ap-southeast-1"
134
+ ```
135
+
136
+ ## DB seed required
137
+
138
+ Chạy script trước khi deploy:
139
+ ```
140
+ infrastructure/quapp-functions-backend/docs/db/seed_slurm_hpc.sql
141
+ ```
@@ -0,0 +1,109 @@
1
+ # quapp-hpc
2
+
3
+ Python library cho Quapp HPC functions — cầu nối giữa Quapp FaaS platform và Slurm HPC cluster.
4
+
5
+ ## Architecture
6
+
7
+ ```
8
+ ksvc (Docker)
9
+ ├── index.py FastAPI server
10
+ ├── quapp_hpc/
11
+ │ ├── factory/
12
+ │ │ └── hpc_handler_factory.py Entry point cho user
13
+ │ ├── component/backend/
14
+ │ │ └── hpc_invocation.py Orchestrates job lifecycle
15
+ │ └── model/
16
+ │ ├── provider/slurm_provider.py Auth headers, base URL
17
+ │ └── device/slurm_device.py Submit → Poll → S3 download
18
+ └── function/
19
+ └── handler.py User-defined processing() + post_processing()
20
+ ```
21
+
22
+ ## Luồng thực thi
23
+
24
+ ```
25
+ index.py nhận HTTP POST
26
+ → HpcHandlerFactory.create_handler(event, processing_fn, post_processing_fn)
27
+ → InvocationHandler.handle()
28
+ → HpcInvocation.submit_job()
29
+ 1. processing_fn(invocation_input) → bash script string
30
+ 2. SlurmDevice._create_job(script) → POST Slurm REST API → slurm_job_id
31
+ 3. SlurmDevice._get_job_result() → poll mỗi 30s → COMPLETED/FAILED
32
+ 4. SlurmDevice._download_s3_result()→ boto3 get s3://$S3_BUCKET/$JOB_UUID/output.json
33
+ 5. post_processing_fn(s3_result) → final response
34
+ ```
35
+
36
+ ## Environment variables (từ K8s Secret `slurm-credentials`)
37
+
38
+ | Var | Ví dụ | Mô tả |
39
+ |-----|-------|-------|
40
+ | `SLURM_API_URL` | `http://10.1.0.15:6820` | Slurm REST API base URL |
41
+ | `SLURM_JWT` | `eyJ...` | JWT token cho Slurm auth |
42
+ | `SLURM_USERNAME` | `quapp-svc` | Slurm username |
43
+ | `SLURM_ACCOUNT` | `quapp` | Slurm account/allocation |
44
+ | `S3_BUCKET` | `quapp-slurm-output-dev` | S3 bucket cho job output |
45
+ | `AWS_REGION` | `ap-southeast-1` | AWS region |
46
+ | `SLURM_POLL_SEC` | `30` | Polling interval (giây) |
47
+ | `SLURM_TIMEOUT_SEC` | `21600` | Max wait time (giây, default 6h) |
48
+
49
+ ## invocation_input schema
50
+
51
+ Xem chi tiết tại [`../qapp-sdk-templates/slurm-hpc/README.md`](../../qapp-sdk-templates/slurm-hpc/README.md).
52
+
53
+ Tóm tắt:
54
+ ```json
55
+ {
56
+ "resources": { "partition", "nodes", "cpus_per_task", "gpus", "memory_gb", "time_limit" },
57
+ "container": { "type": "sif"|"docker"|"none", "image": "..." },
58
+ "job": { "type": "script"|"command", "script"|"command": "...", "environment": {}, "input_s3_paths": [] }
59
+ }
60
+ ```
61
+
62
+ ## Slurm REST API
63
+
64
+ - Version: `v0.0.40`
65
+ - Submit: `POST {SLURM_API_URL}/slurm/v0.0.40/job/submit`
66
+ - Status: `GET {SLURM_API_URL}/slurm/v0.0.40/job/{job_id}`
67
+ - Auth headers: `X-SLURM-USER-NAME`, `X-SLURM-USER-TOKEN`
68
+
69
+ ### Job state mapping
70
+
71
+ | Slurm state | Quapp state |
72
+ |---|---|
73
+ | PENDING, CONFIGURING, RUNNING, COMPLETING | RUNNING |
74
+ | COMPLETED | DONE |
75
+ | FAILED, CANCELLED, TIMEOUT, NODE_FAIL, PREEMPTED | ERROR |
76
+
77
+ ## S3 output pattern
78
+
79
+ Job script phải upload kết quả:
80
+ ```bash
81
+ aws s3 cp /tmp/output.json s3://$S3_BUCKET/$JOB_UUID/output.json
82
+ ```
83
+
84
+ `$JOB_UUID` và `$S3_BUCKET` được inject tự động bởi `SlurmDevice._create_job()` qua Slurm `environment` array.
85
+
86
+ ## K8s Secret
87
+
88
+ ```yaml
89
+ # infrastructure/quapp-job-scheduler/k8s/cts/slurm-secret.yaml
90
+ apiVersion: v1
91
+ kind: Secret
92
+ metadata:
93
+ name: slurm-credentials
94
+ namespace: quapp-functions-dev
95
+ stringData:
96
+ SLURM_JWT: "<generate: sudo scontrol token username=quapp-svc lifespan=2592000>"
97
+ SLURM_API_URL: "http://10.1.0.15:6820"
98
+ SLURM_USERNAME: "quapp-svc"
99
+ SLURM_ACCOUNT: "quapp"
100
+ S3_BUCKET: "quapp-slurm-output-dev"
101
+ AWS_REGION: "ap-southeast-1"
102
+ ```
103
+
104
+ ## DB seed required
105
+
106
+ Chạy script trước khi deploy:
107
+ ```
108
+ infrastructure/quapp-functions-backend/docs/db/seed_slurm_hpc.sql
109
+ ```
@@ -0,0 +1,33 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0.0", "wheel==0.45.1"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "quapp-hpc"
7
+ version = "0.0.1.dev1"
8
+ description = "Quapp HPC library — Slurm integration for Quapp Platform"
9
+ readme = "README.md"
10
+ authors = [{ name = "CITYNOW Co. Ltd.", email = "corp@citynow.vn" }]
11
+ license = { file = "LICENSE" }
12
+ classifiers = [
13
+ "License :: OSI Approved :: MIT License",
14
+ "Programming Language :: Python",
15
+ "Programming Language :: Python :: 3",
16
+ ]
17
+ keywords = ["quapp", "quapp-hpc", "slurm", "hpc"]
18
+ dependencies = [
19
+ "quapp-common==0.0.11.dev9",
20
+ "requests>=2.31.0",
21
+ "boto3>=1.28.0",
22
+ ]
23
+ requires-python = ">=3.10,<3.13"
24
+
25
+ [project.optional-dependencies]
26
+ dev = ["black", "bumpver", "isort", "pip-tools", "pytest"]
27
+
28
+ [tool.setuptools.packages.find]
29
+ include = ["quapp_hpc*"]
30
+ exclude = ["*.md", "*.yml", "*.yaml", "*.toml", "tests*", ".gitignore"]
31
+
32
+ [project.urls]
33
+ Homepage = "https://quapp.cloud/"
@@ -0,0 +1,3 @@
1
+ from .factory.hpc_handler_factory import HpcHandlerFactory
2
+
3
+ __all__ = ["HpcHandlerFactory"]
File without changes
@@ -0,0 +1,41 @@
1
+ from quapp_common.component.backend.invocation import Invocation
2
+ from quapp_common.data.request.invocation_request import InvocationRequest
3
+ from quapp_common.model.provider.provider import Provider
4
+
5
+ from ...factory.hpc_device_factory import HpcDeviceFactory
6
+ from ...factory.hpc_provider_factory import HpcProviderFactory
7
+
8
+
9
+ class HpcInvocation(Invocation):
10
+
11
+ def __init__(self, request_data: InvocationRequest):
12
+ super().__init__(request_data)
13
+ raw = request_data.input or {}
14
+ job = raw.get("job", {})
15
+ self._hpc_config = {
16
+ "resources": raw.get("resources", {}),
17
+ "container": raw.get("container", {}),
18
+ "environment": job.get("environment", {}),
19
+ "input_s3_paths": job.get("input_s3_paths", []),
20
+ "s3_bucket": raw.get("s3Bucket", ""),
21
+ }
22
+
23
+ def _export_circuit(self, circuit):
24
+ pass
25
+
26
+ def _get_qubit_amount(self, circuit) -> int:
27
+ return 0
28
+
29
+ def _create_provider(self) -> Provider:
30
+ return HpcProviderFactory.create_provider(
31
+ provider_type=self.backend_information.provider_tag,
32
+ authentication=self.backend_information.authentication,
33
+ )
34
+
35
+ def _create_device(self, provider: Provider):
36
+ return HpcDeviceFactory.create_device(
37
+ provider=provider,
38
+ device_specification=self.backend_information.device_name,
39
+ job_uuid=self.job_id,
40
+ hpc_config=self._hpc_config,
41
+ )
@@ -0,0 +1,99 @@
1
+ import json
2
+ import os
3
+
4
+ import boto3
5
+ import requests
6
+ from quapp_common.component.backend.job_fetching import JobFetching
7
+ from quapp_common.config.logging_config import job_logger
8
+ from quapp_common.data.request.job_fetching_request import JobFetchingRequest
9
+ from quapp_common.enum.status.job_status import JobStatus
10
+
11
+ from ...model.provider.slurm_provider import SlurmProvider
12
+
13
+ SLURM_JWT = os.getenv("SLURM_JWT", "")
14
+ S3_BUCKET = os.getenv("S3_BUCKET", "quapp-slurm-output-dev")
15
+ AWS_REGION = os.getenv("AWS_REGION", "ap-southeast-1")
16
+
17
+ _SLURM_TO_JOB_STATUS = {
18
+ "PENDING": JobStatus.RUNNING.value,
19
+ "CONFIGURING": JobStatus.RUNNING.value,
20
+ "RUNNING": JobStatus.RUNNING.value,
21
+ "COMPLETING": JobStatus.RUNNING.value,
22
+ "COMPLETED": JobStatus.DONE.value,
23
+ "FAILED": JobStatus.ERROR.value,
24
+ "CANCELLED": JobStatus.ERROR.value,
25
+ "TIMEOUT": JobStatus.ERROR.value,
26
+ "NODE_FAIL": JobStatus.ERROR.value,
27
+ "PREEMPTED": JobStatus.ERROR.value,
28
+ }
29
+
30
+
31
+ class _SlurmJobResult:
32
+ """Lazy S3 download of the Slurm job's output.json.
33
+
34
+ result() downloads from S3 only when called (after DONE is confirmed).
35
+ usage() returns None so JobFetching.__get_execution_time() doesn't raise.
36
+ """
37
+
38
+ def __init__(self, job_uuid: str, s3_bucket: str = S3_BUCKET, aws_region: str = AWS_REGION):
39
+ self._job_uuid = job_uuid
40
+ self._s3_bucket = s3_bucket
41
+ self._aws_region = aws_region
42
+ self._logger = job_logger(job_uuid)
43
+
44
+ def usage(self):
45
+ return None
46
+
47
+ def result(self):
48
+ s3_key = f"{self._job_uuid}/output.json"
49
+ self._logger.info(f"Downloading result from s3://{self._s3_bucket}/{s3_key}")
50
+ s3 = boto3.client("s3", region_name=self._aws_region)
51
+ resp = s3.get_object(Bucket=self._s3_bucket, Key=s3_key)
52
+ return json.loads(resp["Body"].read())
53
+
54
+
55
+ class SlurmJobFetching(JobFetching):
56
+ """Polls Slurm REST API for job status (IBM-pattern, triggered by watchdog)."""
57
+
58
+ def __init__(self, request_data: JobFetchingRequest):
59
+ super().__init__(request_data)
60
+ self.job_id = request_data.job_id
61
+ self._logger = job_logger(request_data.job_id)
62
+
63
+ # ── Abstract implementations ──────────────────────────────────────────────
64
+
65
+ def _collect_provider(self) -> SlurmProvider:
66
+ jwt = (self.provider_authentication or {}).get("slurm_jwt") or SLURM_JWT
67
+ if not jwt:
68
+ raise ValueError("SLURM_JWT not set — cannot authenticate with Slurm API")
69
+ return SlurmProvider(jwt_token=jwt)
70
+
71
+ def _retrieve_job(self, provider: SlurmProvider) -> dict:
72
+ return {"slurm_job_id": self.provider_job_id, "provider": provider}
73
+
74
+ def _get_job_status(self, job: dict) -> str:
75
+ provider: SlurmProvider = job["provider"]
76
+ slurm_job_id = job["slurm_job_id"]
77
+
78
+ resp = requests.get(
79
+ f"{provider.base_url}/job/{slurm_job_id}",
80
+ headers=provider.auth_headers(),
81
+ timeout=15,
82
+ )
83
+ resp.raise_for_status()
84
+ data = resp.json()
85
+
86
+ jobs = data.get("jobs", [data])
87
+ if not jobs:
88
+ return JobStatus.RUNNING.value
89
+
90
+ raw_state = jobs[0].get("job_state", "UNKNOWN")
91
+ if isinstance(raw_state, list):
92
+ raw_state = raw_state[0] if raw_state else "UNKNOWN"
93
+
94
+ state = str(raw_state).strip()
95
+ self._logger.info(f"Slurm job {slurm_job_id} state: {state}")
96
+ return _SLURM_TO_JOB_STATUS.get(state, JobStatus.RUNNING.value)
97
+
98
+ def _get_job_result(self, job: dict) -> _SlurmJobResult:
99
+ return _SlurmJobResult(job_uuid=self.job_id, s3_bucket=S3_BUCKET, aws_region=AWS_REGION)
File without changes
@@ -0,0 +1,19 @@
1
+ from quapp_common.config.logging_config import job_logger
2
+ from quapp_common.model.provider.provider import Provider
3
+
4
+ from ..model.device.slurm_device import SlurmDevice
5
+
6
+ logger = job_logger('HpcDeviceFactory')
7
+
8
+
9
+ class HpcDeviceFactory:
10
+
11
+ @staticmethod
12
+ def create_device(
13
+ provider: Provider,
14
+ device_specification: str,
15
+ job_uuid: str,
16
+ hpc_config: dict = None,
17
+ ) -> SlurmDevice:
18
+ logger.debug(f"Creating SlurmDevice: partition={device_specification}")
19
+ return SlurmDevice(provider, device_specification, job_uuid, hpc_config=hpc_config)
@@ -0,0 +1,30 @@
1
+ from quapp_common.config.logging_config import job_logger
2
+ from quapp_common.factory.handler_factory import HandlerFactory
3
+ from quapp_common.handler.handler import Handler
4
+
5
+ from ..handler.invocation_handler import InvocationHandler
6
+ from ..handler.job_fetching_handler import SlurmJobFetchingHandler
7
+
8
+
9
+ class HpcHandlerFactory(HandlerFactory):
10
+
11
+ @staticmethod
12
+ def create_handler(event, circuit_preparation_fn, post_processing_fn) -> Handler:
13
+ request_data = event.json()
14
+ logger = job_logger(request_data.get("jobId"))
15
+
16
+ provider_job_id = request_data.get("providerJobId")
17
+
18
+ if provider_job_id:
19
+ logger.debug(f"HpcHandlerFactory: job fetching (slurmJobId={provider_job_id})")
20
+ return SlurmJobFetchingHandler(
21
+ request_data=request_data,
22
+ post_processing_fn=post_processing_fn,
23
+ )
24
+
25
+ logger.debug("HpcHandlerFactory: initial invocation")
26
+ return InvocationHandler(
27
+ request_data=request_data,
28
+ circuit_preparation_fn=circuit_preparation_fn,
29
+ post_processing_fn=post_processing_fn,
30
+ )
@@ -0,0 +1,24 @@
1
+ import os
2
+ from quapp_common.enum.provider_tag import ProviderTag
3
+ from quapp_common.config.logging_config import job_logger
4
+
5
+ from ..model.provider.slurm_provider import SlurmProvider
6
+
7
+ logger = job_logger('HpcProviderFactory')
8
+
9
+ SLURM_JWT = os.getenv("SLURM_JWT", "")
10
+
11
+
12
+ class HpcProviderFactory:
13
+
14
+ @staticmethod
15
+ def create_provider(provider_type: ProviderTag, authentication: dict):
16
+ logger.debug(f"Creating HPC provider: {provider_type}")
17
+
18
+ if provider_type == ProviderTag.SLURM_HPC:
19
+ jwt = authentication.get("slurm_jwt") or SLURM_JWT
20
+ if not jwt:
21
+ raise ValueError("SLURM_JWT not set — cannot authenticate with Slurm API")
22
+ return SlurmProvider(jwt_token=jwt)
23
+
24
+ raise NotImplementedError(f"Unsupported HPC provider: {provider_type}")
File without changes
@@ -0,0 +1,23 @@
1
+ from quapp_common.data.request.invocation_request import InvocationRequest
2
+ from quapp_common.handler.handler import Handler
3
+
4
+ from ..component.backend.hpc_invocation import HpcInvocation
5
+
6
+
7
+ class InvocationHandler(Handler):
8
+
9
+ def __init__(self, request_data: dict, circuit_preparation_fn, post_processing_fn):
10
+ super().__init__(request_data, post_processing_fn)
11
+ self.circuit_preparation_fn = circuit_preparation_fn
12
+
13
+ def handle(self):
14
+ self.logger.debug("HPC InvocationHandler: start")
15
+ try:
16
+ invocation_request = InvocationRequest(self.request_data)
17
+ HpcInvocation(invocation_request).submit_job(
18
+ circuit_preparation_fn=self.circuit_preparation_fn,
19
+ post_processing_fn=self.post_processing_fn,
20
+ )
21
+ except Exception as exc:
22
+ self.logger.exception(f"HPC invocation failed: {exc}")
23
+ raise
@@ -0,0 +1,18 @@
1
+ from quapp_common.data.request.job_fetching_request import JobFetchingRequest
2
+ from quapp_common.handler.handler import Handler
3
+
4
+ from ..component.backend.slurm_job_fetching import SlurmJobFetching
5
+
6
+
7
+ class SlurmJobFetchingHandler(Handler):
8
+ """Handles watchdog-triggered job status polls for Slurm jobs."""
9
+
10
+ def __init__(self, request_data: dict, post_processing_fn):
11
+ super().__init__(request_data, post_processing_fn)
12
+
13
+ def handle(self):
14
+ self.logger.debug("SlurmJobFetchingHandler: start")
15
+ request = JobFetchingRequest(self.request_data)
16
+ return SlurmJobFetching(request).fetch(
17
+ post_processing_fn=self.post_processing_fn
18
+ )
File without changes
@@ -0,0 +1,255 @@
1
+ import base64
2
+ import json
3
+ import os
4
+ import shlex
5
+ import time
6
+
7
+ import boto3
8
+ import requests
9
+ from quapp_common.config.logging_config import job_logger
10
+ from quapp_common.data.device.circuit_running_option import CircuitRunningOption
11
+ from quapp_common.enum.status.job_status import JobStatus
12
+ from quapp_common.model.device.device import Device
13
+ from quapp_common.model.provider.provider import Provider
14
+
15
+ from ..provider.slurm_provider import SlurmProvider, SLURM_ACCOUNT
16
+
17
+ S3_BUCKET = os.getenv("S3_BUCKET", "quapp-slurm-output-dev")
18
+ AWS_REGION = os.getenv("AWS_REGION", "ap-southeast-1")
19
+ SLURM_POLL_SEC = int(os.getenv("SLURM_POLL_SEC", "30"))
20
+ SLURM_TIMEOUT_SEC = int(os.getenv("SLURM_TIMEOUT_SEC", "21600")) # 6 hours
21
+ SLURM_TIME_LIMIT = int(os.getenv("SLURM_TIME_LIMIT_MIN", "60"))
22
+
23
+ _TERMINAL_STATES = {"COMPLETED", "FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL", "PREEMPTED"}
24
+ _DONE_STATE = "COMPLETED"
25
+
26
+ _SLURM_TO_JOB_STATUS = {
27
+ "PENDING": JobStatus.RUNNING.value,
28
+ "CONFIGURING": JobStatus.RUNNING.value,
29
+ "RUNNING": JobStatus.RUNNING.value,
30
+ "COMPLETING": JobStatus.RUNNING.value,
31
+ "COMPLETED": JobStatus.DONE.value,
32
+ "FAILED": JobStatus.ERROR.value,
33
+ "CANCELLED": JobStatus.ERROR.value,
34
+ "TIMEOUT": JobStatus.ERROR.value,
35
+ "NODE_FAIL": JobStatus.ERROR.value,
36
+ "PREEMPTED": JobStatus.ERROR.value,
37
+ }
38
+
39
+ _COLLECT_PY = (
40
+ "import json, os; "
41
+ "ec = int(open('/tmp/quapp_exit_code.txt').read().strip()) "
42
+ "if os.path.exists('/tmp/quapp_exit_code.txt') else 0; "
43
+ "out = open('/tmp/quapp_stdout.txt').read() "
44
+ "if os.path.exists('/tmp/quapp_stdout.txt') else ''; "
45
+ "err = open('/tmp/quapp_stderr.txt').read() "
46
+ "if os.path.exists('/tmp/quapp_stderr.txt') else ''; "
47
+ "json.dump({'exit_code': ec, 'stdout': out, 'stderr': err}, open('/tmp/output.json','w'))"
48
+ )
49
+
50
+
51
+ class SlurmDevice(Device):
52
+
53
+ def __init__(
54
+ self,
55
+ provider: Provider,
56
+ device_specification: str,
57
+ job_uuid: str,
58
+ hpc_config: dict = None,
59
+ ):
60
+ super().__init__(provider, device_specification)
61
+ self.job_uuid: str = job_uuid
62
+ self.slurm: SlurmProvider = provider
63
+ self.logger = job_logger(job_uuid)
64
+ self.hpc_config = hpc_config or {}
65
+ self.s3_bucket = self.hpc_config.get("s3_bucket") or S3_BUCKET
66
+
67
+ # ── Abstract method implementations ──────────────────────────────────────
68
+
69
+ def _is_simulator(self) -> bool:
70
+ return False
71
+
72
+ def _create_job(self, circuit: str, options: CircuitRunningOption) -> dict:
73
+ """Wrap circuit (user script string) in SBATCH infrastructure and submit to Slurm REST API."""
74
+ bash = self._build_sbatch_script(circuit)
75
+
76
+ payload = {
77
+ "job": {
78
+ "name": f"quapp-{self.job_uuid[:8]}",
79
+ "account": SLURM_ACCOUNT,
80
+ "environment": [
81
+ "PATH=/usr/bin:/bin:/usr/local/bin",
82
+ f"AWS_DEFAULT_REGION={AWS_REGION}",
83
+ ],
84
+ "time_limit": {
85
+ "number": SLURM_TIME_LIMIT,
86
+ "set": True,
87
+ "infinite": False,
88
+ },
89
+ "partition": self.device,
90
+ "current_working_directory": "/data/jobs",
91
+ "standard_output": f"/data/jobs/{self.job_uuid}.out",
92
+ "standard_error": f"/data/jobs/{self.job_uuid}.err",
93
+ },
94
+ "script": bash,
95
+ }
96
+
97
+ resp = requests.post(
98
+ f"{self.slurm.base_url}/job/submit",
99
+ headers=self.slurm.auth_headers(),
100
+ json=payload,
101
+ timeout=30,
102
+ )
103
+ resp.raise_for_status()
104
+ data = resp.json()
105
+ slurm_job_id = data.get("job_id") or data.get("jobId")
106
+
107
+ if not slurm_job_id:
108
+ raise RuntimeError(f"Slurm submit response missing job_id: {data}")
109
+
110
+ self.logger.info(f"Slurm job submitted: slurm_job_id={slurm_job_id}")
111
+ return {"slurm_job_id": str(slurm_job_id), "job_uuid": self.job_uuid}
112
+
113
+ def _get_provider_job_id(self, job: dict) -> str:
114
+ return job["slurm_job_id"]
115
+
116
+ def _get_job_status(self, job: dict) -> str:
117
+ state = self._fetch_slurm_state(job["slurm_job_id"])
118
+ return _SLURM_TO_JOB_STATUS.get(state, JobStatus.RUNNING.value)
119
+
120
+ def _get_job_result(self, job: dict):
121
+ """Block until Slurm job finishes, then download result from S3."""
122
+ slurm_job_id = job["slurm_job_id"]
123
+ job_uuid = job["job_uuid"]
124
+
125
+ elapsed = 0
126
+ while elapsed < SLURM_TIMEOUT_SEC:
127
+ state = self._fetch_slurm_state(slurm_job_id)
128
+ self.logger.debug(f"Slurm job {slurm_job_id} state: {state}")
129
+
130
+ if state == _DONE_STATE:
131
+ return self._download_s3_result(job_uuid)
132
+
133
+ if state in _TERMINAL_STATES:
134
+ raise RuntimeError(f"Slurm job {slurm_job_id} ended with state: {state}")
135
+
136
+ time.sleep(SLURM_POLL_SEC)
137
+ elapsed += SLURM_POLL_SEC
138
+
139
+ raise TimeoutError(f"Slurm job {slurm_job_id} timed out after {SLURM_TIMEOUT_SEC}s")
140
+
141
+ def _produce_histogram_data(self, job_result) -> None:
142
+ return None
143
+
144
+ def _calculate_execution_time(self, job_result) -> None:
145
+ self.execution_time = None
146
+
147
+ # ── SBATCH script builder ─────────────────────────────────────────────────
148
+
149
+ def _build_sbatch_script(self, user_script: str) -> str:
150
+ """Build the full SBATCH bash script that wraps the user's computation."""
151
+ resources = self.hpc_config.get("resources", {})
152
+ container = self.hpc_config.get("container", {})
153
+ environment = self.hpc_config.get("environment", {})
154
+ input_s3_paths = self.hpc_config.get("input_s3_paths", [])
155
+
156
+ lines = ["#!/bin/bash"]
157
+
158
+ # ── SBATCH directives ─────────────────────────────────────────────────
159
+ gpus = int(resources.get("gpus", 0))
160
+ memory_mb = (resources.get("memoryMb")
161
+ or resources.get("memory_mb")
162
+ or int(resources.get("memory_gb", 4)) * 1024)
163
+ lines += [
164
+ f"#SBATCH --account=quapp",
165
+ f"#SBATCH --partition={resources.get('partition', 'cpu')}",
166
+ f"#SBATCH --nodes={resources.get('nodes', 1)}",
167
+ f"#SBATCH --ntasks={resources.get('ntasks', 1)}",
168
+ f"#SBATCH --cpus-per-task={resources.get('cpus_per_task', 1)}",
169
+ f"#SBATCH --mem={memory_mb}M",
170
+ f"#SBATCH --time={resources.get('walltime') or resources.get('time_limit', '01:00:00')}",
171
+ ]
172
+ if gpus > 0:
173
+ gpu_type = resources.get("gpu_type", "")
174
+ gres = f"gpu:{gpu_type}:{gpus}" if gpu_type else f"gpu:{gpus}"
175
+ lines.append(f"#SBATCH --gres={gres}")
176
+ lines.append("")
177
+
178
+ # ── Runtime setup ─────────────────────────────────────────────────────
179
+ lines += ["set -euo pipefail", "module load apptainer 2>/dev/null || true", ""]
180
+
181
+ # ── Download S3 input files ───────────────────────────────────────────
182
+ if input_s3_paths:
183
+ lines.append("mkdir -p /tmp/quapp_inputs")
184
+ for path in input_s3_paths:
185
+ lines.append(f"aws s3 cp {shlex.quote(path)} /tmp/quapp_inputs/")
186
+ lines.append("")
187
+
188
+ # ── Extra environment variables ───────────────────────────────────────
189
+ for k, v in environment.items():
190
+ lines.append(f"export {k}={shlex.quote(str(v))}")
191
+ if environment:
192
+ lines.append("")
193
+
194
+ # ── Container exec prefix ─────────────────────────────────────────────
195
+ container_type = container.get("type", "none")
196
+ image = container.get("image", "")
197
+ if container_type == "sif" and image:
198
+ exec_prefix = f"apptainer exec /data/containers/{image}"
199
+ elif container_type == "docker" and image:
200
+ exec_prefix = f"apptainer exec docker://{image}"
201
+ else:
202
+ exec_prefix = ""
203
+
204
+ # ── Write and run user script ─────────────────────────────────────────
205
+ b64 = base64.b64encode(user_script.encode()).decode()
206
+ lines += [
207
+ f"echo {shlex.quote(b64)} | base64 -d > /tmp/quapp_job.sh",
208
+ "chmod +x /tmp/quapp_job.sh",
209
+ "",
210
+ ]
211
+ run_cmd = "bash /tmp/quapp_job.sh"
212
+ full_cmd = f"{exec_prefix} {run_cmd}".strip()
213
+ lines += [
214
+ f"{full_cmd} > /tmp/quapp_stdout.txt 2>/tmp/quapp_stderr.txt || true",
215
+ "echo $? > /tmp/quapp_exit_code.txt",
216
+ "",
217
+ ]
218
+
219
+ # ── Collect output and upload to S3 ──────────────────────────────────
220
+ s3_uri = f"s3://{self.s3_bucket}/{self.job_uuid}/output.json"
221
+ lines += [
222
+ f"python3 -c {shlex.quote(_COLLECT_PY)}",
223
+ f"aws s3 cp /tmp/output.json {shlex.quote(s3_uri)}",
224
+ ]
225
+
226
+ return "\n".join(lines)
227
+
228
+ # ── Helpers ───────────────────────────────────────────────────────────────
229
+
230
+ def _fetch_slurm_state(self, slurm_job_id: str) -> str:
231
+ resp = requests.get(
232
+ f"{self.slurm.base_url}/job/{slurm_job_id}",
233
+ headers=self.slurm.auth_headers(),
234
+ timeout=15,
235
+ )
236
+ resp.raise_for_status()
237
+ data = resp.json()
238
+
239
+ jobs = data.get("jobs", [data])
240
+ if not jobs:
241
+ return "UNKNOWN"
242
+
243
+ raw_state = jobs[0].get("job_state", "UNKNOWN")
244
+ if isinstance(raw_state, list):
245
+ raw_state = raw_state[0] if raw_state else "UNKNOWN"
246
+
247
+ return str(raw_state).strip()
248
+
249
+ def _download_s3_result(self, job_uuid: str) -> dict:
250
+ s3_key = f"{job_uuid}/output.json"
251
+ self.logger.info(f"Downloading result from s3://{self.s3_bucket}/{s3_key}")
252
+
253
+ s3 = boto3.client("s3", region_name=AWS_REGION)
254
+ resp = s3.get_object(Bucket=self.s3_bucket, Key=s3_key)
255
+ return json.loads(resp["Body"].read())
@@ -0,0 +1,40 @@
1
+ import os
2
+ import requests
3
+ from quapp_common.config.logging_config import job_logger
4
+ from quapp_common.enum.provider_tag import ProviderTag
5
+ from quapp_common.model.provider.provider import Provider
6
+
7
+ logger = job_logger('SlurmProvider')
8
+
9
+ SLURM_API_URL = os.getenv("SLURM_API_URL", "http://10.1.0.15:6820")
10
+ SLURM_API_VER = os.getenv("SLURM_API_VER", "v0.0.40")
11
+ SLURM_USERNAME = os.getenv("SLURM_USERNAME", "quapp-svc")
12
+ SLURM_ACCOUNT = os.getenv("SLURM_ACCOUNT", "quapp")
13
+
14
+
15
+ class SlurmProvider(Provider):
16
+
17
+ def __init__(self, jwt_token: str):
18
+ super().__init__(ProviderTag.SLURM_HPC)
19
+ self.jwt_token = jwt_token
20
+ self.base_url = f"{SLURM_API_URL}/slurm/{SLURM_API_VER}"
21
+
22
+ def get_backend(self, device_specification: str) -> str:
23
+ # device_specification maps to Slurm partition name
24
+ return device_specification or "compute"
25
+
26
+ def collect_provider(self):
27
+ resp = requests.get(
28
+ f"{SLURM_API_URL}/ping",
29
+ timeout=10,
30
+ )
31
+ resp.raise_for_status()
32
+ logger.debug(f"Slurm API reachable: {resp.json()}")
33
+ return self
34
+
35
+ def auth_headers(self) -> dict:
36
+ return {
37
+ "X-SLURM-USER-NAME": SLURM_USERNAME,
38
+ "X-SLURM-USER-TOKEN": self.jwt_token,
39
+ "Content-Type": "application/json",
40
+ }
@@ -0,0 +1,141 @@
1
+ Metadata-Version: 2.4
2
+ Name: quapp-hpc
3
+ Version: 0.0.1.dev1
4
+ Summary: Quapp HPC library — Slurm integration for Quapp Platform
5
+ Author-email: "CITYNOW Co. Ltd." <corp@citynow.vn>
6
+ License: The MIT License (MIT)
7
+ Copyright © CITYNOW Co. Ltd. All rights reserved.
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
14
+ Project-URL: Homepage, https://quapp.cloud/
15
+ Keywords: quapp,quapp-hpc,slurm,hpc
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python
18
+ Classifier: Programming Language :: Python :: 3
19
+ Requires-Python: <3.13,>=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: quapp-common==0.0.11.dev9
23
+ Requires-Dist: requests>=2.31.0
24
+ Requires-Dist: boto3>=1.28.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: black; extra == "dev"
27
+ Requires-Dist: bumpver; extra == "dev"
28
+ Requires-Dist: isort; extra == "dev"
29
+ Requires-Dist: pip-tools; extra == "dev"
30
+ Requires-Dist: pytest; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # quapp-hpc
34
+
35
+ Python library cho Quapp HPC functions — cầu nối giữa Quapp FaaS platform và Slurm HPC cluster.
36
+
37
+ ## Architecture
38
+
39
+ ```
40
+ ksvc (Docker)
41
+ ├── index.py FastAPI server
42
+ ├── quapp_hpc/
43
+ │ ├── factory/
44
+ │ │ └── hpc_handler_factory.py Entry point cho user
45
+ │ ├── component/backend/
46
+ │ │ └── hpc_invocation.py Orchestrates job lifecycle
47
+ │ └── model/
48
+ │ ├── provider/slurm_provider.py Auth headers, base URL
49
+ │ └── device/slurm_device.py Submit → Poll → S3 download
50
+ └── function/
51
+ └── handler.py User-defined processing() + post_processing()
52
+ ```
53
+
54
+ ## Luồng thực thi
55
+
56
+ ```
57
+ index.py nhận HTTP POST
58
+ → HpcHandlerFactory.create_handler(event, processing_fn, post_processing_fn)
59
+ → InvocationHandler.handle()
60
+ → HpcInvocation.submit_job()
61
+ 1. processing_fn(invocation_input) → bash script string
62
+ 2. SlurmDevice._create_job(script) → POST Slurm REST API → slurm_job_id
63
+ 3. SlurmDevice._get_job_result() → poll mỗi 30s → COMPLETED/FAILED
64
+ 4. SlurmDevice._download_s3_result()→ boto3 get s3://$S3_BUCKET/$JOB_UUID/output.json
65
+ 5. post_processing_fn(s3_result) → final response
66
+ ```
67
+
68
+ ## Environment variables (từ K8s Secret `slurm-credentials`)
69
+
70
+ | Var | Ví dụ | Mô tả |
71
+ |-----|-------|-------|
72
+ | `SLURM_API_URL` | `http://10.1.0.15:6820` | Slurm REST API base URL |
73
+ | `SLURM_JWT` | `eyJ...` | JWT token cho Slurm auth |
74
+ | `SLURM_USERNAME` | `quapp-svc` | Slurm username |
75
+ | `SLURM_ACCOUNT` | `quapp` | Slurm account/allocation |
76
+ | `S3_BUCKET` | `quapp-slurm-output-dev` | S3 bucket cho job output |
77
+ | `AWS_REGION` | `ap-southeast-1` | AWS region |
78
+ | `SLURM_POLL_SEC` | `30` | Polling interval (giây) |
79
+ | `SLURM_TIMEOUT_SEC` | `21600` | Max wait time (giây, default 6h) |
80
+
81
+ ## invocation_input schema
82
+
83
+ Xem chi tiết tại [`../qapp-sdk-templates/slurm-hpc/README.md`](../../qapp-sdk-templates/slurm-hpc/README.md).
84
+
85
+ Tóm tắt:
86
+ ```json
87
+ {
88
+ "resources": { "partition", "nodes", "cpus_per_task", "gpus", "memory_gb", "time_limit" },
89
+ "container": { "type": "sif"|"docker"|"none", "image": "..." },
90
+ "job": { "type": "script"|"command", "script"|"command": "...", "environment": {}, "input_s3_paths": [] }
91
+ }
92
+ ```
93
+
94
+ ## Slurm REST API
95
+
96
+ - Version: `v0.0.40`
97
+ - Submit: `POST {SLURM_API_URL}/slurm/v0.0.40/job/submit`
98
+ - Status: `GET {SLURM_API_URL}/slurm/v0.0.40/job/{job_id}`
99
+ - Auth headers: `X-SLURM-USER-NAME`, `X-SLURM-USER-TOKEN`
100
+
101
+ ### Job state mapping
102
+
103
+ | Slurm state | Quapp state |
104
+ |---|---|
105
+ | PENDING, CONFIGURING, RUNNING, COMPLETING | RUNNING |
106
+ | COMPLETED | DONE |
107
+ | FAILED, CANCELLED, TIMEOUT, NODE_FAIL, PREEMPTED | ERROR |
108
+
109
+ ## S3 output pattern
110
+
111
+ Job script phải upload kết quả:
112
+ ```bash
113
+ aws s3 cp /tmp/output.json s3://$S3_BUCKET/$JOB_UUID/output.json
114
+ ```
115
+
116
+ `$JOB_UUID` và `$S3_BUCKET` được inject tự động bởi `SlurmDevice._create_job()` qua Slurm `environment` array.
117
+
118
+ ## K8s Secret
119
+
120
+ ```yaml
121
+ # infrastructure/quapp-job-scheduler/k8s/cts/slurm-secret.yaml
122
+ apiVersion: v1
123
+ kind: Secret
124
+ metadata:
125
+ name: slurm-credentials
126
+ namespace: quapp-functions-dev
127
+ stringData:
128
+ SLURM_JWT: "<generate: sudo scontrol token username=quapp-svc lifespan=2592000>"
129
+ SLURM_API_URL: "http://10.1.0.15:6820"
130
+ SLURM_USERNAME: "quapp-svc"
131
+ SLURM_ACCOUNT: "quapp"
132
+ S3_BUCKET: "quapp-slurm-output-dev"
133
+ AWS_REGION: "ap-southeast-1"
134
+ ```
135
+
136
+ ## DB seed required
137
+
138
+ Chạy script trước khi deploy:
139
+ ```
140
+ infrastructure/quapp-functions-backend/docs/db/seed_slurm_hpc.sql
141
+ ```
@@ -0,0 +1,25 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ quapp_hpc/__init__.py
5
+ quapp_hpc.egg-info/PKG-INFO
6
+ quapp_hpc.egg-info/SOURCES.txt
7
+ quapp_hpc.egg-info/dependency_links.txt
8
+ quapp_hpc.egg-info/requires.txt
9
+ quapp_hpc.egg-info/top_level.txt
10
+ quapp_hpc/component/__init__.py
11
+ quapp_hpc/component/backend/__init__.py
12
+ quapp_hpc/component/backend/hpc_invocation.py
13
+ quapp_hpc/component/backend/slurm_job_fetching.py
14
+ quapp_hpc/factory/__init__.py
15
+ quapp_hpc/factory/hpc_device_factory.py
16
+ quapp_hpc/factory/hpc_handler_factory.py
17
+ quapp_hpc/factory/hpc_provider_factory.py
18
+ quapp_hpc/handler/__init__.py
19
+ quapp_hpc/handler/invocation_handler.py
20
+ quapp_hpc/handler/job_fetching_handler.py
21
+ quapp_hpc/model/__init__.py
22
+ quapp_hpc/model/device/__init__.py
23
+ quapp_hpc/model/device/slurm_device.py
24
+ quapp_hpc/model/provider/__init__.py
25
+ quapp_hpc/model/provider/slurm_provider.py
@@ -0,0 +1,10 @@
1
+ quapp-common==0.0.11.dev9
2
+ requests>=2.31.0
3
+ boto3>=1.28.0
4
+
5
+ [dev]
6
+ black
7
+ bumpver
8
+ isort
9
+ pip-tools
10
+ pytest
@@ -0,0 +1 @@
1
+ quapp_hpc
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+