quapp-hpc 0.0.1.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quapp_hpc-0.0.1.dev1/LICENSE +8 -0
- quapp_hpc-0.0.1.dev1/PKG-INFO +141 -0
- quapp_hpc-0.0.1.dev1/README.md +109 -0
- quapp_hpc-0.0.1.dev1/pyproject.toml +33 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/__init__.py +3 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/component/__init__.py +0 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/component/backend/__init__.py +0 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/component/backend/hpc_invocation.py +41 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/component/backend/slurm_job_fetching.py +99 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/factory/__init__.py +0 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/factory/hpc_device_factory.py +19 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/factory/hpc_handler_factory.py +30 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/factory/hpc_provider_factory.py +24 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/handler/__init__.py +0 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/handler/invocation_handler.py +23 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/handler/job_fetching_handler.py +18 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/model/__init__.py +0 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/model/device/__init__.py +0 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/model/device/slurm_device.py +255 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/model/provider/__init__.py +0 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc/model/provider/slurm_provider.py +40 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc.egg-info/PKG-INFO +141 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc.egg-info/SOURCES.txt +25 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc.egg-info/dependency_links.txt +1 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc.egg-info/requires.txt +10 -0
- quapp_hpc-0.0.1.dev1/quapp_hpc.egg-info/top_level.txt +1 -0
- quapp_hpc-0.0.1.dev1/setup.cfg +4 -0
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
Copyright © CITYNOW Co. Ltd. All rights reserved.
|
|
3
|
+
|
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
5
|
+
|
|
6
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
7
|
+
|
|
8
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: quapp-hpc
|
|
3
|
+
Version: 0.0.1.dev1
|
|
4
|
+
Summary: Quapp HPC library — Slurm integration for Quapp Platform
|
|
5
|
+
Author-email: "CITYNOW Co. Ltd." <corp@citynow.vn>
|
|
6
|
+
License: The MIT License (MIT)
|
|
7
|
+
Copyright © CITYNOW Co. Ltd. All rights reserved.
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
14
|
+
Project-URL: Homepage, https://quapp.cloud/
|
|
15
|
+
Keywords: quapp,quapp-hpc,slurm,hpc
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Requires-Python: <3.13,>=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: quapp-common==0.0.11.dev9
|
|
23
|
+
Requires-Dist: requests>=2.31.0
|
|
24
|
+
Requires-Dist: boto3>=1.28.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: black; extra == "dev"
|
|
27
|
+
Requires-Dist: bumpver; extra == "dev"
|
|
28
|
+
Requires-Dist: isort; extra == "dev"
|
|
29
|
+
Requires-Dist: pip-tools; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# quapp-hpc
|
|
34
|
+
|
|
35
|
+
Python library cho Quapp HPC functions — cầu nối giữa Quapp FaaS platform và Slurm HPC cluster.
|
|
36
|
+
|
|
37
|
+
## Architecture
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
ksvc (Docker)
|
|
41
|
+
├── index.py FastAPI server
|
|
42
|
+
├── quapp_hpc/
|
|
43
|
+
│ ├── factory/
|
|
44
|
+
│ │ └── hpc_handler_factory.py Entry point cho user
|
|
45
|
+
│ ├── component/backend/
|
|
46
|
+
│ │ └── hpc_invocation.py Orchestrates job lifecycle
|
|
47
|
+
│ └── model/
|
|
48
|
+
│ ├── provider/slurm_provider.py Auth headers, base URL
|
|
49
|
+
│ └── device/slurm_device.py Submit → Poll → S3 download
|
|
50
|
+
└── function/
|
|
51
|
+
└── handler.py User-defined processing() + post_processing()
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Luồng thực thi
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
index.py nhận HTTP POST
|
|
58
|
+
→ HpcHandlerFactory.create_handler(event, processing_fn, post_processing_fn)
|
|
59
|
+
→ InvocationHandler.handle()
|
|
60
|
+
→ HpcInvocation.submit_job()
|
|
61
|
+
1. processing_fn(invocation_input) → bash script string
|
|
62
|
+
2. SlurmDevice._create_job(script) → POST Slurm REST API → slurm_job_id
|
|
63
|
+
3. SlurmDevice._get_job_result() → poll mỗi 30s → COMPLETED/FAILED
|
|
64
|
+
4. SlurmDevice._download_s3_result()→ boto3 get s3://$S3_BUCKET/$JOB_UUID/output.json
|
|
65
|
+
5. post_processing_fn(s3_result) → final response
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Environment variables (từ K8s Secret `slurm-credentials`)
|
|
69
|
+
|
|
70
|
+
| Var | Ví dụ | Mô tả |
|
|
71
|
+
|-----|-------|-------|
|
|
72
|
+
| `SLURM_API_URL` | `http://10.1.0.15:6820` | Slurm REST API base URL |
|
|
73
|
+
| `SLURM_JWT` | `eyJ...` | JWT token cho Slurm auth |
|
|
74
|
+
| `SLURM_USERNAME` | `quapp-svc` | Slurm username |
|
|
75
|
+
| `SLURM_ACCOUNT` | `quapp` | Slurm account/allocation |
|
|
76
|
+
| `S3_BUCKET` | `quapp-slurm-output-dev` | S3 bucket cho job output |
|
|
77
|
+
| `AWS_REGION` | `ap-southeast-1` | AWS region |
|
|
78
|
+
| `SLURM_POLL_SEC` | `30` | Polling interval (giây) |
|
|
79
|
+
| `SLURM_TIMEOUT_SEC` | `21600` | Max wait time (giây, default 6h) |
|
|
80
|
+
|
|
81
|
+
## invocation_input schema
|
|
82
|
+
|
|
83
|
+
Xem chi tiết tại [`../qapp-sdk-templates/slurm-hpc/README.md`](../../qapp-sdk-templates/slurm-hpc/README.md).
|
|
84
|
+
|
|
85
|
+
Tóm tắt:
|
|
86
|
+
```json
|
|
87
|
+
{
|
|
88
|
+
"resources": { "partition", "nodes", "cpus_per_task", "gpus", "memory_gb", "time_limit" },
|
|
89
|
+
"container": { "type": "sif"|"docker"|"none", "image": "..." },
|
|
90
|
+
"job": { "type": "script"|"command", "script"|"command": "...", "environment": {}, "input_s3_paths": [] }
|
|
91
|
+
}
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Slurm REST API
|
|
95
|
+
|
|
96
|
+
- Version: `v0.0.40`
|
|
97
|
+
- Submit: `POST {SLURM_API_URL}/slurm/v0.0.40/job/submit`
|
|
98
|
+
- Status: `GET {SLURM_API_URL}/slurm/v0.0.40/job/{job_id}`
|
|
99
|
+
- Auth headers: `X-SLURM-USER-NAME`, `X-SLURM-USER-TOKEN`
|
|
100
|
+
|
|
101
|
+
### Job state mapping
|
|
102
|
+
|
|
103
|
+
| Slurm state | Quapp state |
|
|
104
|
+
|---|---|
|
|
105
|
+
| PENDING, CONFIGURING, RUNNING, COMPLETING | RUNNING |
|
|
106
|
+
| COMPLETED | DONE |
|
|
107
|
+
| FAILED, CANCELLED, TIMEOUT, NODE_FAIL, PREEMPTED | ERROR |
|
|
108
|
+
|
|
109
|
+
## S3 output pattern
|
|
110
|
+
|
|
111
|
+
Job script phải upload kết quả:
|
|
112
|
+
```bash
|
|
113
|
+
aws s3 cp /tmp/output.json s3://$S3_BUCKET/$JOB_UUID/output.json
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
`$JOB_UUID` và `$S3_BUCKET` được inject tự động bởi `SlurmDevice._create_job()` qua Slurm `environment` array.
|
|
117
|
+
|
|
118
|
+
## K8s Secret
|
|
119
|
+
|
|
120
|
+
```yaml
|
|
121
|
+
# infrastructure/quapp-job-scheduler/k8s/cts/slurm-secret.yaml
|
|
122
|
+
apiVersion: v1
|
|
123
|
+
kind: Secret
|
|
124
|
+
metadata:
|
|
125
|
+
name: slurm-credentials
|
|
126
|
+
namespace: quapp-functions-dev
|
|
127
|
+
stringData:
|
|
128
|
+
SLURM_JWT: "<generate: sudo scontrol token username=quapp-svc lifespan=2592000>"
|
|
129
|
+
SLURM_API_URL: "http://10.1.0.15:6820"
|
|
130
|
+
SLURM_USERNAME: "quapp-svc"
|
|
131
|
+
SLURM_ACCOUNT: "quapp"
|
|
132
|
+
S3_BUCKET: "quapp-slurm-output-dev"
|
|
133
|
+
AWS_REGION: "ap-southeast-1"
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## DB seed required
|
|
137
|
+
|
|
138
|
+
Chạy script trước khi deploy:
|
|
139
|
+
```
|
|
140
|
+
infrastructure/quapp-functions-backend/docs/db/seed_slurm_hpc.sql
|
|
141
|
+
```
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# quapp-hpc
|
|
2
|
+
|
|
3
|
+
Python library cho Quapp HPC functions — cầu nối giữa Quapp FaaS platform và Slurm HPC cluster.
|
|
4
|
+
|
|
5
|
+
## Architecture
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
ksvc (Docker)
|
|
9
|
+
├── index.py FastAPI server
|
|
10
|
+
├── quapp_hpc/
|
|
11
|
+
│ ├── factory/
|
|
12
|
+
│ │ └── hpc_handler_factory.py Entry point cho user
|
|
13
|
+
│ ├── component/backend/
|
|
14
|
+
│ │ └── hpc_invocation.py Orchestrates job lifecycle
|
|
15
|
+
│ └── model/
|
|
16
|
+
│ ├── provider/slurm_provider.py Auth headers, base URL
|
|
17
|
+
│ └── device/slurm_device.py Submit → Poll → S3 download
|
|
18
|
+
└── function/
|
|
19
|
+
└── handler.py User-defined processing() + post_processing()
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Luồng thực thi
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
index.py nhận HTTP POST
|
|
26
|
+
→ HpcHandlerFactory.create_handler(event, processing_fn, post_processing_fn)
|
|
27
|
+
→ InvocationHandler.handle()
|
|
28
|
+
→ HpcInvocation.submit_job()
|
|
29
|
+
1. processing_fn(invocation_input) → bash script string
|
|
30
|
+
2. SlurmDevice._create_job(script) → POST Slurm REST API → slurm_job_id
|
|
31
|
+
3. SlurmDevice._get_job_result() → poll mỗi 30s → COMPLETED/FAILED
|
|
32
|
+
4. SlurmDevice._download_s3_result()→ boto3 get s3://$S3_BUCKET/$JOB_UUID/output.json
|
|
33
|
+
5. post_processing_fn(s3_result) → final response
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Environment variables (từ K8s Secret `slurm-credentials`)
|
|
37
|
+
|
|
38
|
+
| Var | Ví dụ | Mô tả |
|
|
39
|
+
|-----|-------|-------|
|
|
40
|
+
| `SLURM_API_URL` | `http://10.1.0.15:6820` | Slurm REST API base URL |
|
|
41
|
+
| `SLURM_JWT` | `eyJ...` | JWT token cho Slurm auth |
|
|
42
|
+
| `SLURM_USERNAME` | `quapp-svc` | Slurm username |
|
|
43
|
+
| `SLURM_ACCOUNT` | `quapp` | Slurm account/allocation |
|
|
44
|
+
| `S3_BUCKET` | `quapp-slurm-output-dev` | S3 bucket cho job output |
|
|
45
|
+
| `AWS_REGION` | `ap-southeast-1` | AWS region |
|
|
46
|
+
| `SLURM_POLL_SEC` | `30` | Polling interval (giây) |
|
|
47
|
+
| `SLURM_TIMEOUT_SEC` | `21600` | Max wait time (giây, default 6h) |
|
|
48
|
+
|
|
49
|
+
## invocation_input schema
|
|
50
|
+
|
|
51
|
+
Xem chi tiết tại [`../qapp-sdk-templates/slurm-hpc/README.md`](../../qapp-sdk-templates/slurm-hpc/README.md).
|
|
52
|
+
|
|
53
|
+
Tóm tắt:
|
|
54
|
+
```json
|
|
55
|
+
{
|
|
56
|
+
"resources": { "partition", "nodes", "cpus_per_task", "gpus", "memory_gb", "time_limit" },
|
|
57
|
+
"container": { "type": "sif"|"docker"|"none", "image": "..." },
|
|
58
|
+
"job": { "type": "script"|"command", "script"|"command": "...", "environment": {}, "input_s3_paths": [] }
|
|
59
|
+
}
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Slurm REST API
|
|
63
|
+
|
|
64
|
+
- Version: `v0.0.40`
|
|
65
|
+
- Submit: `POST {SLURM_API_URL}/slurm/v0.0.40/job/submit`
|
|
66
|
+
- Status: `GET {SLURM_API_URL}/slurm/v0.0.40/job/{job_id}`
|
|
67
|
+
- Auth headers: `X-SLURM-USER-NAME`, `X-SLURM-USER-TOKEN`
|
|
68
|
+
|
|
69
|
+
### Job state mapping
|
|
70
|
+
|
|
71
|
+
| Slurm state | Quapp state |
|
|
72
|
+
|---|---|
|
|
73
|
+
| PENDING, CONFIGURING, RUNNING, COMPLETING | RUNNING |
|
|
74
|
+
| COMPLETED | DONE |
|
|
75
|
+
| FAILED, CANCELLED, TIMEOUT, NODE_FAIL, PREEMPTED | ERROR |
|
|
76
|
+
|
|
77
|
+
## S3 output pattern
|
|
78
|
+
|
|
79
|
+
Job script phải upload kết quả:
|
|
80
|
+
```bash
|
|
81
|
+
aws s3 cp /tmp/output.json s3://$S3_BUCKET/$JOB_UUID/output.json
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
`$JOB_UUID` và `$S3_BUCKET` được inject tự động bởi `SlurmDevice._create_job()` qua Slurm `environment` array.
|
|
85
|
+
|
|
86
|
+
## K8s Secret
|
|
87
|
+
|
|
88
|
+
```yaml
|
|
89
|
+
# infrastructure/quapp-job-scheduler/k8s/cts/slurm-secret.yaml
|
|
90
|
+
apiVersion: v1
|
|
91
|
+
kind: Secret
|
|
92
|
+
metadata:
|
|
93
|
+
name: slurm-credentials
|
|
94
|
+
namespace: quapp-functions-dev
|
|
95
|
+
stringData:
|
|
96
|
+
SLURM_JWT: "<generate: sudo scontrol token username=quapp-svc lifespan=2592000>"
|
|
97
|
+
SLURM_API_URL: "http://10.1.0.15:6820"
|
|
98
|
+
SLURM_USERNAME: "quapp-svc"
|
|
99
|
+
SLURM_ACCOUNT: "quapp"
|
|
100
|
+
S3_BUCKET: "quapp-slurm-output-dev"
|
|
101
|
+
AWS_REGION: "ap-southeast-1"
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## DB seed required
|
|
105
|
+
|
|
106
|
+
Chạy script trước khi deploy:
|
|
107
|
+
```
|
|
108
|
+
infrastructure/quapp-functions-backend/docs/db/seed_slurm_hpc.sql
|
|
109
|
+
```
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0.0", "wheel==0.45.1"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "quapp-hpc"
|
|
7
|
+
version = "0.0.1.dev1"
|
|
8
|
+
description = "Quapp HPC library — Slurm integration for Quapp Platform"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [{ name = "CITYNOW Co. Ltd.", email = "corp@citynow.vn" }]
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
|
+
classifiers = [
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Programming Language :: Python",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
]
|
|
17
|
+
keywords = ["quapp", "quapp-hpc", "slurm", "hpc"]
|
|
18
|
+
dependencies = [
|
|
19
|
+
"quapp-common==0.0.11.dev9",
|
|
20
|
+
"requests>=2.31.0",
|
|
21
|
+
"boto3>=1.28.0",
|
|
22
|
+
]
|
|
23
|
+
requires-python = ">=3.10,<3.13"
|
|
24
|
+
|
|
25
|
+
[project.optional-dependencies]
|
|
26
|
+
dev = ["black", "bumpver", "isort", "pip-tools", "pytest"]
|
|
27
|
+
|
|
28
|
+
[tool.setuptools.packages.find]
|
|
29
|
+
include = ["quapp_hpc*"]
|
|
30
|
+
exclude = ["*.md", "*.yml", "*.yaml", "*.toml", "tests*", ".gitignore"]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://quapp.cloud/"
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from quapp_common.component.backend.invocation import Invocation
|
|
2
|
+
from quapp_common.data.request.invocation_request import InvocationRequest
|
|
3
|
+
from quapp_common.model.provider.provider import Provider
|
|
4
|
+
|
|
5
|
+
from ...factory.hpc_device_factory import HpcDeviceFactory
|
|
6
|
+
from ...factory.hpc_provider_factory import HpcProviderFactory
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class HpcInvocation(Invocation):
|
|
10
|
+
|
|
11
|
+
def __init__(self, request_data: InvocationRequest):
|
|
12
|
+
super().__init__(request_data)
|
|
13
|
+
raw = request_data.input or {}
|
|
14
|
+
job = raw.get("job", {})
|
|
15
|
+
self._hpc_config = {
|
|
16
|
+
"resources": raw.get("resources", {}),
|
|
17
|
+
"container": raw.get("container", {}),
|
|
18
|
+
"environment": job.get("environment", {}),
|
|
19
|
+
"input_s3_paths": job.get("input_s3_paths", []),
|
|
20
|
+
"s3_bucket": raw.get("s3Bucket", ""),
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
def _export_circuit(self, circuit):
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
def _get_qubit_amount(self, circuit) -> int:
|
|
27
|
+
return 0
|
|
28
|
+
|
|
29
|
+
def _create_provider(self) -> Provider:
|
|
30
|
+
return HpcProviderFactory.create_provider(
|
|
31
|
+
provider_type=self.backend_information.provider_tag,
|
|
32
|
+
authentication=self.backend_information.authentication,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def _create_device(self, provider: Provider):
|
|
36
|
+
return HpcDeviceFactory.create_device(
|
|
37
|
+
provider=provider,
|
|
38
|
+
device_specification=self.backend_information.device_name,
|
|
39
|
+
job_uuid=self.job_id,
|
|
40
|
+
hpc_config=self._hpc_config,
|
|
41
|
+
)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
import boto3
|
|
5
|
+
import requests
|
|
6
|
+
from quapp_common.component.backend.job_fetching import JobFetching
|
|
7
|
+
from quapp_common.config.logging_config import job_logger
|
|
8
|
+
from quapp_common.data.request.job_fetching_request import JobFetchingRequest
|
|
9
|
+
from quapp_common.enum.status.job_status import JobStatus
|
|
10
|
+
|
|
11
|
+
from ...model.provider.slurm_provider import SlurmProvider
|
|
12
|
+
|
|
13
|
+
SLURM_JWT = os.getenv("SLURM_JWT", "")
|
|
14
|
+
S3_BUCKET = os.getenv("S3_BUCKET", "quapp-slurm-output-dev")
|
|
15
|
+
AWS_REGION = os.getenv("AWS_REGION", "ap-southeast-1")
|
|
16
|
+
|
|
17
|
+
_SLURM_TO_JOB_STATUS = {
|
|
18
|
+
"PENDING": JobStatus.RUNNING.value,
|
|
19
|
+
"CONFIGURING": JobStatus.RUNNING.value,
|
|
20
|
+
"RUNNING": JobStatus.RUNNING.value,
|
|
21
|
+
"COMPLETING": JobStatus.RUNNING.value,
|
|
22
|
+
"COMPLETED": JobStatus.DONE.value,
|
|
23
|
+
"FAILED": JobStatus.ERROR.value,
|
|
24
|
+
"CANCELLED": JobStatus.ERROR.value,
|
|
25
|
+
"TIMEOUT": JobStatus.ERROR.value,
|
|
26
|
+
"NODE_FAIL": JobStatus.ERROR.value,
|
|
27
|
+
"PREEMPTED": JobStatus.ERROR.value,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class _SlurmJobResult:
|
|
32
|
+
"""Lazy S3 download of the Slurm job's output.json.
|
|
33
|
+
|
|
34
|
+
result() downloads from S3 only when called (after DONE is confirmed).
|
|
35
|
+
usage() returns None so JobFetching.__get_execution_time() doesn't raise.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, job_uuid: str, s3_bucket: str = S3_BUCKET, aws_region: str = AWS_REGION):
|
|
39
|
+
self._job_uuid = job_uuid
|
|
40
|
+
self._s3_bucket = s3_bucket
|
|
41
|
+
self._aws_region = aws_region
|
|
42
|
+
self._logger = job_logger(job_uuid)
|
|
43
|
+
|
|
44
|
+
def usage(self):
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
def result(self):
|
|
48
|
+
s3_key = f"{self._job_uuid}/output.json"
|
|
49
|
+
self._logger.info(f"Downloading result from s3://{self._s3_bucket}/{s3_key}")
|
|
50
|
+
s3 = boto3.client("s3", region_name=self._aws_region)
|
|
51
|
+
resp = s3.get_object(Bucket=self._s3_bucket, Key=s3_key)
|
|
52
|
+
return json.loads(resp["Body"].read())
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class SlurmJobFetching(JobFetching):
|
|
56
|
+
"""Polls Slurm REST API for job status (IBM-pattern, triggered by watchdog)."""
|
|
57
|
+
|
|
58
|
+
def __init__(self, request_data: JobFetchingRequest):
|
|
59
|
+
super().__init__(request_data)
|
|
60
|
+
self.job_id = request_data.job_id
|
|
61
|
+
self._logger = job_logger(request_data.job_id)
|
|
62
|
+
|
|
63
|
+
# ── Abstract implementations ──────────────────────────────────────────────
|
|
64
|
+
|
|
65
|
+
def _collect_provider(self) -> SlurmProvider:
|
|
66
|
+
jwt = (self.provider_authentication or {}).get("slurm_jwt") or SLURM_JWT
|
|
67
|
+
if not jwt:
|
|
68
|
+
raise ValueError("SLURM_JWT not set — cannot authenticate with Slurm API")
|
|
69
|
+
return SlurmProvider(jwt_token=jwt)
|
|
70
|
+
|
|
71
|
+
def _retrieve_job(self, provider: SlurmProvider) -> dict:
|
|
72
|
+
return {"slurm_job_id": self.provider_job_id, "provider": provider}
|
|
73
|
+
|
|
74
|
+
def _get_job_status(self, job: dict) -> str:
|
|
75
|
+
provider: SlurmProvider = job["provider"]
|
|
76
|
+
slurm_job_id = job["slurm_job_id"]
|
|
77
|
+
|
|
78
|
+
resp = requests.get(
|
|
79
|
+
f"{provider.base_url}/job/{slurm_job_id}",
|
|
80
|
+
headers=provider.auth_headers(),
|
|
81
|
+
timeout=15,
|
|
82
|
+
)
|
|
83
|
+
resp.raise_for_status()
|
|
84
|
+
data = resp.json()
|
|
85
|
+
|
|
86
|
+
jobs = data.get("jobs", [data])
|
|
87
|
+
if not jobs:
|
|
88
|
+
return JobStatus.RUNNING.value
|
|
89
|
+
|
|
90
|
+
raw_state = jobs[0].get("job_state", "UNKNOWN")
|
|
91
|
+
if isinstance(raw_state, list):
|
|
92
|
+
raw_state = raw_state[0] if raw_state else "UNKNOWN"
|
|
93
|
+
|
|
94
|
+
state = str(raw_state).strip()
|
|
95
|
+
self._logger.info(f"Slurm job {slurm_job_id} state: {state}")
|
|
96
|
+
return _SLURM_TO_JOB_STATUS.get(state, JobStatus.RUNNING.value)
|
|
97
|
+
|
|
98
|
+
def _get_job_result(self, job: dict) -> _SlurmJobResult:
|
|
99
|
+
return _SlurmJobResult(job_uuid=self.job_id, s3_bucket=S3_BUCKET, aws_region=AWS_REGION)
|
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from quapp_common.config.logging_config import job_logger
|
|
2
|
+
from quapp_common.model.provider.provider import Provider
|
|
3
|
+
|
|
4
|
+
from ..model.device.slurm_device import SlurmDevice
|
|
5
|
+
|
|
6
|
+
logger = job_logger('HpcDeviceFactory')
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class HpcDeviceFactory:
|
|
10
|
+
|
|
11
|
+
@staticmethod
|
|
12
|
+
def create_device(
|
|
13
|
+
provider: Provider,
|
|
14
|
+
device_specification: str,
|
|
15
|
+
job_uuid: str,
|
|
16
|
+
hpc_config: dict = None,
|
|
17
|
+
) -> SlurmDevice:
|
|
18
|
+
logger.debug(f"Creating SlurmDevice: partition={device_specification}")
|
|
19
|
+
return SlurmDevice(provider, device_specification, job_uuid, hpc_config=hpc_config)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from quapp_common.config.logging_config import job_logger
|
|
2
|
+
from quapp_common.factory.handler_factory import HandlerFactory
|
|
3
|
+
from quapp_common.handler.handler import Handler
|
|
4
|
+
|
|
5
|
+
from ..handler.invocation_handler import InvocationHandler
|
|
6
|
+
from ..handler.job_fetching_handler import SlurmJobFetchingHandler
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class HpcHandlerFactory(HandlerFactory):
|
|
10
|
+
|
|
11
|
+
@staticmethod
|
|
12
|
+
def create_handler(event, circuit_preparation_fn, post_processing_fn) -> Handler:
|
|
13
|
+
request_data = event.json()
|
|
14
|
+
logger = job_logger(request_data.get("jobId"))
|
|
15
|
+
|
|
16
|
+
provider_job_id = request_data.get("providerJobId")
|
|
17
|
+
|
|
18
|
+
if provider_job_id:
|
|
19
|
+
logger.debug(f"HpcHandlerFactory: job fetching (slurmJobId={provider_job_id})")
|
|
20
|
+
return SlurmJobFetchingHandler(
|
|
21
|
+
request_data=request_data,
|
|
22
|
+
post_processing_fn=post_processing_fn,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
logger.debug("HpcHandlerFactory: initial invocation")
|
|
26
|
+
return InvocationHandler(
|
|
27
|
+
request_data=request_data,
|
|
28
|
+
circuit_preparation_fn=circuit_preparation_fn,
|
|
29
|
+
post_processing_fn=post_processing_fn,
|
|
30
|
+
)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from quapp_common.enum.provider_tag import ProviderTag
|
|
3
|
+
from quapp_common.config.logging_config import job_logger
|
|
4
|
+
|
|
5
|
+
from ..model.provider.slurm_provider import SlurmProvider
|
|
6
|
+
|
|
7
|
+
logger = job_logger('HpcProviderFactory')
|
|
8
|
+
|
|
9
|
+
SLURM_JWT = os.getenv("SLURM_JWT", "")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class HpcProviderFactory:
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def create_provider(provider_type: ProviderTag, authentication: dict):
|
|
16
|
+
logger.debug(f"Creating HPC provider: {provider_type}")
|
|
17
|
+
|
|
18
|
+
if provider_type == ProviderTag.SLURM_HPC:
|
|
19
|
+
jwt = authentication.get("slurm_jwt") or SLURM_JWT
|
|
20
|
+
if not jwt:
|
|
21
|
+
raise ValueError("SLURM_JWT not set — cannot authenticate with Slurm API")
|
|
22
|
+
return SlurmProvider(jwt_token=jwt)
|
|
23
|
+
|
|
24
|
+
raise NotImplementedError(f"Unsupported HPC provider: {provider_type}")
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from quapp_common.data.request.invocation_request import InvocationRequest
|
|
2
|
+
from quapp_common.handler.handler import Handler
|
|
3
|
+
|
|
4
|
+
from ..component.backend.hpc_invocation import HpcInvocation
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class InvocationHandler(Handler):
|
|
8
|
+
|
|
9
|
+
def __init__(self, request_data: dict, circuit_preparation_fn, post_processing_fn):
|
|
10
|
+
super().__init__(request_data, post_processing_fn)
|
|
11
|
+
self.circuit_preparation_fn = circuit_preparation_fn
|
|
12
|
+
|
|
13
|
+
def handle(self):
|
|
14
|
+
self.logger.debug("HPC InvocationHandler: start")
|
|
15
|
+
try:
|
|
16
|
+
invocation_request = InvocationRequest(self.request_data)
|
|
17
|
+
HpcInvocation(invocation_request).submit_job(
|
|
18
|
+
circuit_preparation_fn=self.circuit_preparation_fn,
|
|
19
|
+
post_processing_fn=self.post_processing_fn,
|
|
20
|
+
)
|
|
21
|
+
except Exception as exc:
|
|
22
|
+
self.logger.exception(f"HPC invocation failed: {exc}")
|
|
23
|
+
raise
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from quapp_common.data.request.job_fetching_request import JobFetchingRequest
|
|
2
|
+
from quapp_common.handler.handler import Handler
|
|
3
|
+
|
|
4
|
+
from ..component.backend.slurm_job_fetching import SlurmJobFetching
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SlurmJobFetchingHandler(Handler):
|
|
8
|
+
"""Handles watchdog-triggered job status polls for Slurm jobs."""
|
|
9
|
+
|
|
10
|
+
def __init__(self, request_data: dict, post_processing_fn):
|
|
11
|
+
super().__init__(request_data, post_processing_fn)
|
|
12
|
+
|
|
13
|
+
def handle(self):
|
|
14
|
+
self.logger.debug("SlurmJobFetchingHandler: start")
|
|
15
|
+
request = JobFetchingRequest(self.request_data)
|
|
16
|
+
return SlurmJobFetching(request).fetch(
|
|
17
|
+
post_processing_fn=self.post_processing_fn
|
|
18
|
+
)
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import shlex
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
import boto3
|
|
8
|
+
import requests
|
|
9
|
+
from quapp_common.config.logging_config import job_logger
|
|
10
|
+
from quapp_common.data.device.circuit_running_option import CircuitRunningOption
|
|
11
|
+
from quapp_common.enum.status.job_status import JobStatus
|
|
12
|
+
from quapp_common.model.device.device import Device
|
|
13
|
+
from quapp_common.model.provider.provider import Provider
|
|
14
|
+
|
|
15
|
+
from ..provider.slurm_provider import SlurmProvider, SLURM_ACCOUNT
|
|
16
|
+
|
|
17
|
+
S3_BUCKET = os.getenv("S3_BUCKET", "quapp-slurm-output-dev")
|
|
18
|
+
AWS_REGION = os.getenv("AWS_REGION", "ap-southeast-1")
|
|
19
|
+
SLURM_POLL_SEC = int(os.getenv("SLURM_POLL_SEC", "30"))
|
|
20
|
+
SLURM_TIMEOUT_SEC = int(os.getenv("SLURM_TIMEOUT_SEC", "21600")) # 6 hours
|
|
21
|
+
SLURM_TIME_LIMIT = int(os.getenv("SLURM_TIME_LIMIT_MIN", "60"))
|
|
22
|
+
|
|
23
|
+
_TERMINAL_STATES = {"COMPLETED", "FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL", "PREEMPTED"}
|
|
24
|
+
_DONE_STATE = "COMPLETED"
|
|
25
|
+
|
|
26
|
+
_SLURM_TO_JOB_STATUS = {
|
|
27
|
+
"PENDING": JobStatus.RUNNING.value,
|
|
28
|
+
"CONFIGURING": JobStatus.RUNNING.value,
|
|
29
|
+
"RUNNING": JobStatus.RUNNING.value,
|
|
30
|
+
"COMPLETING": JobStatus.RUNNING.value,
|
|
31
|
+
"COMPLETED": JobStatus.DONE.value,
|
|
32
|
+
"FAILED": JobStatus.ERROR.value,
|
|
33
|
+
"CANCELLED": JobStatus.ERROR.value,
|
|
34
|
+
"TIMEOUT": JobStatus.ERROR.value,
|
|
35
|
+
"NODE_FAIL": JobStatus.ERROR.value,
|
|
36
|
+
"PREEMPTED": JobStatus.ERROR.value,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
_COLLECT_PY = (
|
|
40
|
+
"import json, os; "
|
|
41
|
+
"ec = int(open('/tmp/quapp_exit_code.txt').read().strip()) "
|
|
42
|
+
"if os.path.exists('/tmp/quapp_exit_code.txt') else 0; "
|
|
43
|
+
"out = open('/tmp/quapp_stdout.txt').read() "
|
|
44
|
+
"if os.path.exists('/tmp/quapp_stdout.txt') else ''; "
|
|
45
|
+
"err = open('/tmp/quapp_stderr.txt').read() "
|
|
46
|
+
"if os.path.exists('/tmp/quapp_stderr.txt') else ''; "
|
|
47
|
+
"json.dump({'exit_code': ec, 'stdout': out, 'stderr': err}, open('/tmp/output.json','w'))"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class SlurmDevice(Device):
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
provider: Provider,
|
|
56
|
+
device_specification: str,
|
|
57
|
+
job_uuid: str,
|
|
58
|
+
hpc_config: dict = None,
|
|
59
|
+
):
|
|
60
|
+
super().__init__(provider, device_specification)
|
|
61
|
+
self.job_uuid: str = job_uuid
|
|
62
|
+
self.slurm: SlurmProvider = provider
|
|
63
|
+
self.logger = job_logger(job_uuid)
|
|
64
|
+
self.hpc_config = hpc_config or {}
|
|
65
|
+
self.s3_bucket = self.hpc_config.get("s3_bucket") or S3_BUCKET
|
|
66
|
+
|
|
67
|
+
# ── Abstract method implementations ──────────────────────────────────────
|
|
68
|
+
|
|
69
|
+
def _is_simulator(self) -> bool:
|
|
70
|
+
return False
|
|
71
|
+
|
|
72
|
+
def _create_job(self, circuit: str, options: CircuitRunningOption) -> dict:
|
|
73
|
+
"""Wrap circuit (user script string) in SBATCH infrastructure and submit to Slurm REST API."""
|
|
74
|
+
bash = self._build_sbatch_script(circuit)
|
|
75
|
+
|
|
76
|
+
payload = {
|
|
77
|
+
"job": {
|
|
78
|
+
"name": f"quapp-{self.job_uuid[:8]}",
|
|
79
|
+
"account": SLURM_ACCOUNT,
|
|
80
|
+
"environment": [
|
|
81
|
+
"PATH=/usr/bin:/bin:/usr/local/bin",
|
|
82
|
+
f"AWS_DEFAULT_REGION={AWS_REGION}",
|
|
83
|
+
],
|
|
84
|
+
"time_limit": {
|
|
85
|
+
"number": SLURM_TIME_LIMIT,
|
|
86
|
+
"set": True,
|
|
87
|
+
"infinite": False,
|
|
88
|
+
},
|
|
89
|
+
"partition": self.device,
|
|
90
|
+
"current_working_directory": "/data/jobs",
|
|
91
|
+
"standard_output": f"/data/jobs/{self.job_uuid}.out",
|
|
92
|
+
"standard_error": f"/data/jobs/{self.job_uuid}.err",
|
|
93
|
+
},
|
|
94
|
+
"script": bash,
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
resp = requests.post(
|
|
98
|
+
f"{self.slurm.base_url}/job/submit",
|
|
99
|
+
headers=self.slurm.auth_headers(),
|
|
100
|
+
json=payload,
|
|
101
|
+
timeout=30,
|
|
102
|
+
)
|
|
103
|
+
resp.raise_for_status()
|
|
104
|
+
data = resp.json()
|
|
105
|
+
slurm_job_id = data.get("job_id") or data.get("jobId")
|
|
106
|
+
|
|
107
|
+
if not slurm_job_id:
|
|
108
|
+
raise RuntimeError(f"Slurm submit response missing job_id: {data}")
|
|
109
|
+
|
|
110
|
+
self.logger.info(f"Slurm job submitted: slurm_job_id={slurm_job_id}")
|
|
111
|
+
return {"slurm_job_id": str(slurm_job_id), "job_uuid": self.job_uuid}
|
|
112
|
+
|
|
113
|
+
def _get_provider_job_id(self, job: dict) -> str:
|
|
114
|
+
return job["slurm_job_id"]
|
|
115
|
+
|
|
116
|
+
def _get_job_status(self, job: dict) -> str:
|
|
117
|
+
state = self._fetch_slurm_state(job["slurm_job_id"])
|
|
118
|
+
return _SLURM_TO_JOB_STATUS.get(state, JobStatus.RUNNING.value)
|
|
119
|
+
|
|
120
|
+
def _get_job_result(self, job: dict):
|
|
121
|
+
"""Block until Slurm job finishes, then download result from S3."""
|
|
122
|
+
slurm_job_id = job["slurm_job_id"]
|
|
123
|
+
job_uuid = job["job_uuid"]
|
|
124
|
+
|
|
125
|
+
elapsed = 0
|
|
126
|
+
while elapsed < SLURM_TIMEOUT_SEC:
|
|
127
|
+
state = self._fetch_slurm_state(slurm_job_id)
|
|
128
|
+
self.logger.debug(f"Slurm job {slurm_job_id} state: {state}")
|
|
129
|
+
|
|
130
|
+
if state == _DONE_STATE:
|
|
131
|
+
return self._download_s3_result(job_uuid)
|
|
132
|
+
|
|
133
|
+
if state in _TERMINAL_STATES:
|
|
134
|
+
raise RuntimeError(f"Slurm job {slurm_job_id} ended with state: {state}")
|
|
135
|
+
|
|
136
|
+
time.sleep(SLURM_POLL_SEC)
|
|
137
|
+
elapsed += SLURM_POLL_SEC
|
|
138
|
+
|
|
139
|
+
raise TimeoutError(f"Slurm job {slurm_job_id} timed out after {SLURM_TIMEOUT_SEC}s")
|
|
140
|
+
|
|
141
|
+
def _produce_histogram_data(self, job_result) -> None:
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
def _calculate_execution_time(self, job_result) -> None:
|
|
145
|
+
self.execution_time = None
|
|
146
|
+
|
|
147
|
+
# ── SBATCH script builder ─────────────────────────────────────────────────
|
|
148
|
+
|
|
149
|
+
def _build_sbatch_script(self, user_script: str) -> str:
|
|
150
|
+
"""Build the full SBATCH bash script that wraps the user's computation."""
|
|
151
|
+
resources = self.hpc_config.get("resources", {})
|
|
152
|
+
container = self.hpc_config.get("container", {})
|
|
153
|
+
environment = self.hpc_config.get("environment", {})
|
|
154
|
+
input_s3_paths = self.hpc_config.get("input_s3_paths", [])
|
|
155
|
+
|
|
156
|
+
lines = ["#!/bin/bash"]
|
|
157
|
+
|
|
158
|
+
# ── SBATCH directives ─────────────────────────────────────────────────
|
|
159
|
+
gpus = int(resources.get("gpus", 0))
|
|
160
|
+
memory_mb = (resources.get("memoryMb")
|
|
161
|
+
or resources.get("memory_mb")
|
|
162
|
+
or int(resources.get("memory_gb", 4)) * 1024)
|
|
163
|
+
lines += [
|
|
164
|
+
f"#SBATCH --account=quapp",
|
|
165
|
+
f"#SBATCH --partition={resources.get('partition', 'cpu')}",
|
|
166
|
+
f"#SBATCH --nodes={resources.get('nodes', 1)}",
|
|
167
|
+
f"#SBATCH --ntasks={resources.get('ntasks', 1)}",
|
|
168
|
+
f"#SBATCH --cpus-per-task={resources.get('cpus_per_task', 1)}",
|
|
169
|
+
f"#SBATCH --mem={memory_mb}M",
|
|
170
|
+
f"#SBATCH --time={resources.get('walltime') or resources.get('time_limit', '01:00:00')}",
|
|
171
|
+
]
|
|
172
|
+
if gpus > 0:
|
|
173
|
+
gpu_type = resources.get("gpu_type", "")
|
|
174
|
+
gres = f"gpu:{gpu_type}:{gpus}" if gpu_type else f"gpu:{gpus}"
|
|
175
|
+
lines.append(f"#SBATCH --gres={gres}")
|
|
176
|
+
lines.append("")
|
|
177
|
+
|
|
178
|
+
# ── Runtime setup ─────────────────────────────────────────────────────
|
|
179
|
+
lines += ["set -euo pipefail", "module load apptainer 2>/dev/null || true", ""]
|
|
180
|
+
|
|
181
|
+
# ── Download S3 input files ───────────────────────────────────────────
|
|
182
|
+
if input_s3_paths:
|
|
183
|
+
lines.append("mkdir -p /tmp/quapp_inputs")
|
|
184
|
+
for path in input_s3_paths:
|
|
185
|
+
lines.append(f"aws s3 cp {shlex.quote(path)} /tmp/quapp_inputs/")
|
|
186
|
+
lines.append("")
|
|
187
|
+
|
|
188
|
+
# ── Extra environment variables ───────────────────────────────────────
|
|
189
|
+
for k, v in environment.items():
|
|
190
|
+
lines.append(f"export {k}={shlex.quote(str(v))}")
|
|
191
|
+
if environment:
|
|
192
|
+
lines.append("")
|
|
193
|
+
|
|
194
|
+
# ── Container exec prefix ─────────────────────────────────────────────
|
|
195
|
+
container_type = container.get("type", "none")
|
|
196
|
+
image = container.get("image", "")
|
|
197
|
+
if container_type == "sif" and image:
|
|
198
|
+
exec_prefix = f"apptainer exec /data/containers/{image}"
|
|
199
|
+
elif container_type == "docker" and image:
|
|
200
|
+
exec_prefix = f"apptainer exec docker://{image}"
|
|
201
|
+
else:
|
|
202
|
+
exec_prefix = ""
|
|
203
|
+
|
|
204
|
+
# ── Write and run user script ─────────────────────────────────────────
|
|
205
|
+
b64 = base64.b64encode(user_script.encode()).decode()
|
|
206
|
+
lines += [
|
|
207
|
+
f"echo {shlex.quote(b64)} | base64 -d > /tmp/quapp_job.sh",
|
|
208
|
+
"chmod +x /tmp/quapp_job.sh",
|
|
209
|
+
"",
|
|
210
|
+
]
|
|
211
|
+
run_cmd = "bash /tmp/quapp_job.sh"
|
|
212
|
+
full_cmd = f"{exec_prefix} {run_cmd}".strip()
|
|
213
|
+
lines += [
|
|
214
|
+
f"{full_cmd} > /tmp/quapp_stdout.txt 2>/tmp/quapp_stderr.txt || true",
|
|
215
|
+
"echo $? > /tmp/quapp_exit_code.txt",
|
|
216
|
+
"",
|
|
217
|
+
]
|
|
218
|
+
|
|
219
|
+
# ── Collect output and upload to S3 ──────────────────────────────────
|
|
220
|
+
s3_uri = f"s3://{self.s3_bucket}/{self.job_uuid}/output.json"
|
|
221
|
+
lines += [
|
|
222
|
+
f"python3 -c {shlex.quote(_COLLECT_PY)}",
|
|
223
|
+
f"aws s3 cp /tmp/output.json {shlex.quote(s3_uri)}",
|
|
224
|
+
]
|
|
225
|
+
|
|
226
|
+
return "\n".join(lines)
|
|
227
|
+
|
|
228
|
+
# ── Helpers ───────────────────────────────────────────────────────────────
|
|
229
|
+
|
|
230
|
+
def _fetch_slurm_state(self, slurm_job_id: str) -> str:
|
|
231
|
+
resp = requests.get(
|
|
232
|
+
f"{self.slurm.base_url}/job/{slurm_job_id}",
|
|
233
|
+
headers=self.slurm.auth_headers(),
|
|
234
|
+
timeout=15,
|
|
235
|
+
)
|
|
236
|
+
resp.raise_for_status()
|
|
237
|
+
data = resp.json()
|
|
238
|
+
|
|
239
|
+
jobs = data.get("jobs", [data])
|
|
240
|
+
if not jobs:
|
|
241
|
+
return "UNKNOWN"
|
|
242
|
+
|
|
243
|
+
raw_state = jobs[0].get("job_state", "UNKNOWN")
|
|
244
|
+
if isinstance(raw_state, list):
|
|
245
|
+
raw_state = raw_state[0] if raw_state else "UNKNOWN"
|
|
246
|
+
|
|
247
|
+
return str(raw_state).strip()
|
|
248
|
+
|
|
249
|
+
def _download_s3_result(self, job_uuid: str) -> dict:
|
|
250
|
+
s3_key = f"{job_uuid}/output.json"
|
|
251
|
+
self.logger.info(f"Downloading result from s3://{self.s3_bucket}/{s3_key}")
|
|
252
|
+
|
|
253
|
+
s3 = boto3.client("s3", region_name=AWS_REGION)
|
|
254
|
+
resp = s3.get_object(Bucket=self.s3_bucket, Key=s3_key)
|
|
255
|
+
return json.loads(resp["Body"].read())
|
|
File without changes
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
from quapp_common.config.logging_config import job_logger
|
|
4
|
+
from quapp_common.enum.provider_tag import ProviderTag
|
|
5
|
+
from quapp_common.model.provider.provider import Provider
|
|
6
|
+
|
|
7
|
+
logger = job_logger('SlurmProvider')
|
|
8
|
+
|
|
9
|
+
SLURM_API_URL = os.getenv("SLURM_API_URL", "http://10.1.0.15:6820")
|
|
10
|
+
SLURM_API_VER = os.getenv("SLURM_API_VER", "v0.0.40")
|
|
11
|
+
SLURM_USERNAME = os.getenv("SLURM_USERNAME", "quapp-svc")
|
|
12
|
+
SLURM_ACCOUNT = os.getenv("SLURM_ACCOUNT", "quapp")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SlurmProvider(Provider):
|
|
16
|
+
|
|
17
|
+
def __init__(self, jwt_token: str):
|
|
18
|
+
super().__init__(ProviderTag.SLURM_HPC)
|
|
19
|
+
self.jwt_token = jwt_token
|
|
20
|
+
self.base_url = f"{SLURM_API_URL}/slurm/{SLURM_API_VER}"
|
|
21
|
+
|
|
22
|
+
def get_backend(self, device_specification: str) -> str:
|
|
23
|
+
# device_specification maps to Slurm partition name
|
|
24
|
+
return device_specification or "compute"
|
|
25
|
+
|
|
26
|
+
def collect_provider(self):
|
|
27
|
+
resp = requests.get(
|
|
28
|
+
f"{SLURM_API_URL}/ping",
|
|
29
|
+
timeout=10,
|
|
30
|
+
)
|
|
31
|
+
resp.raise_for_status()
|
|
32
|
+
logger.debug(f"Slurm API reachable: {resp.json()}")
|
|
33
|
+
return self
|
|
34
|
+
|
|
35
|
+
def auth_headers(self) -> dict:
|
|
36
|
+
return {
|
|
37
|
+
"X-SLURM-USER-NAME": SLURM_USERNAME,
|
|
38
|
+
"X-SLURM-USER-TOKEN": self.jwt_token,
|
|
39
|
+
"Content-Type": "application/json",
|
|
40
|
+
}
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: quapp-hpc
|
|
3
|
+
Version: 0.0.1.dev1
|
|
4
|
+
Summary: Quapp HPC library — Slurm integration for Quapp Platform
|
|
5
|
+
Author-email: "CITYNOW Co. Ltd." <corp@citynow.vn>
|
|
6
|
+
License: The MIT License (MIT)
|
|
7
|
+
Copyright © CITYNOW Co. Ltd. All rights reserved.
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
14
|
+
Project-URL: Homepage, https://quapp.cloud/
|
|
15
|
+
Keywords: quapp,quapp-hpc,slurm,hpc
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Requires-Python: <3.13,>=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: quapp-common==0.0.11.dev9
|
|
23
|
+
Requires-Dist: requests>=2.31.0
|
|
24
|
+
Requires-Dist: boto3>=1.28.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: black; extra == "dev"
|
|
27
|
+
Requires-Dist: bumpver; extra == "dev"
|
|
28
|
+
Requires-Dist: isort; extra == "dev"
|
|
29
|
+
Requires-Dist: pip-tools; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# quapp-hpc
|
|
34
|
+
|
|
35
|
+
Python library cho Quapp HPC functions — cầu nối giữa Quapp FaaS platform và Slurm HPC cluster.
|
|
36
|
+
|
|
37
|
+
## Architecture
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
ksvc (Docker)
|
|
41
|
+
├── index.py FastAPI server
|
|
42
|
+
├── quapp_hpc/
|
|
43
|
+
│ ├── factory/
|
|
44
|
+
│ │ └── hpc_handler_factory.py Entry point cho user
|
|
45
|
+
│ ├── component/backend/
|
|
46
|
+
│ │ └── hpc_invocation.py Orchestrates job lifecycle
|
|
47
|
+
│ └── model/
|
|
48
|
+
│ ├── provider/slurm_provider.py Auth headers, base URL
|
|
49
|
+
│ └── device/slurm_device.py Submit → Poll → S3 download
|
|
50
|
+
└── function/
|
|
51
|
+
└── handler.py User-defined processing() + post_processing()
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Luồng thực thi
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
index.py nhận HTTP POST
|
|
58
|
+
→ HpcHandlerFactory.create_handler(event, processing_fn, post_processing_fn)
|
|
59
|
+
→ InvocationHandler.handle()
|
|
60
|
+
→ HpcInvocation.submit_job()
|
|
61
|
+
1. processing_fn(invocation_input) → bash script string
|
|
62
|
+
2. SlurmDevice._create_job(script) → POST Slurm REST API → slurm_job_id
|
|
63
|
+
3. SlurmDevice._get_job_result() → poll mỗi 30s → COMPLETED/FAILED
|
|
64
|
+
4. SlurmDevice._download_s3_result()→ boto3 get s3://$S3_BUCKET/$JOB_UUID/output.json
|
|
65
|
+
5. post_processing_fn(s3_result) → final response
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Environment variables (từ K8s Secret `slurm-credentials`)
|
|
69
|
+
|
|
70
|
+
| Var | Ví dụ | Mô tả |
|
|
71
|
+
|-----|-------|-------|
|
|
72
|
+
| `SLURM_API_URL` | `http://10.1.0.15:6820` | Slurm REST API base URL |
|
|
73
|
+
| `SLURM_JWT` | `eyJ...` | JWT token cho Slurm auth |
|
|
74
|
+
| `SLURM_USERNAME` | `quapp-svc` | Slurm username |
|
|
75
|
+
| `SLURM_ACCOUNT` | `quapp` | Slurm account/allocation |
|
|
76
|
+
| `S3_BUCKET` | `quapp-slurm-output-dev` | S3 bucket cho job output |
|
|
77
|
+
| `AWS_REGION` | `ap-southeast-1` | AWS region |
|
|
78
|
+
| `SLURM_POLL_SEC` | `30` | Polling interval (giây) |
|
|
79
|
+
| `SLURM_TIMEOUT_SEC` | `21600` | Max wait time (giây, default 6h) |
|
|
80
|
+
|
|
81
|
+
## invocation_input schema
|
|
82
|
+
|
|
83
|
+
Xem chi tiết tại [`../qapp-sdk-templates/slurm-hpc/README.md`](../../qapp-sdk-templates/slurm-hpc/README.md).
|
|
84
|
+
|
|
85
|
+
Tóm tắt:
|
|
86
|
+
```json
|
|
87
|
+
{
|
|
88
|
+
"resources": { "partition", "nodes", "cpus_per_task", "gpus", "memory_gb", "time_limit" },
|
|
89
|
+
"container": { "type": "sif"|"docker"|"none", "image": "..." },
|
|
90
|
+
"job": { "type": "script"|"command", "script"|"command": "...", "environment": {}, "input_s3_paths": [] }
|
|
91
|
+
}
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Slurm REST API
|
|
95
|
+
|
|
96
|
+
- Version: `v0.0.40`
|
|
97
|
+
- Submit: `POST {SLURM_API_URL}/slurm/v0.0.40/job/submit`
|
|
98
|
+
- Status: `GET {SLURM_API_URL}/slurm/v0.0.40/job/{job_id}`
|
|
99
|
+
- Auth headers: `X-SLURM-USER-NAME`, `X-SLURM-USER-TOKEN`
|
|
100
|
+
|
|
101
|
+
### Job state mapping
|
|
102
|
+
|
|
103
|
+
| Slurm state | Quapp state |
|
|
104
|
+
|---|---|
|
|
105
|
+
| PENDING, CONFIGURING, RUNNING, COMPLETING | RUNNING |
|
|
106
|
+
| COMPLETED | DONE |
|
|
107
|
+
| FAILED, CANCELLED, TIMEOUT, NODE_FAIL, PREEMPTED | ERROR |
|
|
108
|
+
|
|
109
|
+
## S3 output pattern
|
|
110
|
+
|
|
111
|
+
Job script phải upload kết quả:
|
|
112
|
+
```bash
|
|
113
|
+
aws s3 cp /tmp/output.json s3://$S3_BUCKET/$JOB_UUID/output.json
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
`$JOB_UUID` và `$S3_BUCKET` được inject tự động bởi `SlurmDevice._create_job()` qua Slurm `environment` array.
|
|
117
|
+
|
|
118
|
+
## K8s Secret
|
|
119
|
+
|
|
120
|
+
```yaml
|
|
121
|
+
# infrastructure/quapp-job-scheduler/k8s/cts/slurm-secret.yaml
|
|
122
|
+
apiVersion: v1
|
|
123
|
+
kind: Secret
|
|
124
|
+
metadata:
|
|
125
|
+
name: slurm-credentials
|
|
126
|
+
namespace: quapp-functions-dev
|
|
127
|
+
stringData:
|
|
128
|
+
SLURM_JWT: "<generate: sudo scontrol token username=quapp-svc lifespan=2592000>"
|
|
129
|
+
SLURM_API_URL: "http://10.1.0.15:6820"
|
|
130
|
+
SLURM_USERNAME: "quapp-svc"
|
|
131
|
+
SLURM_ACCOUNT: "quapp"
|
|
132
|
+
S3_BUCKET: "quapp-slurm-output-dev"
|
|
133
|
+
AWS_REGION: "ap-southeast-1"
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## DB seed required
|
|
137
|
+
|
|
138
|
+
Chạy script trước khi deploy:
|
|
139
|
+
```
|
|
140
|
+
infrastructure/quapp-functions-backend/docs/db/seed_slurm_hpc.sql
|
|
141
|
+
```
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
quapp_hpc/__init__.py
|
|
5
|
+
quapp_hpc.egg-info/PKG-INFO
|
|
6
|
+
quapp_hpc.egg-info/SOURCES.txt
|
|
7
|
+
quapp_hpc.egg-info/dependency_links.txt
|
|
8
|
+
quapp_hpc.egg-info/requires.txt
|
|
9
|
+
quapp_hpc.egg-info/top_level.txt
|
|
10
|
+
quapp_hpc/component/__init__.py
|
|
11
|
+
quapp_hpc/component/backend/__init__.py
|
|
12
|
+
quapp_hpc/component/backend/hpc_invocation.py
|
|
13
|
+
quapp_hpc/component/backend/slurm_job_fetching.py
|
|
14
|
+
quapp_hpc/factory/__init__.py
|
|
15
|
+
quapp_hpc/factory/hpc_device_factory.py
|
|
16
|
+
quapp_hpc/factory/hpc_handler_factory.py
|
|
17
|
+
quapp_hpc/factory/hpc_provider_factory.py
|
|
18
|
+
quapp_hpc/handler/__init__.py
|
|
19
|
+
quapp_hpc/handler/invocation_handler.py
|
|
20
|
+
quapp_hpc/handler/job_fetching_handler.py
|
|
21
|
+
quapp_hpc/model/__init__.py
|
|
22
|
+
quapp_hpc/model/device/__init__.py
|
|
23
|
+
quapp_hpc/model/device/slurm_device.py
|
|
24
|
+
quapp_hpc/model/provider/__init__.py
|
|
25
|
+
quapp_hpc/model/provider/slurm_provider.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
quapp_hpc
|