qcsc-prefect-executor 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- qcsc_prefect_executor-0.1.0/PKG-INFO +21 -0
- qcsc_prefect_executor-0.1.0/README.md +3 -0
- qcsc_prefect_executor-0.1.0/pyproject.toml +32 -0
- qcsc_prefect_executor-0.1.0/setup.cfg +4 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor/__init__.py +19 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor/from_blocks.py +377 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor/fugaku/__init__.py +3 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor/fugaku/run.py +150 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor/miyabi/__init__.py +15 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor/miyabi/from_blocks.py +44 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor/miyabi/run.py +195 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor/slurm/__init__.py +15 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor/slurm/from_blocks.py +42 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor/slurm/run.py +140 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor.egg-info/PKG-INFO +21 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor.egg-info/SOURCES.txt +23 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor.egg-info/dependency_links.txt +1 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor.egg-info/requires.txt +4 -0
- qcsc_prefect_executor-0.1.0/src/qcsc_prefect_executor.egg-info/top_level.txt +1 -0
- qcsc_prefect_executor-0.1.0/tests/test_run_fugaku_job_fugaku_integration.py +93 -0
- qcsc_prefect_executor-0.1.0/tests/test_run_fugaku_job_local.py +122 -0
- qcsc_prefect_executor-0.1.0/tests/test_run_job_from_blocks_local.py +422 -0
- qcsc_prefect_executor-0.1.0/tests/test_run_miyabi_job_local.py +121 -0
- qcsc_prefect_executor-0.1.0/tests/test_run_miyabi_job_miyabi_integration.py +90 -0
- qcsc_prefect_executor-0.1.0/tests/test_run_slurm_job_local.py +117 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: qcsc-prefect-executor
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Prefect executor integrations for HPC schedulers
|
|
5
|
+
Author: QCSC Prefect Contributors
|
|
6
|
+
Maintainer: QCSC Prefect Maintainers
|
|
7
|
+
License-Expression: Apache-2.0
|
|
8
|
+
Project-URL: Homepage, https://github.com/qiskit-community/qcsc-prefect
|
|
9
|
+
Project-URL: Repository, https://github.com/qiskit-community/qcsc-prefect
|
|
10
|
+
Project-URL: Documentation, https://qiskit-community.github.io/qcsc-prefect/
|
|
11
|
+
Project-URL: Issues, https://github.com/qiskit-community/qcsc-prefect/issues
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: prefect>=2.19
|
|
15
|
+
Requires-Dist: qcsc-prefect-adapters==0.1.0
|
|
16
|
+
Requires-Dist: qcsc-prefect-blocks==0.1.0
|
|
17
|
+
Requires-Dist: qcsc-prefect-core==0.1.0
|
|
18
|
+
|
|
19
|
+
# QCSC Prefect Executor
|
|
20
|
+
|
|
21
|
+
Prefect executor integrations for QCSC HPC scheduler workflows.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=77.0.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "qcsc-prefect-executor"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Prefect executor integrations for HPC schedulers"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "Apache-2.0"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "QCSC Prefect Contributors" },
|
|
14
|
+
]
|
|
15
|
+
maintainers = [
|
|
16
|
+
{ name = "QCSC Prefect Maintainers" },
|
|
17
|
+
]
|
|
18
|
+
dependencies = [
|
|
19
|
+
"prefect>=2.19",
|
|
20
|
+
"qcsc-prefect-adapters==0.1.0",
|
|
21
|
+
"qcsc-prefect-blocks==0.1.0",
|
|
22
|
+
"qcsc-prefect-core==0.1.0",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Homepage = "https://github.com/qiskit-community/qcsc-prefect"
|
|
27
|
+
Repository = "https://github.com/qiskit-community/qcsc-prefect"
|
|
28
|
+
Documentation = "https://qiskit-community.github.io/qcsc-prefect/"
|
|
29
|
+
Issues = "https://github.com/qiskit-community/qcsc-prefect/issues"
|
|
30
|
+
|
|
31
|
+
[tool.setuptools.packages.find]
|
|
32
|
+
where = ["src"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""HPC executor package.
|
|
2
|
+
|
|
3
|
+
Keep top-level imports lightweight to avoid circular-import side effects when
|
|
4
|
+
submodules (e.g. ``qcsc_prefect_executor.miyabi.run``) are imported directly.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
async def run_job_from_blocks(*args: Any, **kwargs: Any):
|
|
13
|
+
# Lazy import prevents circular initialization across package __init__.py files.
|
|
14
|
+
from .from_blocks import run_job_from_blocks as _run_job_from_blocks
|
|
15
|
+
|
|
16
|
+
return await _run_job_from_blocks(*args, **kwargs)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
__all__ = ["run_job_from_blocks"]
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import inspect
|
|
4
|
+
import re
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from qcsc_prefect_adapters.fugaku.builder import FugakuJobRequest
|
|
10
|
+
from qcsc_prefect_adapters.miyabi.builder import MiyabiJobRequest
|
|
11
|
+
from qcsc_prefect_adapters.slurm.builder import SlurmJobRequest
|
|
12
|
+
from qcsc_prefect_blocks.common.blocks import CommandBlock, ExecutionProfileBlock, HPCProfileBlock
|
|
13
|
+
from qcsc_prefect_core.models.execution_profile import ExecutionProfile
|
|
14
|
+
|
|
15
|
+
from qcsc_prefect_executor.fugaku.run import run_fugaku_job
|
|
16
|
+
from qcsc_prefect_executor.miyabi.run import run_miyabi_job
|
|
17
|
+
from qcsc_prefect_executor.slurm.run import run_slurm_job
|
|
18
|
+
|
|
19
|
+
_EXECUTION_PROFILE_OVERRIDE_KEYS = {
|
|
20
|
+
"num_nodes",
|
|
21
|
+
"mpiprocs",
|
|
22
|
+
"ompthreads",
|
|
23
|
+
"walltime",
|
|
24
|
+
"launcher",
|
|
25
|
+
"mpi_options",
|
|
26
|
+
"modules",
|
|
27
|
+
"pre_commands",
|
|
28
|
+
"environments",
|
|
29
|
+
}
|
|
30
|
+
_SCRIPT_SUFFIX_BY_TARGET = {
|
|
31
|
+
"miyabi": ".pbs",
|
|
32
|
+
"fugaku": ".pjm",
|
|
33
|
+
"slurm": ".slurm",
|
|
34
|
+
}
|
|
35
|
+
_KNOWN_SCRIPT_SUFFIXES = frozenset(_SCRIPT_SUFFIX_BY_TARGET.values())
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class SubmissionTarget:
|
|
40
|
+
"""Scheduler routing information resolved from Prefect blocks.
|
|
41
|
+
|
|
42
|
+
Attributes:
|
|
43
|
+
hpc_target: Scheduler backend name, such as ``"miyabi"``,
|
|
44
|
+
``"fugaku"``, or ``"slurm"``.
|
|
45
|
+
queue_name: Queue, partition, or resource-group name selected for the
|
|
46
|
+
execution profile's resource class.
|
|
47
|
+
project: Project, group, or account name selected for the resource
|
|
48
|
+
class. This can be empty for scheduler targets that do not require
|
|
49
|
+
an account.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
hpc_target: str
|
|
53
|
+
queue_name: str
|
|
54
|
+
project: str
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
async def _resolve_loaded_block(value):
|
|
58
|
+
if inspect.isawaitable(value):
|
|
59
|
+
return await value
|
|
60
|
+
return value
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
async def _load_block(block_cls, block_name: str):
|
|
64
|
+
return await _resolve_loaded_block(block_cls.load(block_name))
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _resolve_submission_target_from_loaded_blocks(
|
|
68
|
+
hpc_block: HPCProfileBlock, resource_class: str
|
|
69
|
+
) -> SubmissionTarget:
|
|
70
|
+
if resource_class == "gpu":
|
|
71
|
+
return SubmissionTarget(
|
|
72
|
+
hpc_target=hpc_block.hpc_target,
|
|
73
|
+
queue_name=hpc_block.queue_gpu,
|
|
74
|
+
project=hpc_block.project_gpu,
|
|
75
|
+
)
|
|
76
|
+
return SubmissionTarget(
|
|
77
|
+
hpc_target=hpc_block.hpc_target,
|
|
78
|
+
queue_name=hpc_block.queue_cpu,
|
|
79
|
+
project=hpc_block.project_cpu,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
async def resolve_hpc_target(*, hpc_profile_block_name: str) -> str:
|
|
84
|
+
"""Load an ``HPCProfileBlock`` and return its scheduler target name.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
hpc_profile_block_name: Prefect block document name for
|
|
88
|
+
`qcsc_prefect_blocks.common.blocks.HPCProfileBlock`.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
The configured ``hpc_target`` value, for example ``"miyabi"``,
|
|
92
|
+
``"fugaku"``, or ``"slurm"``.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
hpc_block = await _load_block(HPCProfileBlock, hpc_profile_block_name)
|
|
96
|
+
return str(hpc_block.hpc_target)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
async def resolve_submission_target(
|
|
100
|
+
*,
|
|
101
|
+
hpc_profile_block_name: str,
|
|
102
|
+
execution_profile_block_name: str,
|
|
103
|
+
) -> SubmissionTarget:
|
|
104
|
+
"""Resolve scheduler routing from block names without submitting a job.
|
|
105
|
+
|
|
106
|
+
This helper is useful when a flow needs to inspect the target queue or
|
|
107
|
+
project before it creates scheduler-specific filenames or logs. It loads
|
|
108
|
+
the ``HPCProfileBlock`` and ``ExecutionProfileBlock`` and chooses CPU or
|
|
109
|
+
GPU queue/project fields from the execution profile's ``resource_class``.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
hpc_profile_block_name: Prefect block document name for target-specific
|
|
113
|
+
scheduler settings.
|
|
114
|
+
execution_profile_block_name: Prefect block document name for
|
|
115
|
+
scheduler-independent execution settings.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Resolved scheduler target, queue/partition/resource group, and
|
|
119
|
+
project/account values.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
hpc_block = await _load_block(HPCProfileBlock, hpc_profile_block_name)
|
|
123
|
+
execution_profile_block = await _load_block(ExecutionProfileBlock, execution_profile_block_name)
|
|
124
|
+
return _resolve_submission_target_from_loaded_blocks(
|
|
125
|
+
hpc_block, execution_profile_block.resource_class
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def build_scheduler_script_filename(script_stem: str, hpc_target: str) -> str:
|
|
130
|
+
"""Build a scheduler-specific script filename from a logical stem.
|
|
131
|
+
|
|
132
|
+
Existing scheduler suffixes are replaced, while names without a known
|
|
133
|
+
scheduler suffix receive the target suffix appended. For example,
|
|
134
|
+
``"batch"`` becomes ``"batch.pbs"`` for Miyabi and ``"batch.slurm"`` for
|
|
135
|
+
Slurm; ``"batch.pbs"`` becomes ``"batch.pjm"`` for Fugaku.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
script_stem: Logical script name or existing scheduler script filename.
|
|
139
|
+
hpc_target: Scheduler target name.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Script filename with the suffix required by the scheduler target.
|
|
143
|
+
|
|
144
|
+
Raises:
|
|
145
|
+
NotImplementedError: If ``hpc_target`` is not supported.
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
suffix = _SCRIPT_SUFFIX_BY_TARGET.get(hpc_target)
|
|
149
|
+
if suffix is None:
|
|
150
|
+
raise NotImplementedError(f"Unsupported hpc_target for script naming: {hpc_target}")
|
|
151
|
+
|
|
152
|
+
script_path = Path(script_stem)
|
|
153
|
+
if script_path.suffix in _KNOWN_SCRIPT_SUFFIXES:
|
|
154
|
+
script_path = script_path.with_suffix(suffix)
|
|
155
|
+
else:
|
|
156
|
+
script_path = script_path.with_name(script_path.name + suffix)
|
|
157
|
+
return str(script_path)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
async def resolve_scheduler_script_filename(
|
|
161
|
+
*,
|
|
162
|
+
script_stem: str,
|
|
163
|
+
hpc_profile_block_name: str,
|
|
164
|
+
) -> str:
|
|
165
|
+
"""Resolve scheduler target from blocks and return a matching filename.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
script_stem: Logical script name or existing scheduler script filename.
|
|
169
|
+
hpc_profile_block_name: Prefect block document name used to determine
|
|
170
|
+
the scheduler target.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Scheduler-specific script filename.
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
hpc_target = await resolve_hpc_target(hpc_profile_block_name=hpc_profile_block_name)
|
|
177
|
+
return build_scheduler_script_filename(script_stem, hpc_target)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _build_execution_profile(
|
|
181
|
+
*,
|
|
182
|
+
command_block: CommandBlock,
|
|
183
|
+
execution_profile_block: ExecutionProfileBlock,
|
|
184
|
+
user_args: list[str] | None,
|
|
185
|
+
execution_profile_overrides: dict[str, Any] | None,
|
|
186
|
+
) -> ExecutionProfile:
|
|
187
|
+
arguments = list(command_block.default_args)
|
|
188
|
+
if user_args:
|
|
189
|
+
arguments.extend(user_args)
|
|
190
|
+
|
|
191
|
+
profile_kwargs: dict[str, Any] = {
|
|
192
|
+
"command_key": command_block.command_name,
|
|
193
|
+
"num_nodes": execution_profile_block.num_nodes,
|
|
194
|
+
"mpiprocs": execution_profile_block.mpiprocs,
|
|
195
|
+
"ompthreads": execution_profile_block.ompthreads,
|
|
196
|
+
"walltime": execution_profile_block.walltime,
|
|
197
|
+
"launcher": execution_profile_block.launcher,
|
|
198
|
+
"mpi_options": list(execution_profile_block.mpi_options),
|
|
199
|
+
"modules": list(execution_profile_block.modules),
|
|
200
|
+
"pre_commands": list(getattr(execution_profile_block, "pre_commands", [])),
|
|
201
|
+
"environments": dict(execution_profile_block.environments),
|
|
202
|
+
"arguments": arguments,
|
|
203
|
+
}
|
|
204
|
+
if execution_profile_overrides:
|
|
205
|
+
invalid_keys = sorted(set(execution_profile_overrides) - _EXECUTION_PROFILE_OVERRIDE_KEYS)
|
|
206
|
+
if invalid_keys:
|
|
207
|
+
raise ValueError(
|
|
208
|
+
"Unsupported execution_profile_overrides keys: " + ", ".join(invalid_keys)
|
|
209
|
+
)
|
|
210
|
+
for key, value in execution_profile_overrides.items():
|
|
211
|
+
if key in {"mpi_options", "modules", "pre_commands"} and value is not None:
|
|
212
|
+
profile_kwargs[key] = list(value)
|
|
213
|
+
elif key == "environments" and value is not None:
|
|
214
|
+
profile_kwargs[key] = dict(value)
|
|
215
|
+
else:
|
|
216
|
+
profile_kwargs[key] = value
|
|
217
|
+
|
|
218
|
+
return ExecutionProfile(
|
|
219
|
+
**profile_kwargs,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _default_fugaku_job_name(command_name: str) -> str:
|
|
224
|
+
normalized = re.sub(r"[^a-zA-Z0-9_-]+", "-", command_name).strip("-")
|
|
225
|
+
if not normalized:
|
|
226
|
+
return "prefect-job"
|
|
227
|
+
return normalized[:63]
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
async def run_job_from_blocks(
|
|
231
|
+
*,
|
|
232
|
+
command_block_name: str,
|
|
233
|
+
execution_profile_block_name: str,
|
|
234
|
+
hpc_profile_block_name: str,
|
|
235
|
+
work_dir: Path,
|
|
236
|
+
script_filename: str,
|
|
237
|
+
user_args: list[str] | None = None,
|
|
238
|
+
watch_poll_interval: float = 10.0,
|
|
239
|
+
timeout_seconds: float | None = None,
|
|
240
|
+
metrics_artifact_key: str = "hpc-job-metrics",
|
|
241
|
+
fugaku_job_name: str | None = None,
|
|
242
|
+
execution_profile_overrides: dict[str, Any] | None = None,
|
|
243
|
+
) -> Any:
|
|
244
|
+
"""Resolve Prefect blocks and execute a job on the configured HPC target.
|
|
245
|
+
|
|
246
|
+
This is the main block-driven entrypoint for workflow authors. It loads the
|
|
247
|
+
command, execution profile, and HPC profile blocks; converts them into the
|
|
248
|
+
internal runtime models; creates the target-specific scheduler request; and
|
|
249
|
+
dispatches to the Miyabi, Fugaku, or Slurm executor.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
command_block_name: Prefect block document name for the command to run.
|
|
253
|
+
execution_profile_block_name: Prefect block document name describing
|
|
254
|
+
resources, launcher, environment, and default execution behavior.
|
|
255
|
+
hpc_profile_block_name: Prefect block document name describing the
|
|
256
|
+
scheduler target, queues, projects, and executable mapping.
|
|
257
|
+
work_dir: Working directory where scheduler scripts and job outputs are
|
|
258
|
+
created.
|
|
259
|
+
script_filename: Logical or scheduler-specific script filename. The
|
|
260
|
+
suffix is normalized for the resolved target.
|
|
261
|
+
user_args: Optional extra command-line arguments appended after the
|
|
262
|
+
command block's default arguments.
|
|
263
|
+
watch_poll_interval: Seconds to wait between scheduler status polls.
|
|
264
|
+
timeout_seconds: Optional maximum wait time for terminal job status.
|
|
265
|
+
metrics_artifact_key: Prefect artifact key used for job metrics.
|
|
266
|
+
fugaku_job_name: Optional Fugaku PJM job name. When omitted, a safe name
|
|
267
|
+
is derived from the command name.
|
|
268
|
+
execution_profile_overrides: Optional runtime overrides for selected
|
|
269
|
+
execution profile fields, such as ``num_nodes`` or ``walltime``.
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
A target-specific result object: ``MiyabiRunResult``,
|
|
273
|
+
``FugakuRunResult``, or ``SlurmRunResult``.
|
|
274
|
+
|
|
275
|
+
Raises:
|
|
276
|
+
ValueError: If the command and execution profile blocks refer to
|
|
277
|
+
different command names, if a required project/group is missing, or
|
|
278
|
+
if unsupported execution profile override keys are provided.
|
|
279
|
+
KeyError: If the command's executable key is missing from the HPC
|
|
280
|
+
profile's executable map.
|
|
281
|
+
NotImplementedError: If the resolved ``hpc_target`` is unsupported.
|
|
282
|
+
"""
|
|
283
|
+
command_block = await _load_block(CommandBlock, command_block_name)
|
|
284
|
+
execution_profile_block = await _load_block(ExecutionProfileBlock, execution_profile_block_name)
|
|
285
|
+
hpc_block = await _load_block(HPCProfileBlock, hpc_profile_block_name)
|
|
286
|
+
|
|
287
|
+
if execution_profile_block.command_name != command_block.command_name:
|
|
288
|
+
raise ValueError(
|
|
289
|
+
f"ExecutionProfileBlock '{execution_profile_block_name}' is for command "
|
|
290
|
+
f"'{execution_profile_block.command_name}', but command block "
|
|
291
|
+
f"'{command_block_name}' is '{command_block.command_name}'."
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
executable = hpc_block.executable_map.get(command_block.executable_key)
|
|
295
|
+
if not executable:
|
|
296
|
+
raise KeyError(
|
|
297
|
+
f"Executable key '{command_block.executable_key}' was not found in "
|
|
298
|
+
f"HPCProfileBlock '{hpc_profile_block_name}'."
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
submission_target = _resolve_submission_target_from_loaded_blocks(
|
|
302
|
+
hpc_block, execution_profile_block.resource_class
|
|
303
|
+
)
|
|
304
|
+
resolved_script_filename = build_scheduler_script_filename(
|
|
305
|
+
script_filename,
|
|
306
|
+
submission_target.hpc_target,
|
|
307
|
+
)
|
|
308
|
+
if submission_target.hpc_target in {"miyabi", "fugaku"} and not submission_target.project:
|
|
309
|
+
raise ValueError("Project/Group is empty. Update HPCProfileBlock project_cpu/project_gpu.")
|
|
310
|
+
|
|
311
|
+
exec_profile = _build_execution_profile(
|
|
312
|
+
command_block=command_block,
|
|
313
|
+
execution_profile_block=execution_profile_block,
|
|
314
|
+
user_args=user_args,
|
|
315
|
+
execution_profile_overrides=execution_profile_overrides,
|
|
316
|
+
)
|
|
317
|
+
resolved_work_dir = Path(work_dir).expanduser().resolve()
|
|
318
|
+
|
|
319
|
+
if submission_target.hpc_target == "miyabi":
|
|
320
|
+
req = MiyabiJobRequest(
|
|
321
|
+
queue_name=submission_target.queue_name,
|
|
322
|
+
project=submission_target.project,
|
|
323
|
+
executable=executable,
|
|
324
|
+
)
|
|
325
|
+
return await run_miyabi_job(
|
|
326
|
+
work_dir=resolved_work_dir,
|
|
327
|
+
script_filename=resolved_script_filename,
|
|
328
|
+
exec_profile=exec_profile,
|
|
329
|
+
req=req,
|
|
330
|
+
watch_poll_interval=watch_poll_interval,
|
|
331
|
+
timeout_seconds=timeout_seconds,
|
|
332
|
+
metrics_artifact_key=metrics_artifact_key,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
if submission_target.hpc_target == "fugaku":
|
|
336
|
+
req = FugakuJobRequest(
|
|
337
|
+
queue_name=submission_target.queue_name,
|
|
338
|
+
project=submission_target.project,
|
|
339
|
+
executable=executable,
|
|
340
|
+
job_name=fugaku_job_name or _default_fugaku_job_name(command_block.command_name),
|
|
341
|
+
gfscache=hpc_block.gfscache or "/vol0002",
|
|
342
|
+
spack_modules=list(hpc_block.spack_modules) if hpc_block.spack_modules else [],
|
|
343
|
+
mpi_options_for_pjm=list(hpc_block.mpi_options_for_pjm)
|
|
344
|
+
if hpc_block.mpi_options_for_pjm
|
|
345
|
+
else [],
|
|
346
|
+
pjm_resources=list(hpc_block.pjm_resources) if hpc_block.pjm_resources else [],
|
|
347
|
+
)
|
|
348
|
+
return await run_fugaku_job(
|
|
349
|
+
work_dir=resolved_work_dir,
|
|
350
|
+
script_filename=resolved_script_filename,
|
|
351
|
+
exec_profile=exec_profile,
|
|
352
|
+
req=req,
|
|
353
|
+
watch_poll_interval=watch_poll_interval,
|
|
354
|
+
timeout_seconds=timeout_seconds,
|
|
355
|
+
metrics_artifact_key=metrics_artifact_key,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
if submission_target.hpc_target == "slurm":
|
|
359
|
+
req = SlurmJobRequest(
|
|
360
|
+
partition=submission_target.queue_name,
|
|
361
|
+
account=submission_target.project or None,
|
|
362
|
+
executable=executable,
|
|
363
|
+
qpu=hpc_block.slurm_qpu,
|
|
364
|
+
)
|
|
365
|
+
return await run_slurm_job(
|
|
366
|
+
work_dir=resolved_work_dir,
|
|
367
|
+
script_filename=resolved_script_filename,
|
|
368
|
+
exec_profile=exec_profile,
|
|
369
|
+
req=req,
|
|
370
|
+
watch_poll_interval=watch_poll_interval,
|
|
371
|
+
timeout_seconds=timeout_seconds,
|
|
372
|
+
metrics_artifact_key=metrics_artifact_key,
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
raise NotImplementedError(
|
|
376
|
+
f"hpc_target='{submission_target.hpc_target}' is not supported yet by run_job_from_blocks."
|
|
377
|
+
)
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from prefect.artifacts import create_table_artifact
|
|
8
|
+
from prefect.logging import get_run_logger
|
|
9
|
+
from qcsc_prefect_adapters.fugaku.builder import FugakuJobRequest, render_script, write_script_file
|
|
10
|
+
from qcsc_prefect_adapters.fugaku.runtime import FugakuPJMRuntime
|
|
11
|
+
from qcsc_prefect_core.models.execution_profile import ExecutionProfile
|
|
12
|
+
|
|
13
|
+
MAX_LOG_SIZE = 10_000
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def truncate_log(text: str) -> str:
|
|
17
|
+
"""Truncate large log text to the configured maximum length."""
|
|
18
|
+
|
|
19
|
+
if len(text) > MAX_LOG_SIZE:
|
|
20
|
+
return text[:MAX_LOG_SIZE] + f"... (truncated {len(text) - MAX_LOG_SIZE} chars)"
|
|
21
|
+
return text
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _parse_stats_file(stats_file: Path | None) -> dict[str, str]:
|
|
25
|
+
stats: dict[str, str] = {}
|
|
26
|
+
if stats_file is None or not stats_file.exists():
|
|
27
|
+
return stats
|
|
28
|
+
|
|
29
|
+
for line in stats_file.read_text(errors="replace").splitlines():
|
|
30
|
+
line = line.strip()
|
|
31
|
+
if not line:
|
|
32
|
+
continue
|
|
33
|
+
if line.startswith(("Job Statistical Information", "Node Statistical Information")):
|
|
34
|
+
continue
|
|
35
|
+
if " :" not in line:
|
|
36
|
+
continue
|
|
37
|
+
key, _, value = line.partition(" :")
|
|
38
|
+
stats["stats." + key.strip().lower().replace(" ", "_")] = value.strip()
|
|
39
|
+
return stats
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _read_text_if_exists(path: Path) -> str:
|
|
43
|
+
if not path.exists():
|
|
44
|
+
return ""
|
|
45
|
+
return path.read_text(errors="replace")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass(frozen=True)
|
|
49
|
+
class FugakuRunResult:
|
|
50
|
+
"""Normalized result returned by `run_fugaku_job`.
|
|
51
|
+
|
|
52
|
+
Attributes:
|
|
53
|
+
job_id: PJM job id returned by ``pjsub``.
|
|
54
|
+
exit_status: Integer exit status derived from PJM ``EC`` when present.
|
|
55
|
+
state: Final PJM state, such as ``"EXT"`` or ``"CCL"``.
|
|
56
|
+
job_status: Parsed final PJM status dictionary from ``pjstat``.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
job_id: str
|
|
60
|
+
exit_status: int
|
|
61
|
+
state: str
|
|
62
|
+
job_status: dict[str, Any]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
async def run_fugaku_job(
|
|
66
|
+
*,
|
|
67
|
+
work_dir: Path,
|
|
68
|
+
script_filename: str,
|
|
69
|
+
exec_profile: ExecutionProfile,
|
|
70
|
+
req: FugakuJobRequest,
|
|
71
|
+
watch_poll_interval: float = 10.0,
|
|
72
|
+
timeout_seconds: float | None = None,
|
|
73
|
+
metrics_artifact_key: str = "fugaku-job-metrics",
|
|
74
|
+
) -> FugakuRunResult:
|
|
75
|
+
"""Execute a Fugaku job end-to-end from runtime models.
|
|
76
|
+
|
|
77
|
+
.. note::
|
|
78
|
+
This function is the high-level executor entrypoint. It internally
|
|
79
|
+
renders a script, submits it, waits for final status, captures logs,
|
|
80
|
+
parses stats, and publishes a metrics artifact.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
work_dir: Working directory where scripts and job outputs are written.
|
|
84
|
+
script_filename: Job script filename to create in ``work_dir``.
|
|
85
|
+
exec_profile: Scheduler-independent execution profile.
|
|
86
|
+
req: Fugaku-specific scheduler request fields.
|
|
87
|
+
watch_poll_interval: Poll interval in seconds for job status checks.
|
|
88
|
+
timeout_seconds: Optional timeout for waiting final status.
|
|
89
|
+
metrics_artifact_key: Prefect artifact key for job metrics table.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
`FugakuRunResult` containing job id, exit status, state, and
|
|
93
|
+
final scheduler status payload.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
logger = get_run_logger()
|
|
97
|
+
|
|
98
|
+
script_basename = Path(script_filename).name
|
|
99
|
+
script_text = render_script(
|
|
100
|
+
work_dir=work_dir,
|
|
101
|
+
exec_profile=exec_profile,
|
|
102
|
+
req=req,
|
|
103
|
+
script_basename=script_basename,
|
|
104
|
+
)
|
|
105
|
+
script_path = write_script_file(work_dir=work_dir, filename=script_filename, text=script_text)
|
|
106
|
+
|
|
107
|
+
runtime = FugakuPJMRuntime()
|
|
108
|
+
submit = await runtime.submit(script_path, cwd=work_dir)
|
|
109
|
+
final_status = await runtime.wait_final_status(
|
|
110
|
+
submit.job_id,
|
|
111
|
+
watch_poll_interval=watch_poll_interval,
|
|
112
|
+
timeout_seconds=timeout_seconds,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
out_file = work_dir / f"{script_basename}.{req.job_name}.out"
|
|
116
|
+
err_file = work_dir / f"{script_basename}.{req.job_name}.err"
|
|
117
|
+
stats_file = work_dir / f"{script_basename}.{req.job_name}.stats"
|
|
118
|
+
|
|
119
|
+
if logs := _read_text_if_exists(out_file):
|
|
120
|
+
logger.info(truncate_log(logs))
|
|
121
|
+
if logs := _read_text_if_exists(err_file):
|
|
122
|
+
logger.error(truncate_log(logs))
|
|
123
|
+
|
|
124
|
+
artifact: dict[str, Any] = {
|
|
125
|
+
"job_id": submit.job_id,
|
|
126
|
+
"state": final_status.get("ST"),
|
|
127
|
+
"exit_code": final_status.get("EC"),
|
|
128
|
+
"stdout_file": str(out_file) if out_file.exists() else None,
|
|
129
|
+
"stderr_file": str(err_file) if err_file.exists() else None,
|
|
130
|
+
"stats_file": str(stats_file) if stats_file.exists() else None,
|
|
131
|
+
}
|
|
132
|
+
artifact.update(_parse_stats_file(stats_file))
|
|
133
|
+
|
|
134
|
+
await create_table_artifact(
|
|
135
|
+
table=[list(artifact.keys()), list(artifact.values())],
|
|
136
|
+
key=metrics_artifact_key,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
exit_code_text = str(final_status.get("EC", "")).strip()
|
|
140
|
+
if exit_code_text.isdigit():
|
|
141
|
+
exit_status = int(exit_code_text)
|
|
142
|
+
else:
|
|
143
|
+
exit_status = 0 if final_status.get("ST") == "EXT" else -1
|
|
144
|
+
|
|
145
|
+
return FugakuRunResult(
|
|
146
|
+
job_id=submit.job_id,
|
|
147
|
+
exit_status=exit_status,
|
|
148
|
+
state=str(final_status.get("ST", "")),
|
|
149
|
+
job_status=final_status,
|
|
150
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from .run import MiyabiRunResult, run_miyabi_job
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
async def run_miyabi_job_from_blocks(*args: Any, **kwargs: Any):
|
|
9
|
+
# Lazy import avoids cycles with qcsc_prefect_executor.from_blocks.
|
|
10
|
+
from .from_blocks import run_miyabi_job_from_blocks as _run_miyabi_job_from_blocks
|
|
11
|
+
|
|
12
|
+
return await _run_miyabi_job_from_blocks(*args, **kwargs)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
__all__ = ["MiyabiRunResult", "run_miyabi_job", "run_miyabi_job_from_blocks"]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from qcsc_prefect_blocks.common.blocks import HPCProfileBlock
|
|
6
|
+
|
|
7
|
+
from qcsc_prefect_executor.from_blocks import run_job_from_blocks
|
|
8
|
+
from qcsc_prefect_executor.miyabi.run import MiyabiRunResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
async def run_miyabi_job_from_blocks(
|
|
12
|
+
*,
|
|
13
|
+
command_block_name: str,
|
|
14
|
+
execution_profile_block_name: str,
|
|
15
|
+
hpc_profile_block_name: str,
|
|
16
|
+
work_dir: Path,
|
|
17
|
+
script_filename: str,
|
|
18
|
+
user_args: list[str] | None = None,
|
|
19
|
+
watch_poll_interval: float = 10.0,
|
|
20
|
+
timeout_seconds: float | None = None,
|
|
21
|
+
metrics_artifact_key: str = "miyabi-job-metrics",
|
|
22
|
+
) -> MiyabiRunResult:
|
|
23
|
+
"""
|
|
24
|
+
Backward-compatible wrapper around `run_job_from_blocks`.
|
|
25
|
+
"""
|
|
26
|
+
hpc_block = await HPCProfileBlock.load(hpc_profile_block_name)
|
|
27
|
+
if hpc_block.hpc_target != "miyabi":
|
|
28
|
+
raise ValueError(
|
|
29
|
+
f"run_miyabi_job_from_blocks requires hpc_target='miyabi', "
|
|
30
|
+
f"got '{hpc_block.hpc_target}'."
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
result = await run_job_from_blocks(
|
|
34
|
+
command_block_name=command_block_name,
|
|
35
|
+
execution_profile_block_name=execution_profile_block_name,
|
|
36
|
+
hpc_profile_block_name=hpc_profile_block_name,
|
|
37
|
+
work_dir=work_dir,
|
|
38
|
+
script_filename=script_filename,
|
|
39
|
+
user_args=user_args,
|
|
40
|
+
watch_poll_interval=watch_poll_interval,
|
|
41
|
+
timeout_seconds=timeout_seconds,
|
|
42
|
+
metrics_artifact_key=metrics_artifact_key,
|
|
43
|
+
)
|
|
44
|
+
return result
|