idmtools-platform-slurm 0.0.0.dev0__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dockerized_slurm/Dockerfile +107 -0
- dockerized_slurm/README.md +17 -0
- dockerized_slurm/docker-compose.yml +89 -0
- dockerized_slurm/docker-entrypoint.sh +64 -0
- dockerized_slurm/id_rsa +27 -0
- dockerized_slurm/id_rsa.pub +1 -0
- dockerized_slurm/register_cluster.sh +12 -0
- dockerized_slurm/slurm.conf +94 -0
- dockerized_slurm/slurmdbd.conf +37 -0
- idmtools_platform_slurm/__init__.py +12 -8
- idmtools_platform_slurm/assets/__init__.py +157 -0
- idmtools_platform_slurm/assets/_run.sh.jinja2 +44 -0
- idmtools_platform_slurm/assets/batch.sh.jinja2 +54 -0
- idmtools_platform_slurm/assets/run_simulation.sh +23 -0
- idmtools_platform_slurm/assets/sbatch.sh.jinja2 +77 -0
- idmtools_platform_slurm/cli/__init__.py +4 -0
- idmtools_platform_slurm/cli/slurm.py +151 -0
- idmtools_platform_slurm/platform_operations/__init__.py +0 -0
- idmtools_platform_slurm/platform_operations/asset_collection_operations.py +25 -0
- idmtools_platform_slurm/platform_operations/experiment_operations.py +107 -0
- idmtools_platform_slurm/platform_operations/json_metadata_operations.py +17 -0
- idmtools_platform_slurm/platform_operations/simulation_operations.py +46 -0
- idmtools_platform_slurm/platform_operations/suite_operations.py +38 -0
- idmtools_platform_slurm/platform_operations/utils.py +45 -0
- idmtools_platform_slurm/plugin_info.py +75 -0
- idmtools_platform_slurm/slurm_operations/__init__.py +5 -0
- idmtools_platform_slurm/slurm_operations/slurm_operations.py +58 -0
- idmtools_platform_slurm/slurm_platform.py +207 -0
- idmtools_platform_slurm/utils/__init__.py +4 -0
- idmtools_platform_slurm/utils/slurm_job/__init__.py +90 -0
- idmtools_platform_slurm/utils/slurm_job/script_sbatch.sh.jinja2 +78 -0
- idmtools_platform_slurm/utils/slurm_job/slurm_job.py +214 -0
- idmtools_platform_slurm/utils/status_report/__init__.py +5 -0
- idmtools_platform_slurm/utils/status_report/status_report.py +242 -0
- idmtools_platform_slurm/utils/status_report/utils.py +108 -0
- idmtools_platform_slurm-0.0.2.dist-info/METADATA +185 -0
- idmtools_platform_slurm-0.0.2.dist-info/RECORD +43 -0
- idmtools_platform_slurm-0.0.2.dist-info/entry_points.txt +5 -0
- idmtools_platform_slurm-0.0.2.dist-info/licenses/LICENSE.TXT +3 -0
- {idmtools_platform_slurm-0.0.0.dev0.dist-info → idmtools_platform_slurm-0.0.2.dist-info}/top_level.txt +2 -0
- tests/input/hello.sh +2 -0
- tests/input/script.py +49 -0
- idmtools_platform_slurm-0.0.0.dev0.dist-info/METADATA +0 -41
- idmtools_platform_slurm-0.0.0.dev0.dist-info/RECORD +0 -5
- {idmtools_platform_slurm-0.0.0.dev0.dist-info → idmtools_platform_slurm-0.0.2.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Here we implement the Slurm Operations.
|
|
3
|
+
|
|
4
|
+
Copyright 2025, Gates Foundation. All rights reserved.
|
|
5
|
+
"""
|
|
6
|
+
import subprocess
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from logging import getLogger
|
|
9
|
+
from typing import Union, List, Any, Type
|
|
10
|
+
|
|
11
|
+
from idmtools.entities.experiment import Experiment
|
|
12
|
+
from idmtools.entities.simulation import Simulation
|
|
13
|
+
from idmtools_platform_file.file_operations.file_operations import FileOperations
|
|
14
|
+
from idmtools_platform_slurm.assets import generate_batch, generate_script, generate_simulation_script
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class SlurmOperations(FileOperations):
|
|
22
|
+
|
|
23
|
+
platform: 'SlurmPlatform' # noqa: F821
|
|
24
|
+
platform_type: Type = field(default=None)
|
|
25
|
+
|
|
26
|
+
def create_batch_file(self, item: Union[Experiment, Simulation], max_running_jobs: int = None, retries: int = None,
|
|
27
|
+
array_batch_size: int = None, dependency: bool = True, **kwargs) -> None:
|
|
28
|
+
"""
|
|
29
|
+
Create batch file.
|
|
30
|
+
Args:
|
|
31
|
+
item: the item to build batch file for
|
|
32
|
+
kwargs: keyword arguments used to expand functionality.
|
|
33
|
+
Returns:
|
|
34
|
+
None
|
|
35
|
+
"""
|
|
36
|
+
if isinstance(item, Experiment):
|
|
37
|
+
generate_batch(self.platform, item, max_running_jobs, array_batch_size, dependency)
|
|
38
|
+
generate_script(self.platform, item, max_running_jobs)
|
|
39
|
+
elif isinstance(item, Simulation):
|
|
40
|
+
generate_simulation_script(self.platform, item, retries)
|
|
41
|
+
else:
|
|
42
|
+
raise NotImplementedError(f"{item.__class__.__name__} is not supported for batch creation.")
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def cancel_job(job_ids: Union[str, List[str]]) -> Any:
|
|
46
|
+
"""
|
|
47
|
+
Cancel Slurm job for given job ids.
|
|
48
|
+
Args:
|
|
49
|
+
job_ids: slurm jobs id
|
|
50
|
+
Returns:
|
|
51
|
+
Any
|
|
52
|
+
"""
|
|
53
|
+
if isinstance(job_ids, str):
|
|
54
|
+
job_ids = [job_ids]
|
|
55
|
+
logger.debug(f"Submit slurm cancel job: {job_ids}")
|
|
56
|
+
result = subprocess.run(['scancel', *job_ids], stdout=subprocess.PIPE)
|
|
57
|
+
stdout = "Success" if result.returncode == 0 else 'Error'
|
|
58
|
+
return stdout
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Here we implement the SlurmPlatform object.
|
|
3
|
+
|
|
4
|
+
Copyright 2025, Gates Foundation. All rights reserved.
|
|
5
|
+
"""
|
|
6
|
+
import subprocess
|
|
7
|
+
from typing import Optional, Any, Dict, List, Union, Literal
|
|
8
|
+
from dataclasses import dataclass, field, fields
|
|
9
|
+
from logging import getLogger
|
|
10
|
+
from idmtools.core import ItemType
|
|
11
|
+
from idmtools.entities.experiment import Experiment
|
|
12
|
+
from idmtools.entities.simulation import Simulation
|
|
13
|
+
from idmtools_platform_file.file_platform import FilePlatform
|
|
14
|
+
from idmtools_platform_slurm.platform_operations.json_metadata_operations import SlurmJSONMetadataOperations
|
|
15
|
+
from idmtools_platform_slurm.platform_operations.asset_collection_operations import \
|
|
16
|
+
SlurmPlatformAssetCollectionOperations
|
|
17
|
+
from idmtools_platform_slurm.platform_operations.experiment_operations import SlurmPlatformExperimentOperations
|
|
18
|
+
from idmtools_platform_slurm.platform_operations.simulation_operations import SlurmPlatformSimulationOperations
|
|
19
|
+
from idmtools_platform_slurm.platform_operations.suite_operations import SlurmPlatformSuiteOperations
|
|
20
|
+
from idmtools_platform_slurm.platform_operations.utils import get_max_array_size
|
|
21
|
+
from idmtools_platform_slurm.slurm_operations.slurm_operations import SlurmOperations
|
|
22
|
+
|
|
23
|
+
from idmtools_platform_slurm.utils.slurm_job import run_script_on_slurm, slurm_installed
|
|
24
|
+
|
|
25
|
+
logger = getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
op_defaults = dict(default=None, compare=False, metadata={"pickle_ignore": True})
|
|
28
|
+
CONFIG_PARAMETERS = ['ntasks', 'partition', 'nodes', 'mail_type', 'mail_user', 'ntasks_per_core', 'cpus_per_task',
|
|
29
|
+
'mem_per_cpu', 'time', 'constraint', 'account', 'mem', 'exclusive', 'requeue', 'sbatch_custom',
|
|
30
|
+
'max_running_jobs', 'array_batch_size', 'mpi_type']
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(repr=False)
|
|
34
|
+
class SlurmPlatform(FilePlatform):
|
|
35
|
+
# region: Resources request
|
|
36
|
+
|
|
37
|
+
# choose e-mail type
|
|
38
|
+
mail_type: Optional[str] = field(default=None, metadata=dict(sbatch=True, help="e-mail type"))
|
|
39
|
+
|
|
40
|
+
# send e=mail notification
|
|
41
|
+
# TODO Add Validations here from https://slurm.schedmd.com/sbatch.html#OPT_mail-type
|
|
42
|
+
mail_user: Optional[str] = field(default=None, metadata=dict(sbatch=True, help="e-mail address"))
|
|
43
|
+
|
|
44
|
+
# How many nodes to be used
|
|
45
|
+
nodes: Optional[int] = field(default=None, metadata=dict(sbatch=True, help="Number of nodes"))
|
|
46
|
+
|
|
47
|
+
# Num of tasks
|
|
48
|
+
ntasks: Optional[int] = field(default=None, metadata=dict(sbatch=True, help="Number of tasks"))
|
|
49
|
+
|
|
50
|
+
# CPU # per task
|
|
51
|
+
cpus_per_task: Optional[int] = field(default=None, metadata=dict(sbatch=True, help="Number of CPUs per task"))
|
|
52
|
+
|
|
53
|
+
# Task # per core
|
|
54
|
+
ntasks_per_core: Optional[int] = field(default=None, metadata=dict(sbatch=True, help="Number of tasks per core"))
|
|
55
|
+
|
|
56
|
+
# Maximum of running jobs(Per experiment)
|
|
57
|
+
max_running_jobs: Optional[int] = field(default=100, metadata=dict(sbatch=True, help="Maximum of running jobs"))
|
|
58
|
+
|
|
59
|
+
# Memory per core: MB of memory
|
|
60
|
+
mem: Optional[int] = field(default=None, metadata=dict(sbatch=True, help="Memory per core"))
|
|
61
|
+
|
|
62
|
+
# Memory per core: MB of memory
|
|
63
|
+
mem_per_cpu: Optional[int] = field(default=None, metadata=dict(sbatch=True, help="Memory per CPU"))
|
|
64
|
+
|
|
65
|
+
# Which partition to use
|
|
66
|
+
partition: Optional[str] = field(default=None, metadata=dict(sbatch=True, help="Partition"))
|
|
67
|
+
|
|
68
|
+
# Specify compute node
|
|
69
|
+
constraint: Optional[str] = field(default=None, metadata=dict(sbatch=True, help="Constraint"))
|
|
70
|
+
|
|
71
|
+
# Limit time on this job hrs:min:sec
|
|
72
|
+
time: str = field(default=None, metadata=dict(sbatch=True, help="Limit time on this job"))
|
|
73
|
+
|
|
74
|
+
# if set to something, jobs will run with the specified account in slurm
|
|
75
|
+
account: str = field(default=None, metadata=dict(sbatch=True, help="Account"))
|
|
76
|
+
|
|
77
|
+
# Allocated nodes can not be shared with other jobs/users
|
|
78
|
+
exclusive: bool = field(default=False, metadata=dict(sbatch=True, help="Exclusive"))
|
|
79
|
+
|
|
80
|
+
# Specifies that the batch job should be eligible for requeuing
|
|
81
|
+
requeue: bool = field(default=True, metadata=dict(sbatch=True, help="Requeue"))
|
|
82
|
+
|
|
83
|
+
# Default retries for jobs
|
|
84
|
+
retries: int = field(default=1, metadata=dict(sbatch=False, help="Default retries for jobs"))
|
|
85
|
+
|
|
86
|
+
# Pass custom commands to sbatch generation script
|
|
87
|
+
sbatch_custom: Optional[str] = field(default=None, metadata=dict(sbatch=True, help="Custom sbatch commands"))
|
|
88
|
+
|
|
89
|
+
# modules to be load
|
|
90
|
+
modules: list = field(default_factory=list, metadata=dict(sbatch=True, help="Modules to be loaded"))
|
|
91
|
+
|
|
92
|
+
# Specifies default setting of whether slurm should fail if item directory already exists
|
|
93
|
+
dir_exist_ok: bool = field(default=False, repr=False, compare=False, metadata=dict(help="Directory exist ok"))
|
|
94
|
+
|
|
95
|
+
# Set array max size for Slurm job
|
|
96
|
+
array_batch_size: int = field(default=None, metadata=dict(sbatch=False, help="Array batch size"))
|
|
97
|
+
|
|
98
|
+
# determine if run script as Slurm job
|
|
99
|
+
run_on_slurm: bool = field(default=False, repr=False, compare=False, metadata=dict(help="Run script as Slurm job"))
|
|
100
|
+
|
|
101
|
+
# mpi type: default to pmi2 for older versions of MPICH or OpenMPI or an MPI library that explicitly requires PMI2
|
|
102
|
+
mpi_type: Optional[Literal['pmi2', 'pmix', 'mpirun']] = field(default="pmi2", metadata=dict(sbatch=True,
|
|
103
|
+
help="MPI types ('pmi2', 'pmix' for slurm MPI, 'mpirun' for independently MPI)"))
|
|
104
|
+
|
|
105
|
+
# endregion
|
|
106
|
+
|
|
107
|
+
_suites: SlurmPlatformSuiteOperations = field(**op_defaults, repr=False, init=False)
|
|
108
|
+
_experiments: SlurmPlatformExperimentOperations = field(**op_defaults, repr=False, init=False)
|
|
109
|
+
_simulations: SlurmPlatformSimulationOperations = field(**op_defaults, repr=False, init=False)
|
|
110
|
+
_assets: SlurmPlatformAssetCollectionOperations = field(**op_defaults, repr=False, init=False)
|
|
111
|
+
_metas: SlurmJSONMetadataOperations = field(**op_defaults, repr=False, init=False)
|
|
112
|
+
_op_client: SlurmOperations = field(**op_defaults, repr=False, init=False)
|
|
113
|
+
|
|
114
|
+
def __post_init__(self):
|
|
115
|
+
super().__post_init__()
|
|
116
|
+
self.__init_interfaces()
|
|
117
|
+
|
|
118
|
+
# check max_array_size from slurm configuration
|
|
119
|
+
self._max_array_size = None
|
|
120
|
+
if slurm_installed():
|
|
121
|
+
self._max_array_size = get_max_array_size()
|
|
122
|
+
|
|
123
|
+
if self.mpi_type.lower() not in {'pmi2', 'pmix', 'mpirun'}:
|
|
124
|
+
raise ValueError(f"Invalid mpi_type '{self.mpi_type}'. Allowed values are 'pmi2', 'pmix', or 'mpirun'.")
|
|
125
|
+
|
|
126
|
+
# check if run script as a slurm job
|
|
127
|
+
r = run_script_on_slurm(self, run_on_slurm=self.run_on_slurm)
|
|
128
|
+
if r:
|
|
129
|
+
exit(0) # finish the current workflow
|
|
130
|
+
|
|
131
|
+
def __init_interfaces(self):
|
|
132
|
+
self._op_client = SlurmOperations(platform=self)
|
|
133
|
+
self._suites = SlurmPlatformSuiteOperations(platform=self)
|
|
134
|
+
self._experiments = SlurmPlatformExperimentOperations(platform=self)
|
|
135
|
+
self._simulations = SlurmPlatformSimulationOperations(platform=self)
|
|
136
|
+
self._assets = SlurmPlatformAssetCollectionOperations(platform=self)
|
|
137
|
+
self._metas = SlurmJSONMetadataOperations(platform=self)
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def slurm_fields(self):
|
|
141
|
+
"""
|
|
142
|
+
Get list of fields that have metadata sbatch.
|
|
143
|
+
Returns:
|
|
144
|
+
Set of fields that have sbatch metadata
|
|
145
|
+
"""
|
|
146
|
+
return set(f.name for f in fields(self) if "sbatch" in f.metadata and f.metadata["sbatch"])
|
|
147
|
+
|
|
148
|
+
def get_slurm_configs(self, **kwargs) -> Dict[str, Any]:
|
|
149
|
+
"""
|
|
150
|
+
Identify the Slurm config parameters from the fields.
|
|
151
|
+
Args:
|
|
152
|
+
kwargs: additional parameters
|
|
153
|
+
Returns:
|
|
154
|
+
slurm config dict
|
|
155
|
+
"""
|
|
156
|
+
config_dict = {k: getattr(self, k) for k in self.slurm_fields}
|
|
157
|
+
config_dict.update(kwargs)
|
|
158
|
+
return config_dict
|
|
159
|
+
|
|
160
|
+
def create_batch_file(self, item: Union[Experiment, Simulation], **kwargs) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Create batch file.
|
|
163
|
+
Args:
|
|
164
|
+
item: the item to build batch file for
|
|
165
|
+
kwargs: keyword arguments used to expand functionality.
|
|
166
|
+
Returns:
|
|
167
|
+
None
|
|
168
|
+
"""
|
|
169
|
+
self._op_client.create_batch_file(item, **kwargs)
|
|
170
|
+
|
|
171
|
+
def get_job_id(self, item_id: str, item_type: ItemType) -> List:
|
|
172
|
+
"""
|
|
173
|
+
Retrieve the job id for item that had been run.
|
|
174
|
+
Args:
|
|
175
|
+
item_id: id of experiment/simulation
|
|
176
|
+
item_type: ItemType (Experiment or Simulation)
|
|
177
|
+
Returns:
|
|
178
|
+
List of slurm job ids
|
|
179
|
+
"""
|
|
180
|
+
if item_type not in (ItemType.EXPERIMENT, ItemType.SIMULATION):
|
|
181
|
+
raise RuntimeError(f"Not support item type: {item_type}")
|
|
182
|
+
|
|
183
|
+
item_dir = self.get_directory_by_id(item_id, item_type)
|
|
184
|
+
job_id_file = item_dir.joinpath('job_id.txt')
|
|
185
|
+
if not job_id_file.exists():
|
|
186
|
+
logger.debug(f"{job_id_file} not found.")
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
job_id = open(job_id_file).read().strip()
|
|
190
|
+
return job_id.split('\n')
|
|
191
|
+
|
|
192
|
+
def submit_job(self, item: Union[Experiment, Simulation], **kwargs) -> None:
|
|
193
|
+
"""
|
|
194
|
+
Submit a Slurm job.
|
|
195
|
+
Args:
|
|
196
|
+
item: idmtools Experiment or Simulation
|
|
197
|
+
kwargs: keyword arguments used to expand functionality
|
|
198
|
+
Returns:
|
|
199
|
+
None
|
|
200
|
+
"""
|
|
201
|
+
if isinstance(item, Experiment):
|
|
202
|
+
working_directory = self.get_directory(item)
|
|
203
|
+
subprocess.run(['bash', 'batch.sh'], stdout=subprocess.PIPE, cwd=str(working_directory))
|
|
204
|
+
elif isinstance(item, Simulation):
|
|
205
|
+
pass
|
|
206
|
+
else:
|
|
207
|
+
raise NotImplementedError(f"Submit job is not implemented on SlurmPlatform.")
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""
|
|
2
|
+
idmtools SlurmPlatform SlurmJob utils.
|
|
3
|
+
|
|
4
|
+
Copyright 2021, Bill & Melinda Gates Foundation. All rights reserved.
|
|
5
|
+
"""
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
import subprocess
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, NoReturn
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING: # pragma: no cover
|
|
13
|
+
from idmtools_platform_slurm.slurm_platform import SlurmPlatform
|
|
14
|
+
|
|
15
|
+
INDICATOR_VARIABLE = 'RUN_ON_SLURM'
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def create_slurm_indicator() -> NoReturn:
|
|
19
|
+
"""
|
|
20
|
+
Add environment variable.
|
|
21
|
+
Returns:
|
|
22
|
+
None
|
|
23
|
+
"""
|
|
24
|
+
os.environ[INDICATOR_VARIABLE] = '1'
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def remove_slurm_indicator() -> NoReturn:
|
|
28
|
+
"""
|
|
29
|
+
Remove the environment variable.
|
|
30
|
+
Returns:
|
|
31
|
+
None
|
|
32
|
+
"""
|
|
33
|
+
os.environ.pop(INDICATOR_VARIABLE, None)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def check_slurm_indicator() -> bool:
|
|
37
|
+
"""
|
|
38
|
+
Check if the environment set to '1'.
|
|
39
|
+
Returns:
|
|
40
|
+
True/False
|
|
41
|
+
"""
|
|
42
|
+
return os.environ.get(INDICATOR_VARIABLE, '0') == '1'
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def slurm_installed() -> bool:
|
|
46
|
+
"""
|
|
47
|
+
Check if Slurm system is installed or available.
|
|
48
|
+
Returns:
|
|
49
|
+
True/False
|
|
50
|
+
"""
|
|
51
|
+
try:
|
|
52
|
+
subprocess.check_output(["sinfo", "-V"])
|
|
53
|
+
return True
|
|
54
|
+
except:
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def run_script_on_slurm(platform: 'SlurmPlatform', run_on_slurm: bool = False,
|
|
59
|
+
cleanup: bool = True) -> bool:
|
|
60
|
+
"""
|
|
61
|
+
This is a utility tool which wraps the SlurmJob creation and run.
|
|
62
|
+
Args:
|
|
63
|
+
platform: idmtools Platform
|
|
64
|
+
run_on_slurm: True/False
|
|
65
|
+
cleanup: True/False to delete the generated slurm job related files
|
|
66
|
+
Returns:
|
|
67
|
+
True/False
|
|
68
|
+
"""
|
|
69
|
+
from idmtools_platform_slurm.utils.slurm_job.slurm_job import SlurmJob
|
|
70
|
+
from idmtools_platform_slurm.slurm_platform import SlurmPlatform
|
|
71
|
+
|
|
72
|
+
# Double make sure it is Slurm Platform
|
|
73
|
+
if not isinstance(platform, SlurmPlatform):
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
if run_on_slurm and not check_slurm_indicator():
|
|
77
|
+
# Locate the script
|
|
78
|
+
# Wrong path due to emod_malaria bug:
|
|
79
|
+
# script = os.path.abspath(sys.argv[0])
|
|
80
|
+
# Workaround: manually build full path
|
|
81
|
+
script = Path(sys.path[0]).joinpath(Path(sys.argv[0]).name)
|
|
82
|
+
# Collect script input parameters
|
|
83
|
+
script_params = sys.argv[1:]
|
|
84
|
+
# Run script as Slurm job
|
|
85
|
+
sj = SlurmJob(script_path=script, platform=platform, script_params=script_params, cleanup=cleanup)
|
|
86
|
+
# Kick off Slurm job
|
|
87
|
+
sj.run()
|
|
88
|
+
return True
|
|
89
|
+
else:
|
|
90
|
+
return False
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
{% if ntasks is defined and ntasks is not none %}
|
|
3
|
+
#SBATCH --ntasks={{ntasks}}
|
|
4
|
+
{% endif %}
|
|
5
|
+
{% if partition is defined and partition is not none %}
|
|
6
|
+
#SBATCH --partition={{partition}}
|
|
7
|
+
{% endif %}
|
|
8
|
+
{% if nodes is defined and nodes is not none %}
|
|
9
|
+
#SBATCH --nodes={{nodes}}
|
|
10
|
+
{% endif %}
|
|
11
|
+
{% if mail_type is defined and mail_type is not none %}
|
|
12
|
+
#SBATCH --mail-type={{mail_type}}
|
|
13
|
+
{% endif %}
|
|
14
|
+
{% if mail_user is defined and mail_user is not none %}
|
|
15
|
+
#SBATCH --mail-user={{mail_user}}
|
|
16
|
+
{% endif %}
|
|
17
|
+
{% if constraint is defined and constraint is not none %}
|
|
18
|
+
#SBATCH --constraint={{constraint}}
|
|
19
|
+
{% endif %}
|
|
20
|
+
{% if ntasks_per_core is defined and ntasks_per_core is not none %}
|
|
21
|
+
#SBATCH --ntasks-per-core={{ntasks_per_core}}
|
|
22
|
+
{% endif %}
|
|
23
|
+
{% if cpus_per_task is defined and cpus_per_task is not none %}
|
|
24
|
+
#SBATCH --cpus-per-task={{cpus_per_task}}
|
|
25
|
+
{% endif %}
|
|
26
|
+
{% if mem_per_cpu is defined and mem_per_cpu is not none %}
|
|
27
|
+
#SBATCH --mem-per-cpu={{mem_per_cpu}}
|
|
28
|
+
{% endif %}
|
|
29
|
+
{% if time is defined and time is not none %}
|
|
30
|
+
#SBATCH --time={{time}}
|
|
31
|
+
{% endif %}
|
|
32
|
+
{% if account is defined and account is not none %}
|
|
33
|
+
#SBATCH --account={{account}}
|
|
34
|
+
{% endif %}
|
|
35
|
+
{% if exclusive is defined and exclusive is not none and exclusive %}
|
|
36
|
+
#SBATCH --exclusive
|
|
37
|
+
{% endif %}
|
|
38
|
+
{% if mem is defined and mem is not none %}
|
|
39
|
+
#SBATCH --mem={{mem}}
|
|
40
|
+
{% endif %}
|
|
41
|
+
{% if requeue is defined and requeue is not none and requeue %}
|
|
42
|
+
#SBATCH --requeue
|
|
43
|
+
{% endif %}
|
|
44
|
+
{% if sbatch_custom is defined and sbatch_custom is not none %}
|
|
45
|
+
#SBATCH {{sbatch_custom}}
|
|
46
|
+
{% endif %}
|
|
47
|
+
#SBATCH --open-mode=append
|
|
48
|
+
#SBATCH --output=stdout.txt
|
|
49
|
+
#SBATCH --error=stderr.txt
|
|
50
|
+
|
|
51
|
+
{% if modules is defined and modules is not none and modules|length > 0 %}
|
|
52
|
+
{% for m in modules %}
|
|
53
|
+
module load {{ m }}
|
|
54
|
+
{% endfor %}
|
|
55
|
+
{% endif %}
|
|
56
|
+
|
|
57
|
+
# define the handler function
|
|
58
|
+
term_handler()
|
|
59
|
+
{
|
|
60
|
+
# do whatever cleanup you want here
|
|
61
|
+
echo "-1" > job_status.txt
|
|
62
|
+
exit -1
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# associate the function "term_handler" with the TERM signal
|
|
66
|
+
trap 'term_handler' TERM
|
|
67
|
+
|
|
68
|
+
echo $SLURM_JOB_ID > job_id.txt
|
|
69
|
+
|
|
70
|
+
echo "100" > job_status.txt
|
|
71
|
+
{{ command }}
|
|
72
|
+
RESULT=$?
|
|
73
|
+
if [ $RESULT -eq 0 ]; then
|
|
74
|
+
echo "0" > job_status.txt
|
|
75
|
+
exit $RESULT
|
|
76
|
+
fi
|
|
77
|
+
echo "-1" > job_status.txt
|
|
78
|
+
exit $RESULT
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This is a SlurmPlatform utility.
|
|
3
|
+
|
|
4
|
+
Copyright 2021, Bill & Melinda Gates Foundation. All rights reserved.
|
|
5
|
+
"""
|
|
6
|
+
import os
|
|
7
|
+
import subprocess
|
|
8
|
+
import time
|
|
9
|
+
from os import PathLike
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import NoReturn, Union, List, TYPE_CHECKING
|
|
13
|
+
from idmtools.core import NoPlatformException
|
|
14
|
+
from jinja2 import Template
|
|
15
|
+
from logging import getLogger
|
|
16
|
+
from idmtools_platform_slurm.utils.slurm_job import create_slurm_indicator, slurm_installed
|
|
17
|
+
from typing import Tuple
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
user_logger = getLogger('user')
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from idmtools_platform_slurm.slurm_platform import SlurmPlatform
|
|
24
|
+
|
|
25
|
+
DEFAULT_TEMPLATE_FILE = "script_sbatch.sh.jinja2"
|
|
26
|
+
MSG = """Note: any output information from your script is stored in file stdout.txt under the script folder. For example, if you are running a script under current directory which kicks out another Slurm job, then the second Slurm job id is stored in stdout.txt under the current directory."""
|
|
27
|
+
|
|
28
|
+
TEMP_FILES = ['sbatch.sh', 'job_id.txt', 'job_status.txt', 'stdout.txt', 'stderr.txt']
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def generate_script(platform: 'SlurmPlatform', command: str,
|
|
32
|
+
template: Union[Path, str] = DEFAULT_TEMPLATE_FILE, batch_dir: str = None, **kwargs) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Generate batch file sbatch.sh
|
|
35
|
+
Args:
|
|
36
|
+
platform: Slurm Platform
|
|
37
|
+
command: execution command
|
|
38
|
+
template: template to be used to build batch file
|
|
39
|
+
kwargs: keyword arguments used to expand functionality
|
|
40
|
+
Returns:
|
|
41
|
+
None
|
|
42
|
+
"""
|
|
43
|
+
from idmtools_platform_slurm.slurm_platform import CONFIG_PARAMETERS
|
|
44
|
+
template_vars = dict(
|
|
45
|
+
platform=platform,
|
|
46
|
+
command=command
|
|
47
|
+
)
|
|
48
|
+
# Populate from our platform config vars
|
|
49
|
+
for p in CONFIG_PARAMETERS:
|
|
50
|
+
if getattr(platform, p) is not None:
|
|
51
|
+
template_vars[p] = getattr(platform, p)
|
|
52
|
+
|
|
53
|
+
template_vars.update(kwargs)
|
|
54
|
+
|
|
55
|
+
if platform.modules:
|
|
56
|
+
template_vars['modules'] = platform.modules
|
|
57
|
+
|
|
58
|
+
with open(Path(__file__).parent.joinpath(template)) as tin:
|
|
59
|
+
t = Template(tin.read())
|
|
60
|
+
|
|
61
|
+
# Write our file
|
|
62
|
+
if batch_dir is None:
|
|
63
|
+
output_target = Path.cwd().joinpath("sbatch.sh")
|
|
64
|
+
else:
|
|
65
|
+
output_target = Path(batch_dir).joinpath("sbatch.sh")
|
|
66
|
+
|
|
67
|
+
with open(output_target, "w") as tout:
|
|
68
|
+
tout.write(t.render(template_vars))
|
|
69
|
+
|
|
70
|
+
# Make executable
|
|
71
|
+
platform.update_script_mode(output_target)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def check_file_and_job_id(file_path, timeout: int = 3600, interval: int = 10) -> Tuple[bool, str]:
|
|
75
|
+
"""
|
|
76
|
+
Wait for a file to be created and check if slurm job id exists in the file.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
file_path (str): Path to the file to check.
|
|
80
|
+
timeout (int): Maximum time (in seconds) to wait for the file and line.
|
|
81
|
+
interval (int): Time interval (in seconds) between checks.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Tuple[bool, str]: A tuple containing:
|
|
85
|
+
- bool: True if the file exists and contains a valid job ID, False otherwise.
|
|
86
|
+
- str: The job ID if found, otherwise an empty string.
|
|
87
|
+
"""
|
|
88
|
+
start_time = time.time()
|
|
89
|
+
|
|
90
|
+
while time.time() - start_time < timeout:
|
|
91
|
+
if os.path.exists(file_path):
|
|
92
|
+
user_logger.info(f"File {file_path} found.")
|
|
93
|
+
with open(file_path, 'r') as file:
|
|
94
|
+
for line in file:
|
|
95
|
+
if 'Slurm Job Ids (' in line:
|
|
96
|
+
slurm_job_id = file.readline().strip()
|
|
97
|
+
if slurm_job_id.isdigit():
|
|
98
|
+
return True, slurm_job_id
|
|
99
|
+
user_logger.warning(f"Not found slurm job id in {file_path}.")
|
|
100
|
+
else:
|
|
101
|
+
user_logger.info(f"File {file_path} not found yet. Waiting...")
|
|
102
|
+
|
|
103
|
+
time.sleep(interval)
|
|
104
|
+
|
|
105
|
+
user_logger.error(f"Timeout reached. File {file_path} or slurm job id not found.")
|
|
106
|
+
return False, None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass(repr=False)
|
|
111
|
+
class SlurmJob:
|
|
112
|
+
script_path: PathLike = field(init=True)
|
|
113
|
+
platform: 'SlurmPlatform' = field(default=None, init=True)
|
|
114
|
+
executable: str = field(default='python3', init=True)
|
|
115
|
+
script_params: List[str] = field(default=None, init=True)
|
|
116
|
+
cleanup: bool = field(default=True, init=True)
|
|
117
|
+
|
|
118
|
+
def __post_init__(self):
|
|
119
|
+
if self.script_path is None:
|
|
120
|
+
raise RuntimeError("script_path is missing!")
|
|
121
|
+
# load platform from context or from passed in value
|
|
122
|
+
self.platform = self.__check_for_platform_from_context(self.platform)
|
|
123
|
+
self.working_directory = Path(self.script_path).parent
|
|
124
|
+
self.script_params = self.script_params if self.script_params is not None and len(
|
|
125
|
+
self.script_params) > 0 else None
|
|
126
|
+
self.slurm_job_id = None
|
|
127
|
+
|
|
128
|
+
def initialization(self):
|
|
129
|
+
# make str list so that we may join them together
|
|
130
|
+
if self.script_params is not None:
|
|
131
|
+
self.script_params = [str(i) for i in self.script_params]
|
|
132
|
+
|
|
133
|
+
if self.script_params is not None:
|
|
134
|
+
command = f"{self.executable} {Path(self.script_path).name} {' '.join(self.script_params)}"
|
|
135
|
+
else:
|
|
136
|
+
command = f"{self.executable} {Path(self.script_path).name}"
|
|
137
|
+
|
|
138
|
+
generate_script(self.platform, command, batch_dir=self.working_directory)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def run(self, dry_run: bool = False, **kwargs) -> NoReturn:
|
|
142
|
+
if self.cleanup:
|
|
143
|
+
self.clean(self.working_directory)
|
|
144
|
+
|
|
145
|
+
self.initialization()
|
|
146
|
+
|
|
147
|
+
if not dry_run:
|
|
148
|
+
if not slurm_installed():
|
|
149
|
+
user_logger.warning('Slurm is not installed/available!')
|
|
150
|
+
exit(-1)
|
|
151
|
+
|
|
152
|
+
user_logger.info('Script is running as a slurm job!\n')
|
|
153
|
+
create_slurm_indicator()
|
|
154
|
+
result = subprocess.run(['sbatch', '--parsable', 'sbatch.sh'], stdout=subprocess.PIPE,
|
|
155
|
+
cwd=str(self.working_directory))
|
|
156
|
+
self.slurm_job_id = result.stdout.decode('utf-8').strip().split(';')[0]
|
|
157
|
+
|
|
158
|
+
user_logger.info(f"{'job_id: '.ljust(20)} {self.slurm_job_id}")
|
|
159
|
+
user_logger.info(f"{'job_directory: '.ljust(20)} {self.platform.job_directory}\n")
|
|
160
|
+
|
|
161
|
+
# Check if stdout.txt is created and job id exists in there
|
|
162
|
+
stdout_file = os.path.join(self.working_directory, 'stdout.txt')
|
|
163
|
+
is_exists, slurm_job_id = check_file_and_job_id(stdout_file)
|
|
164
|
+
if is_exists and slurm_job_id is not None:
|
|
165
|
+
# print stdout.txt on console
|
|
166
|
+
with open(stdout_file, "r") as f:
|
|
167
|
+
read_data = f.read()
|
|
168
|
+
user_logger.info(read_data)
|
|
169
|
+
user_logger.info("To check job status, run command:")
|
|
170
|
+
user_logger.info(f"scontrol show job {slurm_job_id}")
|
|
171
|
+
user_logger.info(f"sacct -j {slurm_job_id} --format=JobID,State,Start,End")
|
|
172
|
+
else:
|
|
173
|
+
user_logger.warning("Check status.txt for job details.")
|
|
174
|
+
user_logger.warning(MSG)
|
|
175
|
+
else:
|
|
176
|
+
user_logger.warning('Script is running with dry_run = True')
|
|
177
|
+
|
|
178
|
+
def __check_for_platform_from_context(self, platform) -> 'IPlatform': # noqa: F821
|
|
179
|
+
"""
|
|
180
|
+
Try to determine platform of current object from self or current platform.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
platform: Passed in platform object
|
|
184
|
+
|
|
185
|
+
Raises:
|
|
186
|
+
NoPlatformException: when no platform is on current context
|
|
187
|
+
Returns:
|
|
188
|
+
Platform object
|
|
189
|
+
"""
|
|
190
|
+
if self.platform is None:
|
|
191
|
+
# check context for current platform
|
|
192
|
+
if platform is None:
|
|
193
|
+
from idmtools.core.context import CURRENT_PLATFORM
|
|
194
|
+
if CURRENT_PLATFORM is None:
|
|
195
|
+
raise NoPlatformException("No Platform defined on object, in current context, or passed to run")
|
|
196
|
+
platform = CURRENT_PLATFORM
|
|
197
|
+
self.platform = platform
|
|
198
|
+
return self.platform
|
|
199
|
+
|
|
200
|
+
def clean(self, cwd: str = os.getcwd()):
|
|
201
|
+
"""
|
|
202
|
+
Delete generated slurm job related files.
|
|
203
|
+
Args:
|
|
204
|
+
cwd: the directory containing the files
|
|
205
|
+
Returns:
|
|
206
|
+
None
|
|
207
|
+
"""
|
|
208
|
+
for file_path in TEMP_FILES:
|
|
209
|
+
f = os.path.join(cwd, file_path)
|
|
210
|
+
if os.path.exists(f):
|
|
211
|
+
try:
|
|
212
|
+
os.remove(f)
|
|
213
|
+
except:
|
|
214
|
+
pass
|