idmtools-platform-slurm 0.0.0.dev0__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. dockerized_slurm/Dockerfile +107 -0
  2. dockerized_slurm/README.md +17 -0
  3. dockerized_slurm/docker-compose.yml +89 -0
  4. dockerized_slurm/docker-entrypoint.sh +64 -0
  5. dockerized_slurm/id_rsa +27 -0
  6. dockerized_slurm/id_rsa.pub +1 -0
  7. dockerized_slurm/register_cluster.sh +12 -0
  8. dockerized_slurm/slurm.conf +94 -0
  9. dockerized_slurm/slurmdbd.conf +37 -0
  10. idmtools_platform_slurm/__init__.py +12 -8
  11. idmtools_platform_slurm/assets/__init__.py +157 -0
  12. idmtools_platform_slurm/assets/_run.sh.jinja2 +44 -0
  13. idmtools_platform_slurm/assets/batch.sh.jinja2 +54 -0
  14. idmtools_platform_slurm/assets/run_simulation.sh +23 -0
  15. idmtools_platform_slurm/assets/sbatch.sh.jinja2 +77 -0
  16. idmtools_platform_slurm/cli/__init__.py +4 -0
  17. idmtools_platform_slurm/cli/slurm.py +151 -0
  18. idmtools_platform_slurm/platform_operations/__init__.py +0 -0
  19. idmtools_platform_slurm/platform_operations/asset_collection_operations.py +25 -0
  20. idmtools_platform_slurm/platform_operations/experiment_operations.py +107 -0
  21. idmtools_platform_slurm/platform_operations/json_metadata_operations.py +17 -0
  22. idmtools_platform_slurm/platform_operations/simulation_operations.py +46 -0
  23. idmtools_platform_slurm/platform_operations/suite_operations.py +38 -0
  24. idmtools_platform_slurm/platform_operations/utils.py +45 -0
  25. idmtools_platform_slurm/plugin_info.py +75 -0
  26. idmtools_platform_slurm/slurm_operations/__init__.py +5 -0
  27. idmtools_platform_slurm/slurm_operations/slurm_operations.py +58 -0
  28. idmtools_platform_slurm/slurm_platform.py +207 -0
  29. idmtools_platform_slurm/utils/__init__.py +4 -0
  30. idmtools_platform_slurm/utils/slurm_job/__init__.py +90 -0
  31. idmtools_platform_slurm/utils/slurm_job/script_sbatch.sh.jinja2 +78 -0
  32. idmtools_platform_slurm/utils/slurm_job/slurm_job.py +214 -0
  33. idmtools_platform_slurm/utils/status_report/__init__.py +5 -0
  34. idmtools_platform_slurm/utils/status_report/status_report.py +242 -0
  35. idmtools_platform_slurm/utils/status_report/utils.py +108 -0
  36. idmtools_platform_slurm-0.0.2.dist-info/METADATA +185 -0
  37. idmtools_platform_slurm-0.0.2.dist-info/RECORD +43 -0
  38. idmtools_platform_slurm-0.0.2.dist-info/entry_points.txt +5 -0
  39. idmtools_platform_slurm-0.0.2.dist-info/licenses/LICENSE.TXT +3 -0
  40. {idmtools_platform_slurm-0.0.0.dev0.dist-info → idmtools_platform_slurm-0.0.2.dist-info}/top_level.txt +2 -0
  41. tests/input/hello.sh +2 -0
  42. tests/input/script.py +49 -0
  43. idmtools_platform_slurm-0.0.0.dev0.dist-info/METADATA +0 -41
  44. idmtools_platform_slurm-0.0.0.dev0.dist-info/RECORD +0 -5
  45. {idmtools_platform_slurm-0.0.0.dev0.dist-info → idmtools_platform_slurm-0.0.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,58 @@
1
+ """
2
+ Here we implement the Slurm Operations.
3
+
4
+ Copyright 2025, Gates Foundation. All rights reserved.
5
+ """
6
+ import subprocess
7
+ from dataclasses import dataclass, field
8
+ from logging import getLogger
9
+ from typing import Union, List, Any, Type
10
+
11
+ from idmtools.entities.experiment import Experiment
12
+ from idmtools.entities.simulation import Simulation
13
+ from idmtools_platform_file.file_operations.file_operations import FileOperations
14
+ from idmtools_platform_slurm.assets import generate_batch, generate_script, generate_simulation_script
15
+
16
+
17
+ logger = getLogger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class SlurmOperations(FileOperations):
22
+
23
+ platform: 'SlurmPlatform' # noqa: F821
24
+ platform_type: Type = field(default=None)
25
+
26
+ def create_batch_file(self, item: Union[Experiment, Simulation], max_running_jobs: int = None, retries: int = None,
27
+ array_batch_size: int = None, dependency: bool = True, **kwargs) -> None:
28
+ """
29
+ Create batch file.
30
+ Args:
31
+ item: the item to build batch file for
32
+ kwargs: keyword arguments used to expand functionality.
33
+ Returns:
34
+ None
35
+ """
36
+ if isinstance(item, Experiment):
37
+ generate_batch(self.platform, item, max_running_jobs, array_batch_size, dependency)
38
+ generate_script(self.platform, item, max_running_jobs)
39
+ elif isinstance(item, Simulation):
40
+ generate_simulation_script(self.platform, item, retries)
41
+ else:
42
+ raise NotImplementedError(f"{item.__class__.__name__} is not supported for batch creation.")
43
+
44
+ @staticmethod
45
+ def cancel_job(job_ids: Union[str, List[str]]) -> Any:
46
+ """
47
+ Cancel Slurm job for given job ids.
48
+ Args:
49
+ job_ids: slurm jobs id
50
+ Returns:
51
+ Any
52
+ """
53
+ if isinstance(job_ids, str):
54
+ job_ids = [job_ids]
55
+ logger.debug(f"Submit slurm cancel job: {job_ids}")
56
+ result = subprocess.run(['scancel', *job_ids], stdout=subprocess.PIPE)
57
+ stdout = "Success" if result.returncode == 0 else 'Error'
58
+ return stdout
@@ -0,0 +1,207 @@
1
+ """
2
+ Here we implement the SlurmPlatform object.
3
+
4
+ Copyright 2025, Gates Foundation. All rights reserved.
5
+ """
6
+ import subprocess
7
+ from typing import Optional, Any, Dict, List, Union, Literal
8
+ from dataclasses import dataclass, field, fields
9
+ from logging import getLogger
10
+ from idmtools.core import ItemType
11
+ from idmtools.entities.experiment import Experiment
12
+ from idmtools.entities.simulation import Simulation
13
+ from idmtools_platform_file.file_platform import FilePlatform
14
+ from idmtools_platform_slurm.platform_operations.json_metadata_operations import SlurmJSONMetadataOperations
15
+ from idmtools_platform_slurm.platform_operations.asset_collection_operations import \
16
+ SlurmPlatformAssetCollectionOperations
17
+ from idmtools_platform_slurm.platform_operations.experiment_operations import SlurmPlatformExperimentOperations
18
+ from idmtools_platform_slurm.platform_operations.simulation_operations import SlurmPlatformSimulationOperations
19
+ from idmtools_platform_slurm.platform_operations.suite_operations import SlurmPlatformSuiteOperations
20
+ from idmtools_platform_slurm.platform_operations.utils import get_max_array_size
21
+ from idmtools_platform_slurm.slurm_operations.slurm_operations import SlurmOperations
22
+
23
+ from idmtools_platform_slurm.utils.slurm_job import run_script_on_slurm, slurm_installed
24
+
25
+ logger = getLogger(__name__)
26
+
27
+ op_defaults = dict(default=None, compare=False, metadata={"pickle_ignore": True})
28
+ CONFIG_PARAMETERS = ['ntasks', 'partition', 'nodes', 'mail_type', 'mail_user', 'ntasks_per_core', 'cpus_per_task',
29
+ 'mem_per_cpu', 'time', 'constraint', 'account', 'mem', 'exclusive', 'requeue', 'sbatch_custom',
30
+ 'max_running_jobs', 'array_batch_size', 'mpi_type']
31
+
32
+
33
+ @dataclass(repr=False)
34
+ class SlurmPlatform(FilePlatform):
35
+ # region: Resources request
36
+
37
+ # choose e-mail type
38
+ mail_type: Optional[str] = field(default=None, metadata=dict(sbatch=True, help="e-mail type"))
39
+
40
+ # send e=mail notification
41
+ # TODO Add Validations here from https://slurm.schedmd.com/sbatch.html#OPT_mail-type
42
+ mail_user: Optional[str] = field(default=None, metadata=dict(sbatch=True, help="e-mail address"))
43
+
44
+ # How many nodes to be used
45
+ nodes: Optional[int] = field(default=None, metadata=dict(sbatch=True, help="Number of nodes"))
46
+
47
+ # Num of tasks
48
+ ntasks: Optional[int] = field(default=None, metadata=dict(sbatch=True, help="Number of tasks"))
49
+
50
+ # CPU # per task
51
+ cpus_per_task: Optional[int] = field(default=None, metadata=dict(sbatch=True, help="Number of CPUs per task"))
52
+
53
+ # Task # per core
54
+ ntasks_per_core: Optional[int] = field(default=None, metadata=dict(sbatch=True, help="Number of tasks per core"))
55
+
56
+ # Maximum of running jobs(Per experiment)
57
+ max_running_jobs: Optional[int] = field(default=100, metadata=dict(sbatch=True, help="Maximum of running jobs"))
58
+
59
+ # Memory per core: MB of memory
60
+ mem: Optional[int] = field(default=None, metadata=dict(sbatch=True, help="Memory per core"))
61
+
62
+ # Memory per core: MB of memory
63
+ mem_per_cpu: Optional[int] = field(default=None, metadata=dict(sbatch=True, help="Memory per CPU"))
64
+
65
+ # Which partition to use
66
+ partition: Optional[str] = field(default=None, metadata=dict(sbatch=True, help="Partition"))
67
+
68
+ # Specify compute node
69
+ constraint: Optional[str] = field(default=None, metadata=dict(sbatch=True, help="Constraint"))
70
+
71
+ # Limit time on this job hrs:min:sec
72
+ time: str = field(default=None, metadata=dict(sbatch=True, help="Limit time on this job"))
73
+
74
+ # if set to something, jobs will run with the specified account in slurm
75
+ account: str = field(default=None, metadata=dict(sbatch=True, help="Account"))
76
+
77
+ # Allocated nodes can not be shared with other jobs/users
78
+ exclusive: bool = field(default=False, metadata=dict(sbatch=True, help="Exclusive"))
79
+
80
+ # Specifies that the batch job should be eligible for requeuing
81
+ requeue: bool = field(default=True, metadata=dict(sbatch=True, help="Requeue"))
82
+
83
+ # Default retries for jobs
84
+ retries: int = field(default=1, metadata=dict(sbatch=False, help="Default retries for jobs"))
85
+
86
+ # Pass custom commands to sbatch generation script
87
+ sbatch_custom: Optional[str] = field(default=None, metadata=dict(sbatch=True, help="Custom sbatch commands"))
88
+
89
+ # modules to be load
90
+ modules: list = field(default_factory=list, metadata=dict(sbatch=True, help="Modules to be loaded"))
91
+
92
+ # Specifies default setting of whether slurm should fail if item directory already exists
93
+ dir_exist_ok: bool = field(default=False, repr=False, compare=False, metadata=dict(help="Directory exist ok"))
94
+
95
+ # Set array max size for Slurm job
96
+ array_batch_size: int = field(default=None, metadata=dict(sbatch=False, help="Array batch size"))
97
+
98
+ # determine if run script as Slurm job
99
+ run_on_slurm: bool = field(default=False, repr=False, compare=False, metadata=dict(help="Run script as Slurm job"))
100
+
101
+ # mpi type: default to pmi2 for older versions of MPICH or OpenMPI or an MPI library that explicitly requires PMI2
102
+ mpi_type: Optional[Literal['pmi2', 'pmix', 'mpirun']] = field(default="pmi2", metadata=dict(sbatch=True,
103
+ help="MPI types ('pmi2', 'pmix' for slurm MPI, 'mpirun' for independently MPI)"))
104
+
105
+ # endregion
106
+
107
+ _suites: SlurmPlatformSuiteOperations = field(**op_defaults, repr=False, init=False)
108
+ _experiments: SlurmPlatformExperimentOperations = field(**op_defaults, repr=False, init=False)
109
+ _simulations: SlurmPlatformSimulationOperations = field(**op_defaults, repr=False, init=False)
110
+ _assets: SlurmPlatformAssetCollectionOperations = field(**op_defaults, repr=False, init=False)
111
+ _metas: SlurmJSONMetadataOperations = field(**op_defaults, repr=False, init=False)
112
+ _op_client: SlurmOperations = field(**op_defaults, repr=False, init=False)
113
+
114
+ def __post_init__(self):
115
+ super().__post_init__()
116
+ self.__init_interfaces()
117
+
118
+ # check max_array_size from slurm configuration
119
+ self._max_array_size = None
120
+ if slurm_installed():
121
+ self._max_array_size = get_max_array_size()
122
+
123
+ if self.mpi_type.lower() not in {'pmi2', 'pmix', 'mpirun'}:
124
+ raise ValueError(f"Invalid mpi_type '{self.mpi_type}'. Allowed values are 'pmi2', 'pmix', or 'mpirun'.")
125
+
126
+ # check if run script as a slurm job
127
+ r = run_script_on_slurm(self, run_on_slurm=self.run_on_slurm)
128
+ if r:
129
+ exit(0) # finish the current workflow
130
+
131
+ def __init_interfaces(self):
132
+ self._op_client = SlurmOperations(platform=self)
133
+ self._suites = SlurmPlatformSuiteOperations(platform=self)
134
+ self._experiments = SlurmPlatformExperimentOperations(platform=self)
135
+ self._simulations = SlurmPlatformSimulationOperations(platform=self)
136
+ self._assets = SlurmPlatformAssetCollectionOperations(platform=self)
137
+ self._metas = SlurmJSONMetadataOperations(platform=self)
138
+
139
+ @property
140
+ def slurm_fields(self):
141
+ """
142
+ Get list of fields that have metadata sbatch.
143
+ Returns:
144
+ Set of fields that have sbatch metadata
145
+ """
146
+ return set(f.name for f in fields(self) if "sbatch" in f.metadata and f.metadata["sbatch"])
147
+
148
+ def get_slurm_configs(self, **kwargs) -> Dict[str, Any]:
149
+ """
150
+ Identify the Slurm config parameters from the fields.
151
+ Args:
152
+ kwargs: additional parameters
153
+ Returns:
154
+ slurm config dict
155
+ """
156
+ config_dict = {k: getattr(self, k) for k in self.slurm_fields}
157
+ config_dict.update(kwargs)
158
+ return config_dict
159
+
160
+ def create_batch_file(self, item: Union[Experiment, Simulation], **kwargs) -> None:
161
+ """
162
+ Create batch file.
163
+ Args:
164
+ item: the item to build batch file for
165
+ kwargs: keyword arguments used to expand functionality.
166
+ Returns:
167
+ None
168
+ """
169
+ self._op_client.create_batch_file(item, **kwargs)
170
+
171
+ def get_job_id(self, item_id: str, item_type: ItemType) -> List:
172
+ """
173
+ Retrieve the job id for item that had been run.
174
+ Args:
175
+ item_id: id of experiment/simulation
176
+ item_type: ItemType (Experiment or Simulation)
177
+ Returns:
178
+ List of slurm job ids
179
+ """
180
+ if item_type not in (ItemType.EXPERIMENT, ItemType.SIMULATION):
181
+ raise RuntimeError(f"Not support item type: {item_type}")
182
+
183
+ item_dir = self.get_directory_by_id(item_id, item_type)
184
+ job_id_file = item_dir.joinpath('job_id.txt')
185
+ if not job_id_file.exists():
186
+ logger.debug(f"{job_id_file} not found.")
187
+ return None
188
+
189
+ job_id = open(job_id_file).read().strip()
190
+ return job_id.split('\n')
191
+
192
+ def submit_job(self, item: Union[Experiment, Simulation], **kwargs) -> None:
193
+ """
194
+ Submit a Slurm job.
195
+ Args:
196
+ item: idmtools Experiment or Simulation
197
+ kwargs: keyword arguments used to expand functionality
198
+ Returns:
199
+ None
200
+ """
201
+ if isinstance(item, Experiment):
202
+ working_directory = self.get_directory(item)
203
+ subprocess.run(['bash', 'batch.sh'], stdout=subprocess.PIPE, cwd=str(working_directory))
204
+ elif isinstance(item, Simulation):
205
+ pass
206
+ else:
207
+ raise NotImplementedError(f"Submit job is not implemented on SlurmPlatform.")
@@ -0,0 +1,4 @@
1
+ """idmtools comps utils.
2
+
3
+ Copyright 2021, Bill & Melinda Gates Foundation. All rights reserved.
4
+ """
@@ -0,0 +1,90 @@
1
+ """
2
+ idmtools SlurmPlatform SlurmJob utils.
3
+
4
+ Copyright 2021, Bill & Melinda Gates Foundation. All rights reserved.
5
+ """
6
+ import os
7
+ import sys
8
+ import subprocess
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, NoReturn
11
+
12
+ if TYPE_CHECKING: # pragma: no cover
13
+ from idmtools_platform_slurm.slurm_platform import SlurmPlatform
14
+
15
+ INDICATOR_VARIABLE = 'RUN_ON_SLURM'
16
+
17
+
18
+ def create_slurm_indicator() -> NoReturn:
19
+ """
20
+ Add environment variable.
21
+ Returns:
22
+ None
23
+ """
24
+ os.environ[INDICATOR_VARIABLE] = '1'
25
+
26
+
27
+ def remove_slurm_indicator() -> NoReturn:
28
+ """
29
+ Remove the environment variable.
30
+ Returns:
31
+ None
32
+ """
33
+ os.environ.pop(INDICATOR_VARIABLE, None)
34
+
35
+
36
+ def check_slurm_indicator() -> bool:
37
+ """
38
+ Check if the environment set to '1'.
39
+ Returns:
40
+ True/False
41
+ """
42
+ return os.environ.get(INDICATOR_VARIABLE, '0') == '1'
43
+
44
+
45
+ def slurm_installed() -> bool:
46
+ """
47
+ Check if Slurm system is installed or available.
48
+ Returns:
49
+ True/False
50
+ """
51
+ try:
52
+ subprocess.check_output(["sinfo", "-V"])
53
+ return True
54
+ except:
55
+ return False
56
+
57
+
58
+ def run_script_on_slurm(platform: 'SlurmPlatform', run_on_slurm: bool = False,
59
+ cleanup: bool = True) -> bool:
60
+ """
61
+ This is a utility tool which wraps the SlurmJob creation and run.
62
+ Args:
63
+ platform: idmtools Platform
64
+ run_on_slurm: True/False
65
+ cleanup: True/False to delete the generated slurm job related files
66
+ Returns:
67
+ True/False
68
+ """
69
+ from idmtools_platform_slurm.utils.slurm_job.slurm_job import SlurmJob
70
+ from idmtools_platform_slurm.slurm_platform import SlurmPlatform
71
+
72
+ # Double make sure it is Slurm Platform
73
+ if not isinstance(platform, SlurmPlatform):
74
+ return False
75
+
76
+ if run_on_slurm and not check_slurm_indicator():
77
+ # Locate the script
78
+ # Wrong path due to emod_malaria bug:
79
+ # script = os.path.abspath(sys.argv[0])
80
+ # Workaround: manually build full path
81
+ script = Path(sys.path[0]).joinpath(Path(sys.argv[0]).name)
82
+ # Collect script input parameters
83
+ script_params = sys.argv[1:]
84
+ # Run script as Slurm job
85
+ sj = SlurmJob(script_path=script, platform=platform, script_params=script_params, cleanup=cleanup)
86
+ # Kick off Slurm job
87
+ sj.run()
88
+ return True
89
+ else:
90
+ return False
@@ -0,0 +1,78 @@
1
+ #!/bin/bash
2
+ {% if ntasks is defined and ntasks is not none %}
3
+ #SBATCH --ntasks={{ntasks}}
4
+ {% endif %}
5
+ {% if partition is defined and partition is not none %}
6
+ #SBATCH --partition={{partition}}
7
+ {% endif %}
8
+ {% if nodes is defined and nodes is not none %}
9
+ #SBATCH --nodes={{nodes}}
10
+ {% endif %}
11
+ {% if mail_type is defined and mail_type is not none %}
12
+ #SBATCH --mail-type={{mail_type}}
13
+ {% endif %}
14
+ {% if mail_user is defined and mail_user is not none %}
15
+ #SBATCH --mail-user={{mail_user}}
16
+ {% endif %}
17
+ {% if constraint is defined and constraint is not none %}
18
+ #SBATCH --constraint={{constraint}}
19
+ {% endif %}
20
+ {% if ntasks_per_core is defined and ntasks_per_core is not none %}
21
+ #SBATCH --ntasks-per-core={{ntasks_per_core}}
22
+ {% endif %}
23
+ {% if cpus_per_task is defined and cpus_per_task is not none %}
24
+ #SBATCH --cpus-per-task={{cpus_per_task}}
25
+ {% endif %}
26
+ {% if mem_per_cpu is defined and mem_per_cpu is not none %}
27
+ #SBATCH --mem-per-cpu={{mem_per_cpu}}
28
+ {% endif %}
29
+ {% if time is defined and time is not none %}
30
+ #SBATCH --time={{time}}
31
+ {% endif %}
32
+ {% if account is defined and account is not none %}
33
+ #SBATCH --account={{account}}
34
+ {% endif %}
35
+ {% if exclusive is defined and exclusive is not none and exclusive %}
36
+ #SBATCH --exclusive
37
+ {% endif %}
38
+ {% if mem is defined and mem is not none %}
39
+ #SBATCH --mem={{mem}}
40
+ {% endif %}
41
+ {% if requeue is defined and requeue is not none and requeue %}
42
+ #SBATCH --requeue
43
+ {% endif %}
44
+ {% if sbatch_custom is defined and sbatch_custom is not none %}
45
+ #SBATCH {{sbatch_custom}}
46
+ {% endif %}
47
+ #SBATCH --open-mode=append
48
+ #SBATCH --output=stdout.txt
49
+ #SBATCH --error=stderr.txt
50
+
51
+ {% if modules is defined and modules is not none and modules|length > 0 %}
52
+ {% for m in modules %}
53
+ module load {{ m }}
54
+ {% endfor %}
55
+ {% endif %}
56
+
57
+ # define the handler function
58
+ term_handler()
59
+ {
60
+ # do whatever cleanup you want here
61
+ echo "-1" > job_status.txt
62
+ exit -1
63
+ }
64
+
65
+ # associate the function "term_handler" with the TERM signal
66
+ trap 'term_handler' TERM
67
+
68
+ echo $SLURM_JOB_ID > job_id.txt
69
+
70
+ echo "100" > job_status.txt
71
+ {{ command }}
72
+ RESULT=$?
73
+ if [ $RESULT -eq 0 ]; then
74
+ echo "0" > job_status.txt
75
+ exit $RESULT
76
+ fi
77
+ echo "-1" > job_status.txt
78
+ exit $RESULT
@@ -0,0 +1,214 @@
1
+ """
2
+ This is a SlurmPlatform utility.
3
+
4
+ Copyright 2021, Bill & Melinda Gates Foundation. All rights reserved.
5
+ """
6
+ import os
7
+ import subprocess
8
+ import time
9
+ from os import PathLike
10
+ from pathlib import Path
11
+ from dataclasses import dataclass, field
12
+ from typing import NoReturn, Union, List, TYPE_CHECKING
13
+ from idmtools.core import NoPlatformException
14
+ from jinja2 import Template
15
+ from logging import getLogger
16
+ from idmtools_platform_slurm.utils.slurm_job import create_slurm_indicator, slurm_installed
17
+ from typing import Tuple
18
+
19
+
20
+ user_logger = getLogger('user')
21
+
22
+ if TYPE_CHECKING:
23
+ from idmtools_platform_slurm.slurm_platform import SlurmPlatform
24
+
25
+ DEFAULT_TEMPLATE_FILE = "script_sbatch.sh.jinja2"
26
+ MSG = """Note: any output information from your script is stored in file stdout.txt under the script folder. For example, if you are running a script under current directory which kicks out another Slurm job, then the second Slurm job id is stored in stdout.txt under the current directory."""
27
+
28
+ TEMP_FILES = ['sbatch.sh', 'job_id.txt', 'job_status.txt', 'stdout.txt', 'stderr.txt']
29
+
30
+
31
+ def generate_script(platform: 'SlurmPlatform', command: str,
32
+ template: Union[Path, str] = DEFAULT_TEMPLATE_FILE, batch_dir: str = None, **kwargs) -> None:
33
+ """
34
+ Generate batch file sbatch.sh
35
+ Args:
36
+ platform: Slurm Platform
37
+ command: execution command
38
+ template: template to be used to build batch file
39
+ kwargs: keyword arguments used to expand functionality
40
+ Returns:
41
+ None
42
+ """
43
+ from idmtools_platform_slurm.slurm_platform import CONFIG_PARAMETERS
44
+ template_vars = dict(
45
+ platform=platform,
46
+ command=command
47
+ )
48
+ # Populate from our platform config vars
49
+ for p in CONFIG_PARAMETERS:
50
+ if getattr(platform, p) is not None:
51
+ template_vars[p] = getattr(platform, p)
52
+
53
+ template_vars.update(kwargs)
54
+
55
+ if platform.modules:
56
+ template_vars['modules'] = platform.modules
57
+
58
+ with open(Path(__file__).parent.joinpath(template)) as tin:
59
+ t = Template(tin.read())
60
+
61
+ # Write our file
62
+ if batch_dir is None:
63
+ output_target = Path.cwd().joinpath("sbatch.sh")
64
+ else:
65
+ output_target = Path(batch_dir).joinpath("sbatch.sh")
66
+
67
+ with open(output_target, "w") as tout:
68
+ tout.write(t.render(template_vars))
69
+
70
+ # Make executable
71
+ platform.update_script_mode(output_target)
72
+
73
+
74
+ def check_file_and_job_id(file_path, timeout: int = 3600, interval: int = 10) -> Tuple[bool, str]:
75
+ """
76
+ Wait for a file to be created and check if slurm job id exists in the file.
77
+
78
+ Args:
79
+ file_path (str): Path to the file to check.
80
+ timeout (int): Maximum time (in seconds) to wait for the file and line.
81
+ interval (int): Time interval (in seconds) between checks.
82
+
83
+ Returns:
84
+ Tuple[bool, str]: A tuple containing:
85
+ - bool: True if the file exists and contains a valid job ID, False otherwise.
86
+ - str: The job ID if found, otherwise an empty string.
87
+ """
88
+ start_time = time.time()
89
+
90
+ while time.time() - start_time < timeout:
91
+ if os.path.exists(file_path):
92
+ user_logger.info(f"File {file_path} found.")
93
+ with open(file_path, 'r') as file:
94
+ for line in file:
95
+ if 'Slurm Job Ids (' in line:
96
+ slurm_job_id = file.readline().strip()
97
+ if slurm_job_id.isdigit():
98
+ return True, slurm_job_id
99
+ user_logger.warning(f"Not found slurm job id in {file_path}.")
100
+ else:
101
+ user_logger.info(f"File {file_path} not found yet. Waiting...")
102
+
103
+ time.sleep(interval)
104
+
105
+ user_logger.error(f"Timeout reached. File {file_path} or slurm job id not found.")
106
+ return False, None
107
+
108
+
109
+
110
+ @dataclass(repr=False)
111
+ class SlurmJob:
112
+ script_path: PathLike = field(init=True)
113
+ platform: 'SlurmPlatform' = field(default=None, init=True)
114
+ executable: str = field(default='python3', init=True)
115
+ script_params: List[str] = field(default=None, init=True)
116
+ cleanup: bool = field(default=True, init=True)
117
+
118
+ def __post_init__(self):
119
+ if self.script_path is None:
120
+ raise RuntimeError("script_path is missing!")
121
+ # load platform from context or from passed in value
122
+ self.platform = self.__check_for_platform_from_context(self.platform)
123
+ self.working_directory = Path(self.script_path).parent
124
+ self.script_params = self.script_params if self.script_params is not None and len(
125
+ self.script_params) > 0 else None
126
+ self.slurm_job_id = None
127
+
128
+ def initialization(self):
129
+ # make str list so that we may join them together
130
+ if self.script_params is not None:
131
+ self.script_params = [str(i) for i in self.script_params]
132
+
133
+ if self.script_params is not None:
134
+ command = f"{self.executable} {Path(self.script_path).name} {' '.join(self.script_params)}"
135
+ else:
136
+ command = f"{self.executable} {Path(self.script_path).name}"
137
+
138
+ generate_script(self.platform, command, batch_dir=self.working_directory)
139
+
140
+
141
+ def run(self, dry_run: bool = False, **kwargs) -> NoReturn:
142
+ if self.cleanup:
143
+ self.clean(self.working_directory)
144
+
145
+ self.initialization()
146
+
147
+ if not dry_run:
148
+ if not slurm_installed():
149
+ user_logger.warning('Slurm is not installed/available!')
150
+ exit(-1)
151
+
152
+ user_logger.info('Script is running as a slurm job!\n')
153
+ create_slurm_indicator()
154
+ result = subprocess.run(['sbatch', '--parsable', 'sbatch.sh'], stdout=subprocess.PIPE,
155
+ cwd=str(self.working_directory))
156
+ self.slurm_job_id = result.stdout.decode('utf-8').strip().split(';')[0]
157
+
158
+ user_logger.info(f"{'job_id: '.ljust(20)} {self.slurm_job_id}")
159
+ user_logger.info(f"{'job_directory: '.ljust(20)} {self.platform.job_directory}\n")
160
+
161
+ # Check if stdout.txt is created and job id exists in there
162
+ stdout_file = os.path.join(self.working_directory, 'stdout.txt')
163
+ is_exists, slurm_job_id = check_file_and_job_id(stdout_file)
164
+ if is_exists and slurm_job_id is not None:
165
+ # print stdout.txt on console
166
+ with open(stdout_file, "r") as f:
167
+ read_data = f.read()
168
+ user_logger.info(read_data)
169
+ user_logger.info("To check job status, run command:")
170
+ user_logger.info(f"scontrol show job {slurm_job_id}")
171
+ user_logger.info(f"sacct -j {slurm_job_id} --format=JobID,State,Start,End")
172
+ else:
173
+ user_logger.warning("Check status.txt for job details.")
174
+ user_logger.warning(MSG)
175
+ else:
176
+ user_logger.warning('Script is running with dry_run = True')
177
+
178
+ def __check_for_platform_from_context(self, platform) -> 'IPlatform': # noqa: F821
179
+ """
180
+ Try to determine platform of current object from self or current platform.
181
+
182
+ Args:
183
+ platform: Passed in platform object
184
+
185
+ Raises:
186
+ NoPlatformException: when no platform is on current context
187
+ Returns:
188
+ Platform object
189
+ """
190
+ if self.platform is None:
191
+ # check context for current platform
192
+ if platform is None:
193
+ from idmtools.core.context import CURRENT_PLATFORM
194
+ if CURRENT_PLATFORM is None:
195
+ raise NoPlatformException("No Platform defined on object, in current context, or passed to run")
196
+ platform = CURRENT_PLATFORM
197
+ self.platform = platform
198
+ return self.platform
199
+
200
+ def clean(self, cwd: str = os.getcwd()):
201
+ """
202
+ Delete generated slurm job related files.
203
+ Args:
204
+ cwd: the directory containing the files
205
+ Returns:
206
+ None
207
+ """
208
+ for file_path in TEMP_FILES:
209
+ f = os.path.join(cwd, file_path)
210
+ if os.path.exists(f):
211
+ try:
212
+ os.remove(f)
213
+ except:
214
+ pass
@@ -0,0 +1,5 @@
1
+ """
2
+ idmtools SlurmPlatform utils.
3
+
4
+ Copyright 2021, Bill & Melinda Gates Foundation. All rights reserved.
5
+ """