idmtools-platform-slurm 0.0.0.dev0__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dockerized_slurm/Dockerfile +107 -0
- dockerized_slurm/README.md +17 -0
- dockerized_slurm/docker-compose.yml +89 -0
- dockerized_slurm/docker-entrypoint.sh +64 -0
- dockerized_slurm/id_rsa +27 -0
- dockerized_slurm/id_rsa.pub +1 -0
- dockerized_slurm/register_cluster.sh +12 -0
- dockerized_slurm/slurm.conf +94 -0
- dockerized_slurm/slurmdbd.conf +37 -0
- idmtools_platform_slurm/__init__.py +12 -8
- idmtools_platform_slurm/assets/__init__.py +157 -0
- idmtools_platform_slurm/assets/_run.sh.jinja2 +44 -0
- idmtools_platform_slurm/assets/batch.sh.jinja2 +54 -0
- idmtools_platform_slurm/assets/run_simulation.sh +23 -0
- idmtools_platform_slurm/assets/sbatch.sh.jinja2 +77 -0
- idmtools_platform_slurm/cli/__init__.py +4 -0
- idmtools_platform_slurm/cli/slurm.py +151 -0
- idmtools_platform_slurm/platform_operations/__init__.py +0 -0
- idmtools_platform_slurm/platform_operations/asset_collection_operations.py +25 -0
- idmtools_platform_slurm/platform_operations/experiment_operations.py +107 -0
- idmtools_platform_slurm/platform_operations/json_metadata_operations.py +17 -0
- idmtools_platform_slurm/platform_operations/simulation_operations.py +46 -0
- idmtools_platform_slurm/platform_operations/suite_operations.py +38 -0
- idmtools_platform_slurm/platform_operations/utils.py +45 -0
- idmtools_platform_slurm/plugin_info.py +75 -0
- idmtools_platform_slurm/slurm_operations/__init__.py +5 -0
- idmtools_platform_slurm/slurm_operations/slurm_operations.py +58 -0
- idmtools_platform_slurm/slurm_platform.py +207 -0
- idmtools_platform_slurm/utils/__init__.py +4 -0
- idmtools_platform_slurm/utils/slurm_job/__init__.py +90 -0
- idmtools_platform_slurm/utils/slurm_job/script_sbatch.sh.jinja2 +78 -0
- idmtools_platform_slurm/utils/slurm_job/slurm_job.py +214 -0
- idmtools_platform_slurm/utils/status_report/__init__.py +5 -0
- idmtools_platform_slurm/utils/status_report/status_report.py +242 -0
- idmtools_platform_slurm/utils/status_report/utils.py +108 -0
- idmtools_platform_slurm-0.0.2.dist-info/METADATA +185 -0
- idmtools_platform_slurm-0.0.2.dist-info/RECORD +43 -0
- idmtools_platform_slurm-0.0.2.dist-info/entry_points.txt +5 -0
- idmtools_platform_slurm-0.0.2.dist-info/licenses/LICENSE.TXT +3 -0
- {idmtools_platform_slurm-0.0.0.dev0.dist-info → idmtools_platform_slurm-0.0.2.dist-info}/top_level.txt +2 -0
- tests/input/hello.sh +2 -0
- tests/input/script.py +49 -0
- idmtools_platform_slurm-0.0.0.dev0.dist-info/METADATA +0 -41
- idmtools_platform_slurm-0.0.0.dev0.dist-info/RECORD +0 -5
- {idmtools_platform_slurm-0.0.0.dev0.dist-info → idmtools_platform_slurm-0.0.2.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
# Set the total number of tasks
|
|
4
|
+
total_tasks={{njobs}}
|
|
5
|
+
|
|
6
|
+
# Set the number of tasks per array job
|
|
7
|
+
batch_size={{array_batch_size}}
|
|
8
|
+
|
|
9
|
+
# Set max running jobs
|
|
10
|
+
max_jobs={{max_running_jobs}}
|
|
11
|
+
|
|
12
|
+
num_batches=$((total_tasks / batch_size))
|
|
13
|
+
remainder=$((total_tasks % batch_size))
|
|
14
|
+
|
|
15
|
+
echo "num_batches: $num_batches"
|
|
16
|
+
echo "remainder: $remainder"
|
|
17
|
+
|
|
18
|
+
# Submit the first array job with tasks 1-batch_size
|
|
19
|
+
job_id=$(sbatch --array=1-$batch_size%$max_jobs sbatch.sh 0 | awk '{print $4}')
|
|
20
|
+
echo $job_id >> job_id.txt
|
|
21
|
+
|
|
22
|
+
# Submit additional array jobs that depend on the first job
|
|
23
|
+
for (( i=1; i<$num_batches; i+=1 ))
|
|
24
|
+
do
|
|
25
|
+
# Calculate the task range for the current array job
|
|
26
|
+
start_task=$((i * $batch_size))
|
|
27
|
+
|
|
28
|
+
# Submit the array job with the current task range and a dependency on the previous job
|
|
29
|
+
{% if dependency is defined and dependency %}
|
|
30
|
+
new_job_id=$(sbatch --array=1-$batch_size%$max_jobs --dependency=afterok:$job_id sbatch.sh $start_task | awk '{print $4}')
|
|
31
|
+
{% else %}
|
|
32
|
+
new_job_id=$(sbatch --array=1-$batch_size%$max_jobs sbatch.sh $start_task | awk '{print $4}')
|
|
33
|
+
{% endif %}
|
|
34
|
+
echo $new_job_id >> job_id.txt
|
|
35
|
+
|
|
36
|
+
# Update the job ID to use as a dependency for the next array job
|
|
37
|
+
job_id=$new_job_id
|
|
38
|
+
done
|
|
39
|
+
|
|
40
|
+
# Submit the remaining tasks as a separate batch
|
|
41
|
+
if [ $remainder -gt 0 ]
|
|
42
|
+
then
|
|
43
|
+
start_task=$(($num_batches * $batch_size))
|
|
44
|
+
|
|
45
|
+
# Submit the array job with the current task range and a dependency on the previous job
|
|
46
|
+
{% if dependency is defined and dependency %}
|
|
47
|
+
new_job_id=$(sbatch --array=1-$remainder%$max_jobs --dependency=afterok:$job_id sbatch.sh $start_task | awk '{print $4}')
|
|
48
|
+
{% else %}
|
|
49
|
+
new_job_id=$(sbatch --array=1-$remainder%$max_jobs sbatch.sh $start_task | awk '{print $4}')
|
|
50
|
+
{% endif %}
|
|
51
|
+
echo $new_job_id >> job_id.txt
|
|
52
|
+
fi
|
|
53
|
+
|
|
54
|
+
wait
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Get the parameters passed from sbatch.sh
|
|
3
|
+
mpi_type="$2"
|
|
4
|
+
|
|
5
|
+
SIMULATION_INDEX=$((${SLURM_ARRAY_TASK_ID} + $1))
|
|
6
|
+
JOB_DIRECTORY=$(find . -type d -maxdepth 1 -mindepth 1 | grep -v Assets | head -$SIMULATION_INDEX | tail -1)
|
|
7
|
+
cd $JOB_DIRECTORY
|
|
8
|
+
current_dir=$(pwd)
|
|
9
|
+
echo "The script is running from: $current_dir"
|
|
10
|
+
|
|
11
|
+
# Run the simulation based on whether MPI is required
|
|
12
|
+
if [ "$mpi_type" = "no-mpi" ]; then
|
|
13
|
+
echo "Run without MPI"
|
|
14
|
+
srun _run.sh 1> stdout.txt 2> stderr.txt
|
|
15
|
+
elif [ "$mpi_type" = "mpirun" ]; then
|
|
16
|
+
echo "Run mpirun"
|
|
17
|
+
mpirun "$current_dir"/_run.sh 1> stdout.txt 2> stderr.txt
|
|
18
|
+
elif [ "$mpi_type" = "pmi2" ] || [ "$mpi_type" = "pmix" ]; then # pmi2 or pmix
|
|
19
|
+
echo "Run MPI with $mpi_type"
|
|
20
|
+
srun --mpi=$mpi_type _run.sh 1> stdout.txt 2> stderr.txt
|
|
21
|
+
else
|
|
22
|
+
echo "Invalid MPI type: $mpi_type"
|
|
23
|
+
fi
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
{% if ntasks is defined and ntasks is not none %}
|
|
3
|
+
#SBATCH --ntasks={{ntasks}}
|
|
4
|
+
{% endif %}
|
|
5
|
+
{% if partition is defined and partition is not none %}
|
|
6
|
+
#SBATCH --partition={{partition}}
|
|
7
|
+
{% endif %}
|
|
8
|
+
{% if nodes is defined and nodes is not none %}
|
|
9
|
+
#SBATCH --nodes={{nodes}}
|
|
10
|
+
{% endif %}
|
|
11
|
+
{% if mail_type is defined and mail_type is not none %}
|
|
12
|
+
#SBATCH --mail-type={{mail_type}}
|
|
13
|
+
{% endif %}
|
|
14
|
+
{% if mail_user is defined and mail_user is not none %}
|
|
15
|
+
#SBATCH --mail-user={{mail_user}}
|
|
16
|
+
{% endif %}
|
|
17
|
+
{% if constraint is defined and constraint is not none %}
|
|
18
|
+
#SBATCH --constraint={{constraint}}
|
|
19
|
+
{% endif %}
|
|
20
|
+
{% if ntasks_per_core is defined and ntasks_per_core is not none %}
|
|
21
|
+
#SBATCH --ntasks-per-core={{ntasks_per_core}}
|
|
22
|
+
{% endif %}
|
|
23
|
+
{% if cpus_per_task is defined and cpus_per_task is not none %}
|
|
24
|
+
#SBATCH --cpus-per-task={{cpus_per_task}}
|
|
25
|
+
{% endif %}
|
|
26
|
+
{% if mem_per_cpu is defined and mem_per_cpu is not none %}
|
|
27
|
+
#SBATCH --mem-per-cpu={{mem_per_cpu}}
|
|
28
|
+
{% endif %}
|
|
29
|
+
{% if time is defined and time is not none %}
|
|
30
|
+
#SBATCH --time={{time}}
|
|
31
|
+
{% endif %}
|
|
32
|
+
{% if account is defined and account is not none %}
|
|
33
|
+
#SBATCH --account={{account}}
|
|
34
|
+
{% endif %}
|
|
35
|
+
{% if exclusive is defined and exclusive is not none and exclusive %}
|
|
36
|
+
#SBATCH --exclusive
|
|
37
|
+
{% endif %}
|
|
38
|
+
{% if mem is defined and mem is not none %}
|
|
39
|
+
#SBATCH --mem={{mem}}
|
|
40
|
+
{% endif %}
|
|
41
|
+
{% if requeue is defined and requeue is not none and requeue %}
|
|
42
|
+
#SBATCH --requeue
|
|
43
|
+
{% endif %}
|
|
44
|
+
{% if sbatch_custom is defined and sbatch_custom is not none %}
|
|
45
|
+
#SBATCH {{sbatch_custom}}
|
|
46
|
+
{% endif %}
|
|
47
|
+
#SBATCH --open-mode=append
|
|
48
|
+
#SBATCH --output=stdout.txt
|
|
49
|
+
#SBATCH --error=stderr.txt
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
{% if modules is defined and modules is not none and modules|length > 0 %}
|
|
53
|
+
{% for m in modules %}
|
|
54
|
+
module load {{m}}
|
|
55
|
+
{% endfor %}
|
|
56
|
+
{% endif %}
|
|
57
|
+
|
|
58
|
+
# Assign the values from the template
|
|
59
|
+
ntasks={{ntasks|default(1)}}
|
|
60
|
+
|
|
61
|
+
# Get mpi_type
|
|
62
|
+
mpi_type={{mpi_type|lower}}
|
|
63
|
+
|
|
64
|
+
# All submissions happen at the experiment level
|
|
65
|
+
# Check if ntasks is greater than 1 to include --mpi=$mpi_type
|
|
66
|
+
if [ "$ntasks" -gt 1 ]; then
|
|
67
|
+
echo "Running with MPI (ntasks=$ntasks)"
|
|
68
|
+
bash run_simulation.sh "$1" "$mpi_type"
|
|
69
|
+
else
|
|
70
|
+
echo "Running without MPI (ntasks=$ntasks)"
|
|
71
|
+
bash run_simulation.sh "$1" "no-mpi"
|
|
72
|
+
fi
|
|
73
|
+
wait
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""
|
|
2
|
+
idmtools slurm cli commands.
|
|
3
|
+
|
|
4
|
+
Copyright 2021, Bill & Melinda Gates Foundation. All rights reserved.
|
|
5
|
+
"""
|
|
6
|
+
import json
|
|
7
|
+
import click
|
|
8
|
+
from idmtools.core import ItemType
|
|
9
|
+
from idmtools.core.platform_factory import Platform
|
|
10
|
+
from idmtools_platform_slurm.utils.status_report.status_report import generate_status_report
|
|
11
|
+
from idmtools_platform_slurm.utils.status_report.utils import get_latest_experiment, check_status
|
|
12
|
+
from logging import getLogger
|
|
13
|
+
|
|
14
|
+
logger = getLogger(__name__)
|
|
15
|
+
user_logger = getLogger('user')
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@click.group(short_help="Slurm platform related commands.")
|
|
19
|
+
@click.argument('job-directory')
|
|
20
|
+
@click.pass_context
|
|
21
|
+
def slurm(ctx: click.Context, job_directory):
|
|
22
|
+
"""
|
|
23
|
+
Commands related to managing the SLURM platform.
|
|
24
|
+
|
|
25
|
+
job_directory: Slurm Working Directory
|
|
26
|
+
"""
|
|
27
|
+
ctx.obj = dict(job_directory=job_directory)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@slurm.command(help="Get simulation's report")
|
|
31
|
+
@click.option('--suite-id', default=None, help="Idmtools Suite id")
|
|
32
|
+
@click.option('--exp-id', default=None, help="Idmtools Experiment id")
|
|
33
|
+
@click.option('--status-filter', type=click.Choice(['0', '-1', '100']), multiple=True, help="list of status")
|
|
34
|
+
@click.option('--sim-filter', multiple=True, help="list of simulations")
|
|
35
|
+
@click.option('--job-filter', multiple=True, help="list of slurm jobs")
|
|
36
|
+
@click.option('--root', default='sim', type=click.Choice(['job', 'sim']), help="Dictionary root key")
|
|
37
|
+
@click.option('--verbose/--no-verbose', default=True, help="Enable verbose output in results")
|
|
38
|
+
@click.option('--display/--no-display', default=True, help="Display with working directory or not")
|
|
39
|
+
@click.option('--display-count', default=20, help="Display Count")
|
|
40
|
+
@click.pass_context
|
|
41
|
+
def status_report(ctx: click.Context, suite_id, exp_id, status_filter, sim_filter, job_filter, root, verbose, display,
|
|
42
|
+
display_count):
|
|
43
|
+
job_dir = ctx.obj['job_directory']
|
|
44
|
+
|
|
45
|
+
if suite_id is not None:
|
|
46
|
+
scope = (suite_id, ItemType.SUITE)
|
|
47
|
+
elif exp_id is not None:
|
|
48
|
+
scope = (exp_id, ItemType.EXPERIMENT)
|
|
49
|
+
else:
|
|
50
|
+
scope = None
|
|
51
|
+
|
|
52
|
+
platform = Platform('SLURM_LOCAL', job_directory=job_dir)
|
|
53
|
+
|
|
54
|
+
generate_status_report(platform=platform, scope=scope,
|
|
55
|
+
status_filter=status_filter if len(status_filter) > 0 else None,
|
|
56
|
+
job_filter=job_filter if len(job_filter) > 0 else None,
|
|
57
|
+
sim_filter=sim_filter if len(sim_filter) > 0 else None,
|
|
58
|
+
root=root, verbose=verbose, display=display, display_count=display_count)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@slurm.command(help="Get Suite/Experiment/Simulation directory")
|
|
62
|
+
@click.option('--sim-id', default=None, help="Idmtools Simulation id")
|
|
63
|
+
@click.option('--exp-id', default=None, help="Idmtools Experiment id")
|
|
64
|
+
@click.option('--suite-id', default=None, help="Idmtools Suite id")
|
|
65
|
+
@click.pass_context
|
|
66
|
+
def get_path(ctx: click.Context, sim_id, exp_id, suite_id):
|
|
67
|
+
job_dir = ctx.obj['job_directory']
|
|
68
|
+
platform = Platform('SLURM_LOCAL', job_directory=job_dir)
|
|
69
|
+
|
|
70
|
+
if sim_id is not None:
|
|
71
|
+
item_dir = platform.get_directory_by_id(sim_id, ItemType.SIMULATION)
|
|
72
|
+
elif exp_id is not None:
|
|
73
|
+
item_dir = platform.get_directory_by_id(exp_id, ItemType.EXPERIMENT)
|
|
74
|
+
elif suite_id is not None:
|
|
75
|
+
item_dir = platform.get_directory_by_id(suite_id, ItemType.SUITE)
|
|
76
|
+
else:
|
|
77
|
+
raise Exception('Must provide at least one: suite-id, exp-id or sim-id!')
|
|
78
|
+
|
|
79
|
+
user_logger.info(item_dir)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@slurm.command(help="Get status of Experiment/Simulation")
|
|
83
|
+
@click.option('--sim-id', default=None, help="Idmtools Simulation id")
|
|
84
|
+
@click.option('--exp-id', default=None, help="Idmtools Experiment id")
|
|
85
|
+
@click.pass_context
|
|
86
|
+
def get_status(ctx: click.Context, sim_id, exp_id):
|
|
87
|
+
job_dir = ctx.obj['job_directory']
|
|
88
|
+
platform = Platform('SLURM_LOCAL', job_directory=job_dir)
|
|
89
|
+
|
|
90
|
+
if sim_id is not None:
|
|
91
|
+
status = platform._op_client.get_simulation_status(sim_id)
|
|
92
|
+
elif exp_id is not None:
|
|
93
|
+
exp = platform.get_item(exp_id, ItemType.EXPERIMENT)
|
|
94
|
+
status = exp.status
|
|
95
|
+
else:
|
|
96
|
+
raise Exception('Must provide at least one: exp-id or sim-id!')
|
|
97
|
+
|
|
98
|
+
user_logger.info(status.name if status else None)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@slurm.command(help="Get Suite/Experiment/Simulation slurm job")
|
|
102
|
+
@click.option('--sim-id', default=None, help="Idmtools Simulation id")
|
|
103
|
+
@click.option('--exp-id', default=None, help="Idmtools Experiment id")
|
|
104
|
+
@click.option('--suite-id', default=None, help="Idmtools Suite id")
|
|
105
|
+
@click.pass_context
|
|
106
|
+
def get_job(ctx: click.Context, sim_id, exp_id, suite_id):
|
|
107
|
+
job_dir = ctx.obj['job_directory']
|
|
108
|
+
platform = Platform('SLURM_LOCAL', job_directory=job_dir)
|
|
109
|
+
|
|
110
|
+
if sim_id is not None:
|
|
111
|
+
job_id = platform._op_client.get_job_id(sim_id, ItemType.SIMULATION)
|
|
112
|
+
elif exp_id is not None:
|
|
113
|
+
job_id = platform._op_client.get_job_id(exp_id, ItemType.EXPERIMENT)
|
|
114
|
+
elif suite_id is not None:
|
|
115
|
+
suite = platform.get_item(suite_id, ItemType.SUITE)
|
|
116
|
+
exp_id = suite.experiments[0].id
|
|
117
|
+
job_id = platform._op_client.get_job_id(exp_id, ItemType.EXPERIMENT)
|
|
118
|
+
else:
|
|
119
|
+
raise Exception('Must provide at least one: suite-id, exp-id or sim-id!')
|
|
120
|
+
|
|
121
|
+
user_logger.info(job_id)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@slurm.command(help="Get the latest experiment info")
|
|
125
|
+
@click.pass_context
|
|
126
|
+
def get_latest(ctx: click.Context):
|
|
127
|
+
job_dir = ctx.obj['job_directory']
|
|
128
|
+
platform = Platform('SLURM_LOCAL', job_directory=job_dir)
|
|
129
|
+
|
|
130
|
+
result = get_latest_experiment(platform)
|
|
131
|
+
user_logger.info(json.dumps(result, indent=3))
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@slurm.command(help="Get simulation's status")
|
|
135
|
+
@click.option('--exp-id', default=None, help="Idmtools Experiment id")
|
|
136
|
+
@click.option('--display/--no-display', default=False, help="Display with working directory or not")
|
|
137
|
+
@click.pass_context
|
|
138
|
+
def status(ctx: click.Context, exp_id, display):
|
|
139
|
+
"""
|
|
140
|
+
Get job status.
|
|
141
|
+
Args:
|
|
142
|
+
ctx: click.Context
|
|
143
|
+
exp_id: experiment id
|
|
144
|
+
display: bool True/False
|
|
145
|
+
Returns:
|
|
146
|
+
None
|
|
147
|
+
"""
|
|
148
|
+
job_dir = ctx.obj['job_directory']
|
|
149
|
+
platform = Platform('SLURM_LOCAL', job_directory=job_dir)
|
|
150
|
+
|
|
151
|
+
check_status(platform=platform, exp_id=exp_id, display=display)
|
|
File without changes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Here we implement the SlurmPlatform asset collection operations.
|
|
3
|
+
|
|
4
|
+
Copyright 2025, Gates Foundation. All rights reserved.
|
|
5
|
+
"""
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from logging import getLogger
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
from idmtools_platform_file.platform_operations.asset_collection_operations import FilePlatformAssetCollectionOperations
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from idmtools_platform_slurm.slurm_platform import SlurmPlatform
|
|
13
|
+
|
|
14
|
+
logger = getLogger(__name__)
|
|
15
|
+
user_logger = getLogger("user")
|
|
16
|
+
|
|
17
|
+
EXCLUDE_FILES = ['_run.sh', 'metadata.json', 'stdout.txt', 'stderr.txt', 'status.txt', 'job_id.txt', 'job_status.txt']
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class SlurmPlatformAssetCollectionOperations(FilePlatformAssetCollectionOperations):
|
|
22
|
+
"""
|
|
23
|
+
Provides AssetCollection Operations to SlurmPlatform.
|
|
24
|
+
"""
|
|
25
|
+
platform: 'SlurmPlatform' # noqa F821
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Here we implement the SlurmPlatform experiment operations.
|
|
3
|
+
|
|
4
|
+
Copyright 2025, Gates Foundation. All rights reserved.
|
|
5
|
+
"""
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
from idmtools.core import EntityStatus
|
|
11
|
+
from idmtools.core import ItemType
|
|
12
|
+
from idmtools.entities.experiment import Experiment
|
|
13
|
+
from idmtools_platform_file.platform_operations.experiment_operations import FilePlatformExperimentOperations
|
|
14
|
+
from logging import getLogger
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = getLogger(__name__)
|
|
18
|
+
user_logger = getLogger('user')
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from idmtools_platform_slurm.slurm_platform import SlurmPlatform
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class SlurmPlatformExperimentOperations(FilePlatformExperimentOperations):
|
|
26
|
+
platform: 'SlurmPlatform' # noqa: F821
|
|
27
|
+
RUN_SIMULATION_SCRIPT_PATH = Path(__file__).parent.parent.joinpath('assets/run_simulation.sh')
|
|
28
|
+
|
|
29
|
+
def platform_run_item(self, experiment: Experiment, dry_run: bool = False, **kwargs):
|
|
30
|
+
"""
|
|
31
|
+
Run experiment.
|
|
32
|
+
Args:
|
|
33
|
+
experiment: idmtools Experiment
|
|
34
|
+
dry_run: True/False
|
|
35
|
+
kwargs: keyword arguments used to expand functionality
|
|
36
|
+
Returns:
|
|
37
|
+
None
|
|
38
|
+
"""
|
|
39
|
+
# Ensure parent
|
|
40
|
+
super().platform_run_item(experiment, **kwargs)
|
|
41
|
+
# Commission
|
|
42
|
+
if not dry_run:
|
|
43
|
+
self.platform.submit_job(experiment, **kwargs)
|
|
44
|
+
|
|
45
|
+
def refresh_status(self, experiment: Experiment, **kwargs):
|
|
46
|
+
"""
|
|
47
|
+
Refresh status of experiment.
|
|
48
|
+
Args:
|
|
49
|
+
experiment: idmtools Experiment
|
|
50
|
+
kwargs: keyword arguments used to expand functionality
|
|
51
|
+
Returns:
|
|
52
|
+
Dict of simulation id as key and working dir as value
|
|
53
|
+
"""
|
|
54
|
+
# Check if file job_id.txt exists
|
|
55
|
+
job_id_path = self.platform.get_directory(experiment).joinpath('job_id.txt')
|
|
56
|
+
if not job_id_path.exists():
|
|
57
|
+
logger.debug(f'job_id is not available for experiment: {experiment.id}')
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
# Refresh status for each simulation
|
|
61
|
+
for sim in experiment.simulations:
|
|
62
|
+
sim.status = self.platform.get_simulation_status(sim.id, **kwargs)
|
|
63
|
+
|
|
64
|
+
def platform_cancel(self, experiment_id: str, force: bool = True) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Cancel platform experiment's slurm job.
|
|
67
|
+
Args:
|
|
68
|
+
experiment_id: experiment id
|
|
69
|
+
force: bool, True/False
|
|
70
|
+
Returns:
|
|
71
|
+
Any
|
|
72
|
+
"""
|
|
73
|
+
experiment = self.platform.get_item(experiment_id, ItemType.EXPERIMENT, raw=False)
|
|
74
|
+
if force or experiment.status == EntityStatus.RUNNING:
|
|
75
|
+
logger.debug(f"cancel slurm job for experiment: {experiment_id}...")
|
|
76
|
+
job_id = self.platform.get_job_id(experiment_id, ItemType.EXPERIMENT)
|
|
77
|
+
if job_id is None:
|
|
78
|
+
logger.debug(f"Slurm job for experiment: {experiment_id} is not available!")
|
|
79
|
+
else:
|
|
80
|
+
result = self.platform._op_client.cancel_job(job_id)
|
|
81
|
+
user_logger.info(f"Cancel Experiment {experiment_id}: {result}")
|
|
82
|
+
else:
|
|
83
|
+
user_logger.info(f"Experiment {experiment_id} is not running, no cancel needed...")
|
|
84
|
+
|
|
85
|
+
def post_run_item(self, experiment: Experiment, **kwargs):
|
|
86
|
+
"""
|
|
87
|
+
Trigger right after commissioning experiment on platform.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
experiment: Experiment just commissioned
|
|
91
|
+
kwargs: keyword arguments used to expand functionality
|
|
92
|
+
Returns:
|
|
93
|
+
None
|
|
94
|
+
"""
|
|
95
|
+
super().post_run_item(experiment, **kwargs)
|
|
96
|
+
|
|
97
|
+
job_ids = self.platform.get_job_id(experiment.id, ItemType.EXPERIMENT)
|
|
98
|
+
if job_ids is None:
|
|
99
|
+
logger.debug(f"Slurm job for experiment: {experiment.id} is not available!")
|
|
100
|
+
user_logger.info("Slurm Job Ids: None")
|
|
101
|
+
else:
|
|
102
|
+
job_ids = [f'{" ".ljust(3)}{id}' for id in job_ids]
|
|
103
|
+
user_logger.info(f"Slurm Job Ids ({len(job_ids)}):")
|
|
104
|
+
user_logger.info('\n'.join(job_ids))
|
|
105
|
+
|
|
106
|
+
user_logger.info(
|
|
107
|
+
f'\nYou may try the following command to check simulations running status: \n idmtools slurm {os.path.abspath(self.platform.job_directory)} status --exp-id {experiment.id}')
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Here we implement the SlurmPlatform experiment operations.
|
|
3
|
+
|
|
4
|
+
Copyright 2025, Gates Foundation. All rights reserved.
|
|
5
|
+
"""
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from idmtools_platform_file.platform_operations.json_metadata_operations import \
|
|
9
|
+
JSONMetadataOperations as FileJSONMetadataOperations
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from idmtools_platform_slurm.slurm_platform import SlurmPlatform
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class SlurmJSONMetadataOperations(FileJSONMetadataOperations):
|
|
17
|
+
platform: 'SlurmPlatform' # noqa: F821
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Here we implement the SlurmPlatform simulation operations.
|
|
3
|
+
|
|
4
|
+
Copyright 2025, Gates Foundation. All rights reserved.
|
|
5
|
+
"""
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
from idmtools.core import ItemType, EntityStatus
|
|
9
|
+
from idmtools_platform_file.platform_operations.simulation_operations import FilePlatformSimulationOperations
|
|
10
|
+
from logging import getLogger
|
|
11
|
+
|
|
12
|
+
logger = getLogger(__name__)
|
|
13
|
+
user_logger = getLogger('user')
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from idmtools_platform_slurm.slurm_platform import SlurmPlatform
|
|
17
|
+
|
|
18
|
+
logger = getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class SlurmPlatformSimulationOperations(FilePlatformSimulationOperations):
|
|
23
|
+
platform: 'SlurmPlatform' # noqa: F821
|
|
24
|
+
|
|
25
|
+
def platform_cancel(self, sim_id: str, force: bool = False) -> Any:
|
|
26
|
+
"""
|
|
27
|
+
Cancel platform simulation's slurm job.
|
|
28
|
+
Args:
|
|
29
|
+
sim_id: simulation id
|
|
30
|
+
force: bool, True/False
|
|
31
|
+
Returns:
|
|
32
|
+
Any
|
|
33
|
+
"""
|
|
34
|
+
sim = self.platform.get_item(sim_id, ItemType.SIMULATION, raw=False)
|
|
35
|
+
if force or sim.status == EntityStatus.RUNNING:
|
|
36
|
+
logger.debug(f"cancel slurm job for simulation: {sim_id}...")
|
|
37
|
+
job_id = self.platform.get_job_id(sim_id, ItemType.SIMULATION)
|
|
38
|
+
if job_id is None:
|
|
39
|
+
logger.debug(f"Slurm job for simulation: {sim_id} is not available!")
|
|
40
|
+
return
|
|
41
|
+
else:
|
|
42
|
+
result = self.platform._op_client.cancel_job(job_id)
|
|
43
|
+
user_logger.info(f"Cancel Simulation: {sim_id}: {result}")
|
|
44
|
+
return result
|
|
45
|
+
else:
|
|
46
|
+
user_logger.info(f"Simulation {sim_id} is not running, no cancel needed...")
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Here we implement the SlurmPlatform suite operations.
|
|
3
|
+
|
|
4
|
+
Copyright 2025, Gates Foundation. All rights reserved.
|
|
5
|
+
"""
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
from logging import getLogger
|
|
9
|
+
from idmtools.core import ItemType
|
|
10
|
+
from idmtools_platform_file.platform_operations.suite_operations import FilePlatformSuiteOperations
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from idmtools_platform_slurm.slurm_platform import SlurmPlatform
|
|
14
|
+
|
|
15
|
+
logger = getLogger(__name__)
|
|
16
|
+
user_logger = getLogger('user')
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class SlurmPlatformSuiteOperations(FilePlatformSuiteOperations):
|
|
21
|
+
"""
|
|
22
|
+
Provides Suite operation to the SlurmPlatform.
|
|
23
|
+
"""
|
|
24
|
+
platform: 'SlurmPlatform' # noqa F821
|
|
25
|
+
|
|
26
|
+
def platform_cancel(self, suite_id: str, force: bool = False) -> None:
|
|
27
|
+
"""
|
|
28
|
+
Cancel platform suite's slurm job.
|
|
29
|
+
Args:
|
|
30
|
+
suite_id: suite id
|
|
31
|
+
force: bool, True/False
|
|
32
|
+
Returns:
|
|
33
|
+
None
|
|
34
|
+
"""
|
|
35
|
+
suite = self.platform.get_item(suite_id, ItemType.SUITE, force=True, raw=False)
|
|
36
|
+
logger.debug(f"cancel slurm job for suite: {suite_id}...")
|
|
37
|
+
for exp in suite.experiments:
|
|
38
|
+
self.platform._experiments.platform_cancel(exp.id, force)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This is SlurmPlatform operations utils.
|
|
3
|
+
|
|
4
|
+
Copyright 2025, Gates Foundation. All rights reserved.
|
|
5
|
+
"""
|
|
6
|
+
import os
|
|
7
|
+
import subprocess
|
|
8
|
+
from logging import getLogger
|
|
9
|
+
|
|
10
|
+
logger = getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_max_array_size():
|
|
14
|
+
"""
|
|
15
|
+
Get Slurm MaxArraySize from configuration.
|
|
16
|
+
Returns:
|
|
17
|
+
Slurm system MaxArraySize
|
|
18
|
+
"""
|
|
19
|
+
try:
|
|
20
|
+
output = subprocess.check_output(['scontrol', 'show', 'config'])
|
|
21
|
+
for line in output.decode().splitlines():
|
|
22
|
+
if line.startswith("MaxArraySize"):
|
|
23
|
+
max_array_size = int(line.split("=")[1])
|
|
24
|
+
return max_array_size - 1
|
|
25
|
+
except (subprocess.CalledProcessError, IndexError, ValueError):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def check_home(directory: str) -> bool:
|
|
32
|
+
"""
|
|
33
|
+
Check if a directory is under HOME.
|
|
34
|
+
Args:
|
|
35
|
+
directory: a directory
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
True/False
|
|
39
|
+
"""
|
|
40
|
+
home = os.path.expanduser("~").replace('\\', '/')
|
|
41
|
+
directory = directory.replace('\\', '/')
|
|
42
|
+
if directory.startswith(home):
|
|
43
|
+
return True
|
|
44
|
+
else:
|
|
45
|
+
return False
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""
|
|
2
|
+
idmtools slurm platform plugin definition.
|
|
3
|
+
|
|
4
|
+
Copyright 2021, Bill & Melinda Gates Foundation. All rights reserved.
|
|
5
|
+
"""
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Type, Dict
|
|
8
|
+
from idmtools.entities.iplatform import IPlatform
|
|
9
|
+
from idmtools.registry.platform_specification import example_configuration_impl, get_platform_impl, \
|
|
10
|
+
get_platform_type_impl, PlatformSpecification
|
|
11
|
+
from idmtools.registry.plugin_specification import get_description_impl
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
SLURM_EXAMPLE_CONFIG = """
|
|
15
|
+
[Slurm]
|
|
16
|
+
job_directory = /data
|
|
17
|
+
# values on ALL or END.
|
|
18
|
+
# All will email you as the job changes states
|
|
19
|
+
# END with email you when the job is done
|
|
20
|
+
mail_type = 'END'
|
|
21
|
+
mail_user = 'ccollins@idmod.org'
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SlurmPlatformSpecification(PlatformSpecification):
|
|
26
|
+
|
|
27
|
+
@get_description_impl
|
|
28
|
+
def get_description(self) -> str:
|
|
29
|
+
return "Provides access to the Slurm Platform to IDM Tools"
|
|
30
|
+
|
|
31
|
+
@get_platform_impl
|
|
32
|
+
def get(self, **configuration) -> IPlatform:
|
|
33
|
+
"""
|
|
34
|
+
Build our slurm platform from the passed in configuration object
|
|
35
|
+
|
|
36
|
+
We do our import of platform here to avoid any weirdness
|
|
37
|
+
Args:
|
|
38
|
+
configuration:
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
|
|
42
|
+
"""
|
|
43
|
+
from idmtools_platform_slurm.slurm_platform import SlurmPlatform
|
|
44
|
+
return SlurmPlatform(**configuration)
|
|
45
|
+
|
|
46
|
+
@example_configuration_impl
|
|
47
|
+
def example_configuration(self):
|
|
48
|
+
return SLURM_EXAMPLE_CONFIG
|
|
49
|
+
|
|
50
|
+
@get_platform_type_impl
|
|
51
|
+
def get_type(self) -> Type['SlurmPlatform']: # noqa: F821
|
|
52
|
+
from idmtools_platform_slurm.slurm_platform import SlurmPlatform
|
|
53
|
+
return SlurmPlatform
|
|
54
|
+
|
|
55
|
+
def get_version(self) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Returns the version of the plugin
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Plugin Version
|
|
61
|
+
"""
|
|
62
|
+
from idmtools_platform_slurm import __version__
|
|
63
|
+
return __version__
|
|
64
|
+
|
|
65
|
+
def get_configuration_aliases(self) -> Dict[str, Dict]:
|
|
66
|
+
"""Provides configuration aliases that exist in SLURM."""
|
|
67
|
+
config_aliases = dict(
|
|
68
|
+
SLURM_LOCAL=dict(
|
|
69
|
+
job_directory=str(Path.home())
|
|
70
|
+
),
|
|
71
|
+
SLURM_CLUSTER=dict(
|
|
72
|
+
job_directory=str(Path.home())
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
return config_aliases
|