idmtools-platform-slurm 0.0.0.dev0__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dockerized_slurm/Dockerfile +107 -0
- dockerized_slurm/README.md +17 -0
- dockerized_slurm/docker-compose.yml +89 -0
- dockerized_slurm/docker-entrypoint.sh +64 -0
- dockerized_slurm/id_rsa +27 -0
- dockerized_slurm/id_rsa.pub +1 -0
- dockerized_slurm/register_cluster.sh +12 -0
- dockerized_slurm/slurm.conf +94 -0
- dockerized_slurm/slurmdbd.conf +37 -0
- idmtools_platform_slurm/__init__.py +12 -8
- idmtools_platform_slurm/assets/__init__.py +157 -0
- idmtools_platform_slurm/assets/_run.sh.jinja2 +44 -0
- idmtools_platform_slurm/assets/batch.sh.jinja2 +54 -0
- idmtools_platform_slurm/assets/run_simulation.sh +23 -0
- idmtools_platform_slurm/assets/sbatch.sh.jinja2 +77 -0
- idmtools_platform_slurm/cli/__init__.py +4 -0
- idmtools_platform_slurm/cli/slurm.py +151 -0
- idmtools_platform_slurm/platform_operations/__init__.py +0 -0
- idmtools_platform_slurm/platform_operations/asset_collection_operations.py +25 -0
- idmtools_platform_slurm/platform_operations/experiment_operations.py +107 -0
- idmtools_platform_slurm/platform_operations/json_metadata_operations.py +17 -0
- idmtools_platform_slurm/platform_operations/simulation_operations.py +46 -0
- idmtools_platform_slurm/platform_operations/suite_operations.py +38 -0
- idmtools_platform_slurm/platform_operations/utils.py +45 -0
- idmtools_platform_slurm/plugin_info.py +75 -0
- idmtools_platform_slurm/slurm_operations/__init__.py +5 -0
- idmtools_platform_slurm/slurm_operations/slurm_operations.py +58 -0
- idmtools_platform_slurm/slurm_platform.py +207 -0
- idmtools_platform_slurm/utils/__init__.py +4 -0
- idmtools_platform_slurm/utils/slurm_job/__init__.py +90 -0
- idmtools_platform_slurm/utils/slurm_job/script_sbatch.sh.jinja2 +78 -0
- idmtools_platform_slurm/utils/slurm_job/slurm_job.py +214 -0
- idmtools_platform_slurm/utils/status_report/__init__.py +5 -0
- idmtools_platform_slurm/utils/status_report/status_report.py +242 -0
- idmtools_platform_slurm/utils/status_report/utils.py +108 -0
- idmtools_platform_slurm-0.0.3.dist-info/METADATA +185 -0
- idmtools_platform_slurm-0.0.3.dist-info/RECORD +43 -0
- idmtools_platform_slurm-0.0.3.dist-info/entry_points.txt +5 -0
- idmtools_platform_slurm-0.0.3.dist-info/licenses/LICENSE.TXT +3 -0
- {idmtools_platform_slurm-0.0.0.dev0.dist-info → idmtools_platform_slurm-0.0.3.dist-info}/top_level.txt +2 -0
- tests/input/hello.sh +2 -0
- tests/input/script.py +49 -0
- idmtools_platform_slurm-0.0.0.dev0.dist-info/METADATA +0 -41
- idmtools_platform_slurm-0.0.0.dev0.dist-info/RECORD +0 -5
- {idmtools_platform_slurm-0.0.0.dev0.dist-info → idmtools_platform_slurm-0.0.3.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
FROM centos:7
|
|
2
|
+
|
|
3
|
+
LABEL org.opencontainers.image.source="https://github.com/giovtorres/slurm-docker-cluster" \
|
|
4
|
+
org.opencontainers.image.title="slurm-docker-cluster" \
|
|
5
|
+
org.opencontainers.image.description="Slurm Docker cluster on CentOS 7" \
|
|
6
|
+
org.label-schema.docker.cmd="docker-compose up -d" \
|
|
7
|
+
maintainer="Giovanni Torres"
|
|
8
|
+
|
|
9
|
+
ARG SLURM_TAG=slurm-19-05-1-2
|
|
10
|
+
ARG GOSU_VERSION=1.11
|
|
11
|
+
|
|
12
|
+
RUN set -ex \
|
|
13
|
+
&& yum makecache fast \
|
|
14
|
+
&& yum -y update \
|
|
15
|
+
&& yum -y install epel-release \
|
|
16
|
+
&& yum -y install \
|
|
17
|
+
wget \
|
|
18
|
+
bzip2 \
|
|
19
|
+
perl \
|
|
20
|
+
gcc \
|
|
21
|
+
gcc-c++\
|
|
22
|
+
git \
|
|
23
|
+
gnupg \
|
|
24
|
+
make \
|
|
25
|
+
munge \
|
|
26
|
+
munge-devel \
|
|
27
|
+
openssh-server \
|
|
28
|
+
python-devel \
|
|
29
|
+
python-pip \
|
|
30
|
+
python36 \
|
|
31
|
+
python36-devel \
|
|
32
|
+
python36-pip \
|
|
33
|
+
mariadb-server \
|
|
34
|
+
mariadb-devel \
|
|
35
|
+
psmisc \
|
|
36
|
+
bash-completion \
|
|
37
|
+
vim-enhanced \
|
|
38
|
+
&& yum clean all \
|
|
39
|
+
&& rm -rf /var/cache/yum
|
|
40
|
+
|
|
41
|
+
RUN pip install Cython nose && pip3.6 install Cython nose
|
|
42
|
+
|
|
43
|
+
RUN set -ex \
|
|
44
|
+
&& wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" \
|
|
45
|
+
&& wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64.asc" \
|
|
46
|
+
&& export GNUPGHOME="$(mktemp -d)" \
|
|
47
|
+
&& gpg --keyserver ha.pool.sks-keyservers.net --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4 \
|
|
48
|
+
&& gpg --batch --verify /usr/local/bin/gosu.asc /usr/local/bin/gosu \
|
|
49
|
+
&& rm -rf "${GNUPGHOME}" /usr/local/bin/gosu.asc \
|
|
50
|
+
&& chmod +x /usr/local/bin/gosu \
|
|
51
|
+
&& gosu nobody true
|
|
52
|
+
|
|
53
|
+
RUN set -x \
|
|
54
|
+
&& git clone https://github.com/SchedMD/slurm.git \
|
|
55
|
+
&& pushd slurm \
|
|
56
|
+
&& git checkout tags/$SLURM_TAG \
|
|
57
|
+
&& ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \
|
|
58
|
+
--with-mysql_config=/usr/bin --libdir=/usr/lib64 \
|
|
59
|
+
&& make install \
|
|
60
|
+
&& install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \
|
|
61
|
+
&& install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \
|
|
62
|
+
&& install -D -m644 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \
|
|
63
|
+
&& install -D -m644 contribs/slurm_completion_help/slurm_completion.sh /etc/profile.d/slurm_completion.sh \
|
|
64
|
+
&& popd \
|
|
65
|
+
&& rm -rf slurm
|
|
66
|
+
RUN /usr/bin/ssh-keygen -A \
|
|
67
|
+
&& groupadd -r --gid=996 slurm-data \
|
|
68
|
+
&& groupadd -r --gid=995 slurm \
|
|
69
|
+
&& useradd -r -g slurm -G slurm-data --uid=995 slurm \
|
|
70
|
+
&& groupadd -r --gid=1005 test \
|
|
71
|
+
&& useradd -r -g test -G slurm-data --uid=1005 test \
|
|
72
|
+
&& echo "test:test" | chpasswd \
|
|
73
|
+
&& mkdir -p /home/test/.ssh \
|
|
74
|
+
&& chown -R test:test /home/test \
|
|
75
|
+
&& mkdir /etc/sysconfig/slurm \
|
|
76
|
+
/var/spool/slurmd \
|
|
77
|
+
/var/run/slurmd \
|
|
78
|
+
/var/run/slurmdbd \
|
|
79
|
+
/var/lib/slurmd \
|
|
80
|
+
/var/log/slurm \
|
|
81
|
+
/data \
|
|
82
|
+
&& touch /var/lib/slurmd/node_state \
|
|
83
|
+
/var/lib/slurmd/front_end_state \
|
|
84
|
+
/var/lib/slurmd/job_state \
|
|
85
|
+
/var/lib/slurmd/resv_state \
|
|
86
|
+
/var/lib/slurmd/trigger_state \
|
|
87
|
+
/var/lib/slurmd/assoc_mgr_state \
|
|
88
|
+
/var/lib/slurmd/assoc_usage \
|
|
89
|
+
/var/lib/slurmd/qos_usage \
|
|
90
|
+
/var/lib/slurmd/fed_mgr_state \
|
|
91
|
+
&& chown -R slurm:slurm /var/*/slurm* \
|
|
92
|
+
&& /sbin/create-munge-key
|
|
93
|
+
|
|
94
|
+
COPY id_rsa id_rsa.pub /home/test/.ssh/
|
|
95
|
+
COPY slurm.conf /etc/slurm/slurm.conf
|
|
96
|
+
COPY slurmdbd.conf /etc/slurm/slurmdbd.conf
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
RUN cp /home/test/.ssh/id_rsa.pub /home/test/.ssh/authorized_keys \
|
|
100
|
+
&& chmod 400 /home/test/.ssh/id_rsa.pub \
|
|
101
|
+
&& chmod 644 /home/test/.ssh/authorized_keys \
|
|
102
|
+
&& chown -R 1005:1005 /home/test/.ssh/
|
|
103
|
+
|
|
104
|
+
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
|
|
105
|
+
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
|
|
106
|
+
|
|
107
|
+
CMD ["slurmdbd"]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
|
|
2
|
+
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
|
|
7
|
+
|
|
8
|
+
To use the slurm docker test platform, you need to follow these steps
|
|
9
|
+
|
|
10
|
+
1. Within this directory, run docker-compose up -d
|
|
11
|
+
2. Wait one minute are check the docker logs. Once the slurmctld is ready, then go to the next step
|
|
12
|
+
3. Run register_cluster.sh . On windows, you can run the docker-compose exec commands
|
|
13
|
+
4. Grab the IP Address of slurmctld
|
|
14
|
+
`docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' slurmctld`
|
|
15
|
+
5. Update the remote_host to the ip address from previous output in idmtools_platform_slurm/tests/idmtools.ini
|
|
16
|
+
6. Update the key_file path in idmtools_platform_slurm/tests/idmtools.ini to point to the absolute path of the
|
|
17
|
+
idmtools_platform_slurm/dockerized_slurm/id_rsa on your machine
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
version: "2.2"
|
|
2
|
+
|
|
3
|
+
services:
|
|
4
|
+
mysql:
|
|
5
|
+
image: mysql:5.7
|
|
6
|
+
hostname: mysql
|
|
7
|
+
container_name: mysql
|
|
8
|
+
environment:
|
|
9
|
+
MYSQL_RANDOM_ROOT_PASSWORD: "yes"
|
|
10
|
+
MYSQL_DATABASE: slurm_acct_db
|
|
11
|
+
MYSQL_USER: slurm
|
|
12
|
+
MYSQL_PASSWORD: password
|
|
13
|
+
volumes:
|
|
14
|
+
- var_lib_mysql:/var/lib/mysql
|
|
15
|
+
|
|
16
|
+
slurmdbd:
|
|
17
|
+
image: slurm-docker-cluster:19.05.1
|
|
18
|
+
build:
|
|
19
|
+
context: .
|
|
20
|
+
command: ["slurmdbd"]
|
|
21
|
+
container_name: slurmdbd
|
|
22
|
+
hostname: slurmdbd
|
|
23
|
+
volumes:
|
|
24
|
+
- etc_munge:/etc/munge
|
|
25
|
+
- etc_slurm:/etc/slurm
|
|
26
|
+
- var_log_slurm:/var/log/slurm
|
|
27
|
+
expose:
|
|
28
|
+
- "6819"
|
|
29
|
+
depends_on:
|
|
30
|
+
- mysql
|
|
31
|
+
|
|
32
|
+
slurmctld:
|
|
33
|
+
image: slurm-docker-cluster:19.05.1
|
|
34
|
+
build:
|
|
35
|
+
context: .
|
|
36
|
+
command: ["slurmctld"]
|
|
37
|
+
container_name: slurmctld
|
|
38
|
+
hostname: slurmctld
|
|
39
|
+
volumes:
|
|
40
|
+
- etc_munge:/etc/munge
|
|
41
|
+
- etc_slurm:/etc/slurm
|
|
42
|
+
- ./test_slurm_data:/data
|
|
43
|
+
- var_log_slurm:/var/log/slurm
|
|
44
|
+
expose:
|
|
45
|
+
- "6817"
|
|
46
|
+
ports:
|
|
47
|
+
- "2222:22"
|
|
48
|
+
depends_on:
|
|
49
|
+
- "slurmdbd"
|
|
50
|
+
|
|
51
|
+
c1:
|
|
52
|
+
image: slurm-docker-cluster:19.05.1
|
|
53
|
+
build:
|
|
54
|
+
context: .
|
|
55
|
+
command: ["slurmd"]
|
|
56
|
+
hostname: c1
|
|
57
|
+
container_name: c1
|
|
58
|
+
volumes:
|
|
59
|
+
- etc_munge:/etc/munge
|
|
60
|
+
- etc_slurm:/etc/slurm
|
|
61
|
+
- ./test_slurm_data:/data
|
|
62
|
+
- var_log_slurm:/var/log/slurm
|
|
63
|
+
expose:
|
|
64
|
+
- "6818"
|
|
65
|
+
depends_on:
|
|
66
|
+
- "slurmctld"
|
|
67
|
+
|
|
68
|
+
c2:
|
|
69
|
+
image: slurm-docker-cluster:19.05.1
|
|
70
|
+
build:
|
|
71
|
+
context: .
|
|
72
|
+
command: ["slurmd"]
|
|
73
|
+
hostname: c2
|
|
74
|
+
container_name: c2
|
|
75
|
+
volumes:
|
|
76
|
+
- etc_munge:/etc/munge
|
|
77
|
+
- etc_slurm:/etc/slurm
|
|
78
|
+
- ./test_slurm_data:/data
|
|
79
|
+
- var_log_slurm:/var/log/slurm
|
|
80
|
+
expose:
|
|
81
|
+
- "6818"
|
|
82
|
+
depends_on:
|
|
83
|
+
- "slurmctld"
|
|
84
|
+
|
|
85
|
+
volumes:
|
|
86
|
+
etc_munge:
|
|
87
|
+
etc_slurm:
|
|
88
|
+
var_lib_mysql:
|
|
89
|
+
var_log_slurm:
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
set -e
|
|
3
|
+
|
|
4
|
+
chown -R slurm:slurm-data /data
|
|
5
|
+
chmod 0775 /data
|
|
6
|
+
|
|
7
|
+
if [ "$1" = "slurmdbd" ]
|
|
8
|
+
then
|
|
9
|
+
echo "---> Starting the MUNGE Authentication service (munged) ..."
|
|
10
|
+
gosu munge /usr/sbin/munged
|
|
11
|
+
|
|
12
|
+
echo "---> Starting the Slurm Database Daemon (slurmdbd) ..."
|
|
13
|
+
|
|
14
|
+
{
|
|
15
|
+
. /etc/slurm/slurmdbd.conf
|
|
16
|
+
until echo "SELECT 1" | mysql -h $StorageHost -u$StorageUser -p$StoragePass 2>&1 > /dev/null
|
|
17
|
+
do
|
|
18
|
+
echo "-- Waiting for database to become active ..."
|
|
19
|
+
sleep 2
|
|
20
|
+
done
|
|
21
|
+
}
|
|
22
|
+
echo "-- Database is now active ..."
|
|
23
|
+
|
|
24
|
+
exec gosu slurm /usr/sbin/slurmdbd -Dvvv
|
|
25
|
+
fi
|
|
26
|
+
|
|
27
|
+
if [ "$1" = "slurmctld" ]
|
|
28
|
+
then
|
|
29
|
+
echo "---> Starting the MUNGE Authentication service (munged) ..."
|
|
30
|
+
gosu munge /usr/sbin/munged
|
|
31
|
+
|
|
32
|
+
echo "---> Waiting for slurmdbd to become active before starting slurmctld ..."
|
|
33
|
+
|
|
34
|
+
until 2>/dev/null >/dev/tcp/slurmdbd/6819
|
|
35
|
+
do
|
|
36
|
+
echo "-- slurmdbd is not available. Sleeping ..."
|
|
37
|
+
sleep 2
|
|
38
|
+
done
|
|
39
|
+
echo "-- slurmdbd is now active ..."
|
|
40
|
+
|
|
41
|
+
echo "---> Starting the Slurm Controller Daemon (slurmctld) ..."
|
|
42
|
+
exec /usr/sbin/sshd -D &
|
|
43
|
+
exec gosu slurm /usr/sbin/slurmctld -Dvvv
|
|
44
|
+
fi
|
|
45
|
+
|
|
46
|
+
if [ "$1" = "slurmd" ]
|
|
47
|
+
then
|
|
48
|
+
echo "---> Starting the MUNGE Authentication service (munged) ..."
|
|
49
|
+
gosu munge /usr/sbin/munged
|
|
50
|
+
|
|
51
|
+
echo "---> Waiting for slurmctld to become active before starting slurmd..."
|
|
52
|
+
|
|
53
|
+
until 2>/dev/null >/dev/tcp/slurmctld/6817
|
|
54
|
+
do
|
|
55
|
+
echo "-- slurmctld is not available. Sleeping ..."
|
|
56
|
+
sleep 2
|
|
57
|
+
done
|
|
58
|
+
echo "-- slurmctld is now active ..."
|
|
59
|
+
|
|
60
|
+
echo "---> Starting the Slurm Node Daemon (slurmd) ..."
|
|
61
|
+
exec /usr/sbin/slurmd -Dvvv
|
|
62
|
+
fi
|
|
63
|
+
|
|
64
|
+
exec "$@"
|
dockerized_slurm/id_rsa
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
-----BEGIN RSA PRIVATE KEY-----
|
|
2
|
+
MIIEowIBAAKCAQEApPDLAzN1H0re2oKCHRnqBOlv07MPL13CpQAD7eqzoQajhP+t
|
|
3
|
+
qps0zcNAKsiC1fRbn/+oX4jLlGXApnPvzo2X7PDUw8slIwS7odFNle3vQVf24r3L
|
|
4
|
+
ZGlBrWaomX3cYRO5gFFqfZwS6PDa22/YuoEVSHanml3u2+hBoqAFrGuR0RzF4GUG
|
|
5
|
+
1bZATBH2/YC3wEDR3kCQ98Z+mOS/tenf4/0s1rz1x/7ljQVTbGssc8Xis4S5JBEc
|
|
6
|
+
vg3VKsbz4mb0iuKcojvUJqAWDrpZVqeH6fUXuOHrz4vNFbJh+YBmES9/MxwRxd9U
|
|
7
|
+
JJ8oF2RO2udr6JFFYkorOr3bJiWYoLC28QsVwQIDAQABAoIBAA8ZF4w4dp0hrlqU
|
|
8
|
+
HbLqP1ipwZnAR0CPtZSC9tkdZcn0oJ05Bj2arW+0UrhX2FobXxO7RD9Sd0gjNEpI
|
|
9
|
+
TIg8v85pkSBHBSQ6d65tSUvTFtaFZc0FkIulcuSbhA1gzv533sAXM8dBtR1rhq0V
|
|
10
|
+
hOI1lKwoaFkiBg5NKUzolvxccGGSugsHCER1itoHblZBTxvVnJ/dXhFzQ2yAhErQ
|
|
11
|
+
4vYjjXbyM3cIWv3edoB7oI8Kc4w0cKdvSQ12GPIYSrq3k5RDMB36+GhokNlez0t2
|
|
12
|
+
VhcxeMBJAdIwif3tFXluwBYBmQsn9B7ovnpNm594Nn833vD56dFeH0a9soFm4Q/x
|
|
13
|
+
BotVxFECgYEA0/Ne5LVzGcftzxr5VY014+oPNKeQwnx9OIUWHdVgoFocZFtfCGa7
|
|
14
|
+
jC9cVj27GRw1WUEMKaTcUhreXA3dMzymFDNau89YX8BR3s5DZRtaM48aJCBDoK7W
|
|
15
|
+
YC+pHTXgfhd66+Q5LWtACRRz+lx6O1ZuMF3G4/CJXsvhH5FHEjobw10CgYEAxzhM
|
|
16
|
+
GxBaZPdZLwZY5XUy8BwixN8yEfsJOlIwwPJL/2LS59Qood25LwYA1oYj8JwMIG+s
|
|
17
|
+
uNwJFSBT7NYExV3Sg9we09VW74jX1C3Ejhvn63iGGbzsE2AHQtgKrzLjgcelyaBo
|
|
18
|
+
FkrtnK61L4UcUOyRqJ0+rJpuz28pRRdYo3YvebUCgYAUUP3Fmob/76Qh+AnyY2jJ
|
|
19
|
+
AgSXHYDIw7oVEty921g5xql2DTQc0AwmMdv+AEjQ7V1Hwu5xh3X/AMhTtph/cs7Q
|
|
20
|
+
nOOToRptgzfVzZu0M10AGDV/RQB2hIvUCH2DaUitjX6g4e2BJwiqViWP2BF3Yp+J
|
|
21
|
+
T1PjQYlJZu2bYweW/a7vmQKBgQCcPwMxlrnOxlAkGY5PKIQy3V5HmeXjREgQfbXP
|
|
22
|
+
HjmMqy1OtY7IpOVAhCzUE5DMfRblubB1q91TvG8WKSTExrj8wf9LlN8CLwlXWC34
|
|
23
|
+
ZtqWS4ihVxKwf3gybM60ae0VNEhKwovgMBP79uoTwwpaTbBP1kP5i4WtGzn6/jx+
|
|
24
|
+
t4q0oQKBgAy9CCCO8lnToFcso4KSh8+z2FLRFBjNmFDfb6UHkaI///PVG/posSl9
|
|
25
|
+
pAghYIVCRtlapc7D9SYB5dQdmJ38RDGkkrvz87J3PQZeCGVWou0rYhHyKlVyUVor
|
|
26
|
+
JlwyOKbIXih8ew81IzbYAJKni981nj58dwJViA9m3Zhyb+1p2tKg
|
|
27
|
+
-----END RSA PRIVATE KEY-----
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCk8MsDM3UfSt7agoIdGeoE6W/Tsw8vXcKlAAPt6rOhBqOE/62qmzTNw0AqyILV9Fuf/6hfiMuUZcCmc+/OjZfs8NTDyyUjBLuh0U2V7e9BV/bivctkaUGtZqiZfdxhE7mAUWp9nBLo8Nrbb9i6gRVIdqeaXe7b6EGioAWsa5HRHMXgZQbVtkBMEfb9gLfAQNHeQJD3xn6Y5L+16d/j/SzWvPXH/uWNBVNsayxzxeKzhLkkERy+DdUqxvPiZvSK4pyiO9QmoBYOullWp4fp9Re44evPi80VsmH5gGYRL38zHBHF31QknygXZE7a52vokUViSis6vdsmJZigsLbxCxXB clinton@sahara
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
set -e
|
|
3
|
+
|
|
4
|
+
# https://github.com/giovtorres/docker-centos7-slurm/issues/3
|
|
5
|
+
# sacctmgr add cluster linux
|
|
6
|
+
#
|
|
7
|
+
|
|
8
|
+
docker exec slurmctld bash -c "/usr/bin/sacctmgr --immediate add cluster name=linux" && \
|
|
9
|
+
docker exec slurmctld bash -c "/usr/bin/sacctmgr --immediate add account test Cluster=linux Description='none' Organization='none'" && \
|
|
10
|
+
docker exec slurmctld bash -c "/usr/bin/sacctmgr --immediate add user test DefaultAccount=test" && \
|
|
11
|
+
docker-compose restart slurmdbd slurmctld c1 c2
|
|
12
|
+
#ssh-copy-id -i id
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# slurm.conf
|
|
2
|
+
#
|
|
3
|
+
# See the slurm.conf man page for more information.
|
|
4
|
+
#
|
|
5
|
+
ClusterName=linux
|
|
6
|
+
ControlMachine=slurmctld
|
|
7
|
+
ControlAddr=slurmctld
|
|
8
|
+
#BackupController=
|
|
9
|
+
#BackupAddr=
|
|
10
|
+
#
|
|
11
|
+
SlurmUser=slurm
|
|
12
|
+
#SlurmdUser=root
|
|
13
|
+
SlurmctldPort=6817
|
|
14
|
+
SlurmdPort=6818
|
|
15
|
+
AuthType=auth/munge
|
|
16
|
+
#JobCredentialPrivateKey=
|
|
17
|
+
#JobCredentialPublicCertificate=
|
|
18
|
+
StateSaveLocation=/var/lib/slurmd
|
|
19
|
+
SlurmdSpoolDir=/var/spool/slurmd
|
|
20
|
+
SwitchType=switch/none
|
|
21
|
+
MpiDefault=none
|
|
22
|
+
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
|
|
23
|
+
SlurmdPidFile=/var/run/slurmd/slurmd.pid
|
|
24
|
+
ProctrackType=proctrack/linuxproc
|
|
25
|
+
#PluginDir=
|
|
26
|
+
CacheGroups=0
|
|
27
|
+
#FirstJobId=
|
|
28
|
+
ReturnToService=0
|
|
29
|
+
#MaxJobCount=
|
|
30
|
+
#PlugStackConfig=
|
|
31
|
+
#PropagatePrioProcess=
|
|
32
|
+
#PropagateResourceLimits=
|
|
33
|
+
#PropagateResourceLimitsExcept=
|
|
34
|
+
#Prolog=
|
|
35
|
+
#Epilog=
|
|
36
|
+
#SrunProlog=
|
|
37
|
+
#SrunEpilog=
|
|
38
|
+
#TaskProlog=
|
|
39
|
+
#TaskEpilog=
|
|
40
|
+
#TaskPlugin=
|
|
41
|
+
#TrackWCKey=no
|
|
42
|
+
#TreeWidth=50
|
|
43
|
+
#TmpFS=
|
|
44
|
+
#UsePAM=1
|
|
45
|
+
#
|
|
46
|
+
# TIMERS
|
|
47
|
+
SlurmctldTimeout=300
|
|
48
|
+
SlurmdTimeout=300
|
|
49
|
+
InactiveLimit=0
|
|
50
|
+
MinJobAge=300
|
|
51
|
+
KillWait=30
|
|
52
|
+
Waittime=0
|
|
53
|
+
#
|
|
54
|
+
# SCHEDULING
|
|
55
|
+
SchedulerType=sched/backfill
|
|
56
|
+
#SchedulerAuth=
|
|
57
|
+
#SchedulerPort=
|
|
58
|
+
#SchedulerRootFilter=
|
|
59
|
+
SelectType=select/cons_res
|
|
60
|
+
SelectTypeParameters=CR_CPU_Memory
|
|
61
|
+
FastSchedule=1
|
|
62
|
+
#PriorityType=priority/multifactor
|
|
63
|
+
#PriorityDecayHalfLife=14-0
|
|
64
|
+
#PriorityUsageResetPeriod=14-0
|
|
65
|
+
#PriorityWeightFairshare=100000
|
|
66
|
+
#PriorityWeightAge=1000
|
|
67
|
+
#PriorityWeightPartition=10000
|
|
68
|
+
#PriorityWeightJobSize=1000
|
|
69
|
+
#PriorityMaxAge=1-0
|
|
70
|
+
#
|
|
71
|
+
# LOGGING
|
|
72
|
+
SlurmctldDebug=3
|
|
73
|
+
SlurmctldLogFile=/var/log/slurm/slurmctld.log
|
|
74
|
+
SlurmdDebug=3
|
|
75
|
+
SlurmdLogFile=/var/log/slurm/slurmd.log
|
|
76
|
+
JobCompType=jobcomp/filetxt
|
|
77
|
+
JobCompLoc=/var/log/slurm/jobcomp.log
|
|
78
|
+
#
|
|
79
|
+
# ACCOUNTING
|
|
80
|
+
JobAcctGatherType=jobacct_gather/linux
|
|
81
|
+
JobAcctGatherFrequency=30
|
|
82
|
+
#
|
|
83
|
+
AccountingStorageType=accounting_storage/slurmdbd
|
|
84
|
+
AccountingStorageHost=slurmdbd
|
|
85
|
+
AccountingStoragePort=6819
|
|
86
|
+
AccountingStorageLoc=slurm_acct_db
|
|
87
|
+
#AccountingStoragePass=
|
|
88
|
+
#AccountingStorageUser=
|
|
89
|
+
#
|
|
90
|
+
# COMPUTE NODES
|
|
91
|
+
NodeName=c[1-2] RealMemory=1000 State=UNKNOWN
|
|
92
|
+
#
|
|
93
|
+
# PARTITIONS
|
|
94
|
+
PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=1 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Example slurmdbd.conf file.
|
|
3
|
+
#
|
|
4
|
+
# See the slurmdbd.conf man page for more information.
|
|
5
|
+
#
|
|
6
|
+
# Archive info
|
|
7
|
+
#ArchiveJobs=yes
|
|
8
|
+
#ArchiveDir="/tmp"
|
|
9
|
+
#ArchiveSteps=yes
|
|
10
|
+
#ArchiveScript=
|
|
11
|
+
#JobPurge=12
|
|
12
|
+
#StepPurge=1
|
|
13
|
+
#
|
|
14
|
+
# Authentication info
|
|
15
|
+
AuthType=auth/munge
|
|
16
|
+
#AuthInfo=/var/run/munge/munge.socket.2
|
|
17
|
+
#
|
|
18
|
+
# slurmDBD info
|
|
19
|
+
DbdAddr=slurmdbd
|
|
20
|
+
DbdHost=slurmdbd
|
|
21
|
+
#DbdPort=6819
|
|
22
|
+
SlurmUser=slurm
|
|
23
|
+
#MessageTimeout=300
|
|
24
|
+
DebugLevel=4
|
|
25
|
+
#DefaultQOS=normal,standby
|
|
26
|
+
LogFile=/var/log/slurm/slurmdbd.log
|
|
27
|
+
PidFile=/var/run/slurmdbd/slurmdbd.pid
|
|
28
|
+
#PluginDir=/usr/lib/slurm
|
|
29
|
+
#PrivateData=accounts,users,usage,jobs
|
|
30
|
+
#TrackWCKey=yes
|
|
31
|
+
#
|
|
32
|
+
# Database info
|
|
33
|
+
StorageType=accounting_storage/mysql
|
|
34
|
+
StorageHost=mysql
|
|
35
|
+
StorageUser=slurm
|
|
36
|
+
StoragePass=password
|
|
37
|
+
StorageLoc=slurm_acct_db
|
|
@@ -1,8 +1,12 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
__version__ = "
|
|
1
|
+
try:
|
|
2
|
+
from importlib.metadata import version, PackageNotFoundError
|
|
3
|
+
except ImportError:
|
|
4
|
+
# Python < 3.8
|
|
5
|
+
from importlib_metadata import version, PackageNotFoundError
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
__version__ = version("idmtools-platform-slurm") # Use your actual package name
|
|
9
|
+
except PackageNotFoundError:
|
|
10
|
+
# Package not installed, use fallback
|
|
11
|
+
__version__ = "0.0.0+unknown"
|
|
12
|
+
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SlurmPlatform utilities.
|
|
3
|
+
|
|
4
|
+
Copyright 2021, Bill & Melinda Gates Foundation. All rights reserved.
|
|
5
|
+
"""
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from jinja2 import Template
|
|
8
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
9
|
+
from idmtools.entities.experiment import Experiment
|
|
10
|
+
from idmtools_platform_slurm.platform_operations.utils import check_home
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from idmtools_platform_slurm.slurm_platform import SlurmPlatform, CONFIG_PARAMETERS
|
|
14
|
+
|
|
15
|
+
DEFAULT_TEMPLATE_FILE = Path(__file__).parent.joinpath("sbatch.sh.jinja2")
|
|
16
|
+
BATCH_TEMPLATE_FILE = Path(__file__).parent.joinpath("batch.sh.jinja2")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def generate_batch(platform: 'SlurmPlatform', experiment: Experiment,
|
|
20
|
+
max_running_jobs: Optional[int] = None, array_batch_size: Optional[int] = None,
|
|
21
|
+
dependency: Optional[bool] = None,
|
|
22
|
+
template: Union[Path, str] = BATCH_TEMPLATE_FILE, **kwargs) -> None:
|
|
23
|
+
"""
|
|
24
|
+
Generate bash script file batch.sh
|
|
25
|
+
Args:
|
|
26
|
+
platform: Slurm Platform
|
|
27
|
+
experiment: idmtools Experiment
|
|
28
|
+
max_running_jobs: int, how many allowed to run
|
|
29
|
+
array_size: INT, array size for slurm job
|
|
30
|
+
dependency: bool, determine if Slurm jobs depend on each other
|
|
31
|
+
template: template to be used to build batch file
|
|
32
|
+
kwargs: keyword arguments used to expand functionality
|
|
33
|
+
Returns:
|
|
34
|
+
None
|
|
35
|
+
"""
|
|
36
|
+
template_vars = dict(njobs=experiment.simulation_count)
|
|
37
|
+
|
|
38
|
+
# Set max_running_jobs
|
|
39
|
+
if max_running_jobs is not None:
|
|
40
|
+
if platform.max_running_jobs is not None:
|
|
41
|
+
template_vars['max_running_jobs'] = min(max_running_jobs, platform.max_running_jobs)
|
|
42
|
+
else:
|
|
43
|
+
template_vars['max_running_jobs'] = max_running_jobs
|
|
44
|
+
else:
|
|
45
|
+
if platform.max_running_jobs is not None:
|
|
46
|
+
template_vars['max_running_jobs'] = platform.max_running_jobs
|
|
47
|
+
else:
|
|
48
|
+
template_vars['max_running_jobs'] = 1
|
|
49
|
+
|
|
50
|
+
# Set array_size
|
|
51
|
+
if array_batch_size is not None:
|
|
52
|
+
platform.array_batch_size = array_batch_size
|
|
53
|
+
|
|
54
|
+
if platform._max_array_size is not None:
|
|
55
|
+
if platform.array_batch_size is not None:
|
|
56
|
+
template_vars['array_batch_size'] = min(platform._max_array_size, platform.array_batch_size,
|
|
57
|
+
experiment.simulation_count)
|
|
58
|
+
else:
|
|
59
|
+
template_vars['array_batch_size'] = min(platform._max_array_size, experiment.simulation_count)
|
|
60
|
+
elif platform.array_batch_size is not None:
|
|
61
|
+
template_vars['array_batch_size'] = min(platform.array_batch_size, experiment.simulation_count)
|
|
62
|
+
else:
|
|
63
|
+
template_vars['array_batch_size'] = experiment.simulation_count
|
|
64
|
+
|
|
65
|
+
# Consider dependency
|
|
66
|
+
if dependency is None:
|
|
67
|
+
dependency = True
|
|
68
|
+
template_vars['dependency'] = dependency
|
|
69
|
+
|
|
70
|
+
# Update with possible override values
|
|
71
|
+
template_vars.update(kwargs)
|
|
72
|
+
|
|
73
|
+
# Build batch based on the given template
|
|
74
|
+
with open(template) as file_:
|
|
75
|
+
t = Template(file_.read())
|
|
76
|
+
|
|
77
|
+
# Write out file
|
|
78
|
+
output_target = platform.get_directory(experiment).joinpath("batch.sh")
|
|
79
|
+
with open(output_target, "w") as tout:
|
|
80
|
+
tout.write(t.render(template_vars))
|
|
81
|
+
|
|
82
|
+
# Make executable
|
|
83
|
+
platform.update_script_mode(output_target)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def generate_script(platform: 'SlurmPlatform', experiment: Experiment, max_running_jobs: Optional[int] = None,
|
|
87
|
+
template: Union[Path, str] = DEFAULT_TEMPLATE_FILE, **kwargs) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Generate batch file sbatch.sh
|
|
90
|
+
Args:
|
|
91
|
+
platform: Slurm Platform
|
|
92
|
+
experiment: idmtools Experiment
|
|
93
|
+
max_running_jobs: int, how many allowed to run at the same time
|
|
94
|
+
template: template to be used to build batch file
|
|
95
|
+
kwargs: keyword arguments used to expand functionality
|
|
96
|
+
Returns:
|
|
97
|
+
None
|
|
98
|
+
"""
|
|
99
|
+
from idmtools_platform_slurm.slurm_platform import CONFIG_PARAMETERS
|
|
100
|
+
template_vars = dict(njobs=experiment.simulation_count)
|
|
101
|
+
# populate from our platform config vars
|
|
102
|
+
for p in CONFIG_PARAMETERS:
|
|
103
|
+
if getattr(platform, p) is not None:
|
|
104
|
+
template_vars[p] = getattr(platform, p)
|
|
105
|
+
|
|
106
|
+
# Set default here
|
|
107
|
+
if max_running_jobs is not None:
|
|
108
|
+
template_vars['max_running_jobs'] = max_running_jobs
|
|
109
|
+
if max_running_jobs is None and platform.max_running_jobs is None:
|
|
110
|
+
template_vars['max_running_jobs'] = 1
|
|
111
|
+
|
|
112
|
+
# Add any overides. We need some validation here later
|
|
113
|
+
# TODO add validation for valid config options
|
|
114
|
+
template_vars.update(kwargs)
|
|
115
|
+
|
|
116
|
+
if platform.modules:
|
|
117
|
+
template_vars['modules'] = platform.modules
|
|
118
|
+
|
|
119
|
+
with open(template) as file_:
|
|
120
|
+
t = Template(file_.read())
|
|
121
|
+
|
|
122
|
+
# Write out file
|
|
123
|
+
output_target = platform.get_directory(experiment).joinpath("sbatch.sh")
|
|
124
|
+
with open(output_target, "w") as tout:
|
|
125
|
+
tout.write(t.render(template_vars))
|
|
126
|
+
# Make executable
|
|
127
|
+
platform.update_script_mode(output_target)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def generate_simulation_script(platform: 'SlurmPlatform', simulation, retries: Optional[int] = None) -> None:
|
|
131
|
+
"""
|
|
132
|
+
Generate batch file _run.sh
|
|
133
|
+
Args:
|
|
134
|
+
platform: Slurm Platform
|
|
135
|
+
simulation: idmtools Simulation
|
|
136
|
+
retries: int
|
|
137
|
+
Returns:
|
|
138
|
+
None
|
|
139
|
+
"""
|
|
140
|
+
experiment_dir = platform.get_directory(simulation.parent).absolute()
|
|
141
|
+
experiment_dir = str(experiment_dir).replace('\\', '/')
|
|
142
|
+
check = check_home(experiment_dir)
|
|
143
|
+
sim_script = platform.get_directory(simulation).joinpath("_run.sh")
|
|
144
|
+
with open(sim_script, "w") as tout:
|
|
145
|
+
with open(Path(__file__).parent.parent.joinpath("assets/_run.sh.jinja2")) as tin:
|
|
146
|
+
tvars = dict(
|
|
147
|
+
platform=platform,
|
|
148
|
+
simulation=simulation,
|
|
149
|
+
retries=retries if retries else platform.retries
|
|
150
|
+
)
|
|
151
|
+
if not check:
|
|
152
|
+
tvars['experiment_dir'] = str(experiment_dir)
|
|
153
|
+
|
|
154
|
+
t = Template(tin.read())
|
|
155
|
+
tout.write(t.render(tvars))
|
|
156
|
+
# Make executable
|
|
157
|
+
platform.update_script_mode(sim_script)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
#SBATCH --signal=B:SIGTERM@30
|
|
4
|
+
|
|
5
|
+
# define the handler function
|
|
6
|
+
term_handler()
|
|
7
|
+
{
|
|
8
|
+
# do whatever cleanup you want here
|
|
9
|
+
echo "-1" > job_status.txt
|
|
10
|
+
exit -1
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
# associate the function "term_handler" with the TERM signal
|
|
14
|
+
trap 'term_handler' TERM
|
|
15
|
+
|
|
16
|
+
echo ${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID} > job_id.txt
|
|
17
|
+
|
|
18
|
+
n=0
|
|
19
|
+
until [ "$n" -ge {{retries}} ]
|
|
20
|
+
do
|
|
21
|
+
echo "100" > job_status.txt
|
|
22
|
+
{% if simulation.task.sif_path is defined and simulation.task.sif_path %}
|
|
23
|
+
{% if simulation.task.command.cmd.startswith('singularity') %}
|
|
24
|
+
{{simulation.task.command.cmd}}
|
|
25
|
+
{% else %}
|
|
26
|
+
{% if experiment_dir is defined and experiment_dir %}
|
|
27
|
+
singularity exec --bind {{experiment_dir}} {{simulation.task.sif_path}} {{simulation.task.command.cmd}}
|
|
28
|
+
{% else %}
|
|
29
|
+
singularity exec {{simulation.task.sif_path}} {{simulation.task.command.cmd}}
|
|
30
|
+
{% endif %}
|
|
31
|
+
{% endif %}
|
|
32
|
+
{% else %}
|
|
33
|
+
{{simulation.task.command.cmd}}
|
|
34
|
+
{% endif %}
|
|
35
|
+
RESULT=$?
|
|
36
|
+
if [ $RESULT -eq 0 ]; then
|
|
37
|
+
echo "0" > job_status.txt
|
|
38
|
+
exit $RESULT
|
|
39
|
+
fi
|
|
40
|
+
n=$((n+1))
|
|
41
|
+
sleep 15
|
|
42
|
+
done
|
|
43
|
+
echo "-1" > job_status.txt
|
|
44
|
+
exit $RESULT
|