idmtools-platform-slurm 0.0.0.dev0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. dockerized_slurm/Dockerfile +107 -0
  2. dockerized_slurm/README.md +17 -0
  3. dockerized_slurm/docker-compose.yml +89 -0
  4. dockerized_slurm/docker-entrypoint.sh +64 -0
  5. dockerized_slurm/id_rsa +27 -0
  6. dockerized_slurm/id_rsa.pub +1 -0
  7. dockerized_slurm/register_cluster.sh +12 -0
  8. dockerized_slurm/slurm.conf +94 -0
  9. dockerized_slurm/slurmdbd.conf +37 -0
  10. idmtools_platform_slurm/__init__.py +12 -8
  11. idmtools_platform_slurm/assets/__init__.py +157 -0
  12. idmtools_platform_slurm/assets/_run.sh.jinja2 +44 -0
  13. idmtools_platform_slurm/assets/batch.sh.jinja2 +54 -0
  14. idmtools_platform_slurm/assets/run_simulation.sh +23 -0
  15. idmtools_platform_slurm/assets/sbatch.sh.jinja2 +77 -0
  16. idmtools_platform_slurm/cli/__init__.py +4 -0
  17. idmtools_platform_slurm/cli/slurm.py +151 -0
  18. idmtools_platform_slurm/platform_operations/__init__.py +0 -0
  19. idmtools_platform_slurm/platform_operations/asset_collection_operations.py +25 -0
  20. idmtools_platform_slurm/platform_operations/experiment_operations.py +107 -0
  21. idmtools_platform_slurm/platform_operations/json_metadata_operations.py +17 -0
  22. idmtools_platform_slurm/platform_operations/simulation_operations.py +46 -0
  23. idmtools_platform_slurm/platform_operations/suite_operations.py +38 -0
  24. idmtools_platform_slurm/platform_operations/utils.py +45 -0
  25. idmtools_platform_slurm/plugin_info.py +75 -0
  26. idmtools_platform_slurm/slurm_operations/__init__.py +5 -0
  27. idmtools_platform_slurm/slurm_operations/slurm_operations.py +58 -0
  28. idmtools_platform_slurm/slurm_platform.py +207 -0
  29. idmtools_platform_slurm/utils/__init__.py +4 -0
  30. idmtools_platform_slurm/utils/slurm_job/__init__.py +90 -0
  31. idmtools_platform_slurm/utils/slurm_job/script_sbatch.sh.jinja2 +78 -0
  32. idmtools_platform_slurm/utils/slurm_job/slurm_job.py +214 -0
  33. idmtools_platform_slurm/utils/status_report/__init__.py +5 -0
  34. idmtools_platform_slurm/utils/status_report/status_report.py +242 -0
  35. idmtools_platform_slurm/utils/status_report/utils.py +108 -0
  36. idmtools_platform_slurm-0.0.3.dist-info/METADATA +185 -0
  37. idmtools_platform_slurm-0.0.3.dist-info/RECORD +43 -0
  38. idmtools_platform_slurm-0.0.3.dist-info/entry_points.txt +5 -0
  39. idmtools_platform_slurm-0.0.3.dist-info/licenses/LICENSE.TXT +3 -0
  40. {idmtools_platform_slurm-0.0.0.dev0.dist-info → idmtools_platform_slurm-0.0.3.dist-info}/top_level.txt +2 -0
  41. tests/input/hello.sh +2 -0
  42. tests/input/script.py +49 -0
  43. idmtools_platform_slurm-0.0.0.dev0.dist-info/METADATA +0 -41
  44. idmtools_platform_slurm-0.0.0.dev0.dist-info/RECORD +0 -5
  45. {idmtools_platform_slurm-0.0.0.dev0.dist-info → idmtools_platform_slurm-0.0.3.dist-info}/WHEEL +0 -0
@@ -0,0 +1,107 @@
1
+ FROM centos:7
2
+
3
+ LABEL org.opencontainers.image.source="https://github.com/giovtorres/slurm-docker-cluster" \
4
+ org.opencontainers.image.title="slurm-docker-cluster" \
5
+ org.opencontainers.image.description="Slurm Docker cluster on CentOS 7" \
6
+ org.label-schema.docker.cmd="docker-compose up -d" \
7
+ maintainer="Giovanni Torres"
8
+
9
+ ARG SLURM_TAG=slurm-19-05-1-2
10
+ ARG GOSU_VERSION=1.11
11
+
12
+ RUN set -ex \
13
+ && yum makecache fast \
14
+ && yum -y update \
15
+ && yum -y install epel-release \
16
+ && yum -y install \
17
+ wget \
18
+ bzip2 \
19
+ perl \
20
+ gcc \
21
+ gcc-c++\
22
+ git \
23
+ gnupg \
24
+ make \
25
+ munge \
26
+ munge-devel \
27
+ openssh-server \
28
+ python-devel \
29
+ python-pip \
30
+ python36 \
31
+ python36-devel \
32
+ python36-pip \
33
+ mariadb-server \
34
+ mariadb-devel \
35
+ psmisc \
36
+ bash-completion \
37
+ vim-enhanced \
38
+ && yum clean all \
39
+ && rm -rf /var/cache/yum
40
+
41
+ RUN pip install Cython nose && pip3.6 install Cython nose
42
+
43
+ RUN set -ex \
44
+ && wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" \
45
+ && wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64.asc" \
46
+ && export GNUPGHOME="$(mktemp -d)" \
47
+ && gpg --keyserver ha.pool.sks-keyservers.net --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4 \
48
+ && gpg --batch --verify /usr/local/bin/gosu.asc /usr/local/bin/gosu \
49
+ && rm -rf "${GNUPGHOME}" /usr/local/bin/gosu.asc \
50
+ && chmod +x /usr/local/bin/gosu \
51
+ && gosu nobody true
52
+
53
+ RUN set -x \
54
+ && git clone https://github.com/SchedMD/slurm.git \
55
+ && pushd slurm \
56
+ && git checkout tags/$SLURM_TAG \
57
+ && ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \
58
+ --with-mysql_config=/usr/bin --libdir=/usr/lib64 \
59
+ && make install \
60
+ && install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \
61
+ && install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \
62
+ && install -D -m644 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \
63
+ && install -D -m644 contribs/slurm_completion_help/slurm_completion.sh /etc/profile.d/slurm_completion.sh \
64
+ && popd \
65
+ && rm -rf slurm
66
+ RUN /usr/bin/ssh-keygen -A \
67
+ && groupadd -r --gid=996 slurm-data \
68
+ && groupadd -r --gid=995 slurm \
69
+ && useradd -r -g slurm -G slurm-data --uid=995 slurm \
70
+ && groupadd -r --gid=1005 test \
71
+ && useradd -r -g test -G slurm-data --uid=1005 test \
72
+ && echo "test:test" | chpasswd \
73
+ && mkdir -p /home/test/.ssh \
74
+ && chown -R test:test /home/test \
75
+ && mkdir /etc/sysconfig/slurm \
76
+ /var/spool/slurmd \
77
+ /var/run/slurmd \
78
+ /var/run/slurmdbd \
79
+ /var/lib/slurmd \
80
+ /var/log/slurm \
81
+ /data \
82
+ && touch /var/lib/slurmd/node_state \
83
+ /var/lib/slurmd/front_end_state \
84
+ /var/lib/slurmd/job_state \
85
+ /var/lib/slurmd/resv_state \
86
+ /var/lib/slurmd/trigger_state \
87
+ /var/lib/slurmd/assoc_mgr_state \
88
+ /var/lib/slurmd/assoc_usage \
89
+ /var/lib/slurmd/qos_usage \
90
+ /var/lib/slurmd/fed_mgr_state \
91
+ && chown -R slurm:slurm /var/*/slurm* \
92
+ && /sbin/create-munge-key
93
+
94
+ COPY id_rsa id_rsa.pub /home/test/.ssh/
95
+ COPY slurm.conf /etc/slurm/slurm.conf
96
+ COPY slurmdbd.conf /etc/slurm/slurmdbd.conf
97
+
98
+
99
+ RUN cp /home/test/.ssh/id_rsa.pub /home/test/.ssh/authorized_keys \
100
+ && chmod 400 /home/test/.ssh/id_rsa.pub \
101
+ && chmod 644 /home/test/.ssh/authorized_keys \
102
+ && chown -R 1005:1005 /home/test/.ssh/
103
+
104
+ COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
105
+ ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
106
+
107
+ CMD ["slurmdbd"]
@@ -0,0 +1,17 @@
1
+ <!-- START doctoc generated TOC please keep comment here to allow auto update -->
2
+ <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
3
+
4
+
5
+
6
+ <!-- END doctoc generated TOC please keep comment here to allow auto update -->
7
+
8
+ To use the slurm docker test platform, you need to follow these steps
9
+
10
+ 1. Within this directory, run docker-compose up -d
11
+ 2. Wait one minute are check the docker logs. Once the slurmctld is ready, then go to the next step
12
+ 3. Run register_cluster.sh . On windows, you can run the docker-compose exec commands
13
+ 4. Grab the IP Address of slurmctld
14
+ `docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' slurmctld`
15
+ 5. Update the remote_host to the ip address from previous output in idmtools_platform_slurm/tests/idmtools.ini
16
+ 6. Update the key_file path in idmtools_platform_slurm/tests/idmtools.ini to point to the absolute path of the
17
+ idmtools_platform_slurm/dockerized_slurm/id_rsa on your machine
@@ -0,0 +1,89 @@
1
+ version: "2.2"
2
+
3
+ services:
4
+ mysql:
5
+ image: mysql:5.7
6
+ hostname: mysql
7
+ container_name: mysql
8
+ environment:
9
+ MYSQL_RANDOM_ROOT_PASSWORD: "yes"
10
+ MYSQL_DATABASE: slurm_acct_db
11
+ MYSQL_USER: slurm
12
+ MYSQL_PASSWORD: password
13
+ volumes:
14
+ - var_lib_mysql:/var/lib/mysql
15
+
16
+ slurmdbd:
17
+ image: slurm-docker-cluster:19.05.1
18
+ build:
19
+ context: .
20
+ command: ["slurmdbd"]
21
+ container_name: slurmdbd
22
+ hostname: slurmdbd
23
+ volumes:
24
+ - etc_munge:/etc/munge
25
+ - etc_slurm:/etc/slurm
26
+ - var_log_slurm:/var/log/slurm
27
+ expose:
28
+ - "6819"
29
+ depends_on:
30
+ - mysql
31
+
32
+ slurmctld:
33
+ image: slurm-docker-cluster:19.05.1
34
+ build:
35
+ context: .
36
+ command: ["slurmctld"]
37
+ container_name: slurmctld
38
+ hostname: slurmctld
39
+ volumes:
40
+ - etc_munge:/etc/munge
41
+ - etc_slurm:/etc/slurm
42
+ - ./test_slurm_data:/data
43
+ - var_log_slurm:/var/log/slurm
44
+ expose:
45
+ - "6817"
46
+ ports:
47
+ - "2222:22"
48
+ depends_on:
49
+ - "slurmdbd"
50
+
51
+ c1:
52
+ image: slurm-docker-cluster:19.05.1
53
+ build:
54
+ context: .
55
+ command: ["slurmd"]
56
+ hostname: c1
57
+ container_name: c1
58
+ volumes:
59
+ - etc_munge:/etc/munge
60
+ - etc_slurm:/etc/slurm
61
+ - ./test_slurm_data:/data
62
+ - var_log_slurm:/var/log/slurm
63
+ expose:
64
+ - "6818"
65
+ depends_on:
66
+ - "slurmctld"
67
+
68
+ c2:
69
+ image: slurm-docker-cluster:19.05.1
70
+ build:
71
+ context: .
72
+ command: ["slurmd"]
73
+ hostname: c2
74
+ container_name: c2
75
+ volumes:
76
+ - etc_munge:/etc/munge
77
+ - etc_slurm:/etc/slurm
78
+ - ./test_slurm_data:/data
79
+ - var_log_slurm:/var/log/slurm
80
+ expose:
81
+ - "6818"
82
+ depends_on:
83
+ - "slurmctld"
84
+
85
+ volumes:
86
+ etc_munge:
87
+ etc_slurm:
88
+ var_lib_mysql:
89
+ var_log_slurm:
@@ -0,0 +1,64 @@
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ chown -R slurm:slurm-data /data
5
+ chmod 0775 /data
6
+
7
+ if [ "$1" = "slurmdbd" ]
8
+ then
9
+ echo "---> Starting the MUNGE Authentication service (munged) ..."
10
+ gosu munge /usr/sbin/munged
11
+
12
+ echo "---> Starting the Slurm Database Daemon (slurmdbd) ..."
13
+
14
+ {
15
+ . /etc/slurm/slurmdbd.conf
16
+ until echo "SELECT 1" | mysql -h $StorageHost -u$StorageUser -p$StoragePass 2>&1 > /dev/null
17
+ do
18
+ echo "-- Waiting for database to become active ..."
19
+ sleep 2
20
+ done
21
+ }
22
+ echo "-- Database is now active ..."
23
+
24
+ exec gosu slurm /usr/sbin/slurmdbd -Dvvv
25
+ fi
26
+
27
+ if [ "$1" = "slurmctld" ]
28
+ then
29
+ echo "---> Starting the MUNGE Authentication service (munged) ..."
30
+ gosu munge /usr/sbin/munged
31
+
32
+ echo "---> Waiting for slurmdbd to become active before starting slurmctld ..."
33
+
34
+ until 2>/dev/null >/dev/tcp/slurmdbd/6819
35
+ do
36
+ echo "-- slurmdbd is not available. Sleeping ..."
37
+ sleep 2
38
+ done
39
+ echo "-- slurmdbd is now active ..."
40
+
41
+ echo "---> Starting the Slurm Controller Daemon (slurmctld) ..."
42
+ exec /usr/sbin/sshd -D &
43
+ exec gosu slurm /usr/sbin/slurmctld -Dvvv
44
+ fi
45
+
46
+ if [ "$1" = "slurmd" ]
47
+ then
48
+ echo "---> Starting the MUNGE Authentication service (munged) ..."
49
+ gosu munge /usr/sbin/munged
50
+
51
+ echo "---> Waiting for slurmctld to become active before starting slurmd..."
52
+
53
+ until 2>/dev/null >/dev/tcp/slurmctld/6817
54
+ do
55
+ echo "-- slurmctld is not available. Sleeping ..."
56
+ sleep 2
57
+ done
58
+ echo "-- slurmctld is now active ..."
59
+
60
+ echo "---> Starting the Slurm Node Daemon (slurmd) ..."
61
+ exec /usr/sbin/slurmd -Dvvv
62
+ fi
63
+
64
+ exec "$@"
@@ -0,0 +1,27 @@
1
+ -----BEGIN RSA PRIVATE KEY-----
2
+ MIIEowIBAAKCAQEApPDLAzN1H0re2oKCHRnqBOlv07MPL13CpQAD7eqzoQajhP+t
3
+ qps0zcNAKsiC1fRbn/+oX4jLlGXApnPvzo2X7PDUw8slIwS7odFNle3vQVf24r3L
4
+ ZGlBrWaomX3cYRO5gFFqfZwS6PDa22/YuoEVSHanml3u2+hBoqAFrGuR0RzF4GUG
5
+ 1bZATBH2/YC3wEDR3kCQ98Z+mOS/tenf4/0s1rz1x/7ljQVTbGssc8Xis4S5JBEc
6
+ vg3VKsbz4mb0iuKcojvUJqAWDrpZVqeH6fUXuOHrz4vNFbJh+YBmES9/MxwRxd9U
7
+ JJ8oF2RO2udr6JFFYkorOr3bJiWYoLC28QsVwQIDAQABAoIBAA8ZF4w4dp0hrlqU
8
+ HbLqP1ipwZnAR0CPtZSC9tkdZcn0oJ05Bj2arW+0UrhX2FobXxO7RD9Sd0gjNEpI
9
+ TIg8v85pkSBHBSQ6d65tSUvTFtaFZc0FkIulcuSbhA1gzv533sAXM8dBtR1rhq0V
10
+ hOI1lKwoaFkiBg5NKUzolvxccGGSugsHCER1itoHblZBTxvVnJ/dXhFzQ2yAhErQ
11
+ 4vYjjXbyM3cIWv3edoB7oI8Kc4w0cKdvSQ12GPIYSrq3k5RDMB36+GhokNlez0t2
12
+ VhcxeMBJAdIwif3tFXluwBYBmQsn9B7ovnpNm594Nn833vD56dFeH0a9soFm4Q/x
13
+ BotVxFECgYEA0/Ne5LVzGcftzxr5VY014+oPNKeQwnx9OIUWHdVgoFocZFtfCGa7
14
+ jC9cVj27GRw1WUEMKaTcUhreXA3dMzymFDNau89YX8BR3s5DZRtaM48aJCBDoK7W
15
+ YC+pHTXgfhd66+Q5LWtACRRz+lx6O1ZuMF3G4/CJXsvhH5FHEjobw10CgYEAxzhM
16
+ GxBaZPdZLwZY5XUy8BwixN8yEfsJOlIwwPJL/2LS59Qood25LwYA1oYj8JwMIG+s
17
+ uNwJFSBT7NYExV3Sg9we09VW74jX1C3Ejhvn63iGGbzsE2AHQtgKrzLjgcelyaBo
18
+ FkrtnK61L4UcUOyRqJ0+rJpuz28pRRdYo3YvebUCgYAUUP3Fmob/76Qh+AnyY2jJ
19
+ AgSXHYDIw7oVEty921g5xql2DTQc0AwmMdv+AEjQ7V1Hwu5xh3X/AMhTtph/cs7Q
20
+ nOOToRptgzfVzZu0M10AGDV/RQB2hIvUCH2DaUitjX6g4e2BJwiqViWP2BF3Yp+J
21
+ T1PjQYlJZu2bYweW/a7vmQKBgQCcPwMxlrnOxlAkGY5PKIQy3V5HmeXjREgQfbXP
22
+ HjmMqy1OtY7IpOVAhCzUE5DMfRblubB1q91TvG8WKSTExrj8wf9LlN8CLwlXWC34
23
+ ZtqWS4ihVxKwf3gybM60ae0VNEhKwovgMBP79uoTwwpaTbBP1kP5i4WtGzn6/jx+
24
+ t4q0oQKBgAy9CCCO8lnToFcso4KSh8+z2FLRFBjNmFDfb6UHkaI///PVG/posSl9
25
+ pAghYIVCRtlapc7D9SYB5dQdmJ38RDGkkrvz87J3PQZeCGVWou0rYhHyKlVyUVor
26
+ JlwyOKbIXih8ew81IzbYAJKni981nj58dwJViA9m3Zhyb+1p2tKg
27
+ -----END RSA PRIVATE KEY-----
@@ -0,0 +1 @@
1
+ ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCk8MsDM3UfSt7agoIdGeoE6W/Tsw8vXcKlAAPt6rOhBqOE/62qmzTNw0AqyILV9Fuf/6hfiMuUZcCmc+/OjZfs8NTDyyUjBLuh0U2V7e9BV/bivctkaUGtZqiZfdxhE7mAUWp9nBLo8Nrbb9i6gRVIdqeaXe7b6EGioAWsa5HRHMXgZQbVtkBMEfb9gLfAQNHeQJD3xn6Y5L+16d/j/SzWvPXH/uWNBVNsayxzxeKzhLkkERy+DdUqxvPiZvSK4pyiO9QmoBYOullWp4fp9Re44evPi80VsmH5gGYRL38zHBHF31QknygXZE7a52vokUViSis6vdsmJZigsLbxCxXB clinton@sahara
@@ -0,0 +1,12 @@
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ # https://github.com/giovtorres/docker-centos7-slurm/issues/3
5
+ # sacctmgr add cluster linux
6
+ #
7
+
8
+ docker exec slurmctld bash -c "/usr/bin/sacctmgr --immediate add cluster name=linux" && \
9
+ docker exec slurmctld bash -c "/usr/bin/sacctmgr --immediate add account test Cluster=linux Description='none' Organization='none'" && \
10
+ docker exec slurmctld bash -c "/usr/bin/sacctmgr --immediate add user test DefaultAccount=test" && \
11
+ docker-compose restart slurmdbd slurmctld c1 c2
12
+ #ssh-copy-id -i id
@@ -0,0 +1,94 @@
1
+ # slurm.conf
2
+ #
3
+ # See the slurm.conf man page for more information.
4
+ #
5
+ ClusterName=linux
6
+ ControlMachine=slurmctld
7
+ ControlAddr=slurmctld
8
+ #BackupController=
9
+ #BackupAddr=
10
+ #
11
+ SlurmUser=slurm
12
+ #SlurmdUser=root
13
+ SlurmctldPort=6817
14
+ SlurmdPort=6818
15
+ AuthType=auth/munge
16
+ #JobCredentialPrivateKey=
17
+ #JobCredentialPublicCertificate=
18
+ StateSaveLocation=/var/lib/slurmd
19
+ SlurmdSpoolDir=/var/spool/slurmd
20
+ SwitchType=switch/none
21
+ MpiDefault=none
22
+ SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
23
+ SlurmdPidFile=/var/run/slurmd/slurmd.pid
24
+ ProctrackType=proctrack/linuxproc
25
+ #PluginDir=
26
+ CacheGroups=0
27
+ #FirstJobId=
28
+ ReturnToService=0
29
+ #MaxJobCount=
30
+ #PlugStackConfig=
31
+ #PropagatePrioProcess=
32
+ #PropagateResourceLimits=
33
+ #PropagateResourceLimitsExcept=
34
+ #Prolog=
35
+ #Epilog=
36
+ #SrunProlog=
37
+ #SrunEpilog=
38
+ #TaskProlog=
39
+ #TaskEpilog=
40
+ #TaskPlugin=
41
+ #TrackWCKey=no
42
+ #TreeWidth=50
43
+ #TmpFS=
44
+ #UsePAM=1
45
+ #
46
+ # TIMERS
47
+ SlurmctldTimeout=300
48
+ SlurmdTimeout=300
49
+ InactiveLimit=0
50
+ MinJobAge=300
51
+ KillWait=30
52
+ Waittime=0
53
+ #
54
+ # SCHEDULING
55
+ SchedulerType=sched/backfill
56
+ #SchedulerAuth=
57
+ #SchedulerPort=
58
+ #SchedulerRootFilter=
59
+ SelectType=select/cons_res
60
+ SelectTypeParameters=CR_CPU_Memory
61
+ FastSchedule=1
62
+ #PriorityType=priority/multifactor
63
+ #PriorityDecayHalfLife=14-0
64
+ #PriorityUsageResetPeriod=14-0
65
+ #PriorityWeightFairshare=100000
66
+ #PriorityWeightAge=1000
67
+ #PriorityWeightPartition=10000
68
+ #PriorityWeightJobSize=1000
69
+ #PriorityMaxAge=1-0
70
+ #
71
+ # LOGGING
72
+ SlurmctldDebug=3
73
+ SlurmctldLogFile=/var/log/slurm/slurmctld.log
74
+ SlurmdDebug=3
75
+ SlurmdLogFile=/var/log/slurm/slurmd.log
76
+ JobCompType=jobcomp/filetxt
77
+ JobCompLoc=/var/log/slurm/jobcomp.log
78
+ #
79
+ # ACCOUNTING
80
+ JobAcctGatherType=jobacct_gather/linux
81
+ JobAcctGatherFrequency=30
82
+ #
83
+ AccountingStorageType=accounting_storage/slurmdbd
84
+ AccountingStorageHost=slurmdbd
85
+ AccountingStoragePort=6819
86
+ AccountingStorageLoc=slurm_acct_db
87
+ #AccountingStoragePass=
88
+ #AccountingStorageUser=
89
+ #
90
+ # COMPUTE NODES
91
+ NodeName=c[1-2] RealMemory=1000 State=UNKNOWN
92
+ #
93
+ # PARTITIONS
94
+ PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=1 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP
@@ -0,0 +1,37 @@
1
+ #
2
+ # Example slurmdbd.conf file.
3
+ #
4
+ # See the slurmdbd.conf man page for more information.
5
+ #
6
+ # Archive info
7
+ #ArchiveJobs=yes
8
+ #ArchiveDir="/tmp"
9
+ #ArchiveSteps=yes
10
+ #ArchiveScript=
11
+ #JobPurge=12
12
+ #StepPurge=1
13
+ #
14
+ # Authentication info
15
+ AuthType=auth/munge
16
+ #AuthInfo=/var/run/munge/munge.socket.2
17
+ #
18
+ # slurmDBD info
19
+ DbdAddr=slurmdbd
20
+ DbdHost=slurmdbd
21
+ #DbdPort=6819
22
+ SlurmUser=slurm
23
+ #MessageTimeout=300
24
+ DebugLevel=4
25
+ #DefaultQOS=normal,standby
26
+ LogFile=/var/log/slurm/slurmdbd.log
27
+ PidFile=/var/run/slurmdbd/slurmdbd.pid
28
+ #PluginDir=/usr/lib/slurm
29
+ #PrivateData=accounts,users,usage,jobs
30
+ #TrackWCKey=yes
31
+ #
32
+ # Database info
33
+ StorageType=accounting_storage/mysql
34
+ StorageHost=mysql
35
+ StorageUser=slurm
36
+ StoragePass=password
37
+ StorageLoc=slurm_acct_db
@@ -1,8 +1,12 @@
1
- """
2
- idmtools-platform-slurm - Placeholder Package
3
-
4
- This is a placeholder package to reserve the name on PyPI.
5
- The actual package will be published later.
6
- """
7
-
8
- __version__ = "0.0.0.dev0"
1
+ try:
2
+ from importlib.metadata import version, PackageNotFoundError
3
+ except ImportError:
4
+ # Python < 3.8
5
+ from importlib_metadata import version, PackageNotFoundError
6
+
7
+ try:
8
+ __version__ = version("idmtools-platform-slurm") # Use your actual package name
9
+ except PackageNotFoundError:
10
+ # Package not installed, use fallback
11
+ __version__ = "0.0.0+unknown"
12
+
@@ -0,0 +1,157 @@
1
+ """
2
+ SlurmPlatform utilities.
3
+
4
+ Copyright 2021, Bill & Melinda Gates Foundation. All rights reserved.
5
+ """
6
+ from pathlib import Path
7
+ from jinja2 import Template
8
+ from typing import TYPE_CHECKING, Optional, Union
9
+ from idmtools.entities.experiment import Experiment
10
+ from idmtools_platform_slurm.platform_operations.utils import check_home
11
+
12
+ if TYPE_CHECKING:
13
+ from idmtools_platform_slurm.slurm_platform import SlurmPlatform, CONFIG_PARAMETERS
14
+
15
+ DEFAULT_TEMPLATE_FILE = Path(__file__).parent.joinpath("sbatch.sh.jinja2")
16
+ BATCH_TEMPLATE_FILE = Path(__file__).parent.joinpath("batch.sh.jinja2")
17
+
18
+
19
+ def generate_batch(platform: 'SlurmPlatform', experiment: Experiment,
20
+ max_running_jobs: Optional[int] = None, array_batch_size: Optional[int] = None,
21
+ dependency: Optional[bool] = None,
22
+ template: Union[Path, str] = BATCH_TEMPLATE_FILE, **kwargs) -> None:
23
+ """
24
+ Generate bash script file batch.sh
25
+ Args:
26
+ platform: Slurm Platform
27
+ experiment: idmtools Experiment
28
+ max_running_jobs: int, how many allowed to run
29
+ array_size: INT, array size for slurm job
30
+ dependency: bool, determine if Slurm jobs depend on each other
31
+ template: template to be used to build batch file
32
+ kwargs: keyword arguments used to expand functionality
33
+ Returns:
34
+ None
35
+ """
36
+ template_vars = dict(njobs=experiment.simulation_count)
37
+
38
+ # Set max_running_jobs
39
+ if max_running_jobs is not None:
40
+ if platform.max_running_jobs is not None:
41
+ template_vars['max_running_jobs'] = min(max_running_jobs, platform.max_running_jobs)
42
+ else:
43
+ template_vars['max_running_jobs'] = max_running_jobs
44
+ else:
45
+ if platform.max_running_jobs is not None:
46
+ template_vars['max_running_jobs'] = platform.max_running_jobs
47
+ else:
48
+ template_vars['max_running_jobs'] = 1
49
+
50
+ # Set array_size
51
+ if array_batch_size is not None:
52
+ platform.array_batch_size = array_batch_size
53
+
54
+ if platform._max_array_size is not None:
55
+ if platform.array_batch_size is not None:
56
+ template_vars['array_batch_size'] = min(platform._max_array_size, platform.array_batch_size,
57
+ experiment.simulation_count)
58
+ else:
59
+ template_vars['array_batch_size'] = min(platform._max_array_size, experiment.simulation_count)
60
+ elif platform.array_batch_size is not None:
61
+ template_vars['array_batch_size'] = min(platform.array_batch_size, experiment.simulation_count)
62
+ else:
63
+ template_vars['array_batch_size'] = experiment.simulation_count
64
+
65
+ # Consider dependency
66
+ if dependency is None:
67
+ dependency = True
68
+ template_vars['dependency'] = dependency
69
+
70
+ # Update with possible override values
71
+ template_vars.update(kwargs)
72
+
73
+ # Build batch based on the given template
74
+ with open(template) as file_:
75
+ t = Template(file_.read())
76
+
77
+ # Write out file
78
+ output_target = platform.get_directory(experiment).joinpath("batch.sh")
79
+ with open(output_target, "w") as tout:
80
+ tout.write(t.render(template_vars))
81
+
82
+ # Make executable
83
+ platform.update_script_mode(output_target)
84
+
85
+
86
+ def generate_script(platform: 'SlurmPlatform', experiment: Experiment, max_running_jobs: Optional[int] = None,
87
+ template: Union[Path, str] = DEFAULT_TEMPLATE_FILE, **kwargs) -> None:
88
+ """
89
+ Generate batch file sbatch.sh
90
+ Args:
91
+ platform: Slurm Platform
92
+ experiment: idmtools Experiment
93
+ max_running_jobs: int, how many allowed to run at the same time
94
+ template: template to be used to build batch file
95
+ kwargs: keyword arguments used to expand functionality
96
+ Returns:
97
+ None
98
+ """
99
+ from idmtools_platform_slurm.slurm_platform import CONFIG_PARAMETERS
100
+ template_vars = dict(njobs=experiment.simulation_count)
101
+ # populate from our platform config vars
102
+ for p in CONFIG_PARAMETERS:
103
+ if getattr(platform, p) is not None:
104
+ template_vars[p] = getattr(platform, p)
105
+
106
+ # Set default here
107
+ if max_running_jobs is not None:
108
+ template_vars['max_running_jobs'] = max_running_jobs
109
+ if max_running_jobs is None and platform.max_running_jobs is None:
110
+ template_vars['max_running_jobs'] = 1
111
+
112
+ # Add any overides. We need some validation here later
113
+ # TODO add validation for valid config options
114
+ template_vars.update(kwargs)
115
+
116
+ if platform.modules:
117
+ template_vars['modules'] = platform.modules
118
+
119
+ with open(template) as file_:
120
+ t = Template(file_.read())
121
+
122
+ # Write out file
123
+ output_target = platform.get_directory(experiment).joinpath("sbatch.sh")
124
+ with open(output_target, "w") as tout:
125
+ tout.write(t.render(template_vars))
126
+ # Make executable
127
+ platform.update_script_mode(output_target)
128
+
129
+
130
+ def generate_simulation_script(platform: 'SlurmPlatform', simulation, retries: Optional[int] = None) -> None:
131
+ """
132
+ Generate batch file _run.sh
133
+ Args:
134
+ platform: Slurm Platform
135
+ simulation: idmtools Simulation
136
+ retries: int
137
+ Returns:
138
+ None
139
+ """
140
+ experiment_dir = platform.get_directory(simulation.parent).absolute()
141
+ experiment_dir = str(experiment_dir).replace('\\', '/')
142
+ check = check_home(experiment_dir)
143
+ sim_script = platform.get_directory(simulation).joinpath("_run.sh")
144
+ with open(sim_script, "w") as tout:
145
+ with open(Path(__file__).parent.parent.joinpath("assets/_run.sh.jinja2")) as tin:
146
+ tvars = dict(
147
+ platform=platform,
148
+ simulation=simulation,
149
+ retries=retries if retries else platform.retries
150
+ )
151
+ if not check:
152
+ tvars['experiment_dir'] = str(experiment_dir)
153
+
154
+ t = Template(tin.read())
155
+ tout.write(t.render(tvars))
156
+ # Make executable
157
+ platform.update_script_mode(sim_script)
@@ -0,0 +1,44 @@
1
+ #!/bin/bash
2
+
3
+ #SBATCH --signal=B:SIGTERM@30
4
+
5
+ # define the handler function
6
+ term_handler()
7
+ {
8
+ # do whatever cleanup you want here
9
+ echo "-1" > job_status.txt
10
+ exit -1
11
+ }
12
+
13
+ # associate the function "term_handler" with the TERM signal
14
+ trap 'term_handler' TERM
15
+
16
+ echo ${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID} > job_id.txt
17
+
18
+ n=0
19
+ until [ "$n" -ge {{retries}} ]
20
+ do
21
+ echo "100" > job_status.txt
22
+ {% if simulation.task.sif_path is defined and simulation.task.sif_path %}
23
+ {% if simulation.task.command.cmd.startswith('singularity') %}
24
+ {{simulation.task.command.cmd}}
25
+ {% else %}
26
+ {% if experiment_dir is defined and experiment_dir %}
27
+ singularity exec --bind {{experiment_dir}} {{simulation.task.sif_path}} {{simulation.task.command.cmd}}
28
+ {% else %}
29
+ singularity exec {{simulation.task.sif_path}} {{simulation.task.command.cmd}}
30
+ {% endif %}
31
+ {% endif %}
32
+ {% else %}
33
+ {{simulation.task.command.cmd}}
34
+ {% endif %}
35
+ RESULT=$?
36
+ if [ $RESULT -eq 0 ]; then
37
+ echo "0" > job_status.txt
38
+ exit $RESULT
39
+ fi
40
+ n=$((n+1))
41
+ sleep 15
42
+ done
43
+ echo "-1" > job_status.txt
44
+ exit $RESULT