toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -21,62 +21,55 @@ from argparse import Namespace
|
|
|
21
21
|
from collections import defaultdict
|
|
22
22
|
from queue import Empty, Queue
|
|
23
23
|
from threading import Event, Thread
|
|
24
|
-
from typing import
|
|
24
|
+
from typing import Optional
|
|
25
25
|
from unittest.mock import MagicMock
|
|
26
26
|
|
|
27
|
-
from toil.batchSystems.abstractBatchSystem import (
|
|
28
|
-
|
|
29
|
-
|
|
27
|
+
from toil.batchSystems.abstractBatchSystem import (
|
|
28
|
+
AbstractBatchSystem,
|
|
29
|
+
AbstractScalableBatchSystem,
|
|
30
|
+
NodeInfo,
|
|
31
|
+
)
|
|
30
32
|
from toil.common import Config
|
|
31
|
-
from toil.options.common import defaultTargetTime
|
|
32
33
|
from toil.job import JobDescription
|
|
33
34
|
from toil.lib.conversions import human2bytes as h2b
|
|
35
|
+
from toil.options.common import defaultTargetTime
|
|
34
36
|
from toil.provisioners.abstractProvisioner import AbstractProvisioner, Shape
|
|
35
|
-
from toil.provisioners.clusterScaler import (
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
37
|
+
from toil.provisioners.clusterScaler import (
|
|
38
|
+
BinPackedFit,
|
|
39
|
+
ClusterScaler,
|
|
40
|
+
NodeReservation,
|
|
41
|
+
ScalerThread,
|
|
42
|
+
)
|
|
39
43
|
from toil.provisioners.node import Node
|
|
40
44
|
from toil.test import ToilTest, slow
|
|
41
45
|
|
|
42
46
|
logger = logging.getLogger(__name__)
|
|
43
47
|
|
|
44
48
|
# simplified c4.8xlarge (preemptible)
|
|
45
|
-
c4_8xlarge_preemptible = Shape(
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
disk=h2b('100G'),
|
|
49
|
-
preemptible=True)
|
|
49
|
+
c4_8xlarge_preemptible = Shape(
|
|
50
|
+
wallTime=3600, memory=h2b("60G"), cores=36, disk=h2b("100G"), preemptible=True
|
|
51
|
+
)
|
|
50
52
|
# simplified c4.8xlarge (non-preemptible)
|
|
51
|
-
c4_8xlarge = Shape(
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
disk=h2b('100G'),
|
|
55
|
-
preemptible=False)
|
|
53
|
+
c4_8xlarge = Shape(
|
|
54
|
+
wallTime=3600, memory=h2b("60G"), cores=36, disk=h2b("100G"), preemptible=False
|
|
55
|
+
)
|
|
56
56
|
# simplified r3.8xlarge (non-preemptible)
|
|
57
|
-
r3_8xlarge = Shape(
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
disk=h2b('600G'),
|
|
61
|
-
preemptible=False)
|
|
57
|
+
r3_8xlarge = Shape(
|
|
58
|
+
wallTime=3600, memory=h2b("260G"), cores=32, disk=h2b("600G"), preemptible=False
|
|
59
|
+
)
|
|
62
60
|
# simplified r5.2xlarge (non-preemptible)
|
|
63
|
-
r5_2xlarge = Shape(
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
disk=h2b('50G'),
|
|
67
|
-
preemptible=False)
|
|
61
|
+
r5_2xlarge = Shape(
|
|
62
|
+
wallTime=3600, memory=h2b("64Gi"), cores=8, disk=h2b("50G"), preemptible=False
|
|
63
|
+
)
|
|
68
64
|
# simplified r5.4xlarge (non-preemptible)
|
|
69
|
-
r5_4xlarge = Shape(
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
disk=h2b('50G'),
|
|
73
|
-
preemptible=False)
|
|
65
|
+
r5_4xlarge = Shape(
|
|
66
|
+
wallTime=3600, memory=h2b("128Gi"), cores=16, disk=h2b("50G"), preemptible=False
|
|
67
|
+
)
|
|
74
68
|
# simplified t2.micro (non-preemptible)
|
|
75
|
-
t2_micro = Shape(
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
preemptible=False)
|
|
69
|
+
t2_micro = Shape(
|
|
70
|
+
wallTime=3600, memory=h2b("1G"), cores=1, disk=h2b("8G"), preemptible=False
|
|
71
|
+
)
|
|
72
|
+
|
|
80
73
|
|
|
81
74
|
class BinPackingTest(ToilTest):
|
|
82
75
|
def setUp(self):
|
|
@@ -85,56 +78,104 @@ class BinPackingTest(ToilTest):
|
|
|
85
78
|
|
|
86
79
|
def testPackingOneShape(self):
|
|
87
80
|
"""Pack one shape and check that the resulting reservations look sane."""
|
|
88
|
-
self.bpf.nodeReservations[c4_8xlarge_preemptible] = [
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
81
|
+
self.bpf.nodeReservations[c4_8xlarge_preemptible] = [
|
|
82
|
+
NodeReservation(c4_8xlarge_preemptible)
|
|
83
|
+
]
|
|
84
|
+
self.bpf.addJobShape(
|
|
85
|
+
Shape(
|
|
86
|
+
wallTime=1000,
|
|
87
|
+
cores=2,
|
|
88
|
+
memory=h2b("1G"),
|
|
89
|
+
disk=h2b("2G"),
|
|
90
|
+
preemptible=True,
|
|
91
|
+
)
|
|
92
|
+
)
|
|
94
93
|
self.assertEqual(self.bpf.nodeReservations[r3_8xlarge], [])
|
|
95
|
-
self.assertEqual(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
94
|
+
self.assertEqual(
|
|
95
|
+
[x.shapes() for x in self.bpf.nodeReservations[c4_8xlarge_preemptible]],
|
|
96
|
+
[
|
|
97
|
+
[
|
|
98
|
+
Shape(
|
|
99
|
+
wallTime=1000,
|
|
100
|
+
memory=h2b("59G"),
|
|
101
|
+
cores=34,
|
|
102
|
+
disk=h2b("98G"),
|
|
103
|
+
preemptible=True,
|
|
104
|
+
),
|
|
105
|
+
Shape(
|
|
106
|
+
wallTime=2600,
|
|
107
|
+
memory=h2b("60G"),
|
|
108
|
+
cores=36,
|
|
109
|
+
disk=h2b("100G"),
|
|
110
|
+
preemptible=True,
|
|
111
|
+
),
|
|
112
|
+
]
|
|
113
|
+
],
|
|
114
|
+
)
|
|
106
115
|
|
|
107
116
|
def testSorting(self):
|
|
108
117
|
"""
|
|
109
118
|
Test that sorting is correct: preemptible, then memory, then cores, then disk,
|
|
110
119
|
then wallTime.
|
|
111
120
|
"""
|
|
112
|
-
shapeList = [
|
|
113
|
-
|
|
121
|
+
shapeList = [
|
|
122
|
+
c4_8xlarge_preemptible,
|
|
123
|
+
r3_8xlarge,
|
|
124
|
+
c4_8xlarge,
|
|
125
|
+
c4_8xlarge,
|
|
126
|
+
t2_micro,
|
|
127
|
+
t2_micro,
|
|
128
|
+
c4_8xlarge,
|
|
129
|
+
r3_8xlarge,
|
|
130
|
+
r3_8xlarge,
|
|
131
|
+
t2_micro,
|
|
132
|
+
]
|
|
114
133
|
shapeList.sort()
|
|
115
|
-
assert shapeList == [
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
134
|
+
assert shapeList == [
|
|
135
|
+
c4_8xlarge_preemptible,
|
|
136
|
+
t2_micro,
|
|
137
|
+
t2_micro,
|
|
138
|
+
t2_micro,
|
|
139
|
+
c4_8xlarge,
|
|
140
|
+
c4_8xlarge,
|
|
141
|
+
c4_8xlarge,
|
|
142
|
+
r3_8xlarge,
|
|
143
|
+
r3_8xlarge,
|
|
144
|
+
r3_8xlarge,
|
|
145
|
+
]
|
|
119
146
|
|
|
120
147
|
def testAddingInitialNode(self):
|
|
121
148
|
"""Pack one shape when no nodes are available and confirm that we fit one node properly."""
|
|
122
|
-
self.bpf.addJobShape(
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
149
|
+
self.bpf.addJobShape(
|
|
150
|
+
Shape(
|
|
151
|
+
wallTime=1000,
|
|
152
|
+
cores=2,
|
|
153
|
+
memory=h2b("1G"),
|
|
154
|
+
disk=h2b("2G"),
|
|
155
|
+
preemptible=True,
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
self.assertEqual(
|
|
159
|
+
[x.shapes() for x in self.bpf.nodeReservations[c4_8xlarge_preemptible]],
|
|
160
|
+
[
|
|
161
|
+
[
|
|
162
|
+
Shape(
|
|
163
|
+
wallTime=1000,
|
|
164
|
+
memory=h2b("59G"),
|
|
165
|
+
cores=34,
|
|
166
|
+
disk=h2b("98G"),
|
|
167
|
+
preemptible=True,
|
|
168
|
+
),
|
|
169
|
+
Shape(
|
|
170
|
+
wallTime=2600,
|
|
171
|
+
memory=h2b("60G"),
|
|
172
|
+
cores=36,
|
|
173
|
+
disk=h2b("100G"),
|
|
174
|
+
preemptible=True,
|
|
175
|
+
),
|
|
176
|
+
]
|
|
177
|
+
],
|
|
178
|
+
)
|
|
138
179
|
|
|
139
180
|
def testLowTargetTime(self):
|
|
140
181
|
"""
|
|
@@ -150,11 +191,13 @@ class BinPackingTest(ToilTest):
|
|
|
150
191
|
Each job is parametrized to take 300 seconds, so (the minimum of) 1 of them should fit into
|
|
151
192
|
each node's 0 second window, so we expect 1000 nodes.
|
|
152
193
|
"""
|
|
153
|
-
allocation = self.run1000JobsOnMicros(
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
194
|
+
allocation = self.run1000JobsOnMicros(
|
|
195
|
+
jobCores=1,
|
|
196
|
+
jobMem=h2b("1G"),
|
|
197
|
+
jobDisk=h2b("1G"),
|
|
198
|
+
jobTime=300,
|
|
199
|
+
globalTargetTime=0,
|
|
200
|
+
)
|
|
158
201
|
self.assertEqual(allocation, {t2_micro: 1000})
|
|
159
202
|
|
|
160
203
|
def testHighTargetTime(self):
|
|
@@ -170,11 +213,13 @@ class BinPackingTest(ToilTest):
|
|
|
170
213
|
Each job is parametrized to take 300 seconds, so 12 of them should fit into each node's
|
|
171
214
|
3600 second window. 1000/12 = 83.33, so we expect 84 nodes.
|
|
172
215
|
"""
|
|
173
|
-
allocation = self.run1000JobsOnMicros(
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
216
|
+
allocation = self.run1000JobsOnMicros(
|
|
217
|
+
jobCores=1,
|
|
218
|
+
jobMem=h2b("1G"),
|
|
219
|
+
jobDisk=h2b("1G"),
|
|
220
|
+
jobTime=300,
|
|
221
|
+
globalTargetTime=3600,
|
|
222
|
+
)
|
|
178
223
|
self.assertEqual(allocation, {t2_micro: 84})
|
|
179
224
|
|
|
180
225
|
def testZeroResourceJobs(self):
|
|
@@ -188,11 +233,9 @@ class BinPackingTest(ToilTest):
|
|
|
188
233
|
Since all jobs should pack cpu/disk/mem-wise on a t2.micro, we expect only one t2.micro to
|
|
189
234
|
be provisioned. If we raise this, as in testLowTargetTime, it will launch 1000 t2.micros.
|
|
190
235
|
"""
|
|
191
|
-
allocation = self.run1000JobsOnMicros(
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
jobTime=300,
|
|
195
|
-
globalTargetTime=0)
|
|
236
|
+
allocation = self.run1000JobsOnMicros(
|
|
237
|
+
jobCores=0, jobMem=0, jobDisk=0, jobTime=300, globalTargetTime=0
|
|
238
|
+
)
|
|
196
239
|
self.assertEqual(allocation, {t2_micro: 1})
|
|
197
240
|
|
|
198
241
|
def testLongRunningJobs(self):
|
|
@@ -206,11 +249,13 @@ class BinPackingTest(ToilTest):
|
|
|
206
249
|
Despite setting globalTargetTime=3600, this should launch 1000 t2.micros because each job's
|
|
207
250
|
estimated runtime (30000 seconds) extends well beyond 3600 seconds.
|
|
208
251
|
"""
|
|
209
|
-
allocation = self.run1000JobsOnMicros(
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
252
|
+
allocation = self.run1000JobsOnMicros(
|
|
253
|
+
jobCores=1,
|
|
254
|
+
jobMem=h2b("1G"),
|
|
255
|
+
jobDisk=h2b("1G"),
|
|
256
|
+
jobTime=30000,
|
|
257
|
+
globalTargetTime=3600,
|
|
258
|
+
)
|
|
214
259
|
self.assertEqual(allocation, {t2_micro: 1000})
|
|
215
260
|
|
|
216
261
|
def run1000JobsOnMicros(self, jobCores, jobMem, jobDisk, jobTime, globalTargetTime):
|
|
@@ -221,11 +266,15 @@ class BinPackingTest(ToilTest):
|
|
|
221
266
|
bpf = BinPackedFit(node_shapes_for_testing, targetTime=globalTargetTime)
|
|
222
267
|
|
|
223
268
|
for _ in range(1000):
|
|
224
|
-
bpf.addJobShape(
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
269
|
+
bpf.addJobShape(
|
|
270
|
+
Shape(
|
|
271
|
+
wallTime=jobTime,
|
|
272
|
+
memory=jobMem,
|
|
273
|
+
cores=jobCores,
|
|
274
|
+
disk=jobDisk,
|
|
275
|
+
preemptible=False,
|
|
276
|
+
)
|
|
277
|
+
)
|
|
229
278
|
return bpf.getRequiredNodes()
|
|
230
279
|
|
|
231
280
|
def testPathologicalCase(self):
|
|
@@ -238,20 +287,30 @@ class BinPackingTest(ToilTest):
|
|
|
238
287
|
the future.
|
|
239
288
|
"""
|
|
240
289
|
# Add one job that partially fills an r3.8xlarge for 1000 hours
|
|
241
|
-
self.bpf.addJobShape(
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
290
|
+
self.bpf.addJobShape(
|
|
291
|
+
Shape(
|
|
292
|
+
wallTime=3600000,
|
|
293
|
+
memory=h2b("10G"),
|
|
294
|
+
cores=0,
|
|
295
|
+
disk=h2b("10G"),
|
|
296
|
+
preemptible=False,
|
|
297
|
+
)
|
|
298
|
+
)
|
|
246
299
|
for _ in range(500):
|
|
247
300
|
# Add 500 CPU-hours worth of jobs that fill an r3.8xlarge
|
|
248
|
-
self.bpf.addJobShape(
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
301
|
+
self.bpf.addJobShape(
|
|
302
|
+
Shape(
|
|
303
|
+
wallTime=3600,
|
|
304
|
+
memory=h2b("26G"),
|
|
305
|
+
cores=32,
|
|
306
|
+
disk=h2b("60G"),
|
|
307
|
+
preemptible=False,
|
|
308
|
+
)
|
|
309
|
+
)
|
|
253
310
|
# Hopefully we didn't assign just one node to cover all those jobs.
|
|
254
|
-
self.assertNotEqual(
|
|
311
|
+
self.assertNotEqual(
|
|
312
|
+
self.bpf.getRequiredNodes(), {r3_8xlarge: 1, c4_8xlarge_preemptible: 0}
|
|
313
|
+
)
|
|
255
314
|
|
|
256
315
|
def testJobTooLargeForAllNodes(self):
|
|
257
316
|
"""
|
|
@@ -259,14 +318,17 @@ class BinPackingTest(ToilTest):
|
|
|
259
318
|
warning, but definitely not crash.
|
|
260
319
|
"""
|
|
261
320
|
# Takes more RAM than an r3.8xlarge
|
|
262
|
-
largerThanR3 = Shape(
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
321
|
+
largerThanR3 = Shape(
|
|
322
|
+
wallTime=3600,
|
|
323
|
+
memory=h2b("360G"),
|
|
324
|
+
cores=32,
|
|
325
|
+
disk=h2b("600G"),
|
|
326
|
+
preemptible=False,
|
|
327
|
+
)
|
|
267
328
|
self.bpf.addJobShape(largerThanR3)
|
|
268
329
|
# If we got here we didn't crash.
|
|
269
330
|
|
|
331
|
+
|
|
270
332
|
class ClusterScalerTest(ToilTest):
|
|
271
333
|
def setUp(self):
|
|
272
334
|
super().setUp()
|
|
@@ -279,7 +341,9 @@ class ClusterScalerTest(ToilTest):
|
|
|
279
341
|
# It is also a full mock provisioner, so configure it to be that as well
|
|
280
342
|
self.provisioner = self.leader
|
|
281
343
|
# Pretend that Shapes are actually strings we can use for instance type names.
|
|
282
|
-
self.provisioner.setAutoscaledNodeTypes(
|
|
344
|
+
self.provisioner.setAutoscaledNodeTypes(
|
|
345
|
+
[({t}, None) for t in self.config.nodeTypes]
|
|
346
|
+
)
|
|
283
347
|
|
|
284
348
|
def testRounding(self):
|
|
285
349
|
"""
|
|
@@ -299,8 +363,8 @@ class ClusterScalerTest(ToilTest):
|
|
|
299
363
|
self.assertEqual(scaler._round(123456789101112.13), 123456789101112)
|
|
300
364
|
|
|
301
365
|
# Decimals other than X.5 round to the side they are closer to
|
|
302
|
-
self.assertEqual(scaler._round(
|
|
303
|
-
self.assertEqual(scaler._round(0.5 +
|
|
366
|
+
self.assertEqual(scaler._round(1e-10), 0)
|
|
367
|
+
self.assertEqual(scaler._round(0.5 + 1e-15), 1)
|
|
304
368
|
self.assertEqual(scaler._round(-0.9), -1)
|
|
305
369
|
self.assertEqual(scaler._round(-0.4), 0)
|
|
306
370
|
|
|
@@ -322,17 +386,30 @@ class ClusterScalerTest(ToilTest):
|
|
|
322
386
|
self.config.betaInertia = 0.0
|
|
323
387
|
self.config.maxNodes = [2, 3]
|
|
324
388
|
scaler = ClusterScaler(self.provisioner, self.leader, self.config)
|
|
325
|
-
jobShapes = [
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
389
|
+
jobShapes = [
|
|
390
|
+
Shape(
|
|
391
|
+
wallTime=3600,
|
|
392
|
+
cores=2,
|
|
393
|
+
memory=h2b("1G"),
|
|
394
|
+
disk=h2b("2G"),
|
|
395
|
+
preemptible=True,
|
|
396
|
+
)
|
|
397
|
+
] * 1000
|
|
398
|
+
jobShapes.extend(
|
|
399
|
+
[
|
|
400
|
+
Shape(
|
|
401
|
+
wallTime=3600,
|
|
402
|
+
cores=2,
|
|
403
|
+
memory=h2b("1G"),
|
|
404
|
+
disk=h2b("2G"),
|
|
405
|
+
preemptible=False,
|
|
406
|
+
)
|
|
407
|
+
]
|
|
408
|
+
* 1000
|
|
409
|
+
)
|
|
410
|
+
estimatedNodeCounts, could_not_fit = scaler.getEstimatedNodeCounts(
|
|
411
|
+
jobShapes, defaultdict(int)
|
|
412
|
+
)
|
|
336
413
|
self.assertEqual(estimatedNodeCounts[r3_8xlarge], 2)
|
|
337
414
|
self.assertEqual(estimatedNodeCounts[c4_8xlarge_preemptible], 3)
|
|
338
415
|
self.assertEqual(len(could_not_fit), 0)
|
|
@@ -345,7 +422,9 @@ class ClusterScalerTest(ToilTest):
|
|
|
345
422
|
self.config.minNodes = [2, 3]
|
|
346
423
|
scaler = ClusterScaler(self.provisioner, self.leader, self.config)
|
|
347
424
|
jobShapes = []
|
|
348
|
-
estimatedNodeCounts, could_not_fit = scaler.getEstimatedNodeCounts(
|
|
425
|
+
estimatedNodeCounts, could_not_fit = scaler.getEstimatedNodeCounts(
|
|
426
|
+
jobShapes, defaultdict(int)
|
|
427
|
+
)
|
|
349
428
|
self.assertEqual(estimatedNodeCounts[r3_8xlarge], 2)
|
|
350
429
|
self.assertEqual(estimatedNodeCounts[c4_8xlarge_preemptible], 3)
|
|
351
430
|
self.assertEqual(len(could_not_fit), 0)
|
|
@@ -367,7 +446,9 @@ class ClusterScalerTest(ToilTest):
|
|
|
367
446
|
# the same type. That is the only situation where
|
|
368
447
|
# preemptibleCompensation applies.
|
|
369
448
|
self.config.nodeTypes = [c4_8xlarge_preemptible, c4_8xlarge]
|
|
370
|
-
self.provisioner.setAutoscaledNodeTypes(
|
|
449
|
+
self.provisioner.setAutoscaledNodeTypes(
|
|
450
|
+
[({t}, None) for t in self.config.nodeTypes]
|
|
451
|
+
)
|
|
371
452
|
|
|
372
453
|
scaler = ClusterScaler(self.provisioner, self.leader, self.config)
|
|
373
454
|
# Simulate a situation where a previous run caused a
|
|
@@ -375,16 +456,24 @@ class ClusterScalerTest(ToilTest):
|
|
|
375
456
|
scaler.preemptibleNodeDeficit[c4_8xlarge] = 5
|
|
376
457
|
# Add a bunch of preemptible jobs (so the bin-packing
|
|
377
458
|
# estimate for the non-preemptible node should still be 0)
|
|
378
|
-
jobShapes = [
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
459
|
+
jobShapes = [
|
|
460
|
+
Shape(
|
|
461
|
+
wallTime=3600,
|
|
462
|
+
cores=2,
|
|
463
|
+
memory=h2b("1G"),
|
|
464
|
+
disk=h2b("2G"),
|
|
465
|
+
preemptible=True,
|
|
466
|
+
)
|
|
467
|
+
] * 1000
|
|
468
|
+
estimatedNodeCounts, could_not_fit = scaler.getEstimatedNodeCounts(
|
|
469
|
+
jobShapes, defaultdict(int)
|
|
470
|
+
)
|
|
384
471
|
# We don't care about the estimated size of the preemptible
|
|
385
472
|
# nodes. All we want to know is if we responded to the deficit
|
|
386
473
|
# properly: 0.5 * 5 (preemptibleCompensation * the deficit) = 3 (rounded up).
|
|
387
|
-
self.assertEqual(
|
|
474
|
+
self.assertEqual(
|
|
475
|
+
estimatedNodeCounts[self.provisioner.node_shapes_for_testing[1]], 3
|
|
476
|
+
)
|
|
388
477
|
self.assertEqual(len(could_not_fit), 0)
|
|
389
478
|
|
|
390
479
|
def testPreemptibleDeficitIsSet(self):
|
|
@@ -404,7 +493,9 @@ class ClusterScalerTest(ToilTest):
|
|
|
404
493
|
# the same type. That is the only situation where
|
|
405
494
|
# preemptibleCompensation applies.
|
|
406
495
|
self.config.nodeTypes = [c4_8xlarge_preemptible, c4_8xlarge]
|
|
407
|
-
self.provisioner.setAutoscaledNodeTypes(
|
|
496
|
+
self.provisioner.setAutoscaledNodeTypes(
|
|
497
|
+
[({t}, None) for t in self.config.nodeTypes]
|
|
498
|
+
)
|
|
408
499
|
scaler = ClusterScaler(self.provisioner, self.leader, self.config)
|
|
409
500
|
estimatedNodeCounts = {c4_8xlarge_preemptible: 5, c4_8xlarge: 0}
|
|
410
501
|
scaler.updateClusterSize(estimatedNodeCounts)
|
|
@@ -427,18 +518,30 @@ class ClusterScalerTest(ToilTest):
|
|
|
427
518
|
scaler = ClusterScaler(self.provisioner, self.leader, self.config)
|
|
428
519
|
# Pretend there is one ignored worker in the cluster
|
|
429
520
|
self.provisioner.getProvisionedWorkers = MagicMock(
|
|
430
|
-
return_value=[
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
521
|
+
return_value=[
|
|
522
|
+
Node(
|
|
523
|
+
"127.0.0.1",
|
|
524
|
+
"127.0.0.1",
|
|
525
|
+
"testNode",
|
|
526
|
+
datetime.datetime.now().isoformat(),
|
|
527
|
+
nodeType=c4_8xlarge,
|
|
528
|
+
preemptible=True,
|
|
529
|
+
)
|
|
530
|
+
]
|
|
531
|
+
)
|
|
532
|
+
scaler.ignoredNodes.add("127.0.0.1")
|
|
434
533
|
# Exercise the updateClusterSize logic
|
|
435
534
|
self.provisioner.addNodes = MagicMock()
|
|
436
535
|
scaler.updateClusterSize({c4_8xlarge: 1})
|
|
437
|
-
self.assertFalse(
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
536
|
+
self.assertFalse(
|
|
537
|
+
self.provisioner.addNodes.called,
|
|
538
|
+
"addNodes was called when no new nodes were needed",
|
|
539
|
+
)
|
|
540
|
+
self.assertEqual(
|
|
541
|
+
len(scaler.ignoredNodes),
|
|
542
|
+
0,
|
|
543
|
+
"The scaler didn't unignore an ignored node when " "scaling up",
|
|
544
|
+
)
|
|
442
545
|
|
|
443
546
|
def testBetaInertia(self):
|
|
444
547
|
# This is really high, but makes things easy to calculate.
|
|
@@ -466,25 +569,29 @@ class ClusterScalerTest(ToilTest):
|
|
|
466
569
|
|
|
467
570
|
# If the job needs 100% of the memory of the instance type, it won't
|
|
468
571
|
# fit and will need a bigger node.
|
|
469
|
-
self._check_job_estimate([(c4_8xlarge, 0), (r3_8xlarge, 1)], memory=h2b(
|
|
572
|
+
self._check_job_estimate([(c4_8xlarge, 0), (r3_8xlarge, 1)], memory=h2b("60G"))
|
|
470
573
|
|
|
471
574
|
# If the job needs 98% of the memory of the instance type, it won't
|
|
472
575
|
# fit and will need a bigger node.
|
|
473
|
-
self._check_job_estimate(
|
|
576
|
+
self._check_job_estimate(
|
|
577
|
+
[(c4_8xlarge, 0), (r3_8xlarge, 1)], memory=int(h2b("60G") * 0.98)
|
|
578
|
+
)
|
|
474
579
|
|
|
475
580
|
# If the job needs 90% of the memory of the instance type, it will fit.
|
|
476
|
-
self._check_job_estimate(
|
|
581
|
+
self._check_job_estimate(
|
|
582
|
+
[(c4_8xlarge, 1), (r3_8xlarge, 0)], memory=int(h2b("60G") * 0.90)
|
|
583
|
+
)
|
|
477
584
|
|
|
478
585
|
# If the job needs 100% of the disk of the instance type, it won't
|
|
479
586
|
# fit and will need a bigger node.
|
|
480
|
-
self._check_job_estimate([(c4_8xlarge, 0), (r3_8xlarge, 1)], disk=h2b(
|
|
587
|
+
self._check_job_estimate([(c4_8xlarge, 0), (r3_8xlarge, 1)], disk=h2b("100G"))
|
|
481
588
|
|
|
482
589
|
# If the job needs all but 7G of the disk of the instance type, it won't
|
|
483
590
|
# fit and will need a bigger node.
|
|
484
|
-
self._check_job_estimate([(c4_8xlarge, 0), (r3_8xlarge, 1)], disk=h2b(
|
|
591
|
+
self._check_job_estimate([(c4_8xlarge, 0), (r3_8xlarge, 1)], disk=h2b("93G"))
|
|
485
592
|
|
|
486
593
|
# If the job leaves 10% and 10G of the disk free, it fits
|
|
487
|
-
self._check_job_estimate([(c4_8xlarge, 1), (r3_8xlarge, 0)], disk=h2b(
|
|
594
|
+
self._check_job_estimate([(c4_8xlarge, 1), (r3_8xlarge, 0)], disk=h2b("90G"))
|
|
488
595
|
|
|
489
596
|
def test_overhead_accounting_small(self):
|
|
490
597
|
"""
|
|
@@ -499,11 +606,13 @@ class ClusterScalerTest(ToilTest):
|
|
|
499
606
|
|
|
500
607
|
# If the job needs 100% of the memory of the instance type, it won't
|
|
501
608
|
# fit and will need a bigger node.
|
|
502
|
-
self._check_job_estimate([(t2_micro, 0), (r3_8xlarge, 1)], memory=h2b(
|
|
609
|
+
self._check_job_estimate([(t2_micro, 0), (r3_8xlarge, 1)], memory=h2b("1G"))
|
|
503
610
|
|
|
504
611
|
# If the job needs all but 100M of the memory of the instance type, it
|
|
505
612
|
# won't fit and will need a bigger node.
|
|
506
|
-
self._check_job_estimate(
|
|
613
|
+
self._check_job_estimate(
|
|
614
|
+
[(t2_micro, 0), (r3_8xlarge, 1)], memory=h2b("1G") - h2b("100M")
|
|
615
|
+
)
|
|
507
616
|
|
|
508
617
|
# If the job needs no more than 90% of the memory on the node *and*
|
|
509
618
|
# leaves at least 384M free for overhead, we can rely on it fitting on a 1G
|
|
@@ -512,12 +621,14 @@ class ClusterScalerTest(ToilTest):
|
|
|
512
621
|
Shape(
|
|
513
622
|
wallTime=3600,
|
|
514
623
|
cores=1,
|
|
515
|
-
memory=h2b(
|
|
516
|
-
disk=h2b(
|
|
517
|
-
preemptible=True
|
|
624
|
+
memory=h2b("1G") - h2b("384M"),
|
|
625
|
+
disk=h2b("2G"),
|
|
626
|
+
preemptible=True,
|
|
518
627
|
)
|
|
519
628
|
]
|
|
520
|
-
self._check_job_estimate(
|
|
629
|
+
self._check_job_estimate(
|
|
630
|
+
[(t2_micro, 1), (r3_8xlarge, 0)], memory=h2b("1G") - h2b("384M")
|
|
631
|
+
)
|
|
521
632
|
|
|
522
633
|
def test_overhead_accounting_observed(self):
|
|
523
634
|
"""
|
|
@@ -536,9 +647,13 @@ class ClusterScalerTest(ToilTest):
|
|
|
536
647
|
# not clear if Mesos is thinking in actual GB or GiB here.
|
|
537
648
|
|
|
538
649
|
# A 62.5Gi job is sent to the larger node
|
|
539
|
-
self._check_job_estimate(
|
|
650
|
+
self._check_job_estimate(
|
|
651
|
+
[(r5_2xlarge, 0), (r5_4xlarge, 1)], memory=h2b("62.5 Gi")
|
|
652
|
+
)
|
|
540
653
|
|
|
541
|
-
def _check_job_estimate(
|
|
654
|
+
def _check_job_estimate(
|
|
655
|
+
self, nodes: list[tuple[Shape, int]], cores=1, memory=1, disk=1
|
|
656
|
+
) -> None:
|
|
542
657
|
"""
|
|
543
658
|
Make sure that a job with the given requirements, when run on the given
|
|
544
659
|
nodes, produces the given numbers of them.
|
|
@@ -553,23 +668,20 @@ class ClusterScalerTest(ToilTest):
|
|
|
553
668
|
|
|
554
669
|
jobs = [
|
|
555
670
|
Shape(
|
|
556
|
-
wallTime=3600,
|
|
557
|
-
cores=cores,
|
|
558
|
-
memory=memory,
|
|
559
|
-
disk=disk,
|
|
560
|
-
preemptible=True
|
|
671
|
+
wallTime=3600, cores=cores, memory=memory, disk=disk, preemptible=True
|
|
561
672
|
)
|
|
562
673
|
]
|
|
563
674
|
|
|
564
|
-
logger.debug(
|
|
675
|
+
logger.debug("Try and fit jobs: %s", jobs)
|
|
565
676
|
counts, could_not_fit = scaler.getEstimatedNodeCounts(jobs, defaultdict(int))
|
|
566
677
|
for node, count in nodes:
|
|
567
678
|
seen_count = counts.get(node, 0)
|
|
568
679
|
if seen_count != count:
|
|
569
|
-
logger.error(
|
|
680
|
+
logger.error("Saw %s/%s instances of node %s", seen_count, count, node)
|
|
570
681
|
self.assertEqual(seen_count, count)
|
|
571
682
|
self.assertEqual(len(could_not_fit), 0)
|
|
572
683
|
|
|
684
|
+
|
|
573
685
|
class ScalerThreadTest(ToilTest):
|
|
574
686
|
def _testClusterScaling(self, config, numJobs, numPreemptibleJobs, jobShape):
|
|
575
687
|
"""
|
|
@@ -587,49 +699,77 @@ class ScalerThreadTest(ToilTest):
|
|
|
587
699
|
clusterScaler.start()
|
|
588
700
|
try:
|
|
589
701
|
# Add 100 jobs to complete
|
|
590
|
-
list(map(lambda x: mock.addJob(jobShape=jobShape),
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
702
|
+
list(map(lambda x: mock.addJob(jobShape=jobShape), list(range(numJobs))))
|
|
703
|
+
list(
|
|
704
|
+
map(
|
|
705
|
+
lambda x: mock.addJob(jobShape=jobShape, preemptible=True),
|
|
706
|
+
list(range(numPreemptibleJobs)),
|
|
707
|
+
)
|
|
708
|
+
)
|
|
594
709
|
|
|
595
710
|
# Add some completed jobs
|
|
596
711
|
for preemptible in (True, False):
|
|
597
|
-
if
|
|
712
|
+
if (
|
|
713
|
+
preemptible
|
|
714
|
+
and numPreemptibleJobs > 0
|
|
715
|
+
or not preemptible
|
|
716
|
+
and numJobs > 0
|
|
717
|
+
):
|
|
598
718
|
# Add 1000 random jobs
|
|
599
719
|
for _ in range(1000):
|
|
600
720
|
x = mock.getNodeShape(nodeType=jobShape)
|
|
601
|
-
iJ = JobDescription(
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
721
|
+
iJ = JobDescription(
|
|
722
|
+
requirements=dict(
|
|
723
|
+
memory=random.randrange(1, x.memory),
|
|
724
|
+
cores=random.randrange(1, x.cores),
|
|
725
|
+
disk=random.randrange(1, x.disk),
|
|
726
|
+
preemptible=preemptible,
|
|
727
|
+
),
|
|
728
|
+
jobName="testClusterScaling",
|
|
729
|
+
unitName="",
|
|
730
|
+
)
|
|
731
|
+
clusterScaler.addCompletedJob(
|
|
732
|
+
iJ, random.choice(list(range(1, x.wallTime)))
|
|
733
|
+
)
|
|
608
734
|
|
|
609
735
|
startTime = time.time()
|
|
610
736
|
# Wait while the cluster processes the jobs
|
|
611
|
-
while (
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
737
|
+
while (
|
|
738
|
+
mock.getNumberOfJobsIssued(preemptible=False) > 0
|
|
739
|
+
or mock.getNumberOfJobsIssued(preemptible=True) > 0
|
|
740
|
+
or mock.getNumberOfNodes() > 0
|
|
741
|
+
or mock.getNumberOfNodes(preemptible=True) > 0
|
|
742
|
+
):
|
|
743
|
+
logger.debug(
|
|
744
|
+
"Running, non-preemptible queue size: %s, non-preemptible workers: %s, "
|
|
745
|
+
"preemptible queue size: %s, preemptible workers: %s"
|
|
746
|
+
% (
|
|
747
|
+
mock.getNumberOfJobsIssued(preemptible=False),
|
|
748
|
+
mock.getNumberOfNodes(preemptible=False),
|
|
749
|
+
mock.getNumberOfJobsIssued(preemptible=True),
|
|
750
|
+
mock.getNumberOfNodes(preemptible=True),
|
|
751
|
+
)
|
|
752
|
+
)
|
|
620
753
|
clusterScaler.check()
|
|
621
754
|
time.sleep(0.5)
|
|
622
|
-
logger.debug(
|
|
755
|
+
logger.debug(
|
|
756
|
+
"We waited %s for cluster to finish" % (time.time() - startTime)
|
|
757
|
+
)
|
|
623
758
|
finally:
|
|
624
759
|
clusterScaler.shutdown()
|
|
625
760
|
mock.shutDown()
|
|
626
761
|
|
|
627
762
|
# Print some info about the autoscaling
|
|
628
|
-
logger.debug(
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
763
|
+
logger.debug(
|
|
764
|
+
"Total-jobs: %s: Max-workers: %s, "
|
|
765
|
+
"Total-worker-time: %s, Worker-time-per-job: %s"
|
|
766
|
+
% (
|
|
767
|
+
mock.totalJobs,
|
|
768
|
+
sum(mock.maxWorkers.values()),
|
|
769
|
+
mock.totalWorkerTime,
|
|
770
|
+
mock.totalWorkerTime // mock.totalJobs if mock.totalJobs > 0 else 0.0,
|
|
771
|
+
)
|
|
772
|
+
)
|
|
633
773
|
|
|
634
774
|
@slow
|
|
635
775
|
def testClusterScaling(self):
|
|
@@ -640,15 +780,15 @@ class ScalerThreadTest(ToilTest):
|
|
|
640
780
|
config = Config()
|
|
641
781
|
|
|
642
782
|
# Make defaults dummy values
|
|
643
|
-
config.defaultMemory = h2b(
|
|
783
|
+
config.defaultMemory = h2b("1Gi")
|
|
644
784
|
config.defaultCores = 1
|
|
645
|
-
config.defaultDisk = h2b(
|
|
785
|
+
config.defaultDisk = h2b("1Gi")
|
|
646
786
|
|
|
647
787
|
# No preemptible nodes/jobs
|
|
648
788
|
config.maxPreemptibleNodes = [] # No preemptible nodes
|
|
649
789
|
|
|
650
790
|
# Non-preemptible parameters
|
|
651
|
-
config.nodeTypes = [Shape(20, h2b(
|
|
791
|
+
config.nodeTypes = [Shape(20, h2b("10Gi"), 10, h2b("100Gi"), False)]
|
|
652
792
|
config.minNodes = [0]
|
|
653
793
|
config.maxNodes = [10]
|
|
654
794
|
|
|
@@ -657,27 +797,31 @@ class ScalerThreadTest(ToilTest):
|
|
|
657
797
|
config.betaInertia = 0.1
|
|
658
798
|
config.scaleInterval = 3
|
|
659
799
|
|
|
660
|
-
self._testClusterScaling(
|
|
661
|
-
|
|
800
|
+
self._testClusterScaling(
|
|
801
|
+
config,
|
|
802
|
+
numJobs=100,
|
|
803
|
+
numPreemptibleJobs=0,
|
|
804
|
+
jobShape=Shape(20, h2b("7Gi"), 10, h2b("80Gi"), False),
|
|
805
|
+
)
|
|
662
806
|
|
|
663
807
|
@slow
|
|
664
808
|
def testClusterScalingMultipleNodeTypes(self):
|
|
665
809
|
|
|
666
|
-
small_node = Shape(20, h2b(
|
|
667
|
-
small_job = Shape(20, h2b(
|
|
668
|
-
medium_node = Shape(20, h2b(
|
|
669
|
-
medium_job = Shape(20, h2b(
|
|
670
|
-
large_node = Shape(20, h2b(
|
|
671
|
-
large_job = Shape(20, h2b(
|
|
810
|
+
small_node = Shape(20, h2b("5Gi"), 10, h2b("20Gi"), False)
|
|
811
|
+
small_job = Shape(20, h2b("3Gi"), 10, h2b("4Gi"), False)
|
|
812
|
+
medium_node = Shape(20, h2b("10Gi"), 10, h2b("20Gi"), False)
|
|
813
|
+
medium_job = Shape(20, h2b("7Gi"), 10, h2b("4Gi"), False)
|
|
814
|
+
large_node = Shape(20, h2b("20Gi"), 10, h2b("20Gi"), False)
|
|
815
|
+
large_job = Shape(20, h2b("16Gi"), 10, h2b("4Gi"), False)
|
|
672
816
|
|
|
673
817
|
numJobs = 100
|
|
674
818
|
|
|
675
819
|
config = Config()
|
|
676
820
|
|
|
677
821
|
# Make defaults dummy values
|
|
678
|
-
config.defaultMemory = h2b(
|
|
822
|
+
config.defaultMemory = h2b("1Gi")
|
|
679
823
|
config.defaultCores = 1
|
|
680
|
-
config.defaultDisk = h2b(
|
|
824
|
+
config.defaultDisk = h2b("1Gi")
|
|
681
825
|
|
|
682
826
|
# No preemptible nodes/jobs
|
|
683
827
|
config.preemptibleNodeTypes = []
|
|
@@ -707,12 +851,18 @@ class ScalerThreadTest(ToilTest):
|
|
|
707
851
|
|
|
708
852
|
# Add medium completed jobs
|
|
709
853
|
for i in range(1000):
|
|
710
|
-
iJ = JobDescription(
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
854
|
+
iJ = JobDescription(
|
|
855
|
+
requirements=dict(
|
|
856
|
+
memory=random.choice(
|
|
857
|
+
range(small_job.memory, medium_job.memory)
|
|
858
|
+
),
|
|
859
|
+
cores=medium_job.cores,
|
|
860
|
+
disk=large_job.disk,
|
|
861
|
+
preemptible=False,
|
|
862
|
+
),
|
|
863
|
+
jobName="testClusterScaling",
|
|
864
|
+
unitName="",
|
|
865
|
+
)
|
|
716
866
|
clusterScaler.addCompletedJob(iJ, random.choice(range(1, 10)))
|
|
717
867
|
|
|
718
868
|
while mock.getNumberOfJobsIssued() > 0 or mock.getNumberOfNodes() > 0:
|
|
@@ -739,15 +889,15 @@ class ScalerThreadTest(ToilTest):
|
|
|
739
889
|
"""
|
|
740
890
|
config = Config()
|
|
741
891
|
|
|
742
|
-
node_shape = Shape(20, h2b(
|
|
743
|
-
preemptible_node_shape = Shape(20, h2b(
|
|
744
|
-
job_shape = Shape(20, h2b(
|
|
745
|
-
preemptible_job_shape = Shape(20, h2b(
|
|
892
|
+
node_shape = Shape(20, h2b("10Gi"), 10, h2b("20Gi"), False)
|
|
893
|
+
preemptible_node_shape = Shape(20, h2b("10Gi"), 10, h2b("20Gi"), True)
|
|
894
|
+
job_shape = Shape(20, h2b("7Gi"), 10, h2b("2Gi"), False)
|
|
895
|
+
preemptible_job_shape = Shape(20, h2b("7Gi"), 10, h2b("2Gi"), True)
|
|
746
896
|
|
|
747
897
|
# Make defaults dummy values
|
|
748
|
-
config.defaultMemory = h2b(
|
|
898
|
+
config.defaultMemory = h2b("1Gi")
|
|
749
899
|
config.defaultCores = 1
|
|
750
|
-
config.defaultDisk = h2b(
|
|
900
|
+
config.defaultDisk = h2b("1Gi")
|
|
751
901
|
|
|
752
902
|
# non-preemptible node parameters
|
|
753
903
|
config.nodeTypes = [node_shape, preemptible_node_shape]
|
|
@@ -759,13 +909,16 @@ class ScalerThreadTest(ToilTest):
|
|
|
759
909
|
config.betaInertia = 0.9
|
|
760
910
|
config.scaleInterval = 3
|
|
761
911
|
|
|
762
|
-
self._testClusterScaling(
|
|
912
|
+
self._testClusterScaling(
|
|
913
|
+
config, numJobs=100, numPreemptibleJobs=100, jobShape=job_shape
|
|
914
|
+
)
|
|
763
915
|
|
|
764
916
|
|
|
765
917
|
class MockBatchSystemAndProvisioner(AbstractScalableBatchSystem, AbstractProvisioner):
|
|
766
918
|
"""Mimics a leader, job batcher, provisioner and scalable batch system."""
|
|
919
|
+
|
|
767
920
|
def __init__(self, config, secondsPerJob):
|
|
768
|
-
super().__init__(clusterName=
|
|
921
|
+
super().__init__(clusterName="clusterName", clusterType="mesos")
|
|
769
922
|
# To mimic parallel preemptible and non-preemptible queues
|
|
770
923
|
# for jobs we create two parallel instances of the following class
|
|
771
924
|
self.config = config
|
|
@@ -797,8 +950,8 @@ class MockBatchSystemAndProvisioner(AbstractScalableBatchSystem, AbstractProvisi
|
|
|
797
950
|
|
|
798
951
|
# Stub out all AbstractBatchSystem methods since they are never called
|
|
799
952
|
for name, value in AbstractBatchSystem.__dict__.items():
|
|
800
|
-
if getattr(value,
|
|
801
|
-
exec(
|
|
953
|
+
if getattr(value, "__isabstractmethod__", False):
|
|
954
|
+
exec("def %s(): pass" % name)
|
|
802
955
|
# Without this, the class would end up with .name and .value attributes
|
|
803
956
|
del name, value
|
|
804
957
|
|
|
@@ -813,7 +966,7 @@ class MockBatchSystemAndProvisioner(AbstractScalableBatchSystem, AbstractProvisi
|
|
|
813
966
|
pass
|
|
814
967
|
|
|
815
968
|
def supportedClusterTypes(self):
|
|
816
|
-
return {
|
|
969
|
+
return {"mesos"}
|
|
817
970
|
|
|
818
971
|
def createClusterSettings(self):
|
|
819
972
|
pass
|
|
@@ -822,7 +975,9 @@ class MockBatchSystemAndProvisioner(AbstractScalableBatchSystem, AbstractProvisi
|
|
|
822
975
|
pass
|
|
823
976
|
|
|
824
977
|
# AbstractProvisioner methods
|
|
825
|
-
def setAutoscaledNodeTypes(
|
|
978
|
+
def setAutoscaledNodeTypes(
|
|
979
|
+
self, node_types: list[tuple[set[Shape], Optional[float]]]
|
|
980
|
+
):
|
|
826
981
|
self.node_shapes_for_testing = sorted(it for t in node_types for it in t[0])
|
|
827
982
|
super().setAutoscaledNodeTypes(node_types)
|
|
828
983
|
|
|
@@ -856,18 +1011,25 @@ class MockBatchSystemAndProvisioner(AbstractScalableBatchSystem, AbstractProvisi
|
|
|
856
1011
|
"""
|
|
857
1012
|
self.totalJobs += 1
|
|
858
1013
|
jobID = uuid.uuid4()
|
|
859
|
-
self.jobBatchSystemIDToIssuedJob[jobID] = JobDescription(
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
1014
|
+
self.jobBatchSystemIDToIssuedJob[jobID] = JobDescription(
|
|
1015
|
+
requirements={
|
|
1016
|
+
"memory": jobShape.memory,
|
|
1017
|
+
"cores": jobShape.cores,
|
|
1018
|
+
"disk": jobShape.disk,
|
|
1019
|
+
"preemptible": preemptible,
|
|
1020
|
+
},
|
|
1021
|
+
jobName=f"job{self.totalJobs}",
|
|
1022
|
+
)
|
|
864
1023
|
self.jobQueue.put(jobID)
|
|
865
1024
|
|
|
866
1025
|
# JobBatcher functionality
|
|
867
1026
|
def getNumberOfJobsIssued(self, preemptible=None):
|
|
868
1027
|
if preemptible is not None:
|
|
869
|
-
jobList = [
|
|
870
|
-
|
|
1028
|
+
jobList = [
|
|
1029
|
+
job
|
|
1030
|
+
for job in list(self.jobQueue.queue)
|
|
1031
|
+
if self.jobBatchSystemIDToIssuedJob[job].preemptible == preemptible
|
|
1032
|
+
]
|
|
871
1033
|
return len(jobList)
|
|
872
1034
|
else:
|
|
873
1035
|
return self.jobQueue.qsize()
|
|
@@ -883,13 +1045,19 @@ class MockBatchSystemAndProvisioner(AbstractScalableBatchSystem, AbstractProvisi
|
|
|
883
1045
|
for node in self.nodesToWorker:
|
|
884
1046
|
if node.preemptible == preemptible:
|
|
885
1047
|
worker = self.nodesToWorker[node]
|
|
886
|
-
nodes[node.privateIP] = NodeInfo(
|
|
887
|
-
|
|
888
|
-
|
|
1048
|
+
nodes[node.privateIP] = NodeInfo(
|
|
1049
|
+
coresTotal=0,
|
|
1050
|
+
coresUsed=0,
|
|
1051
|
+
requestedCores=1,
|
|
1052
|
+
memoryTotal=0,
|
|
1053
|
+
memoryUsed=0,
|
|
1054
|
+
requestedMemory=1,
|
|
1055
|
+
workers=1 if worker.busyEvent.is_set() else 0,
|
|
1056
|
+
)
|
|
889
1057
|
return nodes
|
|
890
1058
|
|
|
891
1059
|
# AbstractProvisioner functionality
|
|
892
|
-
def addNodes(self, nodeTypes:
|
|
1060
|
+
def addNodes(self, nodeTypes: set[str], numNodes, preemptible) -> int:
|
|
893
1061
|
nodeType = next(iter(nodeTypes))
|
|
894
1062
|
self._addNodes(numNodes=numNodes, nodeType=nodeType, preemptible=preemptible)
|
|
895
1063
|
return self.getNumberOfNodes(nodeType=nodeType, preemptible=preemptible)
|
|
@@ -902,8 +1070,17 @@ class MockBatchSystemAndProvisioner(AbstractScalableBatchSystem, AbstractProvisi
|
|
|
902
1070
|
def getWorkersInCluster(self, nodeShape):
|
|
903
1071
|
return self.workers[nodeShape]
|
|
904
1072
|
|
|
905
|
-
def launchCluster(
|
|
906
|
-
|
|
1073
|
+
def launchCluster(
|
|
1074
|
+
self,
|
|
1075
|
+
leaderNodeType,
|
|
1076
|
+
keyName,
|
|
1077
|
+
userTags=None,
|
|
1078
|
+
vpcSubnet=None,
|
|
1079
|
+
leaderStorage=50,
|
|
1080
|
+
nodeStorage=50,
|
|
1081
|
+
botoPath=None,
|
|
1082
|
+
**kwargs,
|
|
1083
|
+
):
|
|
907
1084
|
pass
|
|
908
1085
|
|
|
909
1086
|
def destroyCluster(self) -> None:
|
|
@@ -912,7 +1089,6 @@ class MockBatchSystemAndProvisioner(AbstractScalableBatchSystem, AbstractProvisi
|
|
|
912
1089
|
def getLeader(self):
|
|
913
1090
|
pass
|
|
914
1091
|
|
|
915
|
-
|
|
916
1092
|
def _leaderFn(self):
|
|
917
1093
|
while self.running:
|
|
918
1094
|
updatedJobID = None
|
|
@@ -955,14 +1131,28 @@ class MockBatchSystemAndProvisioner(AbstractScalableBatchSystem, AbstractProvisi
|
|
|
955
1131
|
return time.time() - self.startTime
|
|
956
1132
|
|
|
957
1133
|
for _ in range(numNodes):
|
|
958
|
-
node = Node(
|
|
959
|
-
|
|
960
|
-
|
|
1134
|
+
node = Node(
|
|
1135
|
+
"127.0.0.1",
|
|
1136
|
+
uuid.uuid4(),
|
|
1137
|
+
"testNode",
|
|
1138
|
+
datetime.datetime.now().isoformat() + "Z",
|
|
1139
|
+
nodeType=nodeType,
|
|
1140
|
+
preemptible=preemptible,
|
|
1141
|
+
)
|
|
1142
|
+
self.nodesToWorker[node] = Worker(
|
|
1143
|
+
self.jobQueue, self.updatedJobsQueue, self.secondsPerJob
|
|
1144
|
+
)
|
|
961
1145
|
self.workers[nodeShape].append(self.nodesToWorker[node])
|
|
962
|
-
self.maxWorkers[nodeShape] = max(
|
|
1146
|
+
self.maxWorkers[nodeShape] = max(
|
|
1147
|
+
self.maxWorkers[nodeShape], len(self.workers[nodeShape])
|
|
1148
|
+
)
|
|
963
1149
|
|
|
964
1150
|
def _removeNodes(self, nodes):
|
|
965
|
-
logger.debug(
|
|
1151
|
+
logger.debug(
|
|
1152
|
+
"Removing nodes. %s workers and %s to terminate.",
|
|
1153
|
+
len(self.nodesToWorker),
|
|
1154
|
+
len(nodes),
|
|
1155
|
+
)
|
|
966
1156
|
for node in nodes:
|
|
967
1157
|
try:
|
|
968
1158
|
nodeShape = self.getNodeShape(node.nodeType, node.preemptible)
|