toil 7.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +124 -86
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +39 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +651 -155
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +784 -397
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1137 -534
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +1031 -349
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +772 -412
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +204 -58
- toil/lib/aws/utils.py +290 -213
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/dockstore.py +379 -0
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -105
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/history.py +1271 -0
- toil/lib/history_submission.py +681 -0
- toil/lib/humanize.py +6 -2
- toil/lib/io.py +121 -12
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +83 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +125 -87
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/lib/trs.py +390 -0
- toil/lib/web.py +38 -0
- toil/options/common.py +850 -402
- toil/options/cwl.py +185 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +283 -180
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +84 -55
- toil/server/utils.py +56 -31
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +183 -65
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +265 -49
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/conftest.py +39 -0
- toil/test/cwl/cwlTest.py +375 -72
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/optional-file.cwl +18 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_history.py +212 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/lib/test_trs.py +161 -0
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +6 -6
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3528 -1053
- toil/worker.py +370 -149
- toil-8.1.0b1.dist-info/METADATA +178 -0
- toil-8.1.0b1.dist-info/RECORD +259 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
|
@@ -20,29 +20,36 @@ import subprocess
|
|
|
20
20
|
import time
|
|
21
21
|
import traceback
|
|
22
22
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
23
|
+
from collections.abc import Sequence
|
|
23
24
|
from queue import Empty, Queue
|
|
24
25
|
from threading import Event, Lock, Thread
|
|
25
|
-
from typing import
|
|
26
|
+
from typing import Optional, Union
|
|
26
27
|
|
|
27
28
|
import toil
|
|
28
29
|
from toil import worker as toil_worker
|
|
29
|
-
from toil.batchSystems.abstractBatchSystem import (
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
30
|
+
from toil.batchSystems.abstractBatchSystem import (
|
|
31
|
+
EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
32
|
+
BatchSystemSupport,
|
|
33
|
+
InsufficientSystemResources,
|
|
34
|
+
ResourcePool,
|
|
35
|
+
ResourceSet,
|
|
36
|
+
UpdatedBatchJobInfo,
|
|
37
|
+
)
|
|
35
38
|
from toil.batchSystems.options import OptionSetter
|
|
36
39
|
from toil.bus import ExternalBatchIdMessage
|
|
37
40
|
from toil.common import Config, Toil
|
|
38
|
-
from toil.
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
41
|
+
from toil.job import (
|
|
42
|
+
AcceleratorRequirement,
|
|
43
|
+
JobDescription,
|
|
44
|
+
Requirer,
|
|
45
|
+
accelerator_satisfies,
|
|
46
|
+
)
|
|
47
|
+
from toil.lib.accelerators import (
|
|
48
|
+
get_individual_local_accelerators,
|
|
49
|
+
get_restrictive_environment_for_local_accelerators,
|
|
50
|
+
)
|
|
45
51
|
from toil.lib.threading import cpu_count
|
|
52
|
+
from toil.options.common import SYS_MAX_SIZE, make_open_interval_action
|
|
46
53
|
|
|
47
54
|
logger = logging.getLogger(__name__)
|
|
48
55
|
|
|
@@ -84,7 +91,12 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
84
91
|
physicalMemory = toil.physicalMemory()
|
|
85
92
|
|
|
86
93
|
def __init__(
|
|
87
|
-
self,
|
|
94
|
+
self,
|
|
95
|
+
config: Config,
|
|
96
|
+
maxCores: float,
|
|
97
|
+
maxMemory: int,
|
|
98
|
+
maxDisk: int,
|
|
99
|
+
max_jobs: Optional[int] = None,
|
|
88
100
|
) -> None:
|
|
89
101
|
self.config = config
|
|
90
102
|
|
|
@@ -102,22 +114,38 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
102
114
|
# If we don't have up to the limit of the resource (and the resource
|
|
103
115
|
# isn't the inlimited sentinel), warn.
|
|
104
116
|
if maxCores > self.numCores:
|
|
105
|
-
if maxCores != SYS_MAX_SIZE and maxCores != float(
|
|
117
|
+
if maxCores != SYS_MAX_SIZE and maxCores != float("inf"):
|
|
106
118
|
# We have an actually specified limit and not the default
|
|
107
|
-
logger.warning(
|
|
119
|
+
logger.warning(
|
|
120
|
+
"Not enough cores! User limited to %i but we only have %i.",
|
|
121
|
+
maxCores,
|
|
122
|
+
self.numCores,
|
|
123
|
+
)
|
|
108
124
|
maxCores = self.numCores
|
|
109
125
|
if maxMemory > self.physicalMemory:
|
|
110
|
-
if
|
|
126
|
+
if (
|
|
127
|
+
maxMemory < SYS_MAX_SIZE
|
|
128
|
+
): # todo: looks like humans2bytes converts SYS_MAX_SIZE to SYS_MAX_SIZE+1
|
|
111
129
|
# We have an actually specified limit and not the default
|
|
112
|
-
logger.warning(
|
|
130
|
+
logger.warning(
|
|
131
|
+
"Not enough memory! User limited to %i bytes but we only have %i bytes.",
|
|
132
|
+
maxMemory,
|
|
133
|
+
self.physicalMemory,
|
|
134
|
+
)
|
|
113
135
|
maxMemory = self.physicalMemory
|
|
114
136
|
|
|
115
|
-
workdir = Toil.getLocalWorkflowDir(
|
|
137
|
+
workdir = Toil.getLocalWorkflowDir(
|
|
138
|
+
config.workflowID, config.workDir
|
|
139
|
+
) # config.workDir may be None; this sets a real directory
|
|
116
140
|
self.physicalDisk = toil.physicalDisk(workdir)
|
|
117
141
|
if maxDisk > self.physicalDisk:
|
|
118
142
|
if maxDisk < SYS_MAX_SIZE: # same as maxMemory logger.warning
|
|
119
143
|
# We have an actually specified limit and not the default
|
|
120
|
-
logger.warning(
|
|
144
|
+
logger.warning(
|
|
145
|
+
"Not enough disk space! User limited to %i bytes but we only have %i bytes.",
|
|
146
|
+
maxDisk,
|
|
147
|
+
self.physicalDisk,
|
|
148
|
+
)
|
|
121
149
|
maxDisk = self.physicalDisk
|
|
122
150
|
|
|
123
151
|
super().__init__(config, maxCores, maxMemory, maxDisk)
|
|
@@ -132,8 +160,10 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
132
160
|
if config.badWorker > 0 and config.debugWorker:
|
|
133
161
|
# We can't throw SIGUSR1 at the worker because it is also going to
|
|
134
162
|
# be the leader and/or test harness.
|
|
135
|
-
raise RuntimeError(
|
|
136
|
-
"
|
|
163
|
+
raise RuntimeError(
|
|
164
|
+
"Cannot use badWorker and debugWorker together; "
|
|
165
|
+
"worker would have to kill the leader"
|
|
166
|
+
)
|
|
137
167
|
|
|
138
168
|
self.debugWorker = config.debugWorker
|
|
139
169
|
|
|
@@ -143,7 +173,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
143
173
|
self.jobIndexLock = Lock()
|
|
144
174
|
|
|
145
175
|
# A dictionary mapping batch system IDs of submitted jobs to the command line
|
|
146
|
-
self.jobs:
|
|
176
|
+
self.jobs: dict[int, JobDescription] = {}
|
|
147
177
|
|
|
148
178
|
# A queue of jobs waiting to be executed. Consumed by the daddy thread.
|
|
149
179
|
self.inputQueue = Queue()
|
|
@@ -152,15 +182,15 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
152
182
|
self.outputQueue = Queue()
|
|
153
183
|
|
|
154
184
|
# A dictionary mapping batch system IDs of currently running jobs to their Info objects
|
|
155
|
-
self.runningJobs:
|
|
185
|
+
self.runningJobs: dict[int, Info] = {}
|
|
156
186
|
|
|
157
187
|
# These next two are only used outside debug-worker mode
|
|
158
188
|
|
|
159
189
|
# A dict mapping PIDs to Popen objects for running jobs.
|
|
160
190
|
# Jobs that don't fork are executed one at a time in the main thread.
|
|
161
|
-
self.children:
|
|
191
|
+
self.children: dict[int, subprocess.Popen] = {}
|
|
162
192
|
# A dict mapping child PIDs to the Job IDs they are supposed to be running.
|
|
163
|
-
self.childToJob:
|
|
193
|
+
self.childToJob: dict[int, str] = {}
|
|
164
194
|
|
|
165
195
|
# For accelerators, we need a collection of what each accelerator is, and an acquirable set of them.
|
|
166
196
|
self.accelerator_identities = get_individual_local_accelerators()
|
|
@@ -168,15 +198,15 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
168
198
|
# Put them all organized by resource type
|
|
169
199
|
self.resource_sources = [
|
|
170
200
|
# A pool representing available job slots
|
|
171
|
-
ResourcePool(self.max_jobs,
|
|
201
|
+
ResourcePool(self.max_jobs, "job slots"),
|
|
172
202
|
# A pool representing available CPU in units of minCores
|
|
173
|
-
ResourcePool(int(self.maxCores / self.minCores),
|
|
203
|
+
ResourcePool(int(self.maxCores / self.minCores), "cores"),
|
|
174
204
|
# A pool representing available memory in bytes
|
|
175
|
-
ResourcePool(self.maxMemory,
|
|
205
|
+
ResourcePool(self.maxMemory, "memory"),
|
|
176
206
|
# A pool representing the available space in bytes
|
|
177
|
-
ResourcePool(self.maxDisk,
|
|
207
|
+
ResourcePool(self.maxDisk, "disk"),
|
|
178
208
|
# And a set for acquiring individual accelerators
|
|
179
|
-
ResourceSet(set(range(len(self.accelerator_identities))),
|
|
209
|
+
ResourceSet(set(range(len(self.accelerator_identities))), "accelerators"),
|
|
180
210
|
]
|
|
181
211
|
|
|
182
212
|
# If we can't schedule something, we fill this in with a reason why
|
|
@@ -192,11 +222,11 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
192
222
|
self.daddyException: Optional[Exception] = None
|
|
193
223
|
|
|
194
224
|
if self.debugWorker:
|
|
195
|
-
logger.debug(
|
|
225
|
+
logger.debug("Started batch system %s in worker debug mode.", id(self))
|
|
196
226
|
else:
|
|
197
227
|
self.daddyThread = Thread(target=self.daddy, daemon=True)
|
|
198
228
|
self.daddyThread.start()
|
|
199
|
-
logger.debug(
|
|
229
|
+
logger.debug("Started batch system %s in normal mode.", id(self))
|
|
200
230
|
|
|
201
231
|
def daddy(self):
|
|
202
232
|
"""
|
|
@@ -214,7 +244,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
214
244
|
"""
|
|
215
245
|
|
|
216
246
|
try:
|
|
217
|
-
logger.debug(
|
|
247
|
+
logger.debug("Started daddy thread for batch system %s.", id(self))
|
|
218
248
|
|
|
219
249
|
while not self.shuttingDown.is_set():
|
|
220
250
|
# Main loop
|
|
@@ -224,13 +254,28 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
224
254
|
try:
|
|
225
255
|
# Grab something from the input queue if available.
|
|
226
256
|
args = self.inputQueue.get_nowait()
|
|
227
|
-
|
|
257
|
+
(
|
|
258
|
+
jobCommand,
|
|
259
|
+
jobID,
|
|
260
|
+
jobCores,
|
|
261
|
+
jobMemory,
|
|
262
|
+
jobDisk,
|
|
263
|
+
job_accelerators,
|
|
264
|
+
environment,
|
|
265
|
+
) = args
|
|
228
266
|
|
|
229
267
|
coreFractions = int(jobCores / self.minCores)
|
|
230
268
|
|
|
231
269
|
# Try to start the child
|
|
232
|
-
result = self._startChild(
|
|
233
|
-
|
|
270
|
+
result = self._startChild(
|
|
271
|
+
jobCommand,
|
|
272
|
+
jobID,
|
|
273
|
+
coreFractions,
|
|
274
|
+
jobMemory,
|
|
275
|
+
jobDisk,
|
|
276
|
+
job_accelerators,
|
|
277
|
+
environment,
|
|
278
|
+
)
|
|
234
279
|
|
|
235
280
|
if result is None:
|
|
236
281
|
# We did not get the resources to run this job.
|
|
@@ -241,12 +286,15 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
241
286
|
self.inputQueue.put(args)
|
|
242
287
|
break
|
|
243
288
|
elif result is not False:
|
|
244
|
-
#Result is a PID
|
|
289
|
+
# Result is a PID
|
|
245
290
|
|
|
246
291
|
if self._outbox is not None:
|
|
247
292
|
# Annotate the job with the PID generated.
|
|
248
293
|
self._outbox.publish(
|
|
249
|
-
|
|
294
|
+
ExternalBatchIdMessage(
|
|
295
|
+
jobID, str(result), self.__class__.__name__
|
|
296
|
+
)
|
|
297
|
+
)
|
|
250
298
|
|
|
251
299
|
# Otherwise False
|
|
252
300
|
|
|
@@ -265,18 +313,28 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
265
313
|
# For now we just sleep and loop.
|
|
266
314
|
time.sleep(0.01)
|
|
267
315
|
|
|
268
|
-
|
|
269
316
|
# When we get here, we are shutting down.
|
|
270
|
-
logger.debug(
|
|
317
|
+
logger.debug(
|
|
318
|
+
"Daddy thread cleaning up %d remaining children for batch system %s...",
|
|
319
|
+
len(self.children),
|
|
320
|
+
id(self),
|
|
321
|
+
)
|
|
271
322
|
|
|
272
323
|
self._stop_and_wait(self.children.values())
|
|
273
324
|
|
|
274
|
-
logger.debug(
|
|
325
|
+
logger.debug(
|
|
326
|
+
"Daddy thread for batch system %s finishing because no children should now exist",
|
|
327
|
+
id(self),
|
|
328
|
+
)
|
|
275
329
|
|
|
276
330
|
# Then exit the thread.
|
|
277
331
|
return
|
|
278
332
|
except Exception as e:
|
|
279
|
-
logger.critical(
|
|
333
|
+
logger.critical(
|
|
334
|
+
"Unhandled exception in daddy thread for batch system %s: %s",
|
|
335
|
+
id(self),
|
|
336
|
+
traceback.format_exc(),
|
|
337
|
+
)
|
|
280
338
|
# Pass the exception back to the main thread so it can stop the next person who calls into us.
|
|
281
339
|
self.daddyException = e
|
|
282
340
|
raise
|
|
@@ -284,15 +342,17 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
284
342
|
def _checkOnDaddy(self):
|
|
285
343
|
if self.daddyException is not None:
|
|
286
344
|
# The daddy thread broke and we cannot do our job
|
|
287
|
-
logger.critical(
|
|
345
|
+
logger.critical(
|
|
346
|
+
"Propagating unhandled exception in daddy thread to main thread"
|
|
347
|
+
)
|
|
288
348
|
exc = self.daddyException
|
|
289
349
|
self.daddyException = None
|
|
290
350
|
if isinstance(exc, Exception):
|
|
291
351
|
raise exc
|
|
292
352
|
else:
|
|
293
|
-
raise TypeError(f
|
|
353
|
+
raise TypeError(f"Daddy thread failed with non-exception: {exc}")
|
|
294
354
|
|
|
295
|
-
def _stop_now(self, popens: Sequence[subprocess.Popen]) ->
|
|
355
|
+
def _stop_now(self, popens: Sequence[subprocess.Popen]) -> list[int]:
|
|
296
356
|
"""
|
|
297
357
|
Stop the given child processes and all their children. Does not reap them.
|
|
298
358
|
|
|
@@ -322,7 +382,11 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
322
382
|
# The child process really is in its own group, and not ours.
|
|
323
383
|
|
|
324
384
|
# Kill the group, which hopefully hasn't been reused
|
|
325
|
-
logger.debug(
|
|
385
|
+
logger.debug(
|
|
386
|
+
"Send shutdown kill to process group %s known to batch system %s",
|
|
387
|
+
pgid,
|
|
388
|
+
id(self),
|
|
389
|
+
)
|
|
326
390
|
try:
|
|
327
391
|
os.killpg(pgid, signal.SIGKILL)
|
|
328
392
|
pgids.append(pgid)
|
|
@@ -339,7 +403,9 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
339
403
|
|
|
340
404
|
return pgids
|
|
341
405
|
|
|
342
|
-
def _stop_and_wait(
|
|
406
|
+
def _stop_and_wait(
|
|
407
|
+
self, popens: Sequence[subprocess.Popen], timeout: int = 5
|
|
408
|
+
) -> None:
|
|
343
409
|
"""
|
|
344
410
|
Stop the given child processes and all their children. Blocks until the
|
|
345
411
|
processes are gone or timeout is passed.
|
|
@@ -354,13 +420,17 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
354
420
|
for popen in popens:
|
|
355
421
|
# Wait on all the children
|
|
356
422
|
popen.wait()
|
|
357
|
-
logger.debug(
|
|
358
|
-
|
|
423
|
+
logger.debug(
|
|
424
|
+
"Process %s known to batch system %s is stopped; it returned %s",
|
|
425
|
+
popen.pid,
|
|
426
|
+
id(self),
|
|
427
|
+
popen.returncode,
|
|
428
|
+
)
|
|
359
429
|
|
|
360
430
|
# Make sure all child processes have received their kill signal
|
|
361
431
|
self._wait_for_death(pgids, timeout)
|
|
362
432
|
|
|
363
|
-
def _wait_for_death(self, pgids:
|
|
433
|
+
def _wait_for_death(self, pgids: list[int], timeout: int = 5):
|
|
364
434
|
"""
|
|
365
435
|
Wait for the process groups to be killed. Blocks until the processes
|
|
366
436
|
are gone or timeout is passed.
|
|
@@ -373,8 +443,11 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
373
443
|
# process and its PGID may have been re-used.
|
|
374
444
|
|
|
375
445
|
start = datetime.datetime.now()
|
|
376
|
-
while
|
|
377
|
-
|
|
446
|
+
while (
|
|
447
|
+
len(pgids) > 0
|
|
448
|
+
and (datetime.datetime.now() - start).total_seconds() < timeout
|
|
449
|
+
):
|
|
450
|
+
new_pgids: list[int] = []
|
|
378
451
|
for pgid in pgids:
|
|
379
452
|
try:
|
|
380
453
|
# Send a kill to the group again, to see if anything in it
|
|
@@ -399,9 +472,11 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
399
472
|
if len(pgids) > 0:
|
|
400
473
|
# If any processes are still alive, let user know that we may leave
|
|
401
474
|
# behind dead but unreaped processes.
|
|
402
|
-
logger.warning(
|
|
403
|
-
logger.warning(
|
|
404
|
-
|
|
475
|
+
logger.warning("Processes were not reaped in groups: %s.", str(pgids))
|
|
476
|
+
logger.warning(
|
|
477
|
+
"Make sure your jobs are cleaning up child processes appropriately to avoid zombie "
|
|
478
|
+
"processes possibly being left behind."
|
|
479
|
+
)
|
|
405
480
|
|
|
406
481
|
def _pollForDoneChildrenIn(self, pid_to_popen):
|
|
407
482
|
"""
|
|
@@ -420,7 +495,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
420
495
|
ready = set()
|
|
421
496
|
|
|
422
497
|
# Find the waitid function
|
|
423
|
-
waitid = getattr(os,
|
|
498
|
+
waitid = getattr(os, "waitid", None)
|
|
424
499
|
|
|
425
500
|
if callable(waitid):
|
|
426
501
|
# waitid exists (not Mac)
|
|
@@ -439,7 +514,11 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
439
514
|
# instead of the weird C behavior of overwriting a field in
|
|
440
515
|
# a pointed-to struct.
|
|
441
516
|
siginfo = None
|
|
442
|
-
if
|
|
517
|
+
if (
|
|
518
|
+
siginfo is not None
|
|
519
|
+
and siginfo.si_pid in pid_to_popen
|
|
520
|
+
and siginfo.si_pid not in ready
|
|
521
|
+
):
|
|
443
522
|
# Something new finished
|
|
444
523
|
ready.add(siginfo.si_pid)
|
|
445
524
|
else:
|
|
@@ -454,7 +533,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
454
533
|
if popen.poll() is not None:
|
|
455
534
|
# Process is done
|
|
456
535
|
ready.add(pid)
|
|
457
|
-
logger.debug(
|
|
536
|
+
logger.debug("Child %d has stopped", pid)
|
|
458
537
|
|
|
459
538
|
# Return all the done processes we found
|
|
460
539
|
return ready
|
|
@@ -473,19 +552,33 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
473
552
|
|
|
474
553
|
if jobCommand.startswith("_toil_worker "):
|
|
475
554
|
# We can actually run in this thread
|
|
476
|
-
jobName, jobStoreLocator, jobStoreID = jobCommand.split()[
|
|
555
|
+
jobName, jobStoreLocator, jobStoreID = jobCommand.split()[
|
|
556
|
+
1:4
|
|
557
|
+
] # Parse command
|
|
477
558
|
jobStore = Toil.resumeJobStore(jobStoreLocator)
|
|
478
|
-
statusCode = toil_worker.workerScript(
|
|
479
|
-
|
|
559
|
+
statusCode = toil_worker.workerScript(
|
|
560
|
+
jobStore,
|
|
561
|
+
jobStore.config,
|
|
562
|
+
jobName,
|
|
563
|
+
jobStoreID,
|
|
564
|
+
redirect_output_to_log_file=not self.debugWorker,
|
|
565
|
+
) # Call the worker
|
|
480
566
|
else:
|
|
481
567
|
# Run synchronously. If starting or running the command fails, let the exception stop us.
|
|
482
|
-
statusCode = subprocess.check_call(
|
|
483
|
-
|
|
484
|
-
|
|
568
|
+
statusCode = subprocess.check_call(
|
|
569
|
+
jobCommand, shell=True, env=dict(os.environ, **environment)
|
|
570
|
+
)
|
|
485
571
|
|
|
486
572
|
self.runningJobs.pop(jobID)
|
|
487
573
|
if not info.killIntended:
|
|
488
|
-
self.outputQueue.put(
|
|
574
|
+
self.outputQueue.put(
|
|
575
|
+
UpdatedBatchJobInfo(
|
|
576
|
+
jobID=jobID,
|
|
577
|
+
exitStatus=statusCode,
|
|
578
|
+
wallTime=time.time() - info.time,
|
|
579
|
+
exitReason=None,
|
|
580
|
+
)
|
|
581
|
+
)
|
|
489
582
|
|
|
490
583
|
def getSchedulingStatusMessage(self):
|
|
491
584
|
# Implement the abstractBatchSystem's scheduling status message API
|
|
@@ -505,19 +598,25 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
505
598
|
super().check_resource_request(requirer)
|
|
506
599
|
except InsufficientSystemResources as e:
|
|
507
600
|
# Tack the scale onto the exception
|
|
508
|
-
e.details.append(f
|
|
601
|
+
e.details.append(f"Scale is set to {self.scale}.")
|
|
509
602
|
raise e
|
|
510
603
|
|
|
511
604
|
def _check_accelerator_request(self, requirer: Requirer) -> None:
|
|
512
|
-
_, problem = self._identify_sufficient_accelerators(
|
|
605
|
+
_, problem = self._identify_sufficient_accelerators(
|
|
606
|
+
requirer.accelerators, set(range(len(self.accelerator_identities)))
|
|
607
|
+
)
|
|
513
608
|
if problem is not None:
|
|
514
609
|
# We can't get the accelerators
|
|
515
|
-
raise InsufficientSystemResources(
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
610
|
+
raise InsufficientSystemResources(
|
|
611
|
+
requirer,
|
|
612
|
+
"accelerators",
|
|
613
|
+
self.accelerator_identities,
|
|
614
|
+
details=[f"The accelerator {problem} could not be provided."],
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
def _release_acquired_resources(
|
|
618
|
+
self, resources: list[Union[int, set[int]]]
|
|
619
|
+
) -> None:
|
|
521
620
|
"""
|
|
522
621
|
Release all resources acquired for a job.
|
|
523
622
|
Assumes resources are in the order: core fractions, memory, disk, accelerators.
|
|
@@ -526,11 +625,16 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
526
625
|
# What pools and sets do we want resources from
|
|
527
626
|
|
|
528
627
|
for resource, request in zip(self.resource_sources, resources):
|
|
529
|
-
assert (
|
|
530
|
-
|
|
628
|
+
assert (
|
|
629
|
+
isinstance(resource, ResourcePool) and isinstance(request, int)
|
|
630
|
+
) or (isinstance(resource, ResourceSet) and isinstance(request, set))
|
|
531
631
|
resource.release(request)
|
|
532
632
|
|
|
533
|
-
def _identify_sufficient_accelerators(
|
|
633
|
+
def _identify_sufficient_accelerators(
|
|
634
|
+
self,
|
|
635
|
+
needed_accelerators: list[AcceleratorRequirement],
|
|
636
|
+
available_accelerator_ids: set[int],
|
|
637
|
+
) -> tuple[Optional[set[int]], Optional[AcceleratorRequirement]]:
|
|
534
638
|
"""
|
|
535
639
|
Given the accelerator requirements of a job, and the set of available
|
|
536
640
|
accelerators out of our associated collection of accelerators, find a
|
|
@@ -547,17 +651,17 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
547
651
|
|
|
548
652
|
Ignores accelerator model constraints.
|
|
549
653
|
"""
|
|
550
|
-
accelerators_needed:
|
|
654
|
+
accelerators_needed: set[int] = set()
|
|
551
655
|
accelerators_still_available = set(available_accelerator_ids)
|
|
552
656
|
for requirement in needed_accelerators:
|
|
553
|
-
for i in range(requirement[
|
|
657
|
+
for i in range(requirement["count"]):
|
|
554
658
|
# For each individual accelerator we need
|
|
555
659
|
satisfied = False
|
|
556
660
|
for candidate_index in accelerators_still_available:
|
|
557
661
|
# Check all the ones we haven't grabbed yet
|
|
558
662
|
# TODO: We'll re-check early ones against this requirement if it has a count of more than one.
|
|
559
663
|
candidate = self.accelerator_identities[candidate_index]
|
|
560
|
-
if accelerator_satisfies(candidate, requirement, ignore=[
|
|
664
|
+
if accelerator_satisfies(candidate, requirement, ignore=["model"]):
|
|
561
665
|
# If this accelerator can satisfy one unit of this requirement.
|
|
562
666
|
# We ignore model constraints because as a single
|
|
563
667
|
# machine we can't really determine the models of
|
|
@@ -577,7 +681,16 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
577
681
|
# If we get here we satisfied everything
|
|
578
682
|
return accelerators_needed, None
|
|
579
683
|
|
|
580
|
-
def _startChild(
|
|
684
|
+
def _startChild(
|
|
685
|
+
self,
|
|
686
|
+
jobCommand,
|
|
687
|
+
jobID,
|
|
688
|
+
coreFractions,
|
|
689
|
+
jobMemory,
|
|
690
|
+
jobDisk,
|
|
691
|
+
job_accelerators: list[AcceleratorRequirement],
|
|
692
|
+
environment,
|
|
693
|
+
):
|
|
581
694
|
"""
|
|
582
695
|
Start a child process for the given job.
|
|
583
696
|
|
|
@@ -596,7 +709,12 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
596
709
|
|
|
597
710
|
# And what do we want from each resource in self.resource_sources?
|
|
598
711
|
# We know they go job slot, cores, memory, disk, accelerators.
|
|
599
|
-
resource_requests:
|
|
712
|
+
resource_requests: list[Union[int, set[int]]] = [
|
|
713
|
+
1,
|
|
714
|
+
coreFractions,
|
|
715
|
+
jobMemory,
|
|
716
|
+
jobDisk,
|
|
717
|
+
]
|
|
600
718
|
|
|
601
719
|
# Keep a reference to the accelerators separately
|
|
602
720
|
accelerators_needed = None
|
|
@@ -604,31 +722,37 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
604
722
|
if job_accelerators:
|
|
605
723
|
# Try and find some accelerators to use.
|
|
606
724
|
# Start with all the accelerators that are free right now
|
|
607
|
-
accelerator_set
|
|
725
|
+
accelerator_set: ResourceSet = self.resource_sources[-1]
|
|
608
726
|
snapshot = accelerator_set.get_free_snapshot()
|
|
609
727
|
# And build a plan of the ones we want
|
|
610
|
-
accelerators_needed, problem = self._identify_sufficient_accelerators(
|
|
728
|
+
accelerators_needed, problem = self._identify_sufficient_accelerators(
|
|
729
|
+
job_accelerators, snapshot
|
|
730
|
+
)
|
|
611
731
|
if accelerators_needed is not None:
|
|
612
732
|
# Now we have a plan to get the accelerators we need.
|
|
613
733
|
resource_requests.append(accelerators_needed)
|
|
614
734
|
else:
|
|
615
735
|
# We couldn't make a plan; the accelerators are busy
|
|
616
736
|
assert problem is not None
|
|
617
|
-
logger.debug(
|
|
618
|
-
self._setSchedulingStatusMessage(
|
|
737
|
+
logger.debug("Accelerators are busy: %s", problem)
|
|
738
|
+
self._setSchedulingStatusMessage(
|
|
739
|
+
"Not enough accelerators to run job %s" % jobID
|
|
740
|
+
)
|
|
619
741
|
return None
|
|
620
742
|
|
|
621
|
-
|
|
622
743
|
acquired = []
|
|
623
744
|
for source, request in zip(self.resource_sources, resource_requests):
|
|
624
745
|
# For each kind of resource we want, go get it
|
|
625
|
-
assert (
|
|
626
|
-
|
|
746
|
+
assert (isinstance(source, ResourcePool) and isinstance(request, int)) or (
|
|
747
|
+
isinstance(source, ResourceSet) and isinstance(request, set)
|
|
748
|
+
)
|
|
627
749
|
if source.acquireNow(request):
|
|
628
750
|
acquired.append(request)
|
|
629
751
|
else:
|
|
630
752
|
# We can't get everything
|
|
631
|
-
self._setSchedulingStatusMessage(
|
|
753
|
+
self._setSchedulingStatusMessage(
|
|
754
|
+
f"Not enough {source.resource_type} to run job {jobID}"
|
|
755
|
+
)
|
|
632
756
|
self._release_acquired_resources(acquired)
|
|
633
757
|
return None
|
|
634
758
|
|
|
@@ -639,8 +763,12 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
639
763
|
|
|
640
764
|
# Communicate the accelerator resources, if any, to the child process
|
|
641
765
|
# by modifying the environemnt
|
|
642
|
-
accelerators_acquired:
|
|
643
|
-
|
|
766
|
+
accelerators_acquired: set[int] = (
|
|
767
|
+
accelerators_needed if accelerators_needed is not None else set()
|
|
768
|
+
)
|
|
769
|
+
child_environment.update(
|
|
770
|
+
get_restrictive_environment_for_local_accelerators(accelerators_acquired)
|
|
771
|
+
)
|
|
644
772
|
|
|
645
773
|
# Actually run the job.
|
|
646
774
|
# When it finishes we will release what it was using.
|
|
@@ -656,18 +784,24 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
656
784
|
# process group ID will equal the PID of the process we
|
|
657
785
|
# are starting.
|
|
658
786
|
logger.debug("Attempting to run job command: %s", jobCommand)
|
|
659
|
-
popen = subprocess.Popen(
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
start_new_session=True)
|
|
787
|
+
popen = subprocess.Popen(
|
|
788
|
+
jobCommand, shell=True, env=child_environment, start_new_session=True
|
|
789
|
+
)
|
|
663
790
|
except Exception:
|
|
664
791
|
# If the job can't start, make sure we release resources now
|
|
665
792
|
self._release_acquired_resources(acquired)
|
|
666
793
|
|
|
667
|
-
logger.error(
|
|
794
|
+
logger.error("Could not start job %s: %s", jobID, traceback.format_exc())
|
|
668
795
|
|
|
669
796
|
# Report as failed.
|
|
670
|
-
self.outputQueue.put(
|
|
797
|
+
self.outputQueue.put(
|
|
798
|
+
UpdatedBatchJobInfo(
|
|
799
|
+
jobID=jobID,
|
|
800
|
+
exitStatus=EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
801
|
+
wallTime=0,
|
|
802
|
+
exitReason=None,
|
|
803
|
+
)
|
|
804
|
+
)
|
|
671
805
|
|
|
672
806
|
# Complain it broke.
|
|
673
807
|
return False
|
|
@@ -680,7 +814,7 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
680
814
|
info = Info(startTime, popen, acquired, killIntended=False)
|
|
681
815
|
self.runningJobs[jobID] = info
|
|
682
816
|
|
|
683
|
-
logger.debug(
|
|
817
|
+
logger.debug("Launched job %s as child %d", jobID, popen.pid)
|
|
684
818
|
|
|
685
819
|
# Report success starting the job
|
|
686
820
|
# Note that if a PID were somehow 0 it would look like False
|
|
@@ -704,13 +838,12 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
704
838
|
# Get the job resources reserved by the job
|
|
705
839
|
acquired = info.resources
|
|
706
840
|
|
|
707
|
-
|
|
708
841
|
# Clean up our records of the job.
|
|
709
842
|
self.runningJobs.pop(jobID)
|
|
710
843
|
self.childToJob.pop(pid)
|
|
711
844
|
self.children.pop(pid)
|
|
712
845
|
|
|
713
|
-
if popen.returncode is None or not callable(getattr(os,
|
|
846
|
+
if popen.returncode is None or not callable(getattr(os, "waitid", None)):
|
|
714
847
|
# It isn't reaped yet, or we have to reap all children to see if thay're done.
|
|
715
848
|
# Before we reap it (if possible), kill its PID as a PGID to make sure
|
|
716
849
|
# it isn't leaving children behind.
|
|
@@ -728,12 +861,22 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
728
861
|
# See how the child did, and reap it.
|
|
729
862
|
statusCode = popen.wait()
|
|
730
863
|
if statusCode != 0 and not info.killIntended:
|
|
731
|
-
logger.error(
|
|
732
|
-
|
|
864
|
+
logger.error(
|
|
865
|
+
"Got exit code %i (indicating failure) " "from job %s.",
|
|
866
|
+
statusCode,
|
|
867
|
+
self.jobs[jobID],
|
|
868
|
+
)
|
|
733
869
|
if not info.killIntended:
|
|
734
870
|
# Report if the job failed and we didn't kill it.
|
|
735
871
|
# If we killed it then it shouldn't show up in the queue.
|
|
736
|
-
self.outputQueue.put(
|
|
872
|
+
self.outputQueue.put(
|
|
873
|
+
UpdatedBatchJobInfo(
|
|
874
|
+
jobID=jobID,
|
|
875
|
+
exitStatus=statusCode,
|
|
876
|
+
wallTime=time.time() - info.time,
|
|
877
|
+
exitReason=None,
|
|
878
|
+
)
|
|
879
|
+
)
|
|
737
880
|
|
|
738
881
|
# Last attempt to make sure all processes in the group have received
|
|
739
882
|
# their kill signals.
|
|
@@ -742,22 +885,31 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
742
885
|
# Free up the job's resources.
|
|
743
886
|
self._release_acquired_resources(acquired)
|
|
744
887
|
|
|
745
|
-
logger.debug(
|
|
888
|
+
logger.debug("Child %d for job %s succeeded", pid, jobID)
|
|
746
889
|
|
|
747
|
-
def issueBatchJob(
|
|
890
|
+
def issueBatchJob(
|
|
891
|
+
self,
|
|
892
|
+
command: str,
|
|
893
|
+
job_desc: JobDescription,
|
|
894
|
+
job_environment: Optional[dict[str, str]] = None,
|
|
895
|
+
) -> int:
|
|
748
896
|
"""Adds the command and resources to a queue to be run."""
|
|
749
897
|
|
|
750
898
|
self._checkOnDaddy()
|
|
751
899
|
|
|
752
900
|
# Apply scale in cores
|
|
753
|
-
scaled_desc = job_desc.scale(
|
|
901
|
+
scaled_desc = job_desc.scale("cores", self.scale)
|
|
754
902
|
# Round cores up to multiples of minCores
|
|
755
|
-
scaled_desc.cores = max(
|
|
903
|
+
scaled_desc.cores = max(
|
|
904
|
+
math.ceil(scaled_desc.cores / self.minCores) * self.minCores, self.minCores
|
|
905
|
+
)
|
|
756
906
|
|
|
757
907
|
# Don't do our own assertions about job size vs. our configured size.
|
|
758
908
|
# The abstract batch system can handle it.
|
|
759
909
|
self.check_resource_request(scaled_desc)
|
|
760
|
-
logger.debug(
|
|
910
|
+
logger.debug(
|
|
911
|
+
f"Issuing the command: {command} with {scaled_desc.requirements_string()}"
|
|
912
|
+
)
|
|
761
913
|
with self.jobIndexLock:
|
|
762
914
|
jobID = self.jobIndex
|
|
763
915
|
self.jobIndex += 1
|
|
@@ -773,20 +925,29 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
773
925
|
self._runDebugJob(command, jobID, environment)
|
|
774
926
|
else:
|
|
775
927
|
# Queue the job for later
|
|
776
|
-
self.inputQueue.put(
|
|
777
|
-
|
|
928
|
+
self.inputQueue.put(
|
|
929
|
+
(
|
|
930
|
+
command,
|
|
931
|
+
jobID,
|
|
932
|
+
scaled_desc.cores,
|
|
933
|
+
scaled_desc.memory,
|
|
934
|
+
scaled_desc.disk,
|
|
935
|
+
scaled_desc.accelerators,
|
|
936
|
+
environment,
|
|
937
|
+
)
|
|
938
|
+
)
|
|
778
939
|
|
|
779
940
|
return jobID
|
|
780
941
|
|
|
781
|
-
def killBatchJobs(self, jobIDs:
|
|
942
|
+
def killBatchJobs(self, jobIDs: list[int]) -> None:
|
|
782
943
|
"""Kills jobs by ID."""
|
|
783
944
|
|
|
784
945
|
self._checkOnDaddy()
|
|
785
946
|
|
|
786
|
-
logger.debug(f
|
|
947
|
+
logger.debug(f"Killing jobs: {jobIDs}")
|
|
787
948
|
|
|
788
949
|
# Collect the popen handles for the jobs we have to stop
|
|
789
|
-
popens:
|
|
950
|
+
popens: list[subprocess.Popen] = []
|
|
790
951
|
|
|
791
952
|
for jobID in jobIDs:
|
|
792
953
|
if jobID in self.runningJobs:
|
|
@@ -808,19 +969,21 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
808
969
|
# Wait for the daddy thread to collect them.
|
|
809
970
|
time.sleep(0.01)
|
|
810
971
|
|
|
811
|
-
def getIssuedBatchJobIDs(self) ->
|
|
972
|
+
def getIssuedBatchJobIDs(self) -> list[int]:
|
|
812
973
|
"""Just returns all the jobs that have been run, but not yet returned as updated."""
|
|
813
974
|
|
|
814
975
|
self._checkOnDaddy()
|
|
815
976
|
|
|
816
977
|
return list(self.jobs.keys())
|
|
817
978
|
|
|
818
|
-
def getRunningBatchJobIDs(self) ->
|
|
979
|
+
def getRunningBatchJobIDs(self) -> dict[int, float]:
|
|
819
980
|
|
|
820
981
|
self._checkOnDaddy()
|
|
821
982
|
|
|
822
983
|
now = time.time()
|
|
823
|
-
return {
|
|
984
|
+
return {
|
|
985
|
+
jobID: now - info.time for jobID, info in list(self.runningJobs.items())
|
|
986
|
+
}
|
|
824
987
|
|
|
825
988
|
def shutdown(self) -> None:
|
|
826
989
|
"""Terminate cleanly and join daddy thread."""
|
|
@@ -847,11 +1010,17 @@ class SingleMachineBatchSystem(BatchSystemSupport):
|
|
|
847
1010
|
|
|
848
1011
|
@classmethod
|
|
849
1012
|
def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
|
|
850
|
-
parser.add_argument(
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
1013
|
+
parser.add_argument(
|
|
1014
|
+
"--scale",
|
|
1015
|
+
dest="scale",
|
|
1016
|
+
type=float,
|
|
1017
|
+
default=1,
|
|
1018
|
+
action=make_open_interval_action(0.0),
|
|
1019
|
+
help="A scaling factor to change the value of all submitted tasks's submitted cores. "
|
|
1020
|
+
"Used in the single_machine batch system. Useful for running workflows on "
|
|
1021
|
+
"smaller machines than they were designed for, by setting a value less than 1. "
|
|
1022
|
+
"(default: %(default)s)",
|
|
1023
|
+
)
|
|
855
1024
|
|
|
856
1025
|
@classmethod
|
|
857
1026
|
def setOptions(cls, setOption: OptionSetter):
|
|
@@ -866,6 +1035,7 @@ class Info:
|
|
|
866
1035
|
(or None), the tuple of (coreFractions, memory, disk) it is using (or
|
|
867
1036
|
None), and whether the job is supposed to be being killed.
|
|
868
1037
|
"""
|
|
1038
|
+
|
|
869
1039
|
# Can't use namedtuple here since killIntended needs to be mutable
|
|
870
1040
|
def __init__(self, startTime, popen, resources, killIntended):
|
|
871
1041
|
self.time = startTime
|