toil 9.0.0__py3-none-any.whl → 9.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/batchSystems/abstractBatchSystem.py +13 -5
- toil/batchSystems/abstractGridEngineBatchSystem.py +17 -5
- toil/batchSystems/kubernetes.py +13 -2
- toil/batchSystems/mesos/batchSystem.py +33 -2
- toil/batchSystems/slurm.py +191 -16
- toil/cwl/cwltoil.py +17 -82
- toil/fileStores/__init__.py +1 -1
- toil/fileStores/abstractFileStore.py +5 -2
- toil/fileStores/cachingFileStore.py +1 -1
- toil/job.py +30 -14
- toil/jobStores/abstractJobStore.py +24 -19
- toil/jobStores/aws/jobStore.py +862 -1963
- toil/jobStores/aws/utils.py +24 -270
- toil/jobStores/googleJobStore.py +25 -9
- toil/jobStores/utils.py +0 -327
- toil/leader.py +27 -22
- toil/lib/aws/config.py +22 -0
- toil/lib/aws/s3.py +477 -9
- toil/lib/aws/utils.py +22 -33
- toil/lib/checksum.py +88 -0
- toil/lib/conversions.py +33 -31
- toil/lib/directory.py +217 -0
- toil/lib/ec2.py +97 -29
- toil/lib/exceptions.py +2 -1
- toil/lib/expando.py +2 -2
- toil/lib/generatedEC2Lists.py +73 -16
- toil/lib/io.py +33 -2
- toil/lib/memoize.py +21 -7
- toil/lib/pipes.py +385 -0
- toil/lib/retry.py +1 -1
- toil/lib/threading.py +1 -1
- toil/lib/web.py +4 -5
- toil/provisioners/__init__.py +5 -2
- toil/provisioners/aws/__init__.py +43 -36
- toil/provisioners/aws/awsProvisioner.py +22 -13
- toil/provisioners/node.py +60 -12
- toil/resource.py +3 -13
- toil/test/__init__.py +14 -16
- toil/test/batchSystems/test_slurm.py +103 -14
- toil/test/cwl/staging_cat.cwl +27 -0
- toil/test/cwl/staging_make_file.cwl +25 -0
- toil/test/cwl/staging_workflow.cwl +43 -0
- toil/test/cwl/zero_default.cwl +61 -0
- toil/test/docs/scripts/tutorial_staging.py +17 -8
- toil/test/jobStores/jobStoreTest.py +23 -133
- toil/test/lib/aws/test_iam.py +7 -7
- toil/test/lib/aws/test_s3.py +30 -33
- toil/test/lib/aws/test_utils.py +9 -9
- toil/test/provisioners/aws/awsProvisionerTest.py +59 -6
- toil/test/src/autoDeploymentTest.py +2 -3
- toil/test/src/fileStoreTest.py +89 -87
- toil/test/utils/ABCWorkflowDebug/ABC.txt +1 -0
- toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +4 -4
- toil/test/utils/toilKillTest.py +35 -28
- toil/test/wdl/md5sum/md5sum.json +1 -1
- toil/test/wdl/wdltoil_test.py +98 -38
- toil/test/wdl/wdltoil_test_kubernetes.py +9 -0
- toil/utils/toilDebugFile.py +6 -3
- toil/utils/toilStats.py +17 -2
- toil/version.py +6 -6
- toil/wdl/wdltoil.py +1032 -546
- toil/worker.py +5 -2
- {toil-9.0.0.dist-info → toil-9.1.0.dist-info}/METADATA +12 -12
- {toil-9.0.0.dist-info → toil-9.1.0.dist-info}/RECORD +68 -61
- toil/lib/iterables.py +0 -112
- toil/test/docs/scripts/stagingExampleFiles/in.txt +0 -1
- {toil-9.0.0.dist-info → toil-9.1.0.dist-info}/WHEEL +0 -0
- {toil-9.0.0.dist-info → toil-9.1.0.dist-info}/entry_points.txt +0 -0
- {toil-9.0.0.dist-info → toil-9.1.0.dist-info}/licenses/LICENSE +0 -0
- {toil-9.0.0.dist-info → toil-9.1.0.dist-info}/top_level.txt +0 -0
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
from dataclasses import dataclass
|
|
14
15
|
import enum
|
|
15
16
|
import logging
|
|
16
17
|
import os
|
|
@@ -72,10 +73,13 @@ class BatchJobExitReason(enum.IntEnum):
|
|
|
72
73
|
except ValueError:
|
|
73
74
|
return str(value)
|
|
74
75
|
|
|
75
|
-
|
|
76
|
-
class UpdatedBatchJobInfo
|
|
76
|
+
@dataclass
|
|
77
|
+
class UpdatedBatchJobInfo:
|
|
77
78
|
jobID: int
|
|
78
|
-
|
|
79
|
+
"""
|
|
80
|
+
The Toil batch system ID of the job.
|
|
81
|
+
"""
|
|
82
|
+
exitStatus: int = EXIT_STATUS_UNAVAILABLE_VALUE
|
|
79
83
|
"""
|
|
80
84
|
The exit status (integer value) of the job. 0 implies successful.
|
|
81
85
|
|
|
@@ -83,8 +87,12 @@ class UpdatedBatchJobInfo(NamedTuple):
|
|
|
83
87
|
(e.g. job is lost, or otherwise died but actual exit code was not reported).
|
|
84
88
|
"""
|
|
85
89
|
|
|
86
|
-
exitReason: Optional[BatchJobExitReason]
|
|
87
|
-
wallTime: Union[float, int, None]
|
|
90
|
+
exitReason: Optional[BatchJobExitReason] = None
|
|
91
|
+
wallTime: Union[float, int, None] = None
|
|
92
|
+
backing_id: Optional[str] = None
|
|
93
|
+
"""
|
|
94
|
+
The identifier for the job in the backing scheduler, if available.
|
|
95
|
+
"""
|
|
88
96
|
|
|
89
97
|
|
|
90
98
|
# Information required for worker cleanup on shutdown of the batch system.
|
|
@@ -159,14 +159,21 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
159
159
|
logger.debug("Running %r", subLine)
|
|
160
160
|
batchJobID = self.boss.with_retries(self.submitJob, subLine)
|
|
161
161
|
if self.boss._outbox is not None:
|
|
162
|
-
# JobID corresponds to the toil version of the jobID,
|
|
162
|
+
# JobID corresponds to the toil version of the jobID,
|
|
163
|
+
# different from the jobstore's idea of the id. batchjobid
|
|
164
|
+
# is what we get from e.g. slurm
|
|
163
165
|
self.boss._outbox.publish(
|
|
164
166
|
ExternalBatchIdMessage(
|
|
165
167
|
jobID, batchJobID, self.boss.__class__.__name__
|
|
166
168
|
)
|
|
167
169
|
)
|
|
168
170
|
|
|
169
|
-
logger.
|
|
171
|
+
logger.info(
|
|
172
|
+
"Job %s with batch system ID %s queued as job %s",
|
|
173
|
+
jobName,
|
|
174
|
+
jobID,
|
|
175
|
+
str(batchJobID)
|
|
176
|
+
)
|
|
170
177
|
|
|
171
178
|
# Store dict for mapping Toil job ID to batch job ID
|
|
172
179
|
# TODO: Note that this currently stores a tuple of (batch system
|
|
@@ -251,8 +258,8 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
251
258
|
self.coalesce_job_exit_codes, batch_job_id_list
|
|
252
259
|
)
|
|
253
260
|
# We got the statuses as a batch
|
|
254
|
-
for running_job_id, status in zip(running_job_list, statuses):
|
|
255
|
-
activity = self._handle_job_status(running_job_id, status, activity)
|
|
261
|
+
for running_job_id, status, backing_id in zip(running_job_list, statuses, batch_job_id_list):
|
|
262
|
+
activity = self._handle_job_status(running_job_id, status, activity, backing_id)
|
|
256
263
|
|
|
257
264
|
self._checkOnJobsCache = activity
|
|
258
265
|
self._checkOnJobsTimestamp = datetime.now()
|
|
@@ -263,6 +270,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
263
270
|
job_id: int,
|
|
264
271
|
status: Union[int, tuple[int, Optional[BatchJobExitReason]], None],
|
|
265
272
|
activity: bool,
|
|
273
|
+
backing_id: str,
|
|
266
274
|
) -> bool:
|
|
267
275
|
"""
|
|
268
276
|
Helper method for checkOnJobs to handle job statuses
|
|
@@ -275,7 +283,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
275
283
|
code, reason = status
|
|
276
284
|
self.updatedJobsQueue.put(
|
|
277
285
|
UpdatedBatchJobInfo(
|
|
278
|
-
jobID=job_id,
|
|
286
|
+
jobID=job_id,
|
|
287
|
+
exitStatus=code,
|
|
288
|
+
exitReason=reason,
|
|
289
|
+
wallTime=None,
|
|
290
|
+
backing_id=backing_id,
|
|
279
291
|
)
|
|
280
292
|
)
|
|
281
293
|
self.forgetJob(job_id)
|
toil/batchSystems/kubernetes.py
CHANGED
|
@@ -37,6 +37,7 @@ from threading import Condition, Event, RLock, Thread
|
|
|
37
37
|
from typing import Any, Callable, Literal, Optional, TypeVar, Union, cast, overload
|
|
38
38
|
|
|
39
39
|
from toil.lib.conversions import opt_strtobool
|
|
40
|
+
from toil.lib.throttle import LocalThrottle
|
|
40
41
|
|
|
41
42
|
if sys.version_info < (3, 10):
|
|
42
43
|
from typing_extensions import ParamSpec
|
|
@@ -281,6 +282,10 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
281
282
|
# in the queue or any resource becomes available.
|
|
282
283
|
self._work_available: Condition = Condition(lock=self._mutex)
|
|
283
284
|
|
|
285
|
+
# To make sure we don't spam the log when the metrics server is down,
|
|
286
|
+
# we use a throttle
|
|
287
|
+
self._metrics_throttle: LocalThrottle = LocalThrottle(600)
|
|
288
|
+
|
|
284
289
|
self.schedulingThread: Thread = Thread(target=self._scheduler, daemon=True)
|
|
285
290
|
self.schedulingThread.start()
|
|
286
291
|
|
|
@@ -1363,7 +1368,8 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1363
1368
|
# This is the sort of error we would expect from an overloaded
|
|
1364
1369
|
# Kubernetes or a dead metrics service.
|
|
1365
1370
|
# We can't tell that the pod is stuck, so say that it isn't.
|
|
1366
|
-
|
|
1371
|
+
if self._metrics_throttle.throttle(False):
|
|
1372
|
+
logger.warning("Kubernetes metrics service is not available: %s", e)
|
|
1367
1373
|
return False
|
|
1368
1374
|
else:
|
|
1369
1375
|
raise
|
|
@@ -1602,6 +1608,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1602
1608
|
exitStatus=exitCode,
|
|
1603
1609
|
wallTime=runtime,
|
|
1604
1610
|
exitReason=exitReason,
|
|
1611
|
+
backing_id=jobObject.metadata.name,
|
|
1605
1612
|
)
|
|
1606
1613
|
|
|
1607
1614
|
if (exitReason == BatchJobExitReason.FAILED) or (
|
|
@@ -1855,7 +1862,11 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1855
1862
|
|
|
1856
1863
|
# Return the one finished job we found
|
|
1857
1864
|
return UpdatedBatchJobInfo(
|
|
1858
|
-
jobID=jobID,
|
|
1865
|
+
jobID=jobID,
|
|
1866
|
+
exitStatus=exitCode,
|
|
1867
|
+
wallTime=runtime,
|
|
1868
|
+
exitReason=None,
|
|
1869
|
+
backing_id=jobObject.metadata.name,
|
|
1859
1870
|
)
|
|
1860
1871
|
|
|
1861
1872
|
def _waitForJobDeath(self, jobName: str) -> None:
|
|
@@ -103,6 +103,9 @@ class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Sch
|
|
|
103
103
|
if config.mesos_framework_id is not None:
|
|
104
104
|
self.mesos_framework_id = config.mesos_framework_id
|
|
105
105
|
|
|
106
|
+
# How long in seconds to wait to register before declaring Mesos unreachable.
|
|
107
|
+
self.mesos_timeout = 60
|
|
108
|
+
|
|
106
109
|
# Written to when Mesos kills tasks, as directed by Toil.
|
|
107
110
|
# Jobs must not enter this set until they are removed from runningJobMap.
|
|
108
111
|
self.killedJobIds = set()
|
|
@@ -345,17 +348,38 @@ class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Sch
|
|
|
345
348
|
framework.roles = config.mesos_role
|
|
346
349
|
framework.capabilities = [dict(type="MULTI_ROLE")]
|
|
347
350
|
|
|
351
|
+
endpoint = self._resolveAddress(self.mesos_endpoint)
|
|
352
|
+
log.info("Connecting to Mesos at %s...", self.mesos_endpoint)
|
|
353
|
+
|
|
348
354
|
# Make the driver which implements most of the scheduler logic and calls back to us for the user-defined parts.
|
|
349
355
|
# Make sure it will call us with nice namespace-y addicts
|
|
350
356
|
self.driver = MesosSchedulerDriver(
|
|
351
357
|
self,
|
|
352
358
|
framework,
|
|
353
|
-
|
|
359
|
+
endpoint,
|
|
354
360
|
use_addict=True,
|
|
355
361
|
implicit_acknowledgements=True,
|
|
356
362
|
)
|
|
357
363
|
self.driver.start()
|
|
358
364
|
|
|
365
|
+
wait_count = 0
|
|
366
|
+
while self.frameworkId is None:
|
|
367
|
+
# Wait to register with Mesos, and eventually fail if it just isn't
|
|
368
|
+
# responding.
|
|
369
|
+
|
|
370
|
+
# TODO: Use a condition instead of a spin wait.
|
|
371
|
+
|
|
372
|
+
if wait_count >= self.mesos_timeout:
|
|
373
|
+
error_message = f"Could not connect to Mesos endpoint at {self.mesos_endpoint}"
|
|
374
|
+
log.error(error_message)
|
|
375
|
+
self.shutdown()
|
|
376
|
+
raise RuntimeError(error_message)
|
|
377
|
+
elif wait_count > 1 and wait_count % 10 == 0:
|
|
378
|
+
log.warning("Waiting for Mesos registration (try %s/%s)", wait_count, self.mesos_timeout)
|
|
379
|
+
time.sleep(1)
|
|
380
|
+
wait_count += 1
|
|
381
|
+
|
|
382
|
+
|
|
359
383
|
@staticmethod
|
|
360
384
|
def _resolveAddress(address):
|
|
361
385
|
"""
|
|
@@ -394,10 +418,17 @@ class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Sch
|
|
|
394
418
|
"""
|
|
395
419
|
Invoked when the scheduler successfully registers with a Mesos master
|
|
396
420
|
"""
|
|
397
|
-
log.
|
|
421
|
+
log.info("Registered with Mesos as framework ID %s", frameworkId.value)
|
|
398
422
|
# Save the framework ID
|
|
399
423
|
self.frameworkId = frameworkId.value
|
|
400
424
|
|
|
425
|
+
def error(self, driver, message):
|
|
426
|
+
"""
|
|
427
|
+
Invoked when Mesos reports an unrecoverable error.
|
|
428
|
+
"""
|
|
429
|
+
log.error("Mesos error: %s", message)
|
|
430
|
+
super().error(driver, message)
|
|
431
|
+
|
|
401
432
|
def _declineAllOffers(self, driver, offers):
|
|
402
433
|
for offer in offers:
|
|
403
434
|
driver.declineOffer(offer.id)
|
toil/batchSystems/slurm.py
CHANGED
|
@@ -18,9 +18,11 @@ import logging
|
|
|
18
18
|
import math
|
|
19
19
|
import os
|
|
20
20
|
import sys
|
|
21
|
-
from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup
|
|
22
21
|
import shlex
|
|
23
|
-
|
|
22
|
+
|
|
23
|
+
from argparse import SUPPRESS, ArgumentParser, _ArgumentGroup
|
|
24
|
+
from datetime import datetime, timedelta, timezone
|
|
25
|
+
from typing import Callable, NamedTuple, Optional, TypeVar
|
|
24
26
|
|
|
25
27
|
from toil.batchSystems.abstractBatchSystem import (
|
|
26
28
|
EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
@@ -350,9 +352,18 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
350
352
|
) -> list[int | tuple[int, BatchJobExitReason | None] | None]:
|
|
351
353
|
"""
|
|
352
354
|
Collect all job exit codes in a single call.
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
355
|
+
|
|
356
|
+
:param batch_job_id_list: list of Job ID strings, where each string
|
|
357
|
+
has the form ``<job>[.<task>]``.
|
|
358
|
+
|
|
359
|
+
:return: list of job exit codes or exit code, exit reason pairs
|
|
360
|
+
associated with the list of job IDs.
|
|
361
|
+
|
|
362
|
+
:raises CalledProcessErrorStderr: if communicating with Slurm went
|
|
363
|
+
wrong.
|
|
364
|
+
|
|
365
|
+
:raises OSError: if job details are not available becasue a Slurm
|
|
366
|
+
command could not start.
|
|
356
367
|
"""
|
|
357
368
|
logger.log(
|
|
358
369
|
TRACE, "Getting exit codes for slurm jobs: %s", batch_job_id_list
|
|
@@ -387,15 +398,54 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
387
398
|
Helper function for `getJobExitCode` and `coalesce_job_exit_codes`.
|
|
388
399
|
Fetch job details from Slurm's accounting system or job control system.
|
|
389
400
|
:param job_id_list: list of integer Job IDs.
|
|
390
|
-
:return: dict of job statuses, where key is the integer job ID, and
|
|
391
|
-
|
|
401
|
+
:return: dict of job statuses, where key is the integer job ID, and
|
|
402
|
+
value is a tuple containing the job's state and exit code.
|
|
403
|
+
:raises CalledProcessErrorStderr: if communicating with Slurm went
|
|
404
|
+
wrong.
|
|
405
|
+
:raises OSError: if job details are not available becasue a Slurm
|
|
406
|
+
command could not start.
|
|
392
407
|
"""
|
|
408
|
+
|
|
409
|
+
status_dict = {}
|
|
410
|
+
scontrol_problem: Optional[Exception] = None
|
|
411
|
+
|
|
412
|
+
try:
|
|
413
|
+
# Get all the job details we can from scontrol, which we think
|
|
414
|
+
# might be faster/less dangerous than sacct searching, even
|
|
415
|
+
# though it can't be aimed at more than one job.
|
|
416
|
+
status_dict.update(self._getJobDetailsFromScontrol(job_id_list))
|
|
417
|
+
except (CalledProcessErrorStderr, OSError) as e:
|
|
418
|
+
if isinstance(e, OSError):
|
|
419
|
+
logger.warning("Could not run scontrol: %s", e)
|
|
420
|
+
else:
|
|
421
|
+
logger.warning("Error from scontrol: %s", e)
|
|
422
|
+
scontrol_problem = e
|
|
423
|
+
|
|
424
|
+
logger.debug("After scontrol, got statuses: %s", status_dict)
|
|
425
|
+
|
|
426
|
+
# See what's not handy in scontrol (or everything if we couldn't
|
|
427
|
+
# call it).
|
|
428
|
+
sacct_job_id_list = self._remaining_jobs(job_id_list, status_dict)
|
|
429
|
+
|
|
430
|
+
logger.debug("Remaining jobs to find out about: %s", sacct_job_id_list)
|
|
431
|
+
|
|
393
432
|
try:
|
|
394
|
-
|
|
433
|
+
# Ask sacct about those jobs
|
|
434
|
+
status_dict.update(self._getJobDetailsFromSacct(sacct_job_id_list))
|
|
395
435
|
except (CalledProcessErrorStderr, OSError) as e:
|
|
396
436
|
if isinstance(e, OSError):
|
|
397
437
|
logger.warning("Could not run sacct: %s", e)
|
|
398
|
-
|
|
438
|
+
else:
|
|
439
|
+
logger.warning("Error from sacct: %s", e)
|
|
440
|
+
if scontrol_problem is not None:
|
|
441
|
+
# Neither approach worked at all
|
|
442
|
+
raise
|
|
443
|
+
|
|
444
|
+
# One of the methods worked, so we have at least (None, None)
|
|
445
|
+
# values filled in for all jobs.
|
|
446
|
+
assert len(status_dict) == len(job_id_list)
|
|
447
|
+
|
|
448
|
+
|
|
399
449
|
return status_dict
|
|
400
450
|
|
|
401
451
|
def _get_job_return_code(
|
|
@@ -466,15 +516,123 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
466
516
|
|
|
467
517
|
return state_token
|
|
468
518
|
|
|
519
|
+
def _remaining_jobs(self, job_id_list: list[int], job_details: dict[int, tuple[str | None, int | None]]) -> list[int]:
|
|
520
|
+
"""
|
|
521
|
+
Given a list of job IDs and a list of job details (state and exit
|
|
522
|
+
code), get the list of job IDs where the details are (None, None)
|
|
523
|
+
(or are missing).
|
|
524
|
+
"""
|
|
525
|
+
return [
|
|
526
|
+
j
|
|
527
|
+
for j in job_id_list
|
|
528
|
+
if job_details.get(j, (None, None)) == (None, None)
|
|
529
|
+
]
|
|
530
|
+
|
|
469
531
|
def _getJobDetailsFromSacct(
|
|
470
|
-
self,
|
|
532
|
+
self,
|
|
533
|
+
job_id_list: list[int],
|
|
534
|
+
) -> dict[int, tuple[str | None, int | None]]:
|
|
535
|
+
"""
|
|
536
|
+
Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`.
|
|
537
|
+
|
|
538
|
+
Handles querying manageable time periods until all jobs have information.
|
|
539
|
+
|
|
540
|
+
There is no guarantee of inter-job consistency: one job may really
|
|
541
|
+
finish after another, but we might see the earlier-finishing job
|
|
542
|
+
still running and the later-finishing job finished.
|
|
543
|
+
|
|
544
|
+
:param job_id_list: list of integer batch job IDs.
|
|
545
|
+
:return: dict of job statuses, where key is the job-id, and value
|
|
546
|
+
is a tuple containing the job's state and exit code. Jobs with
|
|
547
|
+
no information reported from Slurm will have (None, None).
|
|
548
|
+
"""
|
|
549
|
+
|
|
550
|
+
# Pick a now
|
|
551
|
+
now = datetime.now().astimezone(None)
|
|
552
|
+
# Decide when to start the search (first copy of past midnight)
|
|
553
|
+
begin_time = now.replace(
|
|
554
|
+
hour=0,
|
|
555
|
+
minute=0,
|
|
556
|
+
second=0,
|
|
557
|
+
microsecond=0,
|
|
558
|
+
fold=0
|
|
559
|
+
)
|
|
560
|
+
# And when to end (a day after that)
|
|
561
|
+
end_time = begin_time + timedelta(days=1)
|
|
562
|
+
while end_time < now:
|
|
563
|
+
# If something goes really weird, advance up to our chosen now
|
|
564
|
+
end_time += timedelta(days=1)
|
|
565
|
+
# If we don't go around the loop at least once, we might end up
|
|
566
|
+
# with an empty dict being returned, which shouldn't happen. We
|
|
567
|
+
# need the (None, None) entries for jobs we can't find.
|
|
568
|
+
assert end_time >= self.boss.start_time
|
|
569
|
+
|
|
570
|
+
results: dict[int, tuple[str | None, int | None]] = {}
|
|
571
|
+
|
|
572
|
+
while len(job_id_list) > 0 and end_time >= self.boss.start_time:
|
|
573
|
+
# There are still jobs to look for and our search isn't
|
|
574
|
+
# exclusively for stuff that only existed before our workflow
|
|
575
|
+
# started.
|
|
576
|
+
results.update(
|
|
577
|
+
self._get_job_details_from_sacct_for_range(
|
|
578
|
+
job_id_list,
|
|
579
|
+
begin_time,
|
|
580
|
+
end_time
|
|
581
|
+
)
|
|
582
|
+
)
|
|
583
|
+
job_id_list = self._remaining_jobs(job_id_list, results)
|
|
584
|
+
# If we have to search again, search the previous day. But
|
|
585
|
+
# overlap a tiny bit so the endpoints don't exactly match, in
|
|
586
|
+
# case Slurm is not working with inclusive intervals.
|
|
587
|
+
# TODO: is Slurm working with inclusive intervals?
|
|
588
|
+
end_time = begin_time + timedelta(seconds=1)
|
|
589
|
+
begin_time = end_time - timedelta(days=1, seconds=1)
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
if end_time < self.boss.start_time and len(job_id_list) > 0:
|
|
593
|
+
# This is suspicious.
|
|
594
|
+
logger.warning(
|
|
595
|
+
"Could not find any information from sacct after "
|
|
596
|
+
"workflow start at %s about jobs: %s",
|
|
597
|
+
self.boss.start_time.isoformat(),
|
|
598
|
+
job_id_list
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
return results
|
|
602
|
+
|
|
603
|
+
def _get_job_details_from_sacct_for_range(
|
|
604
|
+
self,
|
|
605
|
+
job_id_list: list[int],
|
|
606
|
+
begin_time: datetime,
|
|
607
|
+
end_time: datetime,
|
|
471
608
|
) -> dict[int, tuple[str | None, int | None]]:
|
|
472
609
|
"""
|
|
473
610
|
Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`.
|
|
611
|
+
|
|
612
|
+
Internally, Slurm's accounting thinks in wall clock time, so for
|
|
613
|
+
efficiency you need to only search relevant real-time periods.
|
|
614
|
+
|
|
474
615
|
:param job_id_list: list of integer batch job IDs.
|
|
475
|
-
:
|
|
476
|
-
|
|
616
|
+
:param begin_time: An aware datetime of the earliest time to search
|
|
617
|
+
:param end_time: An aware datetime of the latest time to search
|
|
618
|
+
:return: dict of job statuses, where key is the job-id, and value
|
|
619
|
+
is a tuple containing the job's state and exit code. Jobs with
|
|
620
|
+
no information reported from Slurm will have (None, None).
|
|
477
621
|
"""
|
|
622
|
+
|
|
623
|
+
assert begin_time.tzinfo is not None, "begin_time must be aware"
|
|
624
|
+
assert end_time.tzinfo is not None, "end_time must be aware"
|
|
625
|
+
def stringify(t: datetime) -> str:
|
|
626
|
+
"""
|
|
627
|
+
Convert an aware time local time, and format it *without* a
|
|
628
|
+
trailing time zone indicator.
|
|
629
|
+
"""
|
|
630
|
+
# TODO: What happens when we get an aware time that's ambiguous
|
|
631
|
+
# in local time? Or when the local timezone changes while we're
|
|
632
|
+
# sending things to Slurm or doing a progressive search back?
|
|
633
|
+
naive_t = t.astimezone(None).replace(tzinfo=None)
|
|
634
|
+
return naive_t.isoformat(timespec="seconds")
|
|
635
|
+
|
|
478
636
|
job_ids = ",".join(str(id) for id in job_id_list)
|
|
479
637
|
args = [
|
|
480
638
|
"sacct",
|
|
@@ -485,8 +643,10 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
485
643
|
"JobIDRaw,State,ExitCode", # specify output columns
|
|
486
644
|
"-P", # separate columns with pipes
|
|
487
645
|
"-S",
|
|
488
|
-
|
|
489
|
-
|
|
646
|
+
stringify(begin_time),
|
|
647
|
+
"-E",
|
|
648
|
+
stringify(end_time),
|
|
649
|
+
]
|
|
490
650
|
|
|
491
651
|
# Collect the job statuses in a dict; key is the job-id, value is a tuple containing
|
|
492
652
|
# job state and exit status. Initialize dict before processing output of `sacct`.
|
|
@@ -500,8 +660,20 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
500
660
|
if len(job_id_list) == 1:
|
|
501
661
|
# 1 is too big, we can't recurse further, bail out
|
|
502
662
|
raise
|
|
503
|
-
job_statuses.update(
|
|
504
|
-
|
|
663
|
+
job_statuses.update(
|
|
664
|
+
self._get_job_details_from_sacct_for_range(
|
|
665
|
+
job_id_list[:len(job_id_list)//2],
|
|
666
|
+
begin_time,
|
|
667
|
+
end_time,
|
|
668
|
+
)
|
|
669
|
+
)
|
|
670
|
+
job_statuses.update(
|
|
671
|
+
self._get_job_details_from_sacct_for_range(
|
|
672
|
+
job_id_list[len(job_id_list)//2:],
|
|
673
|
+
begin_time,
|
|
674
|
+
end_time,
|
|
675
|
+
)
|
|
676
|
+
)
|
|
505
677
|
return job_statuses
|
|
506
678
|
else:
|
|
507
679
|
raise
|
|
@@ -847,6 +1019,9 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
847
1019
|
) -> None:
|
|
848
1020
|
super().__init__(config, maxCores, maxMemory, maxDisk)
|
|
849
1021
|
self.partitions = SlurmBatchSystem.PartitionSet()
|
|
1022
|
+
# Record when the workflow started, so we know when to stop looking for
|
|
1023
|
+
# jobs we ran.
|
|
1024
|
+
self.start_time = datetime.now().astimezone(None)
|
|
850
1025
|
|
|
851
1026
|
# Override issuing jobs so we can check if we need to use Slurm's magic
|
|
852
1027
|
# whole-node-memory feature.
|
toil/cwl/cwltoil.py
CHANGED
|
@@ -110,6 +110,11 @@ from toil.batchSystems.abstractBatchSystem import InsufficientSystemResources
|
|
|
110
110
|
from toil.batchSystems.registry import DEFAULT_BATCH_SYSTEM
|
|
111
111
|
from toil.common import Config, Toil, addOptions
|
|
112
112
|
from toil.cwl import check_cwltool_version
|
|
113
|
+
from toil.lib.directory import (
|
|
114
|
+
DirectoryContents,
|
|
115
|
+
decode_directory,
|
|
116
|
+
encode_directory,
|
|
117
|
+
)
|
|
113
118
|
from toil.lib.trs import resolve_workflow
|
|
114
119
|
from toil.lib.misc import call_command
|
|
115
120
|
from toil.provisioners.clusterScaler import JobTooBigError
|
|
@@ -1156,7 +1161,7 @@ class ToilCommandLineTool(ToilTool, cwltool.command_line_tool.CommandLineTool):
|
|
|
1156
1161
|
"""Subclass the cwltool command line tool to provide the custom ToilPathMapper."""
|
|
1157
1162
|
|
|
1158
1163
|
def _initialworkdir(
|
|
1159
|
-
self, j: cwltool.job.JobBase, builder: cwltool.builder.Builder
|
|
1164
|
+
self, j: Optional[cwltool.job.JobBase], builder: cwltool.builder.Builder
|
|
1160
1165
|
) -> None:
|
|
1161
1166
|
"""
|
|
1162
1167
|
Hook the InitialWorkDirRequirement setup to make sure that there are no
|
|
@@ -1166,6 +1171,9 @@ class ToilCommandLineTool(ToilTool, cwltool.command_line_tool.CommandLineTool):
|
|
|
1166
1171
|
# Set up the initial work dir with all its files
|
|
1167
1172
|
super()._initialworkdir(j, builder)
|
|
1168
1173
|
|
|
1174
|
+
if j is None:
|
|
1175
|
+
return # Only testing
|
|
1176
|
+
|
|
1169
1177
|
# The initial work dir listing is now in j.generatefiles["listing"]
|
|
1170
1178
|
# Also j.generatefiles is a CWL Directory.
|
|
1171
1179
|
# So check the initial working directory.
|
|
@@ -1219,79 +1227,6 @@ def toil_make_tool(
|
|
|
1219
1227
|
# URI instead of raising an error right away, in case it is optional.
|
|
1220
1228
|
MISSING_FILE = "missing://"
|
|
1221
1229
|
|
|
1222
|
-
DirectoryContents = dict[str, Union[str, "DirectoryContents"]]
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
def check_directory_dict_invariants(contents: DirectoryContents) -> None:
|
|
1226
|
-
"""
|
|
1227
|
-
Make sure a directory structure dict makes sense. Throws an error
|
|
1228
|
-
otherwise.
|
|
1229
|
-
|
|
1230
|
-
Currently just checks to make sure no empty-string keys exist.
|
|
1231
|
-
"""
|
|
1232
|
-
|
|
1233
|
-
for name, item in contents.items():
|
|
1234
|
-
if name == "":
|
|
1235
|
-
raise RuntimeError(
|
|
1236
|
-
"Found nameless entry in directory: " + json.dumps(contents, indent=2)
|
|
1237
|
-
)
|
|
1238
|
-
if isinstance(item, dict):
|
|
1239
|
-
check_directory_dict_invariants(item)
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
def decode_directory(
|
|
1243
|
-
dir_path: str,
|
|
1244
|
-
) -> tuple[DirectoryContents, Optional[str], str]:
|
|
1245
|
-
"""
|
|
1246
|
-
Decode a directory from a "toildir:" path to a directory (or a file in it).
|
|
1247
|
-
|
|
1248
|
-
Returns the decoded directory dict, the remaining part of the path (which may be
|
|
1249
|
-
None), and the deduplication key string that uniquely identifies the
|
|
1250
|
-
directory.
|
|
1251
|
-
"""
|
|
1252
|
-
if not dir_path.startswith("toildir:"):
|
|
1253
|
-
raise RuntimeError(f"Cannot decode non-directory path: {dir_path}")
|
|
1254
|
-
|
|
1255
|
-
# We will decode the directory and then look inside it
|
|
1256
|
-
|
|
1257
|
-
# Since this was encoded by upload_directory we know the
|
|
1258
|
-
# next piece is encoded JSON describing the directory structure,
|
|
1259
|
-
# and it can't contain any slashes.
|
|
1260
|
-
parts = dir_path[len("toildir:") :].split("/", 1)
|
|
1261
|
-
|
|
1262
|
-
# Before the first slash is the encoded data describing the directory contents
|
|
1263
|
-
dir_data = parts[0]
|
|
1264
|
-
|
|
1265
|
-
# Decode what to download
|
|
1266
|
-
contents = json.loads(
|
|
1267
|
-
base64.urlsafe_b64decode(dir_data.encode("utf-8")).decode("utf-8")
|
|
1268
|
-
)
|
|
1269
|
-
|
|
1270
|
-
check_directory_dict_invariants(contents)
|
|
1271
|
-
|
|
1272
|
-
if len(parts) == 1 or parts[1] == "/":
|
|
1273
|
-
# We didn't have any subdirectory
|
|
1274
|
-
return contents, None, dir_data
|
|
1275
|
-
else:
|
|
1276
|
-
# We have a path below this
|
|
1277
|
-
return contents, parts[1], dir_data
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
def encode_directory(contents: DirectoryContents) -> str:
|
|
1281
|
-
"""
|
|
1282
|
-
Encode a directory from a "toildir:" path to a directory (or a file in it).
|
|
1283
|
-
|
|
1284
|
-
Takes the directory dict, which is a dict from name to URI for a file or
|
|
1285
|
-
dict for a subdirectory.
|
|
1286
|
-
"""
|
|
1287
|
-
|
|
1288
|
-
check_directory_dict_invariants(contents)
|
|
1289
|
-
|
|
1290
|
-
return "toildir:" + base64.urlsafe_b64encode(
|
|
1291
|
-
json.dumps(contents).encode("utf-8")
|
|
1292
|
-
).decode("utf-8")
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
1230
|
class ToilFsAccess(StdFsAccess):
|
|
1296
1231
|
"""
|
|
1297
1232
|
Custom filesystem access class which handles toil filestore references.
|
|
@@ -1360,7 +1295,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1360
1295
|
|
|
1361
1296
|
# Decode its contents, the path inside it to the file (if any), and
|
|
1362
1297
|
# the key to use for caching the directory.
|
|
1363
|
-
contents, subpath, cache_key = decode_directory(path)
|
|
1298
|
+
contents, subpath, cache_key, _, _ = decode_directory(path)
|
|
1364
1299
|
logger.debug("Decoded directory contents: %s", contents)
|
|
1365
1300
|
|
|
1366
1301
|
if cache_key not in self.dir_to_download:
|
|
@@ -1462,7 +1397,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1462
1397
|
# Handle local files
|
|
1463
1398
|
return open(self._abs(fn), mode)
|
|
1464
1399
|
elif parse.scheme == "toildir":
|
|
1465
|
-
contents, subpath, cache_key = decode_directory(fn)
|
|
1400
|
+
contents, subpath, cache_key, _, _ = decode_directory(fn)
|
|
1466
1401
|
if cache_key in self.dir_to_download:
|
|
1467
1402
|
# This is already available locally, so fall back on the local copy
|
|
1468
1403
|
return open(self._abs(fn), mode)
|
|
@@ -1503,7 +1438,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1503
1438
|
except NoSuchFileException:
|
|
1504
1439
|
return False
|
|
1505
1440
|
elif parse.scheme == "toildir":
|
|
1506
|
-
contents, subpath, cache_key = decode_directory(path)
|
|
1441
|
+
contents, subpath, cache_key, _, _ = decode_directory(path)
|
|
1507
1442
|
if subpath is None:
|
|
1508
1443
|
# The toildir directory itself exists
|
|
1509
1444
|
return True
|
|
@@ -1530,7 +1465,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1530
1465
|
elif parse.scheme == "toildir":
|
|
1531
1466
|
# Decode its contents, the path inside it to the file (if any), and
|
|
1532
1467
|
# the key to use for caching the directory.
|
|
1533
|
-
contents, subpath, cache_key = decode_directory(path)
|
|
1468
|
+
contents, subpath, cache_key, _, _ = decode_directory(path)
|
|
1534
1469
|
|
|
1535
1470
|
# We can't get the size of just a directory.
|
|
1536
1471
|
if subpath is None:
|
|
@@ -1564,7 +1499,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1564
1499
|
# TODO: we assume CWL can't call deleteGlobalFile and so the file always exists
|
|
1565
1500
|
return True
|
|
1566
1501
|
elif parse.scheme == "toildir":
|
|
1567
|
-
contents, subpath, cache_key = decode_directory(fn)
|
|
1502
|
+
contents, subpath, cache_key, _, _ = decode_directory(fn)
|
|
1568
1503
|
if subpath is None:
|
|
1569
1504
|
# This is the toildir directory itself
|
|
1570
1505
|
return False
|
|
@@ -1583,7 +1518,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1583
1518
|
elif parse.scheme == "toilfile":
|
|
1584
1519
|
return False
|
|
1585
1520
|
elif parse.scheme == "toildir":
|
|
1586
|
-
contents, subpath, cache_key = decode_directory(fn)
|
|
1521
|
+
contents, subpath, cache_key, _, _ = decode_directory(fn)
|
|
1587
1522
|
if subpath is None:
|
|
1588
1523
|
# This is the toildir directory itself.
|
|
1589
1524
|
# TODO: We assume directories can't be deleted.
|
|
@@ -1611,7 +1546,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1611
1546
|
elif parse.scheme == "toilfile":
|
|
1612
1547
|
raise RuntimeError(f"Cannot list a file: {fn}")
|
|
1613
1548
|
elif parse.scheme == "toildir":
|
|
1614
|
-
contents, subpath, cache_key = decode_directory(fn)
|
|
1549
|
+
contents, subpath, cache_key, _, _ = decode_directory(fn)
|
|
1615
1550
|
here = contents
|
|
1616
1551
|
if subpath is not None:
|
|
1617
1552
|
got = get_from_structure(contents, subpath)
|
|
@@ -2402,7 +2337,7 @@ def toilStageFiles(
|
|
|
2402
2337
|
|
|
2403
2338
|
if file_id_or_contents.startswith("toildir:"):
|
|
2404
2339
|
# Get the directory contents and the path into them, if any
|
|
2405
|
-
here, subpath, _ = decode_directory(file_id_or_contents)
|
|
2340
|
+
here, subpath, _, _, _ = decode_directory(file_id_or_contents)
|
|
2406
2341
|
if subpath is not None:
|
|
2407
2342
|
for part in subpath.split("/"):
|
|
2408
2343
|
here = cast(DirectoryContents, here[part])
|
toil/fileStores/__init__.py
CHANGED
|
@@ -28,7 +28,7 @@ class FileID(str):
|
|
|
28
28
|
the job store if unavailable in the ID.
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
-
def __new__(cls, fileStoreID: str, *args: Any) -> "FileID":
|
|
31
|
+
def __new__(cls, fileStoreID: str, *args: Any, **kwargs: dict[str, Any]) -> "FileID":
|
|
32
32
|
return super().__new__(cls, fileStoreID)
|
|
33
33
|
|
|
34
34
|
def __init__(self, fileStoreID: str, size: int, executable: bool = False) -> None:
|