toil 5.12.0__py3-none-any.whl → 6.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +18 -13
- toil/batchSystems/abstractBatchSystem.py +39 -13
- toil/batchSystems/abstractGridEngineBatchSystem.py +24 -24
- toil/batchSystems/awsBatch.py +14 -14
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/contained_executor.py +3 -3
- toil/batchSystems/htcondor.py +0 -1
- toil/batchSystems/kubernetes.py +34 -31
- toil/batchSystems/local_support.py +3 -1
- toil/batchSystems/lsf.py +7 -7
- toil/batchSystems/mesos/batchSystem.py +7 -7
- toil/batchSystems/options.py +32 -83
- toil/batchSystems/registry.py +104 -23
- toil/batchSystems/singleMachine.py +16 -13
- toil/batchSystems/slurm.py +87 -16
- toil/batchSystems/torque.py +0 -1
- toil/bus.py +44 -8
- toil/common.py +544 -753
- toil/cwl/__init__.py +28 -32
- toil/cwl/cwltoil.py +595 -574
- toil/cwl/utils.py +55 -10
- toil/exceptions.py +1 -1
- toil/fileStores/__init__.py +2 -2
- toil/fileStores/abstractFileStore.py +88 -14
- toil/fileStores/cachingFileStore.py +610 -549
- toil/fileStores/nonCachingFileStore.py +46 -22
- toil/job.py +182 -101
- toil/jobStores/abstractJobStore.py +161 -95
- toil/jobStores/aws/jobStore.py +23 -9
- toil/jobStores/aws/utils.py +6 -6
- toil/jobStores/fileJobStore.py +116 -18
- toil/jobStores/googleJobStore.py +16 -7
- toil/jobStores/utils.py +5 -6
- toil/leader.py +87 -56
- toil/lib/accelerators.py +10 -5
- toil/lib/aws/__init__.py +3 -14
- toil/lib/aws/ami.py +22 -9
- toil/lib/aws/iam.py +21 -13
- toil/lib/aws/session.py +2 -16
- toil/lib/aws/utils.py +4 -5
- toil/lib/compatibility.py +1 -1
- toil/lib/conversions.py +26 -3
- toil/lib/docker.py +22 -23
- toil/lib/ec2.py +10 -6
- toil/lib/ec2nodes.py +106 -100
- toil/lib/encryption/_nacl.py +2 -1
- toil/lib/generatedEC2Lists.py +325 -18
- toil/lib/io.py +49 -2
- toil/lib/misc.py +1 -1
- toil/lib/resources.py +9 -2
- toil/lib/threading.py +101 -38
- toil/options/common.py +736 -0
- toil/options/cwl.py +336 -0
- toil/options/wdl.py +37 -0
- toil/provisioners/abstractProvisioner.py +9 -4
- toil/provisioners/aws/__init__.py +3 -6
- toil/provisioners/aws/awsProvisioner.py +6 -0
- toil/provisioners/clusterScaler.py +3 -2
- toil/provisioners/gceProvisioner.py +2 -2
- toil/realtimeLogger.py +2 -1
- toil/resource.py +24 -18
- toil/server/app.py +2 -3
- toil/server/cli/wes_cwl_runner.py +4 -4
- toil/server/utils.py +1 -1
- toil/server/wes/abstract_backend.py +3 -2
- toil/server/wes/amazon_wes_utils.py +5 -4
- toil/server/wes/tasks.py +2 -3
- toil/server/wes/toil_backend.py +2 -10
- toil/server/wsgi_app.py +2 -0
- toil/serviceManager.py +12 -10
- toil/statsAndLogging.py +41 -9
- toil/test/__init__.py +29 -54
- toil/test/batchSystems/batchSystemTest.py +11 -111
- toil/test/batchSystems/test_slurm.py +24 -8
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +438 -223
- toil/test/cwl/glob_dir.cwl +15 -0
- toil/test/cwl/preemptible.cwl +21 -0
- toil/test/cwl/preemptible_expression.cwl +28 -0
- toil/test/cwl/revsort.cwl +1 -1
- toil/test/cwl/revsort2.cwl +1 -1
- toil/test/docs/scriptsTest.py +2 -3
- toil/test/jobStores/jobStoreTest.py +34 -21
- toil/test/lib/aws/test_iam.py +4 -14
- toil/test/lib/aws/test_utils.py +0 -3
- toil/test/lib/dockerTest.py +4 -4
- toil/test/lib/test_ec2.py +12 -17
- toil/test/mesos/helloWorld.py +4 -5
- toil/test/mesos/stress.py +1 -1
- toil/test/{wdl/conftest.py → options/__init__.py} +0 -10
- toil/test/options/options.py +37 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
- toil/test/provisioners/clusterScalerTest.py +6 -4
- toil/test/provisioners/clusterTest.py +23 -11
- toil/test/provisioners/gceProvisionerTest.py +0 -6
- toil/test/provisioners/restartScript.py +3 -2
- toil/test/server/serverTest.py +1 -1
- toil/test/sort/restart_sort.py +2 -1
- toil/test/sort/sort.py +2 -1
- toil/test/sort/sortTest.py +2 -13
- toil/test/src/autoDeploymentTest.py +45 -45
- toil/test/src/busTest.py +5 -5
- toil/test/src/checkpointTest.py +2 -2
- toil/test/src/deferredFunctionTest.py +1 -1
- toil/test/src/fileStoreTest.py +32 -16
- toil/test/src/helloWorldTest.py +1 -1
- toil/test/src/importExportFileTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +2 -1
- toil/test/src/jobServiceTest.py +1 -1
- toil/test/src/jobTest.py +18 -18
- toil/test/src/miscTests.py +5 -3
- toil/test/src/promisedRequirementTest.py +3 -3
- toil/test/src/realtimeLoggerTest.py +1 -1
- toil/test/src/resourceTest.py +2 -2
- toil/test/src/restartDAGTest.py +1 -1
- toil/test/src/resumabilityTest.py +36 -2
- toil/test/src/retainTempDirTest.py +1 -1
- toil/test/src/systemTest.py +2 -2
- toil/test/src/toilContextManagerTest.py +2 -2
- toil/test/src/userDefinedJobArgTypeTest.py +1 -1
- toil/test/utils/toilDebugTest.py +98 -32
- toil/test/utils/toilKillTest.py +2 -2
- toil/test/utils/utilsTest.py +23 -3
- toil/test/wdl/wdltoil_test.py +223 -45
- toil/toilState.py +7 -6
- toil/utils/toilClean.py +1 -1
- toil/utils/toilConfig.py +36 -0
- toil/utils/toilDebugFile.py +60 -33
- toil/utils/toilDebugJob.py +39 -12
- toil/utils/toilDestroyCluster.py +1 -1
- toil/utils/toilKill.py +1 -1
- toil/utils/toilLaunchCluster.py +13 -2
- toil/utils/toilMain.py +3 -2
- toil/utils/toilRsyncCluster.py +1 -1
- toil/utils/toilSshCluster.py +1 -1
- toil/utils/toilStats.py +445 -305
- toil/utils/toilStatus.py +2 -5
- toil/version.py +10 -10
- toil/wdl/utils.py +2 -122
- toil/wdl/wdltoil.py +1257 -492
- toil/worker.py +55 -46
- toil-6.1.0.dist-info/METADATA +124 -0
- toil-6.1.0.dist-info/RECORD +241 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/WHEEL +1 -1
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -1
- toil/batchSystems/parasol.py +0 -379
- toil/batchSystems/tes.py +0 -459
- toil/test/batchSystems/parasolTestSupport.py +0 -117
- toil/test/wdl/builtinTest.py +0 -506
- toil/test/wdl/toilwdlTest.py +0 -522
- toil/wdl/toilwdl.py +0 -141
- toil/wdl/versions/dev.py +0 -107
- toil/wdl/versions/draft2.py +0 -980
- toil/wdl/versions/v1.py +0 -794
- toil/wdl/wdl_analysis.py +0 -116
- toil/wdl/wdl_functions.py +0 -997
- toil/wdl/wdl_synthesis.py +0 -1011
- toil/wdl/wdl_types.py +0 -243
- toil-5.12.0.dist-info/METADATA +0 -118
- toil-5.12.0.dist-info/RECORD +0 -244
- /toil/{wdl/versions → options}/__init__.py +0 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
toil/leader.py
CHANGED
|
@@ -28,22 +28,24 @@ import enlighten
|
|
|
28
28
|
from toil import resolveEntryPoint
|
|
29
29
|
from toil.batchSystems import DeadlockException
|
|
30
30
|
from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem,
|
|
31
|
-
BatchJobExitReason
|
|
32
|
-
|
|
33
|
-
|
|
31
|
+
BatchJobExitReason,
|
|
32
|
+
EXIT_STATUS_UNAVAILABLE_VALUE)
|
|
33
|
+
from toil.bus import (JobCompletedMessage,
|
|
34
34
|
JobFailedMessage,
|
|
35
35
|
JobIssuedMessage,
|
|
36
36
|
JobMissingMessage,
|
|
37
37
|
JobUpdatedMessage,
|
|
38
|
-
QueueSizeMessage
|
|
39
|
-
|
|
38
|
+
QueueSizeMessage,
|
|
39
|
+
gen_message_bus_path,
|
|
40
|
+
get_job_kind)
|
|
41
|
+
from toil.common import Config, ToilMetrics
|
|
40
42
|
from toil.cwl.utils import CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
|
|
43
|
+
from toil.exceptions import FailedJobsException
|
|
41
44
|
from toil.job import (CheckpointJobDescription,
|
|
42
45
|
JobDescription,
|
|
43
46
|
ServiceJobDescription,
|
|
44
47
|
TemporaryID)
|
|
45
48
|
from toil.jobStores.abstractJobStore import (AbstractJobStore,
|
|
46
|
-
NoSuchFileException,
|
|
47
49
|
NoSuchJobException)
|
|
48
50
|
from toil.lib.throttle import LocalThrottle
|
|
49
51
|
from toil.provisioners.abstractProvisioner import AbstractProvisioner
|
|
@@ -51,7 +53,6 @@ from toil.provisioners.clusterScaler import ScalerThread
|
|
|
51
53
|
from toil.serviceManager import ServiceManager
|
|
52
54
|
from toil.statsAndLogging import StatsAndLogging
|
|
53
55
|
from toil.toilState import ToilState
|
|
54
|
-
from toil.exceptions import FailedJobsException
|
|
55
56
|
|
|
56
57
|
logger = logging.getLogger(__name__)
|
|
57
58
|
|
|
@@ -115,10 +116,14 @@ class Leader:
|
|
|
115
116
|
# state change information about jobs.
|
|
116
117
|
self.toilState = ToilState(self.jobStore)
|
|
117
118
|
|
|
118
|
-
if self.config.write_messages is
|
|
119
|
-
#
|
|
120
|
-
#
|
|
121
|
-
self.
|
|
119
|
+
if self.config.write_messages is None:
|
|
120
|
+
# The user hasn't specified a place for the message bus so we
|
|
121
|
+
# should make one.
|
|
122
|
+
self.config.write_messages = gen_message_bus_path()
|
|
123
|
+
|
|
124
|
+
# Message bus messages need to go to the given file.
|
|
125
|
+
# Keep a reference to the return value so the listener stays alive.
|
|
126
|
+
self._message_subscription = self.toilState.bus.connect_output_file(self.config.write_messages)
|
|
122
127
|
|
|
123
128
|
# Connect to the message bus, so we will get all the messages of these
|
|
124
129
|
# types in an inbox.
|
|
@@ -138,7 +143,8 @@ class Leader:
|
|
|
138
143
|
|
|
139
144
|
# Batch system
|
|
140
145
|
self.batchSystem = batchSystem
|
|
141
|
-
|
|
146
|
+
if len(self.batchSystem.getIssuedBatchJobIDs()) != 0:
|
|
147
|
+
raise RuntimeError("The initialized batchsystem did not start with 0 active jobs.")
|
|
142
148
|
logger.debug("Checked batch system has no running jobs and no updated jobs")
|
|
143
149
|
|
|
144
150
|
# Map of batch system IDs to job store IDs
|
|
@@ -370,7 +376,8 @@ class Leader:
|
|
|
370
376
|
|
|
371
377
|
# If the successor job's predecessors have all not all completed then
|
|
372
378
|
# ignore the successor as is not yet ready to run
|
|
373
|
-
|
|
379
|
+
if len(successor.predecessorsFinished) > successor.predecessorNumber:
|
|
380
|
+
raise RuntimeError("There are more finished predecessors than possible.")
|
|
374
381
|
if len(successor.predecessorsFinished) == successor.predecessorNumber:
|
|
375
382
|
# All the successor's predecessors are done now.
|
|
376
383
|
# Remove the successor job from the set of waiting multi-predecessor jobs.
|
|
@@ -391,8 +398,10 @@ class Leader:
|
|
|
391
398
|
#Build map from successor to predecessors.
|
|
392
399
|
if successor_id not in self.toilState.successor_to_predecessors:
|
|
393
400
|
self.toilState.successor_to_predecessors[successor_id] = set()
|
|
394
|
-
|
|
395
|
-
|
|
401
|
+
if not isinstance(successor_id, str):
|
|
402
|
+
raise RuntimeError("The given successor ID is invalid.")
|
|
403
|
+
if not isinstance(predecessor_id, str):
|
|
404
|
+
raise RuntimeError("The given predecessor ID is invalid.")
|
|
396
405
|
self.toilState.successor_to_predecessors[successor_id].add(predecessor_id)
|
|
397
406
|
|
|
398
407
|
# Grab the successor
|
|
@@ -423,7 +432,8 @@ class Leader:
|
|
|
423
432
|
predecessor_id, len(next_successors))
|
|
424
433
|
#Record the number of successors that must be completed before
|
|
425
434
|
#the job can be considered again
|
|
426
|
-
|
|
435
|
+
if self.toilState.count_pending_successors(predecessor_id) != 0:
|
|
436
|
+
raise RuntimeError('Attempted to schedule successors of the same job twice!')
|
|
427
437
|
self.toilState.successors_pending(predecessor_id, len(next_successors))
|
|
428
438
|
|
|
429
439
|
# For each successor schedule if all predecessors have been completed
|
|
@@ -534,11 +544,13 @@ class Leader:
|
|
|
534
544
|
# the job has services to run, which have not been started, start them
|
|
535
545
|
# Build a map from the service jobs to the job and a map
|
|
536
546
|
# of the services created for the job
|
|
537
|
-
|
|
547
|
+
if readyJob.jobStoreID in self.toilState.servicesIssued:
|
|
548
|
+
raise RuntimeError(f"The ready job: {readyJob.jobStoreID} was already issued.")
|
|
538
549
|
self.toilState.servicesIssued[readyJob.jobStoreID] = set()
|
|
539
550
|
for serviceJobList in readyJob.serviceHostIDsInBatches():
|
|
540
551
|
for serviceID in serviceJobList:
|
|
541
|
-
|
|
552
|
+
if serviceID in self.toilState.service_to_client:
|
|
553
|
+
raise RuntimeError(f"The ready service ID: {serviceID} was already added.")
|
|
542
554
|
self.toilState.reset_job(serviceID)
|
|
543
555
|
serviceHost = self.toilState.get_job(serviceID)
|
|
544
556
|
self.toilState.service_to_client[serviceID] = readyJob.jobStoreID
|
|
@@ -675,7 +687,8 @@ class Leader:
|
|
|
675
687
|
client = self.toilState.get_job(client_id)
|
|
676
688
|
|
|
677
689
|
# Make sure services still want to run
|
|
678
|
-
|
|
690
|
+
if next(client.serviceHostIDsInBatches(), None) is None:
|
|
691
|
+
raise RuntimeError("No more services want to run.")
|
|
679
692
|
|
|
680
693
|
# Mark the service job updated so we don't stop here.
|
|
681
694
|
self._messages.publish(JobUpdatedMessage(client_id, 1))
|
|
@@ -694,8 +707,9 @@ class Leader:
|
|
|
694
707
|
if exitStatus == 0:
|
|
695
708
|
logger.debug('Job ended: %s', updatedJob)
|
|
696
709
|
else:
|
|
697
|
-
|
|
698
|
-
|
|
710
|
+
status_string = str(exitStatus) if exitStatus != EXIT_STATUS_UNAVAILABLE_VALUE else "<UNAVAILABLE>"
|
|
711
|
+
logger.warning(f'Job failed with exit value {status_string}: {updatedJob}\n'
|
|
712
|
+
f'Exit reason: {BatchJobExitReason.to_string(exitReason)}')
|
|
699
713
|
if exitStatus == CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE:
|
|
700
714
|
# This is a CWL job informing us that the workflow is
|
|
701
715
|
# asking things of us that Toil can't do. When we raise an
|
|
@@ -704,7 +718,7 @@ class Leader:
|
|
|
704
718
|
logger.warning("This indicates an unsupported CWL requirement!")
|
|
705
719
|
self.recommended_fail_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
|
|
706
720
|
# Tell everyone it stopped running.
|
|
707
|
-
self._messages.publish(JobCompletedMessage(updatedJob.
|
|
721
|
+
self._messages.publish(JobCompletedMessage(get_job_kind(updatedJob.get_names()), updatedJob.jobStoreID, exitStatus))
|
|
708
722
|
self.process_finished_job(bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason)
|
|
709
723
|
|
|
710
724
|
def _processLostJobs(self):
|
|
@@ -784,13 +798,16 @@ class Leader:
|
|
|
784
798
|
logger.debug("Finished the main loop: no jobs left to run.")
|
|
785
799
|
|
|
786
800
|
# Consistency check the toil state
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
801
|
+
if not self._messages.empty():
|
|
802
|
+
raise RuntimeError(f"Pending messages at shutdown: {self._messages}")
|
|
803
|
+
if self.toilState.successorCounts != {}:
|
|
804
|
+
raise RuntimeError(f"Jobs waiting on successors at shutdown: {self.toilState.successorCounts}")
|
|
805
|
+
if self.toilState.successor_to_predecessors != {}:
|
|
806
|
+
raise RuntimeError(f"Successors pending for their predecessors at shutdown: {self.toilState.successor_to_predecessors}")
|
|
807
|
+
if self.toilState.service_to_client != {}:
|
|
808
|
+
raise RuntimeError(f"Services pending for their clients at shutdown: {self.toilState.service_to_client}")
|
|
809
|
+
if self.toilState.servicesIssued != {}:
|
|
810
|
+
raise RuntimeError(f"Services running at shutdown: {self.toilState.servicesIssued}")
|
|
794
811
|
|
|
795
812
|
def checkForDeadlocks(self):
|
|
796
813
|
"""Check if the system is deadlocked running service jobs."""
|
|
@@ -865,8 +882,8 @@ class Leader:
|
|
|
865
882
|
def issueJob(self, jobNode: JobDescription) -> None:
|
|
866
883
|
"""Add a job to the queue of jobs currently trying to run."""
|
|
867
884
|
# Never issue the same job multiple times simultaneously
|
|
868
|
-
|
|
869
|
-
f"Attempted to issue {jobNode} multiple times simultaneously!"
|
|
885
|
+
if jobNode.jobStoreID in self.toilState.jobs_issued:
|
|
886
|
+
raise RuntimeError(f"Attempted to issue {jobNode} multiple times simultaneously!")
|
|
870
887
|
|
|
871
888
|
workerCommand = [resolveEntryPoint('_toil_worker'),
|
|
872
889
|
jobNode.jobName,
|
|
@@ -907,7 +924,7 @@ class Leader:
|
|
|
907
924
|
"%s and %s",
|
|
908
925
|
jobNode, str(jobBatchSystemID), jobNode.requirements_string())
|
|
909
926
|
# Tell everyone it is issued and the queue size changed
|
|
910
|
-
self._messages.publish(JobIssuedMessage(jobNode.
|
|
927
|
+
self._messages.publish(JobIssuedMessage(get_job_kind(jobNode.get_names()), jobNode.jobStoreID, jobBatchSystemID))
|
|
911
928
|
self._messages.publish(QueueSizeMessage(self.getNumberOfJobsIssued()))
|
|
912
929
|
# Tell the user there's another job to do
|
|
913
930
|
self.progress_overall.total += 1
|
|
@@ -926,7 +943,8 @@ class Leader:
|
|
|
926
943
|
"""
|
|
927
944
|
# Grab the service job description
|
|
928
945
|
service = self.toilState.get_job(service_id)
|
|
929
|
-
|
|
946
|
+
if not isinstance(service, ServiceJobDescription):
|
|
947
|
+
raise RuntimeError("The grabbed service job description is not the right type.")
|
|
930
948
|
|
|
931
949
|
if service.preemptible:
|
|
932
950
|
self.preemptibleServiceJobsToBeIssued.append(service_id)
|
|
@@ -956,7 +974,8 @@ class Leader:
|
|
|
956
974
|
elif preemptible:
|
|
957
975
|
return self.preemptibleJobsIssued
|
|
958
976
|
else:
|
|
959
|
-
|
|
977
|
+
if len(self.issued_jobs_by_batch_system_id) < self.preemptibleJobsIssued:
|
|
978
|
+
raise RuntimeError("Number of jobs issued cannot be negative.")
|
|
960
979
|
return len(self.issued_jobs_by_batch_system_id) - self.preemptibleJobsIssued
|
|
961
980
|
|
|
962
981
|
def _getStatusHint(self) -> str:
|
|
@@ -990,16 +1009,19 @@ class Leader:
|
|
|
990
1009
|
|
|
991
1010
|
:return: Job description as it was issued.
|
|
992
1011
|
"""
|
|
993
|
-
|
|
1012
|
+
if jobBatchSystemID not in self.issued_jobs_by_batch_system_id:
|
|
1013
|
+
raise RuntimeError("Job was already removed or was never issued.")
|
|
994
1014
|
issuedDesc = self.toilState.get_job(self.issued_jobs_by_batch_system_id[jobBatchSystemID])
|
|
995
1015
|
if issuedDesc.preemptible:
|
|
996
1016
|
# len(issued_jobs_by_batch_system_id) should always be greater than or equal to preemptibleJobsIssued,
|
|
997
1017
|
# so decrement this value before removing the job from the issuedJob map
|
|
998
|
-
|
|
1018
|
+
if self.preemptibleJobsIssued <= 0:
|
|
1019
|
+
raise RuntimeError("The number of preemptive issued jobs cannot be negative.")
|
|
999
1020
|
self.preemptibleJobsIssued -= 1
|
|
1000
1021
|
# It's not issued anymore.
|
|
1001
1022
|
del self.issued_jobs_by_batch_system_id[jobBatchSystemID]
|
|
1002
|
-
|
|
1023
|
+
if issuedDesc.jobStoreID not in self.toilState.jobs_issued:
|
|
1024
|
+
raise RuntimeError(f"Job {issuedDesc} came back without being issued")
|
|
1003
1025
|
self.toilState.jobs_issued.remove(issuedDesc.jobStoreID)
|
|
1004
1026
|
# If service job
|
|
1005
1027
|
if issuedDesc.jobStoreID in self.toilState.service_to_client:
|
|
@@ -1090,8 +1112,9 @@ class Leader:
|
|
|
1090
1112
|
for jobBatchSystemID in missingJobIDsSet.difference(jobBatchSystemIDsSet):
|
|
1091
1113
|
self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
|
|
1092
1114
|
logger.warning("Batch system id: %s is no longer missing", str(jobBatchSystemID))
|
|
1093
|
-
|
|
1094
|
-
|
|
1115
|
+
# checks we have no unexpected jobs running
|
|
1116
|
+
if not issuedJobs.issubset(jobBatchSystemIDsSet):
|
|
1117
|
+
raise RuntimeError("An unexpected job is still running.")
|
|
1095
1118
|
jobsToKill = []
|
|
1096
1119
|
for jobBatchSystemID in set(jobBatchSystemIDsSet.difference(issuedJobs)):
|
|
1097
1120
|
jobStoreID = self.issued_jobs_by_batch_system_id[jobBatchSystemID]
|
|
@@ -1137,7 +1160,7 @@ class Leader:
|
|
|
1137
1160
|
self.progress_overall.update(incr=-1)
|
|
1138
1161
|
self.progress_failed.update(incr=1)
|
|
1139
1162
|
|
|
1140
|
-
# Delegate to the
|
|
1163
|
+
# Delegate to the version that uses a JobDescription
|
|
1141
1164
|
return self.process_finished_job_description(issued_job, result_status, wall_time, exit_reason, batch_system_id)
|
|
1142
1165
|
|
|
1143
1166
|
def process_finished_job_description(self, finished_job: JobDescription, result_status: int,
|
|
@@ -1188,11 +1211,12 @@ class Leader:
|
|
|
1188
1211
|
# more memory efficient than read().striplines() while leaving off the
|
|
1189
1212
|
# trailing \n left when using readlines()
|
|
1190
1213
|
# http://stackoverflow.com/a/15233739
|
|
1191
|
-
StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
|
|
1214
|
+
StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
|
|
1192
1215
|
message='The job seems to have left a log file, indicating failure: %s' % replacement_job)
|
|
1193
1216
|
if self.config.writeLogs or self.config.writeLogsGzip:
|
|
1194
1217
|
with replacement_job.getLogFileHandle(self.jobStore) as log_stream:
|
|
1195
|
-
|
|
1218
|
+
# Send log data from the job store to each per-job log file involved.
|
|
1219
|
+
StatsAndLogging.writeLogFiles([names.stats_name for names in replacement_job.get_chain()], log_stream, self.config, failed=True)
|
|
1196
1220
|
if result_status != 0:
|
|
1197
1221
|
# If the batch system returned a non-zero exit code then the worker
|
|
1198
1222
|
# is assumed not to have captured the failure of the job, so we
|
|
@@ -1216,13 +1240,12 @@ class Leader:
|
|
|
1216
1240
|
else:
|
|
1217
1241
|
with log_stream:
|
|
1218
1242
|
if os.path.getsize(log_file) > 0:
|
|
1219
|
-
StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
|
|
1243
|
+
StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
|
|
1220
1244
|
message='The batch system left a non-empty file %s:' % log_file)
|
|
1221
1245
|
if self.config.writeLogs or self.config.writeLogsGzip:
|
|
1222
1246
|
file_root, _ = os.path.splitext(os.path.basename(log_file))
|
|
1223
|
-
job_names = replacement_job.
|
|
1224
|
-
|
|
1225
|
-
job_names = [str(replacement_job)]
|
|
1247
|
+
job_names = [names.stats_name for names in replacement_job.get_chain()]
|
|
1248
|
+
# Tack the batch system log file name onto each job's name
|
|
1226
1249
|
job_names = [j + '_' + file_root for j in job_names]
|
|
1227
1250
|
log_stream.seek(0)
|
|
1228
1251
|
StatsAndLogging.writeLogFiles(job_names, log_stream, self.config, failed=True)
|
|
@@ -1289,18 +1312,21 @@ class Leader:
|
|
|
1289
1312
|
|
|
1290
1313
|
# Tell everyone it failed
|
|
1291
1314
|
|
|
1292
|
-
self._messages.publish(JobFailedMessage(job_desc.
|
|
1315
|
+
self._messages.publish(JobFailedMessage(get_job_kind(job_desc.get_names()), job_id))
|
|
1293
1316
|
|
|
1294
1317
|
if job_id in self.toilState.service_to_client:
|
|
1295
1318
|
# Is a service job
|
|
1296
1319
|
logger.debug("Service job is being processed as a totally failed job: %s", job_desc)
|
|
1297
1320
|
|
|
1298
|
-
|
|
1321
|
+
|
|
1322
|
+
if not isinstance(job_desc, ServiceJobDescription):
|
|
1323
|
+
raise RuntimeError("The service job description type is incorrect.")
|
|
1299
1324
|
|
|
1300
1325
|
# Grab the client, which is the predecessor.
|
|
1301
1326
|
client_id = self.toilState.service_to_client[job_id]
|
|
1302
1327
|
|
|
1303
|
-
|
|
1328
|
+
if client_id not in self.toilState.servicesIssued:
|
|
1329
|
+
raise RuntimeError("The client was never issued.")
|
|
1304
1330
|
|
|
1305
1331
|
# Leave the service job as a service of its predecessor, because it
|
|
1306
1332
|
# didn't work.
|
|
@@ -1331,8 +1357,10 @@ class Leader:
|
|
|
1331
1357
|
self.jobStore.delete_file(job_desc.startJobStoreID)
|
|
1332
1358
|
else:
|
|
1333
1359
|
# Is a non-service job
|
|
1334
|
-
|
|
1335
|
-
|
|
1360
|
+
if job_id in self.toilState.servicesIssued:
|
|
1361
|
+
raise RuntimeError("The non-service job should not have been issued.")
|
|
1362
|
+
if isinstance(job_desc, ServiceJobDescription):
|
|
1363
|
+
raise RuntimeError("The job description type is incorrect.")
|
|
1336
1364
|
|
|
1337
1365
|
# Traverse failed job's successor graph and get the jobStoreID of new successors.
|
|
1338
1366
|
# Any successor already in toilState.failedSuccessors will not be traversed
|
|
@@ -1401,11 +1429,13 @@ class Leader:
|
|
|
1401
1429
|
len(self.toilState.servicesIssued[client_id]))
|
|
1402
1430
|
elif jobStoreID not in self.toilState.successor_to_predecessors:
|
|
1403
1431
|
#We have reach the root job
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1432
|
+
if self._messages.count(JobUpdatedMessage) != 0:
|
|
1433
|
+
raise RuntimeError("Root job is done but other jobs are still updated")
|
|
1434
|
+
if len(self.toilState.successor_to_predecessors) != 0:
|
|
1435
|
+
raise RuntimeError("Job {} is finished and had no predecessor, but we have other outstanding jobs "
|
|
1407
1436
|
"with predecessors: {}".format(jobStoreID, self.toilState.successor_to_predecessors.keys()))
|
|
1408
|
-
|
|
1437
|
+
if len(self.toilState.successorCounts) != 0:
|
|
1438
|
+
raise RuntimeError("Root job is done but jobs waiting on successors: {self.toilState.successorCounts}")
|
|
1409
1439
|
logger.debug("Reached root job %s so no predecessors to clean up" % jobStoreID)
|
|
1410
1440
|
|
|
1411
1441
|
else:
|
|
@@ -1414,7 +1444,8 @@ class Leader:
|
|
|
1414
1444
|
|
|
1415
1445
|
# For each predecessor
|
|
1416
1446
|
for predecessor_id in self.toilState.successor_to_predecessors.pop(jobStoreID):
|
|
1417
|
-
|
|
1447
|
+
if not isinstance(predecessor_id, str):
|
|
1448
|
+
raise RuntimeError("Predecessor ID should be str but is {type(predecessor_id)}")
|
|
1418
1449
|
predecessor = self.toilState.get_job(predecessor_id)
|
|
1419
1450
|
|
|
1420
1451
|
# Tell the predecessor that this job is done (keep only other successor jobs)
|
toil/lib/accelerators.py
CHANGED
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
|
|
17
17
|
import os
|
|
18
18
|
import subprocess
|
|
19
|
-
from typing import Dict, List,
|
|
19
|
+
from typing import Dict, List, Set, Union, cast
|
|
20
20
|
from xml.dom import minidom
|
|
21
21
|
|
|
22
22
|
from toil.job import AcceleratorRequirement
|
|
@@ -92,10 +92,15 @@ def count_nvidia_gpus() -> int:
|
|
|
92
92
|
# <https://github.com/common-workflow-language/cwltool/blob/6f29c59fb1b5426ef6f2891605e8fa2d08f1a8da/cwltool/cuda.py>
|
|
93
93
|
# Some example output is here: <https://gist.github.com/loretoparisi/2620b777562c2dfd50d6b618b5f20867>
|
|
94
94
|
try:
|
|
95
|
-
return int(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
95
|
+
return int(
|
|
96
|
+
cast(
|
|
97
|
+
minidom.Text,
|
|
98
|
+
minidom.parseString(subprocess.check_output(["nvidia-smi", "-q", "-x"]))
|
|
99
|
+
.getElementsByTagName("attached_gpus")[0]
|
|
100
|
+
.firstChild,
|
|
101
|
+
).data
|
|
102
|
+
)
|
|
103
|
+
except:
|
|
99
104
|
return 0
|
|
100
105
|
|
|
101
106
|
# TODO: Parse each gpu > product_name > text content and convert to some
|
toil/lib/aws/__init__.py
CHANGED
|
@@ -11,27 +11,15 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
import collections
|
|
15
|
-
import inspect
|
|
16
14
|
import json
|
|
17
15
|
import logging
|
|
18
16
|
import os
|
|
19
17
|
import re
|
|
20
18
|
import socket
|
|
21
|
-
import
|
|
22
|
-
from
|
|
23
|
-
from typing import (Any,
|
|
24
|
-
Callable,
|
|
25
|
-
Dict,
|
|
26
|
-
Iterable,
|
|
27
|
-
List,
|
|
28
|
-
MutableMapping,
|
|
29
|
-
Optional,
|
|
30
|
-
TypeVar,
|
|
31
|
-
Union)
|
|
19
|
+
from http.client import HTTPException
|
|
20
|
+
from typing import Dict, MutableMapping, Optional
|
|
32
21
|
from urllib.error import URLError
|
|
33
22
|
from urllib.request import urlopen
|
|
34
|
-
from http.client import HTTPException
|
|
35
23
|
|
|
36
24
|
logger = logging.getLogger(__name__)
|
|
37
25
|
|
|
@@ -80,6 +68,7 @@ def get_aws_zone_from_metadata() -> Optional[str]:
|
|
|
80
68
|
try:
|
|
81
69
|
# Use the EC2 metadata service
|
|
82
70
|
import boto
|
|
71
|
+
str(boto) # to prevent removal of the import
|
|
83
72
|
from boto.utils import get_instance_metadata
|
|
84
73
|
logger.debug("Fetch AZ from EC2 metadata")
|
|
85
74
|
return get_instance_metadata()['placement']['availability-zone']
|
toil/lib/aws/ami.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
-
import time
|
|
5
4
|
import urllib.request
|
|
6
|
-
from
|
|
7
|
-
from
|
|
5
|
+
from typing import Dict, Iterator, Optional, cast
|
|
6
|
+
from urllib.error import HTTPError, URLError
|
|
8
7
|
|
|
9
8
|
from botocore.client import BaseClient
|
|
9
|
+
from botocore.exceptions import ClientError
|
|
10
10
|
|
|
11
11
|
from toil.lib.retry import retry
|
|
12
12
|
|
|
@@ -110,6 +110,12 @@ def flatcar_release_feed_amis(region: str, architecture: str = 'amd64', source:
|
|
|
110
110
|
# Try again
|
|
111
111
|
try_number += 1
|
|
112
112
|
continue
|
|
113
|
+
except URLError:
|
|
114
|
+
# Could be a connection timeout
|
|
115
|
+
logger.exception(f'Failed to retrieve {source} Flatcar release feed JSON')
|
|
116
|
+
# Try again
|
|
117
|
+
try_number += 1
|
|
118
|
+
continue
|
|
113
119
|
if try_number == MAX_TRIES:
|
|
114
120
|
# We could not get the JSON
|
|
115
121
|
logger.error(f'Could not get a readable {source} Flatcar release feed JSON')
|
|
@@ -150,11 +156,18 @@ def feed_flatcar_ami_release(ec2_client: BaseClient, architecture: str = 'amd64'
|
|
|
150
156
|
|
|
151
157
|
for ami in flatcar_release_feed_amis(region, architecture, source):
|
|
152
158
|
# verify it exists on AWS
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
159
|
+
try:
|
|
160
|
+
response = ec2_client.describe_images(Filters=[{'Name': 'image-id', 'Values': [ami]}]) # type: ignore
|
|
161
|
+
if len(response['Images']) == 1 and response['Images'][0]['State'] == 'available':
|
|
162
|
+
return ami
|
|
163
|
+
else:
|
|
164
|
+
logger.warning(f'Flatcar release feed suggests image {ami} which does not exist on AWS in {region}')
|
|
165
|
+
except ClientError:
|
|
166
|
+
# Sometimes we get back nonsense like:
|
|
167
|
+
# botocore.exceptions.ClientError: An error occurred (AuthFailure) when calling the DescribeImages operation: AWS was not able to validate the provided access credentials
|
|
168
|
+
# Don't hold that against the AMI.
|
|
169
|
+
logger.exception(f'Unable to check if AMI {ami} exists on AWS in {region}; assuming it does')
|
|
170
|
+
return ami
|
|
158
171
|
# We didn't find it
|
|
159
172
|
logger.warning(f'Flatcar release feed does not have an image for region {region} that exists on AWS')
|
|
160
173
|
return None
|
|
@@ -162,7 +175,7 @@ def feed_flatcar_ami_release(ec2_client: BaseClient, architecture: str = 'amd64'
|
|
|
162
175
|
|
|
163
176
|
@retry() # TODO: What errors do we get for timeout, JSON parse failure, etc?
|
|
164
177
|
def aws_marketplace_flatcar_ami_search(ec2_client: BaseClient, architecture: str = 'amd64') -> Optional[str]:
|
|
165
|
-
"""Query AWS for all AMI names matching
|
|
178
|
+
"""Query AWS for all AMI names matching ``Flatcar-stable-*`` and return the most recent one."""
|
|
166
179
|
|
|
167
180
|
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.describe_images
|
|
168
181
|
# Possible arch choices on AWS: 'i386'|'x86_64'|'arm64'|'x86_64_mac'
|
toil/lib/aws/iam.py
CHANGED
|
@@ -3,16 +3,15 @@ import json
|
|
|
3
3
|
import logging
|
|
4
4
|
from collections import defaultdict
|
|
5
5
|
from functools import lru_cache
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import Dict, List, Optional, Union, cast
|
|
7
7
|
|
|
8
8
|
import boto3
|
|
9
9
|
from mypy_boto3_iam import IAMClient
|
|
10
|
-
from mypy_boto3_iam.type_defs import AttachedPolicyTypeDef
|
|
10
|
+
from mypy_boto3_iam.type_defs import (AttachedPolicyTypeDef,
|
|
11
|
+
PolicyDocumentDictTypeDef)
|
|
11
12
|
from mypy_boto3_sts import STSClient
|
|
12
13
|
|
|
13
|
-
from toil.lib.aws import zone_to_region
|
|
14
14
|
from toil.lib.aws.session import client as get_client
|
|
15
|
-
from toil.provisioners.aws import get_best_aws_zone
|
|
16
15
|
|
|
17
16
|
logger = logging.getLogger(__name__)
|
|
18
17
|
|
|
@@ -121,7 +120,7 @@ def permission_matches_any(perm: str, list_perms: List[str]) -> bool:
|
|
|
121
120
|
return True
|
|
122
121
|
return False
|
|
123
122
|
|
|
124
|
-
def get_actions_from_policy_document(policy_doc:
|
|
123
|
+
def get_actions_from_policy_document(policy_doc: PolicyDocumentDictTypeDef) -> AllowedActionCollection:
|
|
125
124
|
'''
|
|
126
125
|
Given a policy document, go through each statement and create an AllowedActionCollection representing the
|
|
127
126
|
permissions granted in the policy document.
|
|
@@ -138,11 +137,16 @@ def get_actions_from_policy_document(policy_doc: Dict[str, Any]) -> AllowedActio
|
|
|
138
137
|
for resource in statement["Resource"]:
|
|
139
138
|
for key in ["Action", "NotAction"]:
|
|
140
139
|
if key in statement.keys():
|
|
141
|
-
|
|
142
|
-
|
|
140
|
+
# mypy_boto3_iam declares policy document as a TypedDict
|
|
141
|
+
# This type expects 4 string keys, of which NotAction is not an option
|
|
142
|
+
# Thus mypy complains. NotAction seems to be valid according to Amazon:
|
|
143
|
+
# https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_elements_notaction.html
|
|
144
|
+
# so type: ignore for now
|
|
145
|
+
if isinstance(statement[key], list): # type: ignore[literal-required]
|
|
146
|
+
allowed_actions[resource][key] += statement[key] # type: ignore[literal-required]
|
|
143
147
|
else:
|
|
144
148
|
#Assumes that if it isn't a list it's probably a string
|
|
145
|
-
allowed_actions[resource][key].append(statement[key])
|
|
149
|
+
allowed_actions[resource][key].append(statement[key]) # type: ignore[literal-required]
|
|
146
150
|
|
|
147
151
|
return allowed_actions
|
|
148
152
|
def allowed_actions_attached(iam: IAMClient, attached_policies: List[AttachedPolicyTypeDef]) -> AllowedActionCollection:
|
|
@@ -181,24 +185,28 @@ def allowed_actions_roles(iam: IAMClient, policy_names: List[str], role_name: st
|
|
|
181
185
|
PolicyName=policy_name
|
|
182
186
|
)
|
|
183
187
|
logger.debug("Checking role policy")
|
|
184
|
-
|
|
188
|
+
# PolicyDocument is now a TypedDict, but an instance of TypedDict is not an instance of dict?
|
|
189
|
+
if isinstance(role_policy["PolicyDocument"], str):
|
|
190
|
+
policy_document = json.loads(role_policy["PolicyDocument"])
|
|
191
|
+
else:
|
|
192
|
+
policy_document = role_policy["PolicyDocument"]
|
|
185
193
|
|
|
186
194
|
allowed_actions = add_to_action_collection(allowed_actions, get_actions_from_policy_document(policy_document))
|
|
187
195
|
|
|
188
196
|
return allowed_actions
|
|
189
197
|
|
|
190
198
|
|
|
191
|
-
def collect_policy_actions(policy_documents:
|
|
199
|
+
def collect_policy_actions(policy_documents: List[Union[str, PolicyDocumentDictTypeDef]]) -> AllowedActionCollection:
|
|
192
200
|
"""
|
|
193
201
|
Collect all of the actions allowed by the given policy documents into one AllowedActionCollection.
|
|
194
202
|
"""
|
|
195
203
|
allowed_actions: AllowedActionCollection = init_action_collection()
|
|
196
204
|
for policy_str in policy_documents:
|
|
197
205
|
# sometimes a string is returned from the api, so convert to a dictionary
|
|
198
|
-
if isinstance(policy_str,
|
|
199
|
-
policy_dict = policy_str
|
|
200
|
-
else:
|
|
206
|
+
if isinstance(policy_str, str):
|
|
201
207
|
policy_dict = json.loads(policy_str)
|
|
208
|
+
else:
|
|
209
|
+
policy_dict = policy_str
|
|
202
210
|
allowed_actions = add_to_action_collection(allowed_actions, get_actions_from_policy_document(policy_dict))
|
|
203
211
|
return allowed_actions
|
|
204
212
|
|
toil/lib/aws/session.py
CHANGED
|
@@ -12,24 +12,10 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import collections
|
|
15
|
-
import inspect
|
|
16
15
|
import logging
|
|
17
16
|
import os
|
|
18
|
-
import re
|
|
19
|
-
import socket
|
|
20
17
|
import threading
|
|
21
|
-
from typing import
|
|
22
|
-
Callable,
|
|
23
|
-
Dict,
|
|
24
|
-
Iterable,
|
|
25
|
-
List,
|
|
26
|
-
Optional,
|
|
27
|
-
Tuple,
|
|
28
|
-
TypeVar,
|
|
29
|
-
Union,
|
|
30
|
-
cast)
|
|
31
|
-
from urllib.error import URLError
|
|
32
|
-
from urllib.request import urlopen
|
|
18
|
+
from typing import Dict, Optional, Tuple, cast
|
|
33
19
|
|
|
34
20
|
import boto3
|
|
35
21
|
import boto3.resources.base
|
|
@@ -37,8 +23,8 @@ import boto.connection
|
|
|
37
23
|
import botocore
|
|
38
24
|
from boto3 import Session
|
|
39
25
|
from botocore.client import Config
|
|
40
|
-
from botocore.utils import JSONFileCache
|
|
41
26
|
from botocore.session import get_session
|
|
27
|
+
from botocore.utils import JSONFileCache
|
|
42
28
|
|
|
43
29
|
logger = logging.getLogger(__name__)
|
|
44
30
|
|
toil/lib/aws/utils.py
CHANGED
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import errno
|
|
15
|
-
import json
|
|
16
15
|
import logging
|
|
17
16
|
import os
|
|
18
17
|
import socket
|
|
@@ -21,15 +20,13 @@ from typing import (Any,
|
|
|
21
20
|
Callable,
|
|
22
21
|
ContextManager,
|
|
23
22
|
Dict,
|
|
24
|
-
Hashable,
|
|
25
23
|
Iterable,
|
|
26
24
|
Iterator,
|
|
27
25
|
List,
|
|
28
26
|
Optional,
|
|
29
27
|
Set,
|
|
30
28
|
Union,
|
|
31
|
-
cast
|
|
32
|
-
MutableMapping)
|
|
29
|
+
cast)
|
|
33
30
|
from urllib.parse import ParseResult
|
|
34
31
|
|
|
35
32
|
from toil.lib.aws import session
|
|
@@ -345,6 +342,8 @@ def get_object_for_url(url: ParseResult, existing: Optional[bool] = None) -> "Ob
|
|
|
345
342
|
"""
|
|
346
343
|
Extracts a key (object) from a given parsed s3:// URL.
|
|
347
344
|
|
|
345
|
+
If existing is true and the object does not exist, raises FileNotFoundError.
|
|
346
|
+
|
|
348
347
|
:param bool existing: If True, key is expected to exist. If False, key is expected not to
|
|
349
348
|
exists and it will be created. If None, the key will be created if it doesn't exist.
|
|
350
349
|
"""
|
|
@@ -386,7 +385,7 @@ def get_object_for_url(url: ParseResult, existing: Optional[bool] = None) -> "Ob
|
|
|
386
385
|
else:
|
|
387
386
|
raise
|
|
388
387
|
if existing is True and not objExists:
|
|
389
|
-
raise
|
|
388
|
+
raise FileNotFoundError(f"Key '{key_name}' does not exist in bucket '{bucket_name}'.")
|
|
390
389
|
elif existing is False and objExists:
|
|
391
390
|
raise RuntimeError(f"Key '{key_name}' exists in bucket '{bucket_name}'.")
|
|
392
391
|
|