PyPI - toil - Versions diffs - 5.12.0__py3-none-any.whl → 6.1.0__py3-none-any.whl - Mend

toil 5.12.0py3-none-any.whl → 6.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

toil/__init__.py +18 -13
toil/batchSystems/abstractBatchSystem.py +39 -13
toil/batchSystems/abstractGridEngineBatchSystem.py +24 -24
toil/batchSystems/awsBatch.py +14 -14
toil/batchSystems/cleanup_support.py +7 -3
toil/batchSystems/contained_executor.py +3 -3
toil/batchSystems/htcondor.py +0 -1
toil/batchSystems/kubernetes.py +34 -31
toil/batchSystems/local_support.py +3 -1
toil/batchSystems/lsf.py +7 -7
toil/batchSystems/mesos/batchSystem.py +7 -7
toil/batchSystems/options.py +32 -83
toil/batchSystems/registry.py +104 -23
toil/batchSystems/singleMachine.py +16 -13
toil/batchSystems/slurm.py +87 -16
toil/batchSystems/torque.py +0 -1
toil/bus.py +44 -8
toil/common.py +544 -753
toil/cwl/__init__.py +28 -32
toil/cwl/cwltoil.py +595 -574
toil/cwl/utils.py +55 -10
toil/exceptions.py +1 -1
toil/fileStores/__init__.py +2 -2
toil/fileStores/abstractFileStore.py +88 -14
toil/fileStores/cachingFileStore.py +610 -549
toil/fileStores/nonCachingFileStore.py +46 -22
toil/job.py +182 -101
toil/jobStores/abstractJobStore.py +161 -95
toil/jobStores/aws/jobStore.py +23 -9
toil/jobStores/aws/utils.py +6 -6
toil/jobStores/fileJobStore.py +116 -18
toil/jobStores/googleJobStore.py +16 -7
toil/jobStores/utils.py +5 -6
toil/leader.py +87 -56
toil/lib/accelerators.py +10 -5
toil/lib/aws/__init__.py +3 -14
toil/lib/aws/ami.py +22 -9
toil/lib/aws/iam.py +21 -13
toil/lib/aws/session.py +2 -16
toil/lib/aws/utils.py +4 -5
toil/lib/compatibility.py +1 -1
toil/lib/conversions.py +26 -3
toil/lib/docker.py +22 -23
toil/lib/ec2.py +10 -6
toil/lib/ec2nodes.py +106 -100
toil/lib/encryption/_nacl.py +2 -1
toil/lib/generatedEC2Lists.py +325 -18
toil/lib/io.py +49 -2
toil/lib/misc.py +1 -1
toil/lib/resources.py +9 -2
toil/lib/threading.py +101 -38
toil/options/common.py +736 -0
toil/options/cwl.py +336 -0
toil/options/wdl.py +37 -0
toil/provisioners/abstractProvisioner.py +9 -4
toil/provisioners/aws/__init__.py +3 -6
toil/provisioners/aws/awsProvisioner.py +6 -0
toil/provisioners/clusterScaler.py +3 -2
toil/provisioners/gceProvisioner.py +2 -2
toil/realtimeLogger.py +2 -1
toil/resource.py +24 -18
toil/server/app.py +2 -3
toil/server/cli/wes_cwl_runner.py +4 -4
toil/server/utils.py +1 -1
toil/server/wes/abstract_backend.py +3 -2
toil/server/wes/amazon_wes_utils.py +5 -4
toil/server/wes/tasks.py +2 -3
toil/server/wes/toil_backend.py +2 -10
toil/server/wsgi_app.py +2 -0
toil/serviceManager.py +12 -10
toil/statsAndLogging.py +41 -9
toil/test/__init__.py +29 -54
toil/test/batchSystems/batchSystemTest.py +11 -111
toil/test/batchSystems/test_slurm.py +24 -8
toil/test/cactus/__init__.py +0 -0
toil/test/cactus/test_cactus_integration.py +58 -0
toil/test/cwl/cwlTest.py +438 -223
toil/test/cwl/glob_dir.cwl +15 -0
toil/test/cwl/preemptible.cwl +21 -0
toil/test/cwl/preemptible_expression.cwl +28 -0
toil/test/cwl/revsort.cwl +1 -1
toil/test/cwl/revsort2.cwl +1 -1
toil/test/docs/scriptsTest.py +2 -3
toil/test/jobStores/jobStoreTest.py +34 -21
toil/test/lib/aws/test_iam.py +4 -14
toil/test/lib/aws/test_utils.py +0 -3
toil/test/lib/dockerTest.py +4 -4
toil/test/lib/test_ec2.py +12 -17
toil/test/mesos/helloWorld.py +4 -5
toil/test/mesos/stress.py +1 -1
toil/test/{wdl/conftest.py → options/__init__.py} +0 -10
toil/test/options/options.py +37 -0
toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
toil/test/provisioners/clusterScalerTest.py +6 -4
toil/test/provisioners/clusterTest.py +23 -11
toil/test/provisioners/gceProvisionerTest.py +0 -6
toil/test/provisioners/restartScript.py +3 -2
toil/test/server/serverTest.py +1 -1
toil/test/sort/restart_sort.py +2 -1
toil/test/sort/sort.py +2 -1
toil/test/sort/sortTest.py +2 -13
toil/test/src/autoDeploymentTest.py +45 -45
toil/test/src/busTest.py +5 -5
toil/test/src/checkpointTest.py +2 -2
toil/test/src/deferredFunctionTest.py +1 -1
toil/test/src/fileStoreTest.py +32 -16
toil/test/src/helloWorldTest.py +1 -1
toil/test/src/importExportFileTest.py +1 -1
toil/test/src/jobDescriptionTest.py +2 -1
toil/test/src/jobServiceTest.py +1 -1
toil/test/src/jobTest.py +18 -18
toil/test/src/miscTests.py +5 -3
toil/test/src/promisedRequirementTest.py +3 -3
toil/test/src/realtimeLoggerTest.py +1 -1
toil/test/src/resourceTest.py +2 -2
toil/test/src/restartDAGTest.py +1 -1
toil/test/src/resumabilityTest.py +36 -2
toil/test/src/retainTempDirTest.py +1 -1
toil/test/src/systemTest.py +2 -2
toil/test/src/toilContextManagerTest.py +2 -2
toil/test/src/userDefinedJobArgTypeTest.py +1 -1
toil/test/utils/toilDebugTest.py +98 -32
toil/test/utils/toilKillTest.py +2 -2
toil/test/utils/utilsTest.py +23 -3
toil/test/wdl/wdltoil_test.py +223 -45
toil/toilState.py +7 -6
toil/utils/toilClean.py +1 -1
toil/utils/toilConfig.py +36 -0
toil/utils/toilDebugFile.py +60 -33
toil/utils/toilDebugJob.py +39 -12
toil/utils/toilDestroyCluster.py +1 -1
toil/utils/toilKill.py +1 -1
toil/utils/toilLaunchCluster.py +13 -2
toil/utils/toilMain.py +3 -2
toil/utils/toilRsyncCluster.py +1 -1
toil/utils/toilSshCluster.py +1 -1
toil/utils/toilStats.py +445 -305
toil/utils/toilStatus.py +2 -5
toil/version.py +10 -10
toil/wdl/utils.py +2 -122
toil/wdl/wdltoil.py +1257 -492
toil/worker.py +55 -46
toil-6.1.0.dist-info/METADATA +124 -0
toil-6.1.0.dist-info/RECORD +241 -0
{toil-5.12.0.dist-info → toil-6.1.0.dist-info}/WHEEL +1 -1
{toil-5.12.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -1
toil/batchSystems/parasol.py +0 -379
toil/batchSystems/tes.py +0 -459
toil/test/batchSystems/parasolTestSupport.py +0 -117
toil/test/wdl/builtinTest.py +0 -506
toil/test/wdl/toilwdlTest.py +0 -522
toil/wdl/toilwdl.py +0 -141
toil/wdl/versions/dev.py +0 -107
toil/wdl/versions/draft2.py +0 -980
toil/wdl/versions/v1.py +0 -794
toil/wdl/wdl_analysis.py +0 -116
toil/wdl/wdl_functions.py +0 -997
toil/wdl/wdl_synthesis.py +0 -1011
toil/wdl/wdl_types.py +0 -243
toil-5.12.0.dist-info/METADATA +0 -118
toil-5.12.0.dist-info/RECORD +0 -244
/toil/{wdl/versions → options}/__init__.py +0 -0
{toil-5.12.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
{toil-5.12.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0

toil/leader.py CHANGED Viewed

@@ -28,22 +28,24 @@ import enlighten
 from toil import resolveEntryPoint
 from toil.batchSystems import DeadlockException
 from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem,
-                                                   BatchJobExitReason)
-from toil.bus import (JobAnnotationMessage,
-                      JobCompletedMessage,
+                                                   BatchJobExitReason,
+                                                   EXIT_STATUS_UNAVAILABLE_VALUE)
+from toil.bus import (JobCompletedMessage,
                       JobFailedMessage,
                       JobIssuedMessage,
                       JobMissingMessage,
                       JobUpdatedMessage,
-                      QueueSizeMessage)
-from toil.common import Config, Toil, ToilMetrics
+                      QueueSizeMessage,
+                      gen_message_bus_path,
+                      get_job_kind)
+from toil.common import Config, ToilMetrics
 from toil.cwl.utils import CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
+from toil.exceptions import FailedJobsException
 from toil.job import (CheckpointJobDescription,
                       JobDescription,
                       ServiceJobDescription,
                       TemporaryID)
 from toil.jobStores.abstractJobStore import (AbstractJobStore,
-                                             NoSuchFileException,
                                              NoSuchJobException)
 from toil.lib.throttle import LocalThrottle
 from toil.provisioners.abstractProvisioner import AbstractProvisioner
@@ -51,7 +53,6 @@ from toil.provisioners.clusterScaler import ScalerThread
 from toil.serviceManager import ServiceManager
 from toil.statsAndLogging import StatsAndLogging
 from toil.toilState import ToilState
-from toil.exceptions import FailedJobsException
 logger = logging.getLogger(__name__)
@@ -115,10 +116,14 @@ class Leader:
         # state change information about jobs.
         self.toilState = ToilState(self.jobStore)
-        if self.config.write_messages is not None:
-            # Message bus messages need to go to the given file.
-            # Keep a reference to the return value so the listener stays alive.
-            self._message_subscription = self.toilState.bus.connect_output_file(self.config.write_messages)
+        if self.config.write_messages is None:
+            # The user hasn't specified a place for the message bus so we
+            # should make one.
+            self.config.write_messages = gen_message_bus_path()
+        # Message bus messages need to go to the given file.
+        # Keep a reference to the return value so the listener stays alive.
+        self._message_subscription = self.toilState.bus.connect_output_file(self.config.write_messages)
         # Connect to the message bus, so we will get all the messages of these
         # types in an inbox.
@@ -138,7 +143,8 @@ class Leader:
         # Batch system
         self.batchSystem = batchSystem
-        assert len(self.batchSystem.getIssuedBatchJobIDs()) == 0  # Batch system must start with no active jobs!
+        if len(self.batchSystem.getIssuedBatchJobIDs()) != 0:
+            raise RuntimeError("The initialized batchsystem did not start with 0 active jobs.")
         logger.debug("Checked batch system has no running jobs and no updated jobs")
         # Map of batch system IDs to job store IDs
@@ -370,7 +376,8 @@ class Leader:
         # If the successor job's predecessors have all not all completed then
         # ignore the successor as is not yet ready to run
-        assert len(successor.predecessorsFinished) <= successor.predecessorNumber
+        if len(successor.predecessorsFinished) > successor.predecessorNumber:
+            raise RuntimeError("There are more finished predecessors than possible.")
         if len(successor.predecessorsFinished) == successor.predecessorNumber:
             # All the successor's predecessors are done now.
             # Remove the successor job from the set of waiting multi-predecessor jobs.
@@ -391,8 +398,10 @@ class Leader:
         #Build map from successor to predecessors.
         if successor_id not in self.toilState.successor_to_predecessors:
             self.toilState.successor_to_predecessors[successor_id] = set()
-        assert isinstance(successor_id, str)
-        assert isinstance(predecessor_id, str)
+        if not isinstance(successor_id, str):
+            raise RuntimeError("The given successor ID is invalid.")
+        if not isinstance(predecessor_id, str):
+            raise RuntimeError("The given predecessor ID is invalid.")
         self.toilState.successor_to_predecessors[successor_id].add(predecessor_id)
         # Grab the successor
@@ -423,7 +432,8 @@ class Leader:
                      predecessor_id, len(next_successors))
         #Record the number of successors that must be completed before
         #the job can be considered again
-        assert self.toilState.count_pending_successors(predecessor_id) == 0, 'Attempted to schedule successors of the same job twice!'
+        if self.toilState.count_pending_successors(predecessor_id) != 0:
+            raise RuntimeError('Attempted to schedule successors of the same job twice!')
         self.toilState.successors_pending(predecessor_id, len(next_successors))
         # For each successor schedule if all predecessors have been completed
@@ -534,11 +544,13 @@ class Leader:
             # the job has services to run, which have not been started, start them
             # Build a map from the service jobs to the job and a map
             # of the services created for the job
-            assert readyJob.jobStoreID not in self.toilState.servicesIssued
+            if readyJob.jobStoreID in self.toilState.servicesIssued:
+                raise RuntimeError(f"The ready job: {readyJob.jobStoreID} was already issued.")
             self.toilState.servicesIssued[readyJob.jobStoreID] = set()
             for serviceJobList in readyJob.serviceHostIDsInBatches():
                 for serviceID in serviceJobList:
-                    assert serviceID not in self.toilState.service_to_client
+                    if serviceID in self.toilState.service_to_client:
+                        raise RuntimeError(f"The ready service ID: {serviceID} was already added.")
                     self.toilState.reset_job(serviceID)
                     serviceHost = self.toilState.get_job(serviceID)
                     self.toilState.service_to_client[serviceID] = readyJob.jobStoreID
@@ -675,7 +687,8 @@ class Leader:
             client = self.toilState.get_job(client_id)
             # Make sure services still want to run
-            assert next(client.serviceHostIDsInBatches(), None) is not None
+            if next(client.serviceHostIDsInBatches(), None) is None:
+                raise RuntimeError("No more services want to run.")
             # Mark the service job updated so we don't stop here.
             self._messages.publish(JobUpdatedMessage(client_id, 1))
@@ -694,8 +707,9 @@ class Leader:
             if exitStatus == 0:
                 logger.debug('Job ended: %s', updatedJob)
             else:
-                logger.warning(f'Job failed with exit value {exitStatus}: {updatedJob}\n'
-                               f'Exit reason: {exitReason}')
+                status_string = str(exitStatus) if exitStatus != EXIT_STATUS_UNAVAILABLE_VALUE else "<UNAVAILABLE>"
+                logger.warning(f'Job failed with exit value {status_string}: {updatedJob}\n'
+                               f'Exit reason: {BatchJobExitReason.to_string(exitReason)}')
                 if exitStatus == CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE:
                     # This is a CWL job informing us that the workflow is
                     # asking things of us that Toil can't do. When we raise an
@@ -704,7 +718,7 @@ class Leader:
                     logger.warning("This indicates an unsupported CWL requirement!")
                     self.recommended_fail_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
             # Tell everyone it stopped running.
-            self._messages.publish(JobCompletedMessage(updatedJob.get_job_kind(), updatedJob.jobStoreID, exitStatus))
+            self._messages.publish(JobCompletedMessage(get_job_kind(updatedJob.get_names()), updatedJob.jobStoreID, exitStatus))
             self.process_finished_job(bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason)
     def _processLostJobs(self):
@@ -784,13 +798,16 @@ class Leader:
         logger.debug("Finished the main loop: no jobs left to run.")
         # Consistency check the toil state
-        assert self._messages.empty(), f"Pending messages at shutdown: {self._messages}"
-        assert self.toilState.successorCounts == {}, f"Jobs waiting on successors at shutdown: {self.toilState.successorCounts}"
-        assert self.toilState.successor_to_predecessors == {}, f"Successors pending for their predecessors at shutdown: {self.toilState.successor_to_predecessors}"
-        assert self.toilState.service_to_client == {}, f"Services pending for their clients at shutdown: {self.toilState.service_to_client}"
-        assert self.toilState.servicesIssued == {}, f"Services running at shutdown: {self.toilState.servicesIssued}"
-        # assert self.toilState.jobsToBeScheduledWithMultiplePredecessors # These are not properly emptied yet
-        # assert self.toilState.hasFailedSuccessors == set() # These are not properly emptied yet
+        if not self._messages.empty():
+            raise RuntimeError(f"Pending messages at shutdown: {self._messages}")
+        if self.toilState.successorCounts != {}:
+            raise RuntimeError(f"Jobs waiting on successors at shutdown: {self.toilState.successorCounts}")
+        if self.toilState.successor_to_predecessors != {}:
+            raise RuntimeError(f"Successors pending for their predecessors at shutdown: {self.toilState.successor_to_predecessors}")
+        if self.toilState.service_to_client != {}:
+            raise RuntimeError(f"Services pending for their clients at shutdown: {self.toilState.service_to_client}")
+        if self.toilState.servicesIssued != {}:
+            raise RuntimeError(f"Services running at shutdown: {self.toilState.servicesIssued}")
     def checkForDeadlocks(self):
         """Check if the system is deadlocked running service jobs."""
@@ -865,8 +882,8 @@ class Leader:
     def issueJob(self, jobNode: JobDescription) -> None:
         """Add a job to the queue of jobs currently trying to run."""
         # Never issue the same job multiple times simultaneously
-        assert jobNode.jobStoreID not in self.toilState.jobs_issued, \
-            f"Attempted to issue {jobNode} multiple times simultaneously!"
+        if jobNode.jobStoreID in self.toilState.jobs_issued:
+            raise RuntimeError(f"Attempted to issue {jobNode} multiple times simultaneously!")
         workerCommand = [resolveEntryPoint('_toil_worker'),
                          jobNode.jobName,
@@ -907,7 +924,7 @@ class Leader:
                    "%s and %s",
                    jobNode, str(jobBatchSystemID), jobNode.requirements_string())
         # Tell everyone it is issued and the queue size changed
-        self._messages.publish(JobIssuedMessage(jobNode.get_job_kind(), jobNode.jobStoreID, jobBatchSystemID))
+        self._messages.publish(JobIssuedMessage(get_job_kind(jobNode.get_names()), jobNode.jobStoreID, jobBatchSystemID))
         self._messages.publish(QueueSizeMessage(self.getNumberOfJobsIssued()))
         # Tell the user there's another job to do
         self.progress_overall.total += 1
@@ -926,7 +943,8 @@ class Leader:
         """
         # Grab the service job description
         service = self.toilState.get_job(service_id)
-        assert isinstance(service, ServiceJobDescription)
+        if not isinstance(service, ServiceJobDescription):
+            raise RuntimeError("The grabbed service job description is not the right type.")
         if service.preemptible:
             self.preemptibleServiceJobsToBeIssued.append(service_id)
@@ -956,7 +974,8 @@ class Leader:
         elif preemptible:
             return self.preemptibleJobsIssued
         else:
-            assert len(self.issued_jobs_by_batch_system_id) >= self.preemptibleJobsIssued
+            if len(self.issued_jobs_by_batch_system_id) < self.preemptibleJobsIssued:
+                raise RuntimeError("Number of jobs issued cannot be negative.")
             return len(self.issued_jobs_by_batch_system_id) - self.preemptibleJobsIssued
     def _getStatusHint(self) -> str:
@@ -990,16 +1009,19 @@ class Leader:
         :return: Job description as it was issued.
         """
-        assert jobBatchSystemID in self.issued_jobs_by_batch_system_id
+        if jobBatchSystemID not in self.issued_jobs_by_batch_system_id:
+            raise RuntimeError("Job was already removed or was never issued.")
         issuedDesc = self.toilState.get_job(self.issued_jobs_by_batch_system_id[jobBatchSystemID])
         if issuedDesc.preemptible:
             # len(issued_jobs_by_batch_system_id) should always be greater than or equal to preemptibleJobsIssued,
             # so decrement this value before removing the job from the issuedJob map
-            assert self.preemptibleJobsIssued > 0
+            if self.preemptibleJobsIssued <= 0:
+                raise RuntimeError("The number of preemptive issued jobs cannot be negative.")
             self.preemptibleJobsIssued -= 1
         # It's not issued anymore.
         del self.issued_jobs_by_batch_system_id[jobBatchSystemID]
-        assert issuedDesc.jobStoreID in self.toilState.jobs_issued, f"Job {issuedDesc} came back without being issued"
+        if issuedDesc.jobStoreID not in self.toilState.jobs_issued:
+            raise RuntimeError(f"Job {issuedDesc} came back without being issued")
         self.toilState.jobs_issued.remove(issuedDesc.jobStoreID)
         # If service job
         if issuedDesc.jobStoreID in self.toilState.service_to_client:
@@ -1090,8 +1112,9 @@ class Leader:
         for jobBatchSystemID in missingJobIDsSet.difference(jobBatchSystemIDsSet):
             self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
             logger.warning("Batch system id: %s is no longer missing", str(jobBatchSystemID))
-        assert issuedJobs.issubset(jobBatchSystemIDsSet) #Assert checks we have
-        #no unexpected jobs running
+        # checks we have no unexpected jobs running
+        if not issuedJobs.issubset(jobBatchSystemIDsSet):
+            raise RuntimeError("An unexpected job is still running.")
         jobsToKill = []
         for jobBatchSystemID in set(jobBatchSystemIDsSet.difference(issuedJobs)):
             jobStoreID = self.issued_jobs_by_batch_system_id[jobBatchSystemID]
@@ -1137,7 +1160,7 @@ class Leader:
             self.progress_overall.update(incr=-1)
             self.progress_failed.update(incr=1)
-        # Delegate to the vers
+        # Delegate to the version that uses a JobDescription
         return self.process_finished_job_description(issued_job, result_status, wall_time, exit_reason, batch_system_id)
     def process_finished_job_description(self, finished_job: JobDescription, result_status: int,
@@ -1188,11 +1211,12 @@ class Leader:
                     # more memory efficient than read().striplines() while leaving off the
                     # trailing \n left when using readlines()
                     # http://stackoverflow.com/a/15233739
-                    StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
+                    StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
                                                       message='The job seems to have left a log file, indicating failure: %s' % replacement_job)
                 if self.config.writeLogs or self.config.writeLogsGzip:
                     with replacement_job.getLogFileHandle(self.jobStore) as log_stream:
-                        StatsAndLogging.writeLogFiles(replacement_job.chainedJobs, log_stream, self.config, failed=True)
+                        # Send log data from the job store to each per-job log file involved.
+                        StatsAndLogging.writeLogFiles([names.stats_name for names in replacement_job.get_chain()], log_stream, self.config, failed=True)
             if result_status != 0:
                 # If the batch system returned a non-zero exit code then the worker
                 # is assumed not to have captured the failure of the job, so we
@@ -1216,13 +1240,12 @@ class Leader:
                         else:
                             with log_stream:
                                 if os.path.getsize(log_file) > 0:
-                                    StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
+                                    StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
                                                                       message='The batch system left a non-empty file %s:' % log_file)
                                     if self.config.writeLogs or self.config.writeLogsGzip:
                                         file_root, _ = os.path.splitext(os.path.basename(log_file))
-                                        job_names = replacement_job.chainedJobs
-                                        if job_names is None:   # For jobs that fail this way, replacement_job.chainedJobs is not guaranteed to be set
-                                            job_names = [str(replacement_job)]
+                                        job_names = [names.stats_name for names in replacement_job.get_chain()]
+                                        # Tack the batch system log file name onto each job's name
                                         job_names = [j + '_' + file_root for j in job_names]
                                         log_stream.seek(0)
                                         StatsAndLogging.writeLogFiles(job_names, log_stream, self.config, failed=True)
@@ -1289,18 +1312,21 @@ class Leader:
         # Tell everyone it failed
-        self._messages.publish(JobFailedMessage(job_desc.get_job_kind(), job_id))
+        self._messages.publish(JobFailedMessage(get_job_kind(job_desc.get_names()), job_id))
         if job_id in self.toilState.service_to_client:
             # Is a service job
             logger.debug("Service job is being processed as a totally failed job: %s", job_desc)
-            assert isinstance(job_desc, ServiceJobDescription)
+            if not isinstance(job_desc, ServiceJobDescription):
+                raise RuntimeError("The service job description type is incorrect.")
             # Grab the client, which is the predecessor.
             client_id = self.toilState.service_to_client[job_id]
-            assert client_id in self.toilState.servicesIssued
+            if client_id not in self.toilState.servicesIssued:
+                raise RuntimeError("The client was never issued.")
             # Leave the service job as a service of its predecessor, because it
             # didn't work.
@@ -1331,8 +1357,10 @@ class Leader:
             self.jobStore.delete_file(job_desc.startJobStoreID)
         else:
             # Is a non-service job
-            assert job_id not in self.toilState.servicesIssued
-            assert not isinstance(job_desc, ServiceJobDescription)
+            if job_id in self.toilState.servicesIssued:
+                raise RuntimeError("The non-service job should not have been issued.")
+            if isinstance(job_desc, ServiceJobDescription):
+                raise RuntimeError("The job description type is incorrect.")
             # Traverse failed job's successor graph and get the jobStoreID of new successors.
             # Any successor already in toilState.failedSuccessors will not be traversed
@@ -1401,11 +1429,13 @@ class Leader:
                              len(self.toilState.servicesIssued[client_id]))
         elif jobStoreID not in self.toilState.successor_to_predecessors:
             #We have reach the root job
-            assert self._messages.count(JobUpdatedMessage) == 0, "Root job is done but other jobs are still updated"
-            assert len(self.toilState.successor_to_predecessors) == 0, \
-                ("Job {} is finished and had no predecessor, but we have other outstanding jobs "
+            if self._messages.count(JobUpdatedMessage) != 0:
+                raise RuntimeError("Root job is done but other jobs are still updated")
+            if len(self.toilState.successor_to_predecessors) != 0:
+                raise RuntimeError("Job {} is finished and had no predecessor, but we have other outstanding jobs "
                  "with predecessors: {}".format(jobStoreID, self.toilState.successor_to_predecessors.keys()))
-            assert len(self.toilState.successorCounts) == 0, f"Root job is done but jobs waiting on successors: {self.toilState.successorCounts}"
+            if len(self.toilState.successorCounts) != 0:
+                raise RuntimeError("Root job is done but jobs waiting on successors: {self.toilState.successorCounts}")
             logger.debug("Reached root job %s so no predecessors to clean up" % jobStoreID)
         else:
@@ -1414,7 +1444,8 @@ class Leader:
             # For each predecessor
             for predecessor_id in self.toilState.successor_to_predecessors.pop(jobStoreID):
-                assert isinstance(predecessor_id, str), f"Predecessor ID should be str but is {type(predecessor_id)}"
+                if not isinstance(predecessor_id, str):
+                    raise RuntimeError("Predecessor ID should be str but is {type(predecessor_id)}")
                 predecessor = self.toilState.get_job(predecessor_id)
                 # Tell the predecessor that this job is done (keep only other successor jobs)

toil/lib/accelerators.py CHANGED Viewed

@@ -16,7 +16,7 @@
 import os
 import subprocess
-from typing import Dict, List, Optional, Set, Union
+from typing import Dict, List, Set, Union, cast
 from xml.dom import minidom
 from toil.job import AcceleratorRequirement
@@ -92,10 +92,15 @@ def count_nvidia_gpus() -> int:
     # <https://github.com/common-workflow-language/cwltool/blob/6f29c59fb1b5426ef6f2891605e8fa2d08f1a8da/cwltool/cuda.py>
     # Some example output is here: <https://gist.github.com/loretoparisi/2620b777562c2dfd50d6b618b5f20867>
     try:
-        return int(minidom.parseString(
-            subprocess.check_output(["nvidia-smi", "-q", "-x"])
-        ).getElementsByTagName("attached_gpus")[0].firstChild.data)
-    except (FileNotFoundError, subprocess.CalledProcessError, IndexError, ValueError, PermissionError):
+        return int(
+            cast(
+                minidom.Text,
+                minidom.parseString(subprocess.check_output(["nvidia-smi", "-q", "-x"]))
+                .getElementsByTagName("attached_gpus")[0]
+                .firstChild,
+            ).data
+        )
+    except:
         return 0
     # TODO: Parse each gpu > product_name > text content and convert to some

toil/lib/aws/__init__.py CHANGED Viewed

@@ -11,27 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import collections
-import inspect
 import json
 import logging
 import os
 import re
 import socket
-import threading
-from functools import lru_cache
-from typing import (Any,
-                    Callable,
-                    Dict,
-                    Iterable,
-                    List,
-                    MutableMapping,
-                    Optional,
-                    TypeVar,
-                    Union)
+from http.client import HTTPException
+from typing import Dict, MutableMapping, Optional
 from urllib.error import URLError
 from urllib.request import urlopen
-from http.client import HTTPException
 logger = logging.getLogger(__name__)
@@ -80,6 +68,7 @@ def get_aws_zone_from_metadata() -> Optional[str]:
         try:
             # Use the EC2 metadata service
             import boto
+            str(boto)  # to prevent removal of the import
             from boto.utils import get_instance_metadata
             logger.debug("Fetch AZ from EC2 metadata")
             return get_instance_metadata()['placement']['availability-zone']

toil/lib/aws/ami.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import json
 import logging
 import os
-import time
 import urllib.request
-from urllib.error import HTTPError
-from typing import Dict, Optional, Iterator, cast
+from typing import Dict, Iterator, Optional, cast
+from urllib.error import HTTPError, URLError
 from botocore.client import BaseClient
+from botocore.exceptions import ClientError
 from toil.lib.retry import retry
@@ -110,6 +110,12 @@ def flatcar_release_feed_amis(region: str, architecture: str = 'amd64', source:
             # Try again
             try_number += 1
             continue
+        except URLError:
+            # Could be a connection timeout
+            logger.exception(f'Failed to retrieve {source} Flatcar release feed JSON')
+            # Try again
+            try_number += 1
+            continue
     if try_number == MAX_TRIES:
         # We could not get the JSON
         logger.error(f'Could not get a readable {source} Flatcar release feed JSON')
@@ -150,11 +156,18 @@ def feed_flatcar_ami_release(ec2_client: BaseClient, architecture: str = 'amd64'
     for ami in flatcar_release_feed_amis(region, architecture, source):
         # verify it exists on AWS
-        response = ec2_client.describe_images(Filters=[{'Name': 'image-id', 'Values': [ami]}])  # type: ignore
-        if len(response['Images']) == 1 and response['Images'][0]['State'] == 'available':
-            return ami
-        else:
-            logger.warning(f'Flatcar release feed suggests image {ami} which does not exist on AWS in {region}')
+        try:
+            response = ec2_client.describe_images(Filters=[{'Name': 'image-id', 'Values': [ami]}])  # type: ignore
+            if len(response['Images']) == 1 and response['Images'][0]['State'] == 'available':
+                return ami
+            else:
+                logger.warning(f'Flatcar release feed suggests image {ami} which does not exist on AWS in {region}')
+        except ClientError:
+            # Sometimes we get back nonsense like:
+            # botocore.exceptions.ClientError: An error occurred (AuthFailure) when calling the DescribeImages operation: AWS was not able to validate the provided access credentials
+            # Don't hold that against the AMI.
+            logger.exception(f'Unable to check if AMI {ami} exists on AWS in {region}; assuming it does')
+            return ami
     # We didn't find it
     logger.warning(f'Flatcar release feed does not have an image for region {region} that exists on AWS')
     return None
@@ -162,7 +175,7 @@ def feed_flatcar_ami_release(ec2_client: BaseClient, architecture: str = 'amd64'
 @retry()  # TODO: What errors do we get for timeout, JSON parse failure, etc?
 def aws_marketplace_flatcar_ami_search(ec2_client: BaseClient, architecture: str = 'amd64') -> Optional[str]:
-    """Query AWS for all AMI names matching 'Flatcar-stable-*' and return the most recent one."""
+    """Query AWS for all AMI names matching ``Flatcar-stable-*`` and return the most recent one."""
     # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.describe_images
     # Possible arch choices on AWS: 'i386'|'x86_64'|'arm64'|'x86_64_mac'

toil/lib/aws/iam.py CHANGED Viewed

@@ -3,16 +3,15 @@ import json
 import logging
 from collections import defaultdict
 from functools import lru_cache
-from typing import Any, Dict, List, Optional, Set, cast, Union, Sequence
+from typing import Dict, List, Optional, Union, cast
 import boto3
 from mypy_boto3_iam import IAMClient
-from mypy_boto3_iam.type_defs import AttachedPolicyTypeDef
+from mypy_boto3_iam.type_defs import (AttachedPolicyTypeDef,
+                                      PolicyDocumentDictTypeDef)
 from mypy_boto3_sts import STSClient
-from toil.lib.aws import zone_to_region
 from toil.lib.aws.session import client as get_client
-from toil.provisioners.aws import get_best_aws_zone
 logger = logging.getLogger(__name__)
@@ -121,7 +120,7 @@ def permission_matches_any(perm: str, list_perms: List[str]) -> bool:
             return True
     return False
-def get_actions_from_policy_document(policy_doc: Dict[str, Any]) -> AllowedActionCollection:
+def get_actions_from_policy_document(policy_doc: PolicyDocumentDictTypeDef) -> AllowedActionCollection:
     '''
     Given a policy document, go through each statement and create an AllowedActionCollection representing the
     permissions granted in the policy document.
@@ -138,11 +137,16 @@ def get_actions_from_policy_document(policy_doc: Dict[str, Any]) -> AllowedActio
             for resource in statement["Resource"]:
                 for key in ["Action", "NotAction"]:
                     if key in statement.keys():
-                        if isinstance(statement[key], list):
-                            allowed_actions[resource][key] += statement[key]
+                        # mypy_boto3_iam declares policy document as a TypedDict
+                        # This type expects 4 string keys, of which NotAction is not an option
+                        # Thus mypy complains. NotAction seems to be valid according to Amazon:
+                        # https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_elements_notaction.html
+                        # so type: ignore for now
+                        if isinstance(statement[key], list):  # type: ignore[literal-required]
+                            allowed_actions[resource][key] += statement[key]  # type: ignore[literal-required]
                         else:
                             #Assumes that if it isn't a list it's probably a string
-                            allowed_actions[resource][key].append(statement[key])
+                            allowed_actions[resource][key].append(statement[key])  # type: ignore[literal-required]
     return allowed_actions
 def allowed_actions_attached(iam: IAMClient, attached_policies: List[AttachedPolicyTypeDef]) -> AllowedActionCollection:
@@ -181,24 +185,28 @@ def allowed_actions_roles(iam: IAMClient, policy_names: List[str], role_name: st
             PolicyName=policy_name
         )
         logger.debug("Checking role policy")
-        policy_document = json.loads(role_policy["PolicyDocument"])
+        # PolicyDocument is now a TypedDict, but an instance of TypedDict is not an instance of dict?
+        if isinstance(role_policy["PolicyDocument"], str):
+            policy_document = json.loads(role_policy["PolicyDocument"])
+        else:
+            policy_document = role_policy["PolicyDocument"]
         allowed_actions = add_to_action_collection(allowed_actions, get_actions_from_policy_document(policy_document))
     return allowed_actions
-def collect_policy_actions(policy_documents: Sequence[Union[str, Dict[str, Any]]]) -> AllowedActionCollection:
+def collect_policy_actions(policy_documents: List[Union[str, PolicyDocumentDictTypeDef]]) -> AllowedActionCollection:
     """
     Collect all of the actions allowed by the given policy documents into one AllowedActionCollection.
     """
     allowed_actions: AllowedActionCollection = init_action_collection()
     for policy_str in policy_documents:
         # sometimes a string is returned from the api, so convert to a dictionary
-        if isinstance(policy_str, dict):
-            policy_dict = policy_str
-        else:
+        if isinstance(policy_str, str):
             policy_dict = json.loads(policy_str)
+        else:
+            policy_dict = policy_str
         allowed_actions = add_to_action_collection(allowed_actions, get_actions_from_policy_document(policy_dict))
     return allowed_actions

toil/lib/aws/session.py CHANGED Viewed

@@ -12,24 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
-import inspect
 import logging
 import os
-import re
-import socket
 import threading
-from typing import (Any,
-                    Callable,
-                    Dict,
-                    Iterable,
-                    List,
-                    Optional,
-                    Tuple,
-                    TypeVar,
-                    Union,
-                    cast)
-from urllib.error import URLError
-from urllib.request import urlopen
+from typing import Dict, Optional, Tuple, cast
 import boto3
 import boto3.resources.base
@@ -37,8 +23,8 @@ import boto.connection
 import botocore
 from boto3 import Session
 from botocore.client import Config
-from botocore.utils import JSONFileCache
 from botocore.session import get_session
+from botocore.utils import JSONFileCache
 logger = logging.getLogger(__name__)

toil/lib/aws/utils.py CHANGED Viewed

@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import errno
-import json
 import logging
 import os
 import socket
@@ -21,15 +20,13 @@ from typing import (Any,
                     Callable,
                     ContextManager,
                     Dict,
-                    Hashable,
                     Iterable,
                     Iterator,
                     List,
                     Optional,
                     Set,
                     Union,
-                    cast,
-                    MutableMapping)
+                    cast)
 from urllib.parse import ParseResult
 from toil.lib.aws import session
@@ -345,6 +342,8 @@ def get_object_for_url(url: ParseResult, existing: Optional[bool] = None) -> "Ob
         """
         Extracts a key (object) from a given parsed s3:// URL.
+        If existing is true and the object does not exist, raises FileNotFoundError.
         :param bool existing: If True, key is expected to exist. If False, key is expected not to
                 exists and it will be created. If None, the key will be created if it doesn't exist.
         """
@@ -386,7 +385,7 @@ def get_object_for_url(url: ParseResult, existing: Optional[bool] = None) -> "Ob
             else:
                 raise
         if existing is True and not objExists:
-            raise RuntimeError(f"Key '{key_name}' does not exist in bucket '{bucket_name}'.")
+            raise FileNotFoundError(f"Key '{key_name}' does not exist in bucket '{bucket_name}'.")
         elif existing is False and objExists:
             raise RuntimeError(f"Key '{key_name}' exists in bucket '{bucket_name}'.")

toil/lib/compatibility.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import functools
 import warnings
-from typing import Any, Dict, Callable, Union, TypeVar, overload
+from typing import Any, Callable, Union
 def deprecated(new_function_name: str) -> Callable[..., Any]:

toil 5.12.0__py3-none-any.whl → 6.1.0__py3-none-any.whl

toil 5.12.0py3-none-any.whl → 6.1.0py3-none-any.whl