PyPI - toil - Versions diffs - 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl - Mend

toil 6.1.0a1py3-none-any.whl → 7.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

toil/__init__.py +1 -232
toil/batchSystems/abstractBatchSystem.py +41 -17
toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
toil/batchSystems/awsBatch.py +8 -8
toil/batchSystems/cleanup_support.py +7 -3
toil/batchSystems/contained_executor.py +4 -5
toil/batchSystems/gridengine.py +1 -1
toil/batchSystems/htcondor.py +5 -5
toil/batchSystems/kubernetes.py +25 -11
toil/batchSystems/local_support.py +3 -3
toil/batchSystems/lsf.py +9 -9
toil/batchSystems/mesos/batchSystem.py +4 -4
toil/batchSystems/mesos/executor.py +3 -2
toil/batchSystems/options.py +9 -0
toil/batchSystems/singleMachine.py +11 -10
toil/batchSystems/slurm.py +129 -16
toil/batchSystems/torque.py +1 -1
toil/bus.py +45 -3
toil/common.py +56 -31
toil/cwl/cwltoil.py +442 -371
toil/deferred.py +1 -1
toil/exceptions.py +1 -1
toil/fileStores/abstractFileStore.py +69 -20
toil/fileStores/cachingFileStore.py +6 -22
toil/fileStores/nonCachingFileStore.py +6 -15
toil/job.py +270 -86
toil/jobStores/abstractJobStore.py +37 -31
toil/jobStores/aws/jobStore.py +280 -218
toil/jobStores/aws/utils.py +60 -31
toil/jobStores/conftest.py +2 -2
toil/jobStores/fileJobStore.py +3 -3
toil/jobStores/googleJobStore.py +3 -4
toil/leader.py +89 -38
toil/lib/aws/__init__.py +26 -10
toil/lib/aws/iam.py +2 -2
toil/lib/aws/session.py +62 -22
toil/lib/aws/utils.py +73 -37
toil/lib/conversions.py +24 -1
toil/lib/ec2.py +118 -69
toil/lib/expando.py +1 -1
toil/lib/generatedEC2Lists.py +8 -8
toil/lib/io.py +42 -4
toil/lib/misc.py +1 -3
toil/lib/resources.py +57 -16
toil/lib/retry.py +12 -5
toil/lib/threading.py +29 -14
toil/lib/throttle.py +1 -1
toil/options/common.py +31 -30
toil/options/wdl.py +5 -0
toil/provisioners/__init__.py +9 -3
toil/provisioners/abstractProvisioner.py +12 -2
toil/provisioners/aws/__init__.py +20 -15
toil/provisioners/aws/awsProvisioner.py +406 -329
toil/provisioners/gceProvisioner.py +2 -2
toil/provisioners/node.py +13 -5
toil/server/app.py +1 -1
toil/statsAndLogging.py +93 -23
toil/test/__init__.py +27 -12
toil/test/batchSystems/batchSystemTest.py +40 -33
toil/test/batchSystems/batch_system_plugin_test.py +79 -0
toil/test/batchSystems/test_slurm.py +22 -7
toil/test/cactus/__init__.py +0 -0
toil/test/cactus/test_cactus_integration.py +58 -0
toil/test/cwl/cwlTest.py +245 -236
toil/test/cwl/seqtk_seq.cwl +1 -1
toil/test/docs/scriptsTest.py +11 -14
toil/test/jobStores/jobStoreTest.py +40 -54
toil/test/lib/aws/test_iam.py +2 -2
toil/test/lib/test_ec2.py +1 -1
toil/test/options/__init__.py +13 -0
toil/test/options/options.py +37 -0
toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
toil/test/provisioners/clusterTest.py +99 -16
toil/test/server/serverTest.py +2 -2
toil/test/src/autoDeploymentTest.py +1 -1
toil/test/src/dockerCheckTest.py +2 -1
toil/test/src/environmentTest.py +125 -0
toil/test/src/fileStoreTest.py +1 -1
toil/test/src/jobDescriptionTest.py +18 -8
toil/test/src/jobTest.py +1 -1
toil/test/src/realtimeLoggerTest.py +4 -0
toil/test/src/workerTest.py +52 -19
toil/test/utils/toilDebugTest.py +62 -4
toil/test/utils/utilsTest.py +23 -21
toil/test/wdl/wdltoil_test.py +49 -21
toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
toil/toilState.py +68 -9
toil/utils/toilDebugFile.py +1 -1
toil/utils/toilDebugJob.py +153 -26
toil/utils/toilLaunchCluster.py +12 -2
toil/utils/toilRsyncCluster.py +7 -2
toil/utils/toilSshCluster.py +7 -3
toil/utils/toilStats.py +310 -266
toil/utils/toilStatus.py +98 -52
toil/version.py +11 -11
toil/wdl/wdltoil.py +644 -225
toil/worker.py +125 -83
{toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
toil-7.0.0.dist-info/METADATA +158 -0
{toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
{toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
toil-6.1.0a1.dist-info/METADATA +0 -125
{toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
{toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0

toil/jobStores/aws/utils.py CHANGED Viewed

@@ -17,25 +17,25 @@ import logging
 import os
 import types
 from ssl import SSLError
-from typing import Optional, cast
+from typing import Optional, cast, TYPE_CHECKING, Dict, List, Tuple
 from boto3.s3.transfer import TransferConfig
-from boto.exception import SDBResponseError
 from botocore.client import Config
 from botocore.exceptions import ClientError
-from mypy_boto3_s3 import S3Client, S3ServiceResource
+from mypy_boto3_sdb.type_defs import ItemTypeDef, AttributeTypeDef
-from toil.lib.aws import session
-from toil.lib.aws.utils import connection_reset, get_bucket_region
+from toil.lib.aws import session, AWSServerErrors
+from toil.lib.aws.utils import connection_error, get_bucket_region
 from toil.lib.compatibility import compat_bytes
 from toil.lib.retry import (DEFAULT_DELAYS,
                             DEFAULT_TIMEOUT,
-                            ErrorCondition,
                             get_error_code,
                             get_error_message,
                             get_error_status,
                             old_retry,
                             retry)
+if TYPE_CHECKING:
+    from mypy_boto3_s3 import S3ServiceResource
 logger = logging.getLogger(__name__)
@@ -125,11 +125,11 @@ class SDBHelper:
         return cls._maxChunks() * cls.maxValueSize
     @classmethod
-    def binaryToAttributes(cls, binary):
+    def binaryToAttributes(cls, binary) -> Dict[str, str]:
         """
         Turn a bytestring, or None, into SimpleDB attributes.
         """
-        if binary is None: return {'numChunks': 0}
+        if binary is None: return {'numChunks': '0'}
         assert isinstance(binary, bytes)
         assert len(binary) <= cls.maxBinarySize()
         # The use of compression is just an optimization. We can't include it in the maxValueSize
@@ -143,10 +143,41 @@ class SDBHelper:
         assert len(encoded) <= cls._maxEncodedSize()
         n = cls.maxValueSize
         chunks = (encoded[i:i + n] for i in range(0, len(encoded), n))
-        attributes = {cls._chunkName(i): chunk for i, chunk in enumerate(chunks)}
-        attributes.update({'numChunks': len(attributes)})
+        attributes = {cls._chunkName(i): chunk.decode("utf-8") for i, chunk in enumerate(chunks)}
+        attributes.update({'numChunks': str(len(attributes))})
         return attributes
+    @classmethod
+    def attributeDictToList(cls, attributes: Dict[str, str]) -> List[AttributeTypeDef]:
+        """
+        Convert the attribute dict (ex: from binaryToAttributes) into a list of attribute typed dicts
+        to be compatible with boto3 argument syntax
+        :param attributes: Dict[str, str], attribute in object form
+        :return: List[AttributeTypeDef], list of attributes in typed dict form
+        """
+        return [{"Name": name, "Value": value} for name, value in attributes.items()]
+    @classmethod
+    def attributeListToDict(cls, attributes: List[AttributeTypeDef]) -> Dict[str, str]:
+        """
+        Convert the attribute boto3 representation of list of attribute typed dicts
+        back to a dictionary with name, value pairs
+        :param attribute: List[AttributeTypeDef, attribute in typed dict form
+        :return: Dict[str, str], attribute in dict form
+        """
+        return {attribute["Name"]: attribute["Value"] for attribute in attributes}
+    @classmethod
+    def get_attributes_from_item(cls, item: ItemTypeDef, keys: List[str]) -> List[Optional[str]]:
+        return_values: List[Optional[str]] = [None for _ in keys]
+        mapped_indices: Dict[str, int] = {name: index for index, name in enumerate(keys)}
+        for attribute in item["Attributes"]:
+            name = attribute["Name"]
+            value = attribute["Value"]
+            if name in mapped_indices:
+                return_values[mapped_indices[name]] = value
+        return return_values
     @classmethod
     def _chunkName(cls, i):
         return str(i).zfill(3)
@@ -165,14 +196,21 @@ class SDBHelper:
         return 'numChunks'
     @classmethod
-    def attributesToBinary(cls, attributes):
+    def attributesToBinary(cls, attributes: List[AttributeTypeDef]) -> Tuple[bytes, int]:
         """
         :rtype: (str|None,int)
         :return: the binary data and the number of chunks it was composed from
         """
-        chunks = [(int(k), v) for k, v in attributes.items() if cls._isValidChunkName(k)]
+        chunks = []
+        numChunks: int = 0
+        for attribute in attributes:
+            name = attribute["Name"]
+            value = attribute["Value"]
+            if cls._isValidChunkName(name):
+                chunks.append((int(name), value))
+            if name == "numChunks":
+                numChunks = int(value)
         chunks.sort()
-        numChunks = int(attributes['numChunks'])
         if numChunks:
             serializedJob = b''.join(v.encode() for k, v in chunks)
             compressed = base64.b64decode(serializedJob)
@@ -192,10 +230,7 @@ def fileSizeAndTime(localFilePath):
     return file_stat.st_size, file_stat.st_mtime
-@retry(errors=[ErrorCondition(
-    error=ClientError,
-    error_codes=[404, 500, 502, 503, 504]
-)])
+@retry(errors=[AWSServerErrors])
 def uploadFromPath(localFilePath: str,
                    resource,
                    bucketName: str,
@@ -231,10 +266,7 @@ def uploadFromPath(localFilePath: str,
     return version
-@retry(errors=[ErrorCondition(
-    error=ClientError,
-    error_codes=[404, 500, 502, 503, 504]
-)])
+@retry(errors=[AWSServerErrors])
 def uploadFile(readable,
                resource,
                bucketName: str,
@@ -286,11 +318,8 @@ class ServerSideCopyProhibitedError(RuntimeError):
     insists that you pay to download and upload the data yourself instead.
     """
-@retry(errors=[ErrorCondition(
-    error=ClientError,
-    error_codes=[404, 500, 502, 503, 504]
-)])
-def copyKeyMultipart(resource: S3ServiceResource,
+@retry(errors=[AWSServerErrors])
+def copyKeyMultipart(resource: "S3ServiceResource",
                      srcBucketName: str,
                      srcKeyName: str,
                      srcKeyVersion: str,
@@ -346,7 +375,7 @@ def copyKeyMultipart(resource: S3ServiceResource,
     # not wherever the bucket virtual hostnames go.
     source_region = get_bucket_region(srcBucketName)
     source_client = cast(
-        S3Client,
+        "S3Client",
         session.client(
             's3',
             region_name=source_region,
@@ -438,9 +467,9 @@ def sdb_unavailable(e):
 def no_such_sdb_domain(e):
-    return (isinstance(e, SDBResponseError)
-            and e.error_code
-            and e.error_code.endswith('NoSuchDomain'))
+    return (isinstance(e, ClientError)
+            and get_error_code(e)
+            and get_error_code(e).endswith('NoSuchDomain'))
 def retryable_ssl_error(e):
@@ -451,7 +480,7 @@ def retryable_ssl_error(e):
 def retryable_sdb_errors(e):
     return (sdb_unavailable(e)
             or no_such_sdb_domain(e)
-            or connection_reset(e)
+            or connection_error(e)
             or retryable_ssl_error(e))

toil/jobStores/conftest.py CHANGED Viewed

@@ -17,7 +17,7 @@
 collect_ignore = []
 try:
-    import boto
-    print(boto.__file__)  # prevent this import from being removed
+    import boto3
+    print(boto3.__file__)  # prevent this import from being removed
 except ImportError:
     collect_ignore.append("aws")

toil/jobStores/fileJobStore.py CHANGED Viewed

@@ -113,7 +113,7 @@ class FileJobStore(AbstractJobStore):
             os.mkdir(self.jobStoreDir)
         except OSError as e:
             if e.errno == errno.EEXIST:
-                raise JobStoreExistsException(self.jobStoreDir)
+                raise JobStoreExistsException(self.jobStoreDir, "file")
             else:
                 raise
         os.makedirs(self.jobsDir, exist_ok=True)
@@ -127,7 +127,7 @@ class FileJobStore(AbstractJobStore):
     def resume(self):
         if not os.path.isdir(self.jobStoreDir):
-            raise NoSuchJobStoreException(self.jobStoreDir)
+            raise NoSuchJobStoreException(self.jobStoreDir, "file")
         super().resume()
     def destroy(self):
@@ -920,7 +920,7 @@ class FileJobStore(AbstractJobStore):
         :raise NoSuchFileException: if the file with ID jobStoreFileID does
                                     not exist or is not a file
         """
-        if not self.file_exists(unquote(jobStoreFileID)):
+        if not self.file_exists(jobStoreFileID):
             raise NoSuchFileException(jobStoreFileID)
     def _get_arbitrary_jobs_dir_for_name(self, jobNameSlug):

toil/jobStores/googleJobStore.py CHANGED Viewed

@@ -164,7 +164,7 @@ class GoogleJobStore(AbstractJobStore):
         try:
             self.bucket = self.storageClient.create_bucket(self.bucketName)
         except exceptions.Conflict:
-            raise JobStoreExistsException(self.locator)
+            raise JobStoreExistsException(self.locator, "google")
         super().initialize(config)
         # set up sever side encryption after we set up config in super
@@ -178,7 +178,7 @@ class GoogleJobStore(AbstractJobStore):
         try:
             self.bucket = self.storageClient.get_bucket(self.bucketName)
         except exceptions.NotFound:
-            raise NoSuchJobStoreException(self.locator)
+            raise NoSuchJobStoreException(self.locator, "google")
         super().resume()
     @google_retry
@@ -209,8 +209,7 @@ class GoogleJobStore(AbstractJobStore):
     def assign_job_id(self, job_description):
         jobStoreID = self._new_job_id()
-        log.debug("Assigning ID to job %s for '%s'",
-                  jobStoreID, '<no command>' if job_description.command is None else job_description.command)
+        log.debug("Assigning ID to job %s", jobStoreID)
         job_description.jobStoreID = jobStoreID
     @contextmanager

toil/leader.py CHANGED Viewed

@@ -28,14 +28,16 @@ import enlighten
 from toil import resolveEntryPoint
 from toil.batchSystems import DeadlockException
 from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem,
-                                                   BatchJobExitReason)
+                                                   BatchJobExitReason,
+                                                   EXIT_STATUS_UNAVAILABLE_VALUE)
 from toil.bus import (JobCompletedMessage,
                       JobFailedMessage,
                       JobIssuedMessage,
                       JobMissingMessage,
                       JobUpdatedMessage,
                       QueueSizeMessage,
-                      gen_message_bus_path)
+                      gen_message_bus_path,
+                      get_job_kind)
 from toil.common import Config, ToilMetrics
 from toil.cwl.utils import CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
 from toil.exceptions import FailedJobsException
@@ -117,7 +119,12 @@ class Leader:
         if self.config.write_messages is None:
             # The user hasn't specified a place for the message bus so we
             # should make one.
-            self.config.write_messages = gen_message_bus_path()
+            # pass in coordination_dir for toil-cwl-runner; we want to obey --tmpdir-prefix
+            # from cwltool and we change the coordination_dir when detected. we don't want
+            # to make another config attribute so put the message bus in the already prefixed dir
+            # if a coordination_dir is provided normally, we can still put the bus in there
+            # as the coordination dir should serve a similar purpose to the tmp directory
+            self.config.write_messages = gen_message_bus_path(config.coordination_dir)
         # Message bus messages need to go to the given file.
         # Keep a reference to the return value so the listener stays alive.
@@ -287,7 +294,11 @@ class Leader:
                 for job_id in self.toilState.totalFailedJobs:
                     # Refresh all the failed jobs to get e.g. the log file IDs that the workers wrote
                     self.toilState.reset_job(job_id)
-                    failed_jobs.append(self.toilState.get_job(job_id))
+                    try:
+                        failed_jobs.append(self.toilState.get_job(job_id))
+                    except NoSuchJobException:
+                        # Job actually finished and was removed
+                        pass
                 logger.info("Failed jobs at end of the run: %s", ' '.join(str(j) for j in failed_jobs))
                 raise FailedJobsException(self.jobStore, failed_jobs, exit_code=self.recommended_fail_exit_code)
@@ -520,10 +531,10 @@ class Leader:
                          "manager: %s", readyJob.jobStoreID)
         elif readyJob.jobStoreID in self.toilState.hasFailedSuccessors:
             self._processFailedSuccessors(job_id)
-        elif readyJob.command is not None or result_status != 0:
-            # The job has a command it must be run before any successors.
+        elif readyJob.has_body() or result_status != 0:
+            # The job has a body it must be run before any successors.
             # Similarly, if the job previously failed we rerun it, even if it doesn't have a
-            # command to run, to eliminate any parts of the stack now completed.
+            # body to run, to eliminate any parts of the stack now completed.
             isServiceJob = readyJob.jobStoreID in self.toilState.service_to_client
             # We want to run the job, and expend one of its "tries" (possibly
@@ -549,6 +560,7 @@ class Leader:
                 for serviceID in serviceJobList:
                     if serviceID in self.toilState.service_to_client:
                         raise RuntimeError(f"The ready service ID: {serviceID} was already added.")
+                    # TODO: Why do we refresh here?
                     self.toilState.reset_job(serviceID)
                     serviceHost = self.toilState.get_job(serviceID)
                     self.toilState.service_to_client[serviceID] = readyJob.jobStoreID
@@ -705,8 +717,9 @@ class Leader:
             if exitStatus == 0:
                 logger.debug('Job ended: %s', updatedJob)
             else:
-                logger.warning(f'Job failed with exit value {exitStatus}: {updatedJob}\n'
-                               f'Exit reason: {exitReason}')
+                status_string = str(exitStatus) if exitStatus != EXIT_STATUS_UNAVAILABLE_VALUE else "<UNAVAILABLE>"
+                logger.warning(f'Job failed with exit value {status_string}: {updatedJob}\n'
+                               f'Exit reason: {BatchJobExitReason.to_string(exitReason)}')
                 if exitStatus == CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE:
                     # This is a CWL job informing us that the workflow is
                     # asking things of us that Toil can't do. When we raise an
@@ -715,7 +728,7 @@ class Leader:
                     logger.warning("This indicates an unsupported CWL requirement!")
                     self.recommended_fail_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
             # Tell everyone it stopped running.
-            self._messages.publish(JobCompletedMessage(updatedJob.get_job_kind(), updatedJob.jobStoreID, exitStatus))
+            self._messages.publish(JobCompletedMessage(get_job_kind(updatedJob.get_names()), updatedJob.jobStoreID, exitStatus))
             self.process_finished_job(bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason)
     def _processLostJobs(self):
@@ -893,11 +906,6 @@ class Leader:
             workerCommand.append('--context')
             workerCommand.append(base64.b64encode(pickle.dumps(context)).decode('utf-8'))
-        # We locally override the command. This shouldn't get persisted back to
-        # the job store, or we will detach the job body from the job
-        # description. TODO: Don't do it this way! It's weird!
-        jobNode.command = ' '.join(workerCommand)
         omp_threads = os.environ.get('OMP_NUM_THREADS') \
             or str(max(1, int(jobNode.cores)))  # make sure OMP_NUM_THREADS is a positive integer
@@ -907,7 +915,7 @@ class Leader:
         }
         # jobBatchSystemID is an int for each job
-        jobBatchSystemID = self.batchSystem.issueBatchJob(jobNode, job_environment=job_environment)
+        jobBatchSystemID = self.batchSystem.issueBatchJob(' '.join(workerCommand), jobNode, job_environment=job_environment)
         # Record the job by the ID the batch system will use to talk about it with us
         self.issued_jobs_by_batch_system_id[jobBatchSystemID] = jobNode.jobStoreID
         # Record that this job is issued right now and shouldn't e.g. be issued again.
@@ -921,7 +929,7 @@ class Leader:
                    "%s and %s",
                    jobNode, str(jobBatchSystemID), jobNode.requirements_string())
         # Tell everyone it is issued and the queue size changed
-        self._messages.publish(JobIssuedMessage(jobNode.get_job_kind(), jobNode.jobStoreID, jobBatchSystemID))
+        self._messages.publish(JobIssuedMessage(get_job_kind(jobNode.get_names()), jobNode.jobStoreID, jobBatchSystemID))
         self._messages.publish(QueueSizeMessage(self.getNumberOfJobsIssued()))
         # Tell the user there's another job to do
         self.progress_overall.total += 1
@@ -1045,7 +1053,7 @@ class Leader:
             jobs = [job for job in jobs if job.preemptible == preemptible]
         return jobs
-    def killJobs(self, jobsToKill):
+    def killJobs(self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitReason.KILLED):
         """
         Kills the given set of jobs and then sends them for processing.
@@ -1059,7 +1067,7 @@ class Leader:
             self.batchSystem.killBatchJobs(jobsToKill)
             for jobBatchSystemID in jobsToKill:
                 # Reissue immediately, noting that we killed the job
-                willRerun = self.process_finished_job(jobBatchSystemID, 1, exit_reason=BatchJobExitReason.KILLED)
+                willRerun = self.process_finished_job(jobBatchSystemID, 1, exit_reason=exit_reason)
                 if willRerun:
                     # Compose a list of all the jobs that will run again
@@ -1089,7 +1097,7 @@ class Leader:
                                 str(runningJobs[jobBatchSystemID]),
                                 str(maxJobDuration))
                     jobsToKill.append(jobBatchSystemID)
-            reissued = self.killJobs(jobsToKill)
+            reissued = self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MAXJOBDURATION)
             if len(jobsToKill) > 0:
                 # Summarize our actions
                 logger.info("Killed %d over long jobs and reissued %d of them", len(jobsToKill), len(reissued))
@@ -1127,7 +1135,7 @@ class Leader:
             if timesMissing == killAfterNTimesMissing:
                 self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
                 jobsToKill.append(jobBatchSystemID)
-        self.killJobs(jobsToKill)
+        self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MISSING)
         return len( self.reissueMissingJobs_missingHash ) == 0 #We use this to inform
         #if there are missing jobs
@@ -1157,7 +1165,7 @@ class Leader:
             self.progress_overall.update(incr=-1)
             self.progress_failed.update(incr=1)
-        # Delegate to the vers
+        # Delegate to the version that uses a JobDescription
         return self.process_finished_job_description(issued_job, result_status, wall_time, exit_reason, batch_system_id)
     def process_finished_job_description(self, finished_job: JobDescription, result_status: int,
@@ -1165,7 +1173,7 @@ class Leader:
                                          exit_reason: Optional[BatchJobExitReason] = None,
                                          batch_system_id: Optional[int] = None) -> bool:
         """
-        Process a finished JobDescription based upon its succees or failure.
+        Process a finished JobDescription based upon its success or failure.
         If wall-clock time is available, informs the cluster scaler about the
         job finishing.
@@ -1188,19 +1196,62 @@ class Leader:
             logger.debug("Job %s continues to exist (i.e. has more to do)", finished_job)
             try:
                 # Reload the job as modified by the worker
-                self.toilState.reset_job(job_store_id)
-                replacement_job = self.toilState.get_job(job_store_id)
+                if finished_job.has_body():
+                    # The worker was expected to do some work. We expect the
+                    # worker to have updated the job description.
+                    # If the job succeeded, we wait around to see the update
+                    # and fail the job if we don't see it.
+                    if result_status == 0:
+                        timeout = self.config.job_store_timeout
+                        complaint = (
+                            f"has no new version available after {timeout} "
+                            "seconds. Either worker updates to "
+                            "the job store are delayed longer than your "
+                            "--jobStoreTimeout, or the worker trying to run the "
+                            "job was killed (or never started)."
+                        )
+                    else:
+                        timeout = 0
+                        complaint = (
+                            "has no new version available immediately. The "
+                            "batch system may have killed (or never started) "
+                            "the Toil worker."
+                        )
+                    change_detected = self.toilState.reset_job_expecting_change(job_store_id, timeout)
+                    replacement_job = self.toilState.get_job(job_store_id)
+                    if not change_detected:
+                        logger.warning(
+                            'Job %s %s',
+                            replacement_job,
+                            complaint
+                        )
+                        if result_status == 0:
+                            # Make the job fail because we ran it and it finished
+                            # and we never heard back.
+                            logger.error(
+                                'Marking ostensibly successful job %s that did '
+                                'not report in to the job store before '
+                                '--jobStoreTimeout as having been partitioned '
+                                'from us.',
+                                replacement_job
+                            )
+                            result_status = EXIT_STATUS_UNAVAILABLE_VALUE
+                            exit_reason = BatchJobExitReason.PARTITION
+                else:
+                    # If there was no body sent, the worker won't commit any
+                    # changes to the job description. So don't wait around for
+                    # any and don't complain if we don't see them.
+                    self.toilState.reset_job(job_store_id)
+                    replacement_job = self.toilState.get_job(job_store_id)
             except NoSuchJobException:
                 # We have a ghost job - the job has been deleted but a stale
                 # read from e.g. a non-POSIX-compliant filesystem gave us a
                 # false positive when we checked for its existence. Process the
                 # job from here as any other job removed from the job store.
-                # This is a hack until we can figure out how to actually always
-                # have a strongly-consistent communications channel. See
-                # https://github.com/BD2KGenomics/toil/issues/1091
-                logger.warning('Got a stale read for job %s; caught its '
-                'completion in time, but other jobs may try to run twice! Fix '
-                'the consistency of your job store storage!', finished_job)
+                logger.debug("Job %s is actually complete upon closer inspection", finished_job)
                 self.processRemovedJob(finished_job, result_status)
                 return False
             if replacement_job.logJobStoreFileID is not None:
@@ -1208,11 +1259,12 @@ class Leader:
                     # more memory efficient than read().striplines() while leaving off the
                     # trailing \n left when using readlines()
                     # http://stackoverflow.com/a/15233739
-                    StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
+                    StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
                                                       message='The job seems to have left a log file, indicating failure: %s' % replacement_job)
                 if self.config.writeLogs or self.config.writeLogsGzip:
                     with replacement_job.getLogFileHandle(self.jobStore) as log_stream:
-                        StatsAndLogging.writeLogFiles(replacement_job.chainedJobs, log_stream, self.config, failed=True)
+                        # Send log data from the job store to each per-job log file involved.
+                        StatsAndLogging.writeLogFiles([names.stats_name for names in replacement_job.get_chain()], log_stream, self.config, failed=True)
             if result_status != 0:
                 # If the batch system returned a non-zero exit code then the worker
                 # is assumed not to have captured the failure of the job, so we
@@ -1236,13 +1288,12 @@ class Leader:
                         else:
                             with log_stream:
                                 if os.path.getsize(log_file) > 0:
-                                    StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
+                                    StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
                                                                       message='The batch system left a non-empty file %s:' % log_file)
                                     if self.config.writeLogs or self.config.writeLogsGzip:
                                         file_root, _ = os.path.splitext(os.path.basename(log_file))
-                                        job_names = replacement_job.chainedJobs
-                                        if job_names is None:   # For jobs that fail this way, replacement_job.chainedJobs is not guaranteed to be set
-                                            job_names = [str(replacement_job)]
+                                        job_names = [names.stats_name for names in replacement_job.get_chain()]
+                                        # Tack the batch system log file name onto each job's name
                                         job_names = [j + '_' + file_root for j in job_names]
                                         log_stream.seek(0)
                                         StatsAndLogging.writeLogFiles(job_names, log_stream, self.config, failed=True)
@@ -1309,7 +1360,7 @@ class Leader:
         # Tell everyone it failed
-        self._messages.publish(JobFailedMessage(job_desc.get_job_kind(), job_id))
+        self._messages.publish(JobFailedMessage(get_job_kind(job_desc.get_names()), job_id))
         if job_id in self.toilState.service_to_client:
             # Is a service job

toil/lib/aws/__init__.py CHANGED Viewed

@@ -16,11 +16,25 @@ import logging
 import os
 import re
 import socket
+import toil.lib.retry
 from http.client import HTTPException
-from typing import Dict, MutableMapping, Optional
+from typing import Dict, MutableMapping, Optional, Union, Literal
 from urllib.error import URLError
 from urllib.request import urlopen
+from botocore.exceptions import ClientError
+from mypy_boto3_s3.literals import BucketLocationConstraintType
+AWSRegionName = Union[BucketLocationConstraintType, Literal["us-east-1"]]
+# These are errors where we think something randomly
+# went wrong on the AWS side and we ought to retry.
+AWSServerErrors = toil.lib.retry.ErrorCondition(
+    error=ClientError,
+    error_codes=[404, 500, 502, 503, 504]
+)
 logger = logging.getLogger(__name__)
 # This file isn't allowed to import anything that depends on Boto or Boto3,
@@ -67,11 +81,10 @@ def get_aws_zone_from_metadata() -> Optional[str]:
         # metadata.
         try:
             # Use the EC2 metadata service
-            import boto
-            str(boto)  # to prevent removal of the import
-            from boto.utils import get_instance_metadata
+            from ec2_metadata import ec2_metadata
             logger.debug("Fetch AZ from EC2 metadata")
-            return get_instance_metadata()['placement']['availability-zone']
+            return ec2_metadata.availability_zone
         except ImportError:
             # This is expected to happen a lot
             logger.debug("No boto to fetch ECS metadata")
@@ -82,12 +95,15 @@ def get_aws_zone_from_metadata() -> Optional[str]:
 def get_aws_zone_from_boto() -> Optional[str]:
     """
-    Get the AWS zone from the Boto config file, if it is configured and the
-    boto module is available.
+    Get the AWS zone from the Boto3 config file or from AWS_DEFAULT_REGION, if it is configured and the
+    boto3 module is available.
     """
     try:
-        import boto
-        zone = boto.config.get('Boto', 'ec2_region_name')
+        import boto3
+        from session import client
+        boto3_session = boto3.session.Session()
+        # this should check AWS_DEFAULT_REGION and ~/.aws/config
+        zone = boto3_session.region_name
         if zone is not None:
             zone += 'a'  # derive an availability zone in the region
         return zone
@@ -128,7 +144,7 @@ def get_current_aws_zone() -> Optional[str]:
         get_aws_zone_from_environment_region() or \
         get_aws_zone_from_boto()
-def zone_to_region(zone: str) -> str:
+def zone_to_region(zone: str) -> AWSRegionName:
     """Get a region (e.g. us-west-2) from a zone (e.g. us-west-1c)."""
     # re.compile() caches the regex internally so we don't have to
     availability_zone = re.compile(r'^([a-z]{2}-[a-z]+-[1-9][0-9]*)([a-z])$')

toil/lib/aws/iam.py CHANGED Viewed

@@ -257,8 +257,8 @@ def get_policy_permissions(region: str) -> AllowedActionCollection:
     :param zone: AWS zone to connect to
     """
-    iam: IAMClient = cast(IAMClient, get_client('iam', region))
-    sts: STSClient = cast(STSClient, get_client('sts', region))
+    iam: IAMClient = get_client('iam', region)
+    sts: STSClient = get_client('sts', region)
     #TODO Condider effect: deny at some point
     allowed_actions: AllowedActionCollection = defaultdict(lambda: {'Action': [], 'NotAction': []})
     try:

toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl

toil 6.1.0a1py3-none-any.whl → 7.0.0py3-none-any.whl