toil 5.12.0__py3-none-any.whl → 6.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. toil/__init__.py +18 -13
  2. toil/batchSystems/abstractBatchSystem.py +39 -13
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +24 -24
  4. toil/batchSystems/awsBatch.py +14 -14
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +3 -3
  7. toil/batchSystems/htcondor.py +0 -1
  8. toil/batchSystems/kubernetes.py +34 -31
  9. toil/batchSystems/local_support.py +3 -1
  10. toil/batchSystems/lsf.py +7 -7
  11. toil/batchSystems/mesos/batchSystem.py +7 -7
  12. toil/batchSystems/options.py +32 -83
  13. toil/batchSystems/registry.py +104 -23
  14. toil/batchSystems/singleMachine.py +16 -13
  15. toil/batchSystems/slurm.py +87 -16
  16. toil/batchSystems/torque.py +0 -1
  17. toil/bus.py +44 -8
  18. toil/common.py +544 -753
  19. toil/cwl/__init__.py +28 -32
  20. toil/cwl/cwltoil.py +595 -574
  21. toil/cwl/utils.py +55 -10
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/__init__.py +2 -2
  24. toil/fileStores/abstractFileStore.py +88 -14
  25. toil/fileStores/cachingFileStore.py +610 -549
  26. toil/fileStores/nonCachingFileStore.py +46 -22
  27. toil/job.py +182 -101
  28. toil/jobStores/abstractJobStore.py +161 -95
  29. toil/jobStores/aws/jobStore.py +23 -9
  30. toil/jobStores/aws/utils.py +6 -6
  31. toil/jobStores/fileJobStore.py +116 -18
  32. toil/jobStores/googleJobStore.py +16 -7
  33. toil/jobStores/utils.py +5 -6
  34. toil/leader.py +87 -56
  35. toil/lib/accelerators.py +10 -5
  36. toil/lib/aws/__init__.py +3 -14
  37. toil/lib/aws/ami.py +22 -9
  38. toil/lib/aws/iam.py +21 -13
  39. toil/lib/aws/session.py +2 -16
  40. toil/lib/aws/utils.py +4 -5
  41. toil/lib/compatibility.py +1 -1
  42. toil/lib/conversions.py +26 -3
  43. toil/lib/docker.py +22 -23
  44. toil/lib/ec2.py +10 -6
  45. toil/lib/ec2nodes.py +106 -100
  46. toil/lib/encryption/_nacl.py +2 -1
  47. toil/lib/generatedEC2Lists.py +325 -18
  48. toil/lib/io.py +49 -2
  49. toil/lib/misc.py +1 -1
  50. toil/lib/resources.py +9 -2
  51. toil/lib/threading.py +101 -38
  52. toil/options/common.py +736 -0
  53. toil/options/cwl.py +336 -0
  54. toil/options/wdl.py +37 -0
  55. toil/provisioners/abstractProvisioner.py +9 -4
  56. toil/provisioners/aws/__init__.py +3 -6
  57. toil/provisioners/aws/awsProvisioner.py +6 -0
  58. toil/provisioners/clusterScaler.py +3 -2
  59. toil/provisioners/gceProvisioner.py +2 -2
  60. toil/realtimeLogger.py +2 -1
  61. toil/resource.py +24 -18
  62. toil/server/app.py +2 -3
  63. toil/server/cli/wes_cwl_runner.py +4 -4
  64. toil/server/utils.py +1 -1
  65. toil/server/wes/abstract_backend.py +3 -2
  66. toil/server/wes/amazon_wes_utils.py +5 -4
  67. toil/server/wes/tasks.py +2 -3
  68. toil/server/wes/toil_backend.py +2 -10
  69. toil/server/wsgi_app.py +2 -0
  70. toil/serviceManager.py +12 -10
  71. toil/statsAndLogging.py +41 -9
  72. toil/test/__init__.py +29 -54
  73. toil/test/batchSystems/batchSystemTest.py +11 -111
  74. toil/test/batchSystems/test_slurm.py +24 -8
  75. toil/test/cactus/__init__.py +0 -0
  76. toil/test/cactus/test_cactus_integration.py +58 -0
  77. toil/test/cwl/cwlTest.py +438 -223
  78. toil/test/cwl/glob_dir.cwl +15 -0
  79. toil/test/cwl/preemptible.cwl +21 -0
  80. toil/test/cwl/preemptible_expression.cwl +28 -0
  81. toil/test/cwl/revsort.cwl +1 -1
  82. toil/test/cwl/revsort2.cwl +1 -1
  83. toil/test/docs/scriptsTest.py +2 -3
  84. toil/test/jobStores/jobStoreTest.py +34 -21
  85. toil/test/lib/aws/test_iam.py +4 -14
  86. toil/test/lib/aws/test_utils.py +0 -3
  87. toil/test/lib/dockerTest.py +4 -4
  88. toil/test/lib/test_ec2.py +12 -17
  89. toil/test/mesos/helloWorld.py +4 -5
  90. toil/test/mesos/stress.py +1 -1
  91. toil/test/{wdl/conftest.py → options/__init__.py} +0 -10
  92. toil/test/options/options.py +37 -0
  93. toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
  94. toil/test/provisioners/clusterScalerTest.py +6 -4
  95. toil/test/provisioners/clusterTest.py +23 -11
  96. toil/test/provisioners/gceProvisionerTest.py +0 -6
  97. toil/test/provisioners/restartScript.py +3 -2
  98. toil/test/server/serverTest.py +1 -1
  99. toil/test/sort/restart_sort.py +2 -1
  100. toil/test/sort/sort.py +2 -1
  101. toil/test/sort/sortTest.py +2 -13
  102. toil/test/src/autoDeploymentTest.py +45 -45
  103. toil/test/src/busTest.py +5 -5
  104. toil/test/src/checkpointTest.py +2 -2
  105. toil/test/src/deferredFunctionTest.py +1 -1
  106. toil/test/src/fileStoreTest.py +32 -16
  107. toil/test/src/helloWorldTest.py +1 -1
  108. toil/test/src/importExportFileTest.py +1 -1
  109. toil/test/src/jobDescriptionTest.py +2 -1
  110. toil/test/src/jobServiceTest.py +1 -1
  111. toil/test/src/jobTest.py +18 -18
  112. toil/test/src/miscTests.py +5 -3
  113. toil/test/src/promisedRequirementTest.py +3 -3
  114. toil/test/src/realtimeLoggerTest.py +1 -1
  115. toil/test/src/resourceTest.py +2 -2
  116. toil/test/src/restartDAGTest.py +1 -1
  117. toil/test/src/resumabilityTest.py +36 -2
  118. toil/test/src/retainTempDirTest.py +1 -1
  119. toil/test/src/systemTest.py +2 -2
  120. toil/test/src/toilContextManagerTest.py +2 -2
  121. toil/test/src/userDefinedJobArgTypeTest.py +1 -1
  122. toil/test/utils/toilDebugTest.py +98 -32
  123. toil/test/utils/toilKillTest.py +2 -2
  124. toil/test/utils/utilsTest.py +23 -3
  125. toil/test/wdl/wdltoil_test.py +223 -45
  126. toil/toilState.py +7 -6
  127. toil/utils/toilClean.py +1 -1
  128. toil/utils/toilConfig.py +36 -0
  129. toil/utils/toilDebugFile.py +60 -33
  130. toil/utils/toilDebugJob.py +39 -12
  131. toil/utils/toilDestroyCluster.py +1 -1
  132. toil/utils/toilKill.py +1 -1
  133. toil/utils/toilLaunchCluster.py +13 -2
  134. toil/utils/toilMain.py +3 -2
  135. toil/utils/toilRsyncCluster.py +1 -1
  136. toil/utils/toilSshCluster.py +1 -1
  137. toil/utils/toilStats.py +445 -305
  138. toil/utils/toilStatus.py +2 -5
  139. toil/version.py +10 -10
  140. toil/wdl/utils.py +2 -122
  141. toil/wdl/wdltoil.py +1257 -492
  142. toil/worker.py +55 -46
  143. toil-6.1.0.dist-info/METADATA +124 -0
  144. toil-6.1.0.dist-info/RECORD +241 -0
  145. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/WHEEL +1 -1
  146. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -1
  147. toil/batchSystems/parasol.py +0 -379
  148. toil/batchSystems/tes.py +0 -459
  149. toil/test/batchSystems/parasolTestSupport.py +0 -117
  150. toil/test/wdl/builtinTest.py +0 -506
  151. toil/test/wdl/toilwdlTest.py +0 -522
  152. toil/wdl/toilwdl.py +0 -141
  153. toil/wdl/versions/dev.py +0 -107
  154. toil/wdl/versions/draft2.py +0 -980
  155. toil/wdl/versions/v1.py +0 -794
  156. toil/wdl/wdl_analysis.py +0 -116
  157. toil/wdl/wdl_functions.py +0 -997
  158. toil/wdl/wdl_synthesis.py +0 -1011
  159. toil/wdl/wdl_types.py +0 -243
  160. toil-5.12.0.dist-info/METADATA +0 -118
  161. toil-5.12.0.dist-info/RECORD +0 -244
  162. /toil/{wdl/versions → options}/__init__.py +0 -0
  163. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
  164. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
toil/__init__.py CHANGED
@@ -20,6 +20,7 @@ import sys
20
20
  import time
21
21
  from datetime import datetime
22
22
  from typing import TYPE_CHECKING, Optional, Tuple
23
+
23
24
  import requests
24
25
  from pytz import timezone
25
26
 
@@ -106,7 +107,8 @@ def toilPackageDirPath() -> str:
106
107
  The return value is guaranteed to end in '/toil'.
107
108
  """
108
109
  result = os.path.dirname(os.path.realpath(__file__))
109
- assert result.endswith('/toil')
110
+ if not result.endswith('/toil'):
111
+ raise RuntimeError("The top-level toil package is not named Toil.")
110
112
  return result
111
113
 
112
114
 
@@ -132,7 +134,8 @@ def resolveEntryPoint(entryPoint: str) -> str:
132
134
  # opposed to being included via --system-site-packages). For clusters this means that
133
135
  # if Toil is installed in a virtualenv on the leader, it must be installed in
134
136
  # a virtualenv located at the same path on each worker as well.
135
- assert os.access(path, os.X_OK)
137
+ if not os.access(path, os.X_OK):
138
+ raise RuntimeError("Cannot access the Toil virtualenv. If installed in a virtualenv on a cluster, make sure that the virtualenv path is the same for the leader and workers.")
136
139
  return path
137
140
  # Otherwise, we aren't in a virtualenv, or we're in a virtualenv but Toil
138
141
  # came in via --system-site-packages, or we think the virtualenv might not
@@ -238,7 +241,8 @@ def customInitCmd() -> str:
238
241
 
239
242
  def _check_custom_bash_cmd(cmd_str):
240
243
  """Ensure that the Bash command doesn't contain invalid characters."""
241
- assert not re.search(r'[\n\r\t]', cmd_str), f'"{cmd_str}" contains invalid characters (newline and/or tab).'
244
+ if re.search(r'[\n\r\t]', cmd_str):
245
+ raise RuntimeError(f'"{cmd_str}" contains invalid characters (newline and/or tab).')
242
246
 
243
247
 
244
248
  def lookupEnvVar(name: str, envName: str, defaultValue: str) -> str:
@@ -370,11 +374,10 @@ def requestCheckRegularDocker(origAppliance: str, registryName: str, imageName:
370
374
  separate check is done for docker.io images.
371
375
 
372
376
  :param origAppliance: The full url of the docker image originally
373
- specified by the user (or the default).
374
- e.g. ``quay.io/ucsc_cgl/toil:latest``
375
- :param registryName: The url of a docker image's registry. e.g. ``quay.io``
376
- :param imageName: The image, including path and excluding the tag. e.g. ``ucsc_cgl/toil``
377
- :param tag: The tag used at that docker image's registry. e.g. ``latest``
377
+ specified by the user (or the default). For example, ``quay.io/ucsc_cgl/toil:latest``.
378
+ :param registryName: The url of a docker image's registry. For example, ``quay.io``.
379
+ :param imageName: The image, including path and excluding the tag. For example, ``ucsc_cgl/toil``.
380
+ :param tag: The tag used at that docker image's registry. For example, ``latest``.
378
381
  :raises: ApplianceImageNotFound if no match is found.
379
382
  :return: Return True if match found.
380
383
  """
@@ -399,9 +402,9 @@ def requestCheckDockerIo(origAppliance: str, imageName: str, tag: str) -> bool:
399
402
  URL is based on the docker v2 schema. Requires that an access token be fetched first.
400
403
 
401
404
  :param origAppliance: The full url of the docker image originally
402
- specified by the user (or the default). e.g. "ubuntu:latest"
403
- :param imageName: The image, including path and excluding the tag. e.g. "ubuntu"
404
- :param tag: The tag used at that docker image's registry. e.g. "latest"
405
+ specified by the user (or the default). For example, ``ubuntu:latest``.
406
+ :param imageName: The image, including path and excluding the tag. For example, ``ubuntu``.
407
+ :param tag: The tag used at that docker image's registry. For example, ``latest``.
405
408
  :raises: ApplianceImageNotFound if no match is found.
406
409
  :return: Return True if match found.
407
410
  """
@@ -548,7 +551,8 @@ try:
548
551
  So if we ever want to refresh, Boto 3 wants to refresh too.
549
552
  """
550
553
  # This should only happen if we have expiring credentials, which we should only get from boto3
551
- assert (self._boto3_resolver is not None)
554
+ if self._boto3_resolver is None:
555
+ raise RuntimeError("The Boto3 resolver should not be None.")
552
556
 
553
557
  self._obtain_credentials_from_cache_or_boto3()
554
558
 
@@ -612,7 +616,8 @@ try:
612
616
  content = f.read()
613
617
  if content:
614
618
  record = content.split('\n')
615
- assert len(record) == 4
619
+ if len(record) != 4:
620
+ raise RuntimeError("Number of cached credentials is not 4.")
616
621
  self._access_key = record[0]
617
622
  self._secret_key = record[1]
618
623
  self._security_token = record[2]
@@ -15,19 +15,19 @@ import enum
15
15
  import logging
16
16
  import os
17
17
  import shutil
18
+ import time
18
19
  from abc import ABC, abstractmethod
19
20
  from argparse import ArgumentParser, _ArgumentGroup
20
21
  from contextlib import contextmanager
21
22
  from threading import Condition
22
- import time
23
23
  from typing import (Any,
24
24
  ContextManager,
25
25
  Dict,
26
- List,
27
- Set,
28
26
  Iterator,
27
+ List,
29
28
  NamedTuple,
30
29
  Optional,
30
+ Set,
31
31
  Union,
32
32
  cast)
33
33
 
@@ -37,6 +37,7 @@ from toil.common import Config, Toil, cacheDirName
37
37
  from toil.deferred import DeferredFunctionManager
38
38
  from toil.fileStores.abstractFileStore import AbstractFileStore
39
39
  from toil.job import JobDescription, ParsedRequirement, Requirer
40
+ from toil.lib.memoize import memoize
40
41
  from toil.resource import Resource
41
42
 
42
43
  logger = logging.getLogger(__name__)
@@ -58,13 +59,28 @@ class BatchJobExitReason(enum.IntEnum):
58
59
  MEMLIMIT: int = 6
59
60
  """Job hit batch system imposed memory limit."""
60
61
 
62
+ @classmethod
63
+ def to_string(cls, value: int) -> str:
64
+ """
65
+ Convert to human-readable string.
66
+
67
+ Given an int that may be or may be equal to a value from the enum,
68
+ produce the string value of its matching enum entry, or a stringified
69
+ int.
70
+ """
71
+ try:
72
+ return cls(value).name
73
+ except ValueError:
74
+ return str(value)
75
+
61
76
  class UpdatedBatchJobInfo(NamedTuple):
62
77
  jobID: int
63
78
  exitStatus: int
64
79
  """
65
80
  The exit status (integer value) of the job. 0 implies successful.
66
81
 
67
- EXIT_STATUS_UNAVAILABLE_VALUE is used when the exit status is not available (e.g. job is lost).
82
+ EXIT_STATUS_UNAVAILABLE_VALUE is used when the exit status is not available
83
+ (e.g. job is lost, or otherwise died but actual exit code was not reported).
68
84
  """
69
85
 
70
86
  exitReason: Optional[BatchJobExitReason]
@@ -106,6 +122,8 @@ class AbstractBatchSystem(ABC):
106
122
  @abstractmethod
107
123
  def supportsWorkerCleanup(cls) -> bool:
108
124
  """
125
+ Whether this batch system supports worker cleanup.
126
+
109
127
  Indicates whether this batch system invokes
110
128
  :meth:`BatchSystemSupport.workerCleanup` after the last job for a
111
129
  particular workflow invocation finishes. Note that the term *worker*
@@ -119,7 +137,9 @@ class AbstractBatchSystem(ABC):
119
137
 
120
138
  def setUserScript(self, userScript: Resource) -> None:
121
139
  """
122
- Set the user script for this workflow. This method must be called before the first job is
140
+ Set the user script for this workflow.
141
+
142
+ This method must be called before the first job is
123
143
  issued to this batch system, and only if :meth:`.supportsAutoDeployment` returns True,
124
144
  otherwise it will raise an exception.
125
145
 
@@ -134,7 +154,6 @@ class AbstractBatchSystem(ABC):
134
154
  bus, so that it can send informational messages about the jobs it is
135
155
  running to other Toil components.
136
156
  """
137
- pass
138
157
 
139
158
  @abstractmethod
140
159
  def issueBatchJob(self, jobDesc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
@@ -263,7 +282,6 @@ class AbstractBatchSystem(ABC):
263
282
  setOption(option_name, parsing_function=None, check_function=None, default=None, env=None)
264
283
  returning nothing, used to update run configuration as a side effect.
265
284
  """
266
- pass
267
285
 
268
286
  def getWorkerContexts(self) -> List[ContextManager[Any]]:
269
287
  """
@@ -372,7 +390,7 @@ class BatchSystemSupport(AbstractBatchSystem):
372
390
  :param name: the environment variable to be set on the worker.
373
391
 
374
392
  :param value: if given, the environment variable given by name will be set to this value.
375
- if None, the variable's current value will be used as the value on the worker
393
+ If None, the variable's current value will be used as the value on the worker
376
394
 
377
395
  :raise RuntimeError: if value is None and the name cannot be found in the environment
378
396
  """
@@ -392,6 +410,7 @@ class BatchSystemSupport(AbstractBatchSystem):
392
410
  # We do in fact send messages to the message bus.
393
411
  self._outbox = message_bus.outbox()
394
412
 
413
+ @memoize
395
414
  def get_batch_logs_dir(self) -> str:
396
415
  """
397
416
  Get the directory where the backing batch system should save its logs.
@@ -404,6 +423,9 @@ class BatchSystemSupport(AbstractBatchSystem):
404
423
  """
405
424
  if self.config.batch_logs_dir:
406
425
  # Use what is specified
426
+ if not os.path.isdir(self.config.batch_logs_dir):
427
+ # But if it doesn't exist, make it exist
428
+ os.makedirs(self.config.batch_logs_dir, exist_ok=True)
407
429
  return self.config.batch_logs_dir
408
430
  # And if nothing is specified use the workDir.
409
431
  return Toil.getToilWorkDir(self.config.workDir)
@@ -430,7 +452,7 @@ class BatchSystemSupport(AbstractBatchSystem):
430
452
  file_name: str = f'toil_{self.config.workflowID}.{toil_job_id}.{cluster_job_id}.{std}.log'
431
453
  logs_dir: str = self.get_batch_logs_dir()
432
454
  return os.path.join(logs_dir, file_name)
433
-
455
+
434
456
  def format_std_out_err_glob(self, toil_job_id: int) -> str:
435
457
  """
436
458
  Get a glob string that will match all file paths generated by format_std_out_err_path for a job.
@@ -438,11 +460,13 @@ class BatchSystemSupport(AbstractBatchSystem):
438
460
  file_glob: str = f'toil_{self.config.workflowID}.{toil_job_id}.*.log'
439
461
  logs_dir: str = self.get_batch_logs_dir()
440
462
  return os.path.join(logs_dir, file_glob)
441
-
463
+
442
464
  @staticmethod
443
465
  def workerCleanup(info: WorkerCleanupInfo) -> None:
444
466
  """
445
- Cleans up the worker node on batch system shutdown. Also see :meth:`supportsWorkerCleanup`.
467
+ Cleans up the worker node on batch system shutdown.
468
+
469
+ Also see :meth:`supportsWorkerCleanup`.
446
470
 
447
471
  :param WorkerCleanupInfo info: A named tuple consisting of all the relevant information
448
472
  for cleaning up the worker.
@@ -498,8 +522,10 @@ class NodeInfo:
498
522
 
499
523
  class AbstractScalableBatchSystem(AbstractBatchSystem):
500
524
  """
501
- A batch system that supports a variable number of worker nodes. Used by :class:`toil.
502
- provisioners.clusterScaler.ClusterScaler` to scale the number of worker nodes in the cluster
525
+ A batch system that supports a variable number of worker nodes.
526
+
527
+ Used by :class:`toil.provisioners.clusterScaler.ClusterScaler`
528
+ to scale the number of worker nodes in the cluster
503
529
  up or down depending on overall load.
504
530
  """
505
531
 
@@ -17,13 +17,13 @@ from abc import ABCMeta, abstractmethod
17
17
  from datetime import datetime
18
18
  from queue import Empty, Queue
19
19
  from threading import Lock, Thread
20
- from typing import Any, Dict, List, Optional, Tuple, Union
20
+ from typing import Dict, List, Optional, Tuple, Union
21
21
 
22
22
  from toil.batchSystems.abstractBatchSystem import (BatchJobExitReason,
23
23
  UpdatedBatchJobInfo)
24
24
  from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
25
- from toil.bus import ExternalBatchIdMessage
26
- from toil.job import JobDescription, AcceleratorRequirement
25
+ from toil.bus import ExternalBatchIdMessage, get_job_kind
26
+ from toil.job import AcceleratorRequirement
27
27
  from toil.lib.misc import CalledProcessErrorStderr
28
28
 
29
29
  logger = logging.getLogger(__name__)
@@ -225,23 +225,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
225
225
  return activity
226
226
 
227
227
  def _handle_job_status(
228
- self, job_id: int, status: Union[int, None], activity: bool
228
+ self, job_id: int, status: Union[int, Tuple[int, Optional[BatchJobExitReason]], None], activity: bool
229
229
  ) -> bool:
230
230
  """
231
231
  Helper method for checkOnJobs to handle job statuses
232
232
  """
233
233
  if status is not None:
234
+ if isinstance(status, int):
235
+ code = status
236
+ reason = None
237
+ else:
238
+ code, reason = status
234
239
  self.updatedJobsQueue.put(
235
240
  UpdatedBatchJobInfo(
236
- jobID=job_id, exitStatus=status, exitReason=None, wallTime=None
237
- )
238
- )
239
- self.forgetJob(job_id)
240
- return True
241
- if status is not None and isinstance(status, BatchJobExitReason):
242
- self.updatedJobsQueue.put(
243
- UpdatedBatchJobInfo(
244
- jobID=job_id, exitStatus=1, exitReason=status, wallTime=None
241
+ jobID=job_id, exitStatus=code, exitReason=reason, wallTime=None
245
242
  )
246
243
  )
247
244
  self.forgetJob(job_id)
@@ -279,9 +276,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
279
276
  logger.error("GridEngine like batch system failure", exc_info=ex)
280
277
  raise
281
278
 
282
- def coalesce_job_exit_codes(self, batch_job_id_list: list) -> list:
279
+ def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
283
280
  """
284
- Returns exit codes for a list of jobs.
281
+ Returns exit codes and possibly exit reasons for a list of jobs, or None if they are running.
285
282
 
286
283
  Called by AbstractGridEngineWorker.checkOnJobs().
287
284
 
@@ -351,16 +348,19 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
351
348
  raise NotImplementedError()
352
349
 
353
350
  @abstractmethod
354
- def getJobExitCode(self, batchJobID):
351
+ def getJobExitCode(self, batchJobID) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
355
352
  """
356
- Returns job exit code or an instance of abstractBatchSystem.BatchJobExitReason.
357
- if something else happened other than the job exiting.
358
- Implementation-specific; called by AbstractGridEngineWorker.checkOnJobs()
353
+ Returns job exit code and possibly an instance of abstractBatchSystem.BatchJobExitReason.
359
354
 
360
- :param string batchjobID: batch system job ID
355
+ Returns None if the job is still running.
356
+
357
+ If the job is not running but the exit code is not available, it
358
+ will be EXIT_STATUS_UNAVAILABLE_VALUE. Implementation-specific;
359
+ called by AbstractGridEngineWorker.checkOnJobs().
361
360
 
362
- :rtype: int|toil.batchSystems.abstractBatchSystem.BatchJobExitReason: exit code int
363
- or BatchJobExitReason if something else happened other than job exiting.
361
+ The exit code will only be 0 if the job affirmatively succeeded.
362
+
363
+ :param string batchjobID: batch system job ID
364
364
  """
365
365
  raise NotImplementedError()
366
366
 
@@ -407,10 +407,10 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
407
407
  else:
408
408
  gpus = jobDesc.accelerators
409
409
 
410
- self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory, jobDesc.command, jobDesc.get_job_kind(),
410
+ self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory, jobDesc.command, get_job_kind(jobDesc.get_names()),
411
411
  job_environment, gpus))
412
412
  logger.debug("Issued the job command: %s with job id: %s and job name %s", jobDesc.command, str(jobID),
413
- jobDesc.get_job_kind())
413
+ get_job_kind(jobDesc.get_names()))
414
414
  return jobID
415
415
 
416
416
  def killBatchJobs(self, jobIDs):
@@ -34,25 +34,25 @@ import tempfile
34
34
  import time
35
35
  import uuid
36
36
  from argparse import ArgumentParser, _ArgumentGroup
37
- from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Union
37
+ from typing import Any, Dict, Iterator, List, Optional, Set, Union
38
38
 
39
39
  from boto.exception import BotoServerError
40
40
 
41
41
  from toil import applianceSelf
42
42
  from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE,
43
43
  BatchJobExitReason,
44
- UpdatedBatchJobInfo,
45
- InsufficientSystemResources)
46
- from toil.batchSystems.options import OptionSetter
44
+ InsufficientSystemResources,
45
+ UpdatedBatchJobInfo)
47
46
  from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
48
47
  from toil.batchSystems.contained_executor import pack_job
49
- from toil.bus import ExternalBatchIdMessage, MessageBus, MessageOutbox
48
+ from toil.batchSystems.options import OptionSetter
49
+ from toil.bus import ExternalBatchIdMessage
50
50
  from toil.common import Config, Toil
51
51
  from toil.job import JobDescription, Requirer
52
- from toil.lib.aws import get_current_aws_region, zone_to_region
52
+ from toil.lib.aws import get_current_aws_region
53
53
  from toil.lib.aws.session import establish_boto3_session
54
- from toil.lib.conversions import b_to_mib, mib_to_b
55
- from toil.lib.misc import slow_down, unix_now_ms, utc_now
54
+ from toil.lib.conversions import b_to_mib
55
+ from toil.lib.misc import slow_down, unix_now_ms
56
56
  from toil.lib.retry import retry
57
57
  from toil.resource import Resource
58
58
 
@@ -559,17 +559,17 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
559
559
 
560
560
  @classmethod
561
561
  def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
562
- parser.add_argument("--awsBatchRegion", dest="aws_batch_region", default=None,
562
+ parser.add_argument("--awsBatchRegion", dest="aws_batch_region", default=None, env_var="TOIL_AWS_REGION",
563
563
  help="The AWS region containing the AWS Batch queue to submit to.")
564
- parser.add_argument("--awsBatchQueue", dest="aws_batch_queue", default=None,
564
+ parser.add_argument("--awsBatchQueue", dest="aws_batch_queue", default=None, env_var="TOIL_AWS_BATCH_QUEUE",
565
565
  help="The name or ARN of the AWS Batch queue to submit to.")
566
- parser.add_argument("--awsBatchJobRoleArn", dest="aws_batch_job_role_arn", default=None,
566
+ parser.add_argument("--awsBatchJobRoleArn", dest="aws_batch_job_role_arn", default=None, env_var="TOIL_AWS_BATCH_JOB_ROLE_ARN",
567
567
  help=("The ARN of an IAM role to run AWS Batch jobs as, so they "
568
568
  "can e.g. access a job store. Must be assumable by "
569
569
  "ecs-tasks.amazonaws.com."))
570
570
 
571
571
  @classmethod
572
572
  def setOptions(cls, setOption: OptionSetter) -> None:
573
- setOption("aws_batch_region", default=None)
574
- setOption("aws_batch_queue", default=None, env=["TOIL_AWS_BATCH_QUEUE"])
575
- setOption("aws_batch_job_role_arn", default=None, env=["TOIL_AWS_BATCH_JOB_ROLE_ARN"])
573
+ setOption("aws_batch_region")
574
+ setOption("aws_batch_queue")
575
+ setOption("aws_batch_job_role_arn")
@@ -69,8 +69,13 @@ class WorkerCleanupContext:
69
69
 
70
70
  def __enter__(self) -> None:
71
71
  # Set up an arena so we know who is the last worker to leave
72
- self.arena = LastProcessStandingArena(Toil.get_toil_coordination_dir(self.workerCleanupInfo.work_dir, self.workerCleanupInfo.coordination_dir),
73
- self.workerCleanupInfo.workflow_id + '-cleanup')
72
+ self.arena = LastProcessStandingArena(
73
+ Toil.get_toil_coordination_dir(
74
+ self.workerCleanupInfo.work_dir,
75
+ self.workerCleanupInfo.coordination_dir
76
+ ),
77
+ Toil.get_workflow_path_component(self.workerCleanupInfo.workflow_id) + "-cleanup"
78
+ )
74
79
  logger.debug('Entering cleanup arena')
75
80
  self.arena.enter()
76
81
  logger.debug('Cleanup arena entered')
@@ -90,4 +95,3 @@ class WorkerCleanupContext:
90
95
  # Now the coordination_dir is allowed to no longer exist on the node.
91
96
  logger.debug('Cleanup arena left')
92
97
 
93
-
@@ -14,7 +14,7 @@
14
14
  """
15
15
  Executor for running inside a container.
16
16
 
17
- Useful for Kubernetes and TES batch systems.
17
+ Useful for Kubernetes batch system and TES batch system plugin.
18
18
  """
19
19
  import base64
20
20
  import logging
@@ -39,10 +39,10 @@ def pack_job(job_desc: JobDescription, user_script: Optional[Resource] = None, e
39
39
  :param job_desc: Job description for the job to run.
40
40
  :param user_script: User script that will be loaded before the job is run.
41
41
  :param environment: Environment variable dict that will be applied before
42
- the job is run.
42
+ the job is run.
43
43
 
44
44
  :returns: Command to run the job, as an argument list that can be run
45
- inside the Toil appliance container.
45
+ inside the Toil appliance container.
46
46
  """
47
47
  # Make a job dict to send to the executor.
48
48
  # TODO: Factor out executor setup from here and Kubernetes and TES
@@ -24,7 +24,6 @@ import htcondor
24
24
 
25
25
  from toil.batchSystems.abstractGridEngineBatchSystem import \
26
26
  AbstractGridEngineBatchSystem
27
-
28
27
  from toil.job import AcceleratorRequirement
29
28
  from toil.lib.retry import retry
30
29
 
@@ -24,22 +24,22 @@ import datetime
24
24
  import logging
25
25
  import math
26
26
  import os
27
- from queue import Empty, Queue
28
27
  import string
29
28
  import sys
30
29
  import tempfile
31
- from threading import Event, Thread, Condition, RLock
32
30
  import time
33
31
  import uuid
34
32
  from argparse import ArgumentParser, _ArgumentGroup
33
+ from queue import Empty, Queue
34
+ from threading import Condition, Event, RLock, Thread
35
35
  from typing import (Any,
36
36
  Callable,
37
37
  Dict,
38
38
  Iterator,
39
39
  List,
40
- Set,
41
40
  Literal,
42
41
  Optional,
42
+ Set,
43
43
  Tuple,
44
44
  Type,
45
45
  TypeVar,
@@ -104,7 +104,8 @@ from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE
104
104
  from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
105
105
  from toil.batchSystems.contained_executor import pack_job
106
106
  from toil.batchSystems.options import OptionSetter
107
- from toil.common import Config, Toil, SYS_MAX_SIZE
107
+ from toil.common import Config, Toil
108
+ from toil.options.common import SYS_MAX_SIZE
108
109
  from toil.job import JobDescription, Requirer
109
110
  from toil.lib.conversions import human2bytes
110
111
  from toil.lib.misc import get_user_name, slow_down, utc_now
@@ -152,6 +153,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
152
153
  super().__init__(config, maxCores, maxMemory, maxDisk)
153
154
 
154
155
  # Re-type the config to make sure it has all the fields we need.
156
+ # This convinces MyPy we really do have this type.
155
157
  assert isinstance(config, KubernetesBatchSystem.KubernetesConfig)
156
158
 
157
159
  # Turn down log level for Kubernetes modules and dependencies.
@@ -167,26 +169,26 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
167
169
  self._apis: KubernetesBatchSystem._ApiStorageDict = {}
168
170
 
169
171
  # Get our namespace (and our Kubernetes credentials to make sure they exist)
170
- self.namespace = self._api('namespace')
172
+ self.namespace: str = self._api('namespace')
171
173
 
172
174
  # Decide if we are going to mount a Kubernetes host path as the Toil
173
175
  # work dir in the workers, for shared caching.
174
- self.host_path = config.kubernetes_host_path
176
+ self.host_path: Optional[str] = config.kubernetes_host_path
175
177
 
176
178
  # Get the service account name to use, if any.
177
- self.service_account = config.kubernetes_service_account
179
+ self.service_account: Optional[str] = config.kubernetes_service_account
178
180
 
179
181
  # Get how long we should wait for a pod that lands on a node to
180
182
  # actually start.
181
- self.pod_timeout = config.kubernetes_pod_timeout
183
+ self.pod_timeout: float = config.kubernetes_pod_timeout
182
184
 
183
185
  # Get the username to mark jobs with
184
- username = config.kubernetes_owner
186
+ username = config.kubernetes_owner or self.get_default_kubernetes_owner()
185
187
  # And a unique ID for the run
186
188
  self.unique_id = uuid.uuid4()
187
189
 
188
190
  # Create a prefix for jobs, starting with our username
189
- self.job_prefix = f'{username}-toil-{self.unique_id}-'
191
+ self.job_prefix: str = f'{username}-toil-{self.unique_id}-'
190
192
  # Instead of letting Kubernetes assign unique job names, we assign our
191
193
  # own based on a numerical job ID. This functionality is managed by the
192
194
  # BatchSystemLocalSupport.
@@ -199,17 +201,17 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
199
201
  # conformance tests. To work around this, we tag all our jobs with an
200
202
  # explicit TTL that is long enough that we're sure we can deal with all
201
203
  # the finished jobs before they expire.
202
- self.finished_job_ttl = 3600 # seconds
204
+ self.finished_job_ttl: int = 3600 # seconds
203
205
 
204
206
  # Here is where we will store the user script resource object if we get one.
205
207
  self.user_script: Optional[Resource] = None
206
208
 
207
209
  # Ge the image to deploy from Toil's configuration
208
- self.docker_image = applianceSelf()
210
+ self.docker_image: str = applianceSelf()
209
211
 
210
212
  # Try and guess what Toil work dir the workers will use.
211
213
  # We need to be able to provision (possibly shared) space there.
212
- self.worker_work_dir = Toil.getToilWorkDir(config.workDir)
214
+ self.worker_work_dir: str = Toil.getToilWorkDir(config.workDir)
213
215
  if (config.workDir is None and
214
216
  os.getenv('TOIL_WORKDIR') is None and
215
217
  self.worker_work_dir == tempfile.gettempdir()):
@@ -226,17 +228,17 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
226
228
  self.environment['TMPDIR'] = '/var/tmp'
227
229
 
228
230
  # Get the name of the AWS secret, if any, to mount in containers.
229
- self.aws_secret_name = os.environ.get("TOIL_AWS_SECRET_NAME", None)
231
+ self.aws_secret_name: Optional[str] = os.environ.get("TOIL_AWS_SECRET_NAME", None)
230
232
 
231
233
  # Set this to True to enable the experimental wait-for-job-update code
232
- self.enable_watching = os.environ.get("KUBE_WATCH_ENABLED", False)
234
+ self.enable_watching: bool = os.environ.get("KUBE_WATCH_ENABLED", False)
233
235
 
234
236
  # This will be a label to select all our jobs.
235
- self.run_id = f'toil-{self.unique_id}'
237
+ self.run_id: str = f'toil-{self.unique_id}'
236
238
 
237
239
  # Keep track of available resources.
238
240
  maxMillicores = int(SYS_MAX_SIZE if self.maxCores == SYS_MAX_SIZE else self.maxCores * 1000)
239
- self.resource_sources = [
241
+ self.resource_sources: List[ResourcePool] = [
240
242
  # A pool representing available job slots
241
243
  ResourcePool(self.config.max_jobs, 'job slots'),
242
244
  # A pool representing available CPU in units of millicores (1 CPU
@@ -261,16 +263,16 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
261
263
  self._killed_queue_jobs: Set[int] = set()
262
264
 
263
265
  # We use this event to signal shutdown
264
- self._shutting_down = Event()
266
+ self._shutting_down: Event = Event()
265
267
 
266
268
  # A lock to protect critical regions when working with queued jobs.
267
- self._mutex = RLock()
269
+ self._mutex: RLock = RLock()
268
270
 
269
271
  # A condition set to true when there is more work to do. e.g.: new job
270
272
  # in the queue or any resource becomes available.
271
- self._work_available = Condition(lock=self._mutex)
273
+ self._work_available: Condition = Condition(lock=self._mutex)
272
274
 
273
- self.schedulingThread = Thread(target=self._scheduler, daemon=True)
275
+ self.schedulingThread: Thread = Thread(target=self._scheduler, daemon=True)
274
276
  self.schedulingThread.start()
275
277
 
276
278
  def _pretty_print(self, kubernetes_object: Any) -> str:
@@ -1864,24 +1866,25 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
1864
1866
 
1865
1867
  @classmethod
1866
1868
  def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
1867
- parser.add_argument("--kubernetesHostPath", dest="kubernetes_host_path", default=None,
1869
+ parser.add_argument("--kubernetesHostPath", dest="kubernetes_host_path", default=None, env_var="TOIL_KUBERNETES_HOST_PATH",
1868
1870
  help="Path on Kubernetes hosts to use as shared inter-pod temp directory. "
1869
1871
  "(default: %(default)s)")
1870
- parser.add_argument("--kubernetesOwner", dest="kubernetes_owner", default=cls.get_default_kubernetes_owner(),
1871
- help="Username to mark Kubernetes jobs with. "
1872
- "(default: %(default)s)")
1873
- parser.add_argument("--kubernetesServiceAccount", dest="kubernetes_service_account", default=None,
1872
+ parser.add_argument("--kubernetesOwner", dest="kubernetes_owner", default=None, env_var="TOIL_KUBERNETES_OWNER",
1873
+ help=f"Username to mark Kubernetes jobs with. If the provided value is None, the value will "
1874
+ f"be generated at runtime. "
1875
+ f"(Generated default: {cls.get_default_kubernetes_owner()})")
1876
+ parser.add_argument("--kubernetesServiceAccount", dest="kubernetes_service_account", default=None, env_var="TOIL_KUBERNETES_SERVICE_ACCOUNT",
1874
1877
  help="Service account to run jobs as. "
1875
1878
  "(default: %(default)s)")
1876
- parser.add_argument("--kubernetesPodTimeout", dest="kubernetes_pod_timeout", default=120,
1879
+ parser.add_argument("--kubernetesPodTimeout", dest="kubernetes_pod_timeout", default=120, env_var="TOIL_KUBERNETES_POD_TIMEOUT", type=float,
1877
1880
  help="Seconds to wait for a scheduled Kubernetes pod to start running. "
1878
1881
  "(default: %(default)s)")
1879
1882
 
1880
1883
  OptionType = TypeVar('OptionType')
1881
1884
  @classmethod
1882
1885
  def setOptions(cls, setOption: OptionSetter) -> None:
1883
- setOption("kubernetes_host_path", default=None, env=['TOIL_KUBERNETES_HOST_PATH'])
1884
- setOption("kubernetes_owner", default=cls.get_default_kubernetes_owner(), env=['TOIL_KUBERNETES_OWNER'])
1885
- setOption("kubernetes_service_account", default=None, env=['TOIL_KUBERNETES_SERVICE_ACCOUNT'])
1886
- setOption("kubernetes_pod_timeout", default=120, env=['TOIL_KUBERNETES_POD_TIMEOUT'])
1886
+ setOption("kubernetes_host_path")
1887
+ setOption("kubernetes_owner")
1888
+ setOption("kubernetes_service_account",)
1889
+ setOption("kubernetes_pod_timeout")
1887
1890
 
@@ -19,6 +19,7 @@ from toil.batchSystems.abstractBatchSystem import (BatchSystemSupport,
19
19
  from toil.batchSystems.singleMachine import SingleMachineBatchSystem
20
20
  from toil.common import Config
21
21
  from toil.job import JobDescription
22
+ from toil.lib.threading import cpu_count
22
23
 
23
24
  logger = logging.getLogger(__name__)
24
25
 
@@ -28,8 +29,9 @@ class BatchSystemLocalSupport(BatchSystemSupport):
28
29
 
29
30
  def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int) -> None:
30
31
  super().__init__(config, maxCores, maxMemory, maxDisk)
32
+ max_local_jobs = config.max_local_jobs if config.max_local_jobs is not None else cpu_count()
31
33
  self.localBatch: SingleMachineBatchSystem = SingleMachineBatchSystem(
32
- config, maxCores, maxMemory, maxDisk, max_jobs=config.max_local_jobs
34
+ config, maxCores, maxMemory, maxDisk, max_jobs=max_local_jobs
33
35
  )
34
36
 
35
37
  def handleLocalJob(self, jobDesc: JobDescription) -> Optional[int]: