toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +41 -17
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +4 -5
  7. toil/batchSystems/gridengine.py +1 -1
  8. toil/batchSystems/htcondor.py +5 -5
  9. toil/batchSystems/kubernetes.py +25 -11
  10. toil/batchSystems/local_support.py +3 -3
  11. toil/batchSystems/lsf.py +9 -9
  12. toil/batchSystems/mesos/batchSystem.py +4 -4
  13. toil/batchSystems/mesos/executor.py +3 -2
  14. toil/batchSystems/options.py +9 -0
  15. toil/batchSystems/singleMachine.py +11 -10
  16. toil/batchSystems/slurm.py +129 -16
  17. toil/batchSystems/torque.py +1 -1
  18. toil/bus.py +45 -3
  19. toil/common.py +56 -31
  20. toil/cwl/cwltoil.py +442 -371
  21. toil/deferred.py +1 -1
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/abstractFileStore.py +69 -20
  24. toil/fileStores/cachingFileStore.py +6 -22
  25. toil/fileStores/nonCachingFileStore.py +6 -15
  26. toil/job.py +270 -86
  27. toil/jobStores/abstractJobStore.py +37 -31
  28. toil/jobStores/aws/jobStore.py +280 -218
  29. toil/jobStores/aws/utils.py +60 -31
  30. toil/jobStores/conftest.py +2 -2
  31. toil/jobStores/fileJobStore.py +3 -3
  32. toil/jobStores/googleJobStore.py +3 -4
  33. toil/leader.py +89 -38
  34. toil/lib/aws/__init__.py +26 -10
  35. toil/lib/aws/iam.py +2 -2
  36. toil/lib/aws/session.py +62 -22
  37. toil/lib/aws/utils.py +73 -37
  38. toil/lib/conversions.py +24 -1
  39. toil/lib/ec2.py +118 -69
  40. toil/lib/expando.py +1 -1
  41. toil/lib/generatedEC2Lists.py +8 -8
  42. toil/lib/io.py +42 -4
  43. toil/lib/misc.py +1 -3
  44. toil/lib/resources.py +57 -16
  45. toil/lib/retry.py +12 -5
  46. toil/lib/threading.py +29 -14
  47. toil/lib/throttle.py +1 -1
  48. toil/options/common.py +31 -30
  49. toil/options/wdl.py +5 -0
  50. toil/provisioners/__init__.py +9 -3
  51. toil/provisioners/abstractProvisioner.py +12 -2
  52. toil/provisioners/aws/__init__.py +20 -15
  53. toil/provisioners/aws/awsProvisioner.py +406 -329
  54. toil/provisioners/gceProvisioner.py +2 -2
  55. toil/provisioners/node.py +13 -5
  56. toil/server/app.py +1 -1
  57. toil/statsAndLogging.py +93 -23
  58. toil/test/__init__.py +27 -12
  59. toil/test/batchSystems/batchSystemTest.py +40 -33
  60. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  61. toil/test/batchSystems/test_slurm.py +22 -7
  62. toil/test/cactus/__init__.py +0 -0
  63. toil/test/cactus/test_cactus_integration.py +58 -0
  64. toil/test/cwl/cwlTest.py +245 -236
  65. toil/test/cwl/seqtk_seq.cwl +1 -1
  66. toil/test/docs/scriptsTest.py +11 -14
  67. toil/test/jobStores/jobStoreTest.py +40 -54
  68. toil/test/lib/aws/test_iam.py +2 -2
  69. toil/test/lib/test_ec2.py +1 -1
  70. toil/test/options/__init__.py +13 -0
  71. toil/test/options/options.py +37 -0
  72. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  73. toil/test/provisioners/clusterTest.py +99 -16
  74. toil/test/server/serverTest.py +2 -2
  75. toil/test/src/autoDeploymentTest.py +1 -1
  76. toil/test/src/dockerCheckTest.py +2 -1
  77. toil/test/src/environmentTest.py +125 -0
  78. toil/test/src/fileStoreTest.py +1 -1
  79. toil/test/src/jobDescriptionTest.py +18 -8
  80. toil/test/src/jobTest.py +1 -1
  81. toil/test/src/realtimeLoggerTest.py +4 -0
  82. toil/test/src/workerTest.py +52 -19
  83. toil/test/utils/toilDebugTest.py +62 -4
  84. toil/test/utils/utilsTest.py +23 -21
  85. toil/test/wdl/wdltoil_test.py +49 -21
  86. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  87. toil/toilState.py +68 -9
  88. toil/utils/toilDebugFile.py +1 -1
  89. toil/utils/toilDebugJob.py +153 -26
  90. toil/utils/toilLaunchCluster.py +12 -2
  91. toil/utils/toilRsyncCluster.py +7 -2
  92. toil/utils/toilSshCluster.py +7 -3
  93. toil/utils/toilStats.py +310 -266
  94. toil/utils/toilStatus.py +98 -52
  95. toil/version.py +11 -11
  96. toil/wdl/wdltoil.py +644 -225
  97. toil/worker.py +125 -83
  98. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  99. toil-7.0.0.dist-info/METADATA +158 -0
  100. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
  101. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  102. toil-6.1.0a1.dist-info/METADATA +0 -125
  103. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  104. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/deferred.py CHANGED
@@ -178,7 +178,7 @@ class DeferredFunctionManager:
178
178
 
179
179
  try:
180
180
  def defer(deferredFunction):
181
- # Just serialize defered functions one after the other.
181
+ # Just serialize deferred functions one after the other.
182
182
  # If serializing later ones fails, eariler ones will still be intact.
183
183
  # We trust dill to protect sufficiently against partial reads later.
184
184
  logger.debug("Deferring function %s" % repr(deferredFunction))
toil/exceptions.py CHANGED
@@ -36,7 +36,7 @@ class FailedJobsException(Exception):
36
36
  for job_desc in failed_jobs:
37
37
  if job_desc.logJobStoreFileID:
38
38
  with job_desc.getLogFileHandle(job_store) as f:
39
- self.msg += "\n" + StatsAndLogging.formatLogStream(f, job_desc)
39
+ self.msg += "\n" + StatsAndLogging.formatLogStream(f, f'Log from job "{job_desc}"')
40
40
  # catch failures to prepare more complex details and only return the basics
41
41
  except Exception:
42
42
  logger.exception("Exception when compiling information about failed jobs")
@@ -37,11 +37,12 @@ from typing import (IO,
37
37
 
38
38
  import dill
39
39
 
40
- from toil.common import Toil, cacheDirName
40
+ from toil.common import Toil, cacheDirName, getDirSizeRecursively
41
41
  from toil.fileStores import FileID
42
- from toil.job import Job, JobDescription
42
+ from toil.job import Job, JobDescription, DebugStoppingPointReached
43
43
  from toil.jobStores.abstractJobStore import AbstractJobStore
44
44
  from toil.lib.compatibility import deprecated
45
+ from toil.lib.conversions import bytes2human
45
46
  from toil.lib.io import WriteWatchingStream, mkdtemp
46
47
 
47
48
  logger = logging.getLogger(__name__)
@@ -112,11 +113,10 @@ class AbstractFileStore(ABC):
112
113
  assert self.jobStore.config.workflowID is not None
113
114
  self.workflow_dir: str = Toil.getLocalWorkflowDir(self.jobStore.config.workflowID, self.jobStore.config.workDir)
114
115
  self.coordination_dir: str =Toil.get_local_workflow_coordination_dir(self.jobStore.config.workflowID, self.jobStore.config.workDir, self.jobStore.config.coordination_dir)
115
- self.jobName: str = (
116
- self.jobDesc.command.split()[1] if self.jobDesc.command else ""
117
- )
116
+ self.jobName: str = str(self.jobDesc)
118
117
  self.waitForPreviousCommit = waitForPreviousCommit
119
- self.loggingMessages: List[Dict[str, Union[int, str]]] = []
118
+ self.logging_messages: List[Dict[str, Union[int, str]]] = []
119
+ self.logging_user_streams: List[dict[str, str]] = []
120
120
  # Records file IDs of files deleted during the current job. Doesn't get
121
121
  # committed back until the job is completely successful, because if the
122
122
  # job is re-run it will need to be able to re-delete these files.
@@ -125,6 +125,8 @@ class AbstractFileStore(ABC):
125
125
  # Holds records of file ID, or file ID and local path, for reporting
126
126
  # the accessed files of failed jobs.
127
127
  self._accessLog: List[Tuple[str, ...]] = []
128
+ # Holds total bytes of observed disk usage for the last job run under open()
129
+ self._job_disk_used: Optional[int] = None
128
130
 
129
131
  @staticmethod
130
132
  def createFileStore(
@@ -187,15 +189,43 @@ class AbstractFileStore(ABC):
187
189
 
188
190
  :param job: The job instance of the toil job to run.
189
191
  """
190
- failed = True
192
+ job_requested_disk = job.disk
191
193
  try:
192
194
  yield
193
195
  failed = False
194
- finally:
195
- # Do a finally instead of an except/raise because we don't want
196
- # to appear as "another exception occurred" in the stack trace.
197
- if failed:
196
+ except BaseException as e:
197
+ if isinstance(e, DebugStoppingPointReached):
198
+ self._dumpAccessLogs(job_type="Debugged", log_level=logging.INFO)
199
+ else:
198
200
  self._dumpAccessLogs()
201
+ raise
202
+ finally:
203
+ # See how much disk space is used at the end of the job.
204
+ # Not a real peak disk usage, but close enough to be useful for warning the user.
205
+ self._job_disk_used = getDirSizeRecursively(self.localTempDir)
206
+
207
+ # Report disk usage
208
+ percent: float = 0.0
209
+ if job_requested_disk and job_requested_disk > 0:
210
+ percent = float(self._job_disk_used) / job_requested_disk * 100
211
+ disk_usage: str = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(self._job_disk_used)}B [{self._job_disk_used}B] used, "
212
+ f"{bytes2human(job_requested_disk)}B [{job_requested_disk}B] requested).")
213
+ if self._job_disk_used > job_requested_disk:
214
+ self.log_to_leader("Job used more disk than requested. For CWL, consider increasing the outdirMin "
215
+ f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}",
216
+ level=logging.WARNING)
217
+ else:
218
+ self.log_to_leader(disk_usage, level=logging.DEBUG)
219
+
220
+ def get_disk_usage(self) -> Optional[int]:
221
+ """
222
+ Get the number of bytes of disk used by the last job run under open().
223
+
224
+ Disk usage is measured at the end of the job.
225
+ TODO: Sample periodically and record peak usage.
226
+ """
227
+ return self._job_disk_used
228
+
199
229
 
200
230
  # Functions related to temp files and directories
201
231
  def getLocalTempDir(self) -> str:
@@ -331,14 +361,16 @@ class AbstractFileStore(ABC):
331
361
 
332
362
  yield wrappedStream, fileID
333
363
 
334
- def _dumpAccessLogs(self) -> None:
364
+ def _dumpAccessLogs(self, job_type: str = "Failed", log_level: int = logging.WARNING) -> None:
335
365
  """
336
- When something goes wrong, log a report.
366
+ Log a report of the files accessed.
337
367
 
338
368
  Includes the files that were accessed while the file store was open.
369
+
370
+ :param job_type: Adjective to describe the job in the report.
339
371
  """
340
372
  if len(self._accessLog) > 0:
341
- logger.warning('Failed job accessed files:')
373
+ logger.log(log_level, '%s job accessed files:', job_type)
342
374
 
343
375
  for item in self._accessLog:
344
376
  # For each access record
@@ -347,14 +379,14 @@ class AbstractFileStore(ABC):
347
379
  file_id, dest_path = item
348
380
  if os.path.exists(dest_path):
349
381
  if os.path.islink(dest_path):
350
- logger.warning('Symlinked file \'%s\' to path \'%s\'', file_id, dest_path)
382
+ logger.log(log_level, 'Symlinked file \'%s\' to path \'%s\'', file_id, dest_path)
351
383
  else:
352
- logger.warning('Downloaded file \'%s\' to path \'%s\'', file_id, dest_path)
384
+ logger.log(log_level, 'Downloaded file \'%s\' to path \'%s\'', file_id, dest_path)
353
385
  else:
354
- logger.warning('Downloaded file \'%s\' to path \'%s\' (gone!)', file_id, dest_path)
386
+ logger.log(log_level, 'Downloaded file \'%s\' to path \'%s\' (gone!)', file_id, dest_path)
355
387
  else:
356
388
  # Otherwise dump without the name
357
- logger.warning('Streamed file \'%s\'', *item)
389
+ logger.log(log_level, 'Streamed file \'%s\'', *item)
358
390
 
359
391
  def logAccess(
360
392
  self, fileStoreID: Union[FileID, str], destination: Union[str, None] = None
@@ -611,13 +643,30 @@ class AbstractFileStore(ABC):
611
643
  :param level: The logging level.
612
644
  """
613
645
  logger.log(level=level, msg=("LOG-TO-MASTER: " + text))
614
- self.loggingMessages.append(dict(text=text, level=level))
646
+ self.logging_messages.append(dict(text=text, level=level))
615
647
 
616
648
 
617
649
  @deprecated(new_function_name='export_file')
618
650
  def logToMaster(self, text: str, level: int = logging.INFO) -> None:
619
651
  self.log_to_leader(text, level)
620
-
652
+
653
+ def log_user_stream(self, name: str, stream: IO[bytes]) -> None:
654
+ """
655
+ Send a stream of UTF-8 text to the leader as a named log stream.
656
+
657
+ Useful for things like the error logs of Docker containers. The leader
658
+ will show it to the user or organize it appropriately for user-level
659
+ log information.
660
+
661
+ :param name: A hierarchical, .-delimited string.
662
+ :param stream: A stream of encoded text. Encoding errors will be
663
+ tolerated.
664
+ """
665
+
666
+ # Read the whole stream into memory
667
+ steam_data = stream.read().decode('utf-8', errors='replace')
668
+ # And remember it for the worker to fish out
669
+ self.logging_user_streams.append(dict(name=name, text=steam_data))
621
670
 
622
671
  # Functions run after the completion of the job.
623
672
  @abstractmethod
@@ -32,13 +32,12 @@ from typing import (Any,
32
32
  Sequence,
33
33
  Tuple)
34
34
 
35
- from toil.common import cacheDirName, getDirSizeRecursively, getFileSystemSize
35
+ from toil.common import cacheDirName, getFileSystemSize
36
36
  from toil.fileStores import FileID
37
37
  from toil.fileStores.abstractFileStore import AbstractFileStore
38
38
  from toil.job import Job, JobDescription
39
39
  from toil.jobStores.abstractJobStore import AbstractJobStore
40
40
  from toil.lib.compatibility import deprecated
41
- from toil.lib.conversions import bytes2human
42
41
  from toil.lib.io import (atomic_copy,
43
42
  atomic_copyobj,
44
43
  make_public_dir,
@@ -1037,11 +1036,12 @@ class CachingFileStore(AbstractFileStore):
1037
1036
  # Create a working directory for the job
1038
1037
  startingDir = os.getcwd()
1039
1038
  # Move self.localTempDir from the worker directory set up in __init__ to a per-job directory.
1040
- self.localTempDir = make_public_dir(in_directory=self.localTempDir)
1039
+ self.localTempDir = make_public_dir(self.localTempDir, suggested_name="job")
1041
1040
  # Check the status of all jobs on this node. If there are jobs that started and died before
1042
1041
  # cleaning up their presence from the database, clean them up ourselves.
1043
1042
  self._removeDeadJobs(self.coordination_dir, self.con)
1044
- # Get the requirements for the job.
1043
+ # Get the disk requirement for the job, which we will use to know if we
1044
+ # have filled the cache or not.
1045
1045
  self.jobDiskBytes = job.disk
1046
1046
 
1047
1047
  logger.debug('Actually running job (%s) with ID (%s) which wants %d of our %d bytes.',
@@ -1055,22 +1055,6 @@ class CachingFileStore(AbstractFileStore):
1055
1055
  with super().open(job):
1056
1056
  yield
1057
1057
  finally:
1058
- # See how much disk space is used at the end of the job.
1059
- # Not a real peak disk usage, but close enough to be useful for warning the user.
1060
- # TODO: Push this logic into the abstract file store
1061
- disk: int = getDirSizeRecursively(self.localTempDir)
1062
- percent: float = 0.0
1063
- if self.jobDiskBytes and self.jobDiskBytes > 0:
1064
- percent = float(disk) / self.jobDiskBytes * 100
1065
- disk_usage: str = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(disk)}B [{disk}B] used, "
1066
- f"{bytes2human(self.jobDiskBytes)}B [{self.jobDiskBytes}B] requested).")
1067
- if disk > self.jobDiskBytes:
1068
- self.log_to_leader("Job used more disk than requested. For CWL, consider increasing the outdirMin "
1069
- f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}",
1070
- level=logging.WARNING)
1071
- else:
1072
- self.log_to_leader(disk_usage, level=logging.DEBUG)
1073
-
1074
1058
  # Go back up to the per-worker local temp directory.
1075
1059
  os.chdir(startingDir)
1076
1060
  self.cleanupInProgress = True
@@ -1095,7 +1079,7 @@ class CachingFileStore(AbstractFileStore):
1095
1079
  # Create an empty file to get an ID.
1096
1080
  # Make sure to pass along the file basename.
1097
1081
  # TODO: this empty file could leak if we die now...
1098
- fileID = self.jobStore.getEmptyFileStoreID(creatorID, cleanup, os.path.basename(localFileName))
1082
+ fileID = self.jobStore.get_empty_file_store_id(creatorID, cleanup, os.path.basename(localFileName))
1099
1083
  # Work out who we are
1100
1084
  with self.as_process() as me:
1101
1085
 
@@ -1875,7 +1859,7 @@ class CachingFileStore(AbstractFileStore):
1875
1859
  logger.debug('Starting commit of %s forked from %s', state_to_commit, self.jobDesc)
1876
1860
  # Make sure the deep copy isn't summoning ghosts of old job
1877
1861
  # versions. It must be as new or newer at this point.
1878
- self.jobDesc.check_new_version(state_to_commit)
1862
+ self.jobDesc.assert_is_not_newer_than(state_to_commit)
1879
1863
 
1880
1864
  # Bump the original's version since saving will do that too and we
1881
1865
  # don't want duplicate versions.
@@ -35,13 +35,12 @@ from typing import (IO,
35
35
 
36
36
  import dill
37
37
 
38
- from toil.common import getDirSizeRecursively, getFileSystemSize
38
+ from toil.common import getFileSystemSize
39
39
  from toil.fileStores import FileID
40
40
  from toil.fileStores.abstractFileStore import AbstractFileStore
41
41
  from toil.job import Job, JobDescription
42
42
  from toil.jobStores.abstractJobStore import AbstractJobStore
43
43
  from toil.lib.compatibility import deprecated
44
- from toil.lib.conversions import bytes2human
45
44
  from toil.lib.io import make_public_dir, robust_rmtree
46
45
  from toil.lib.retry import ErrorCondition, retry
47
46
  from toil.lib.threading import get_process_name, process_name_exists
@@ -102,9 +101,8 @@ class NonCachingFileStore(AbstractFileStore):
102
101
 
103
102
  @contextmanager
104
103
  def open(self, job: Job) -> Generator[None, None, None]:
105
- jobReqs = job.disk
106
104
  startingDir = os.getcwd()
107
- self.localTempDir: str = make_public_dir(in_directory=self.localTempDir)
105
+ self.localTempDir: str = make_public_dir(self.localTempDir, suggested_name="job")
108
106
  self._removeDeadJobs(self.coordination_dir)
109
107
  self.jobStateFile = self._createJobStateFile()
110
108
  self.check_for_state_corruption()
@@ -116,16 +114,6 @@ class NonCachingFileStore(AbstractFileStore):
116
114
  with super().open(job):
117
115
  yield
118
116
  finally:
119
- disk = getDirSizeRecursively(self.localTempDir)
120
- percent = float(disk) / jobReqs * 100 if jobReqs > 0 else 0.0
121
- disk_usage = (f"Job {self.jobName} used {percent:.2f}% disk ({bytes2human(disk)}B [{disk}B] used, "
122
- f"{bytes2human(jobReqs)}B [{jobReqs}B] requested).")
123
- if disk > jobReqs:
124
- self.log_to_leader("Job used more disk than requested. For CWL, consider increasing the outdirMin "
125
- f"requirement, otherwise, consider increasing the disk requirement. {disk_usage}",
126
- level=logging.WARNING)
127
- else:
128
- self.log_to_leader(disk_usage, level=logging.DEBUG)
129
117
  os.chdir(startingDir)
130
118
  # Finally delete the job from the worker
131
119
  self.check_for_state_corruption()
@@ -362,7 +350,10 @@ class NonCachingFileStore(AbstractFileStore):
362
350
  jobState = {'jobProcessName': get_process_name(self.coordination_dir),
363
351
  'jobName': self.jobName,
364
352
  'jobDir': self.localTempDir}
365
- (fd, jobStateFile) = tempfile.mkstemp(suffix='.jobState.tmp', dir=self.coordination_dir)
353
+ try:
354
+ (fd, jobStateFile) = tempfile.mkstemp(suffix='.jobState.tmp', dir=self.coordination_dir)
355
+ except Exception as e:
356
+ raise RuntimeError("Could not make state file in " + self.coordination_dir) from e
366
357
  with open(fd, 'wb') as fH:
367
358
  # Write data
368
359
  dill.dump(jobState, fH)