toil 5.12.0__py3-none-any.whl → 6.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. toil/__init__.py +18 -13
  2. toil/batchSystems/abstractBatchSystem.py +39 -13
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +24 -24
  4. toil/batchSystems/awsBatch.py +14 -14
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +3 -3
  7. toil/batchSystems/htcondor.py +0 -1
  8. toil/batchSystems/kubernetes.py +34 -31
  9. toil/batchSystems/local_support.py +3 -1
  10. toil/batchSystems/lsf.py +7 -7
  11. toil/batchSystems/mesos/batchSystem.py +7 -7
  12. toil/batchSystems/options.py +32 -83
  13. toil/batchSystems/registry.py +104 -23
  14. toil/batchSystems/singleMachine.py +16 -13
  15. toil/batchSystems/slurm.py +87 -16
  16. toil/batchSystems/torque.py +0 -1
  17. toil/bus.py +44 -8
  18. toil/common.py +544 -753
  19. toil/cwl/__init__.py +28 -32
  20. toil/cwl/cwltoil.py +595 -574
  21. toil/cwl/utils.py +55 -10
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/__init__.py +2 -2
  24. toil/fileStores/abstractFileStore.py +88 -14
  25. toil/fileStores/cachingFileStore.py +610 -549
  26. toil/fileStores/nonCachingFileStore.py +46 -22
  27. toil/job.py +182 -101
  28. toil/jobStores/abstractJobStore.py +161 -95
  29. toil/jobStores/aws/jobStore.py +23 -9
  30. toil/jobStores/aws/utils.py +6 -6
  31. toil/jobStores/fileJobStore.py +116 -18
  32. toil/jobStores/googleJobStore.py +16 -7
  33. toil/jobStores/utils.py +5 -6
  34. toil/leader.py +87 -56
  35. toil/lib/accelerators.py +10 -5
  36. toil/lib/aws/__init__.py +3 -14
  37. toil/lib/aws/ami.py +22 -9
  38. toil/lib/aws/iam.py +21 -13
  39. toil/lib/aws/session.py +2 -16
  40. toil/lib/aws/utils.py +4 -5
  41. toil/lib/compatibility.py +1 -1
  42. toil/lib/conversions.py +26 -3
  43. toil/lib/docker.py +22 -23
  44. toil/lib/ec2.py +10 -6
  45. toil/lib/ec2nodes.py +106 -100
  46. toil/lib/encryption/_nacl.py +2 -1
  47. toil/lib/generatedEC2Lists.py +325 -18
  48. toil/lib/io.py +49 -2
  49. toil/lib/misc.py +1 -1
  50. toil/lib/resources.py +9 -2
  51. toil/lib/threading.py +101 -38
  52. toil/options/common.py +736 -0
  53. toil/options/cwl.py +336 -0
  54. toil/options/wdl.py +37 -0
  55. toil/provisioners/abstractProvisioner.py +9 -4
  56. toil/provisioners/aws/__init__.py +3 -6
  57. toil/provisioners/aws/awsProvisioner.py +6 -0
  58. toil/provisioners/clusterScaler.py +3 -2
  59. toil/provisioners/gceProvisioner.py +2 -2
  60. toil/realtimeLogger.py +2 -1
  61. toil/resource.py +24 -18
  62. toil/server/app.py +2 -3
  63. toil/server/cli/wes_cwl_runner.py +4 -4
  64. toil/server/utils.py +1 -1
  65. toil/server/wes/abstract_backend.py +3 -2
  66. toil/server/wes/amazon_wes_utils.py +5 -4
  67. toil/server/wes/tasks.py +2 -3
  68. toil/server/wes/toil_backend.py +2 -10
  69. toil/server/wsgi_app.py +2 -0
  70. toil/serviceManager.py +12 -10
  71. toil/statsAndLogging.py +41 -9
  72. toil/test/__init__.py +29 -54
  73. toil/test/batchSystems/batchSystemTest.py +11 -111
  74. toil/test/batchSystems/test_slurm.py +24 -8
  75. toil/test/cactus/__init__.py +0 -0
  76. toil/test/cactus/test_cactus_integration.py +58 -0
  77. toil/test/cwl/cwlTest.py +438 -223
  78. toil/test/cwl/glob_dir.cwl +15 -0
  79. toil/test/cwl/preemptible.cwl +21 -0
  80. toil/test/cwl/preemptible_expression.cwl +28 -0
  81. toil/test/cwl/revsort.cwl +1 -1
  82. toil/test/cwl/revsort2.cwl +1 -1
  83. toil/test/docs/scriptsTest.py +2 -3
  84. toil/test/jobStores/jobStoreTest.py +34 -21
  85. toil/test/lib/aws/test_iam.py +4 -14
  86. toil/test/lib/aws/test_utils.py +0 -3
  87. toil/test/lib/dockerTest.py +4 -4
  88. toil/test/lib/test_ec2.py +12 -17
  89. toil/test/mesos/helloWorld.py +4 -5
  90. toil/test/mesos/stress.py +1 -1
  91. toil/test/{wdl/conftest.py → options/__init__.py} +0 -10
  92. toil/test/options/options.py +37 -0
  93. toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
  94. toil/test/provisioners/clusterScalerTest.py +6 -4
  95. toil/test/provisioners/clusterTest.py +23 -11
  96. toil/test/provisioners/gceProvisionerTest.py +0 -6
  97. toil/test/provisioners/restartScript.py +3 -2
  98. toil/test/server/serverTest.py +1 -1
  99. toil/test/sort/restart_sort.py +2 -1
  100. toil/test/sort/sort.py +2 -1
  101. toil/test/sort/sortTest.py +2 -13
  102. toil/test/src/autoDeploymentTest.py +45 -45
  103. toil/test/src/busTest.py +5 -5
  104. toil/test/src/checkpointTest.py +2 -2
  105. toil/test/src/deferredFunctionTest.py +1 -1
  106. toil/test/src/fileStoreTest.py +32 -16
  107. toil/test/src/helloWorldTest.py +1 -1
  108. toil/test/src/importExportFileTest.py +1 -1
  109. toil/test/src/jobDescriptionTest.py +2 -1
  110. toil/test/src/jobServiceTest.py +1 -1
  111. toil/test/src/jobTest.py +18 -18
  112. toil/test/src/miscTests.py +5 -3
  113. toil/test/src/promisedRequirementTest.py +3 -3
  114. toil/test/src/realtimeLoggerTest.py +1 -1
  115. toil/test/src/resourceTest.py +2 -2
  116. toil/test/src/restartDAGTest.py +1 -1
  117. toil/test/src/resumabilityTest.py +36 -2
  118. toil/test/src/retainTempDirTest.py +1 -1
  119. toil/test/src/systemTest.py +2 -2
  120. toil/test/src/toilContextManagerTest.py +2 -2
  121. toil/test/src/userDefinedJobArgTypeTest.py +1 -1
  122. toil/test/utils/toilDebugTest.py +98 -32
  123. toil/test/utils/toilKillTest.py +2 -2
  124. toil/test/utils/utilsTest.py +23 -3
  125. toil/test/wdl/wdltoil_test.py +223 -45
  126. toil/toilState.py +7 -6
  127. toil/utils/toilClean.py +1 -1
  128. toil/utils/toilConfig.py +36 -0
  129. toil/utils/toilDebugFile.py +60 -33
  130. toil/utils/toilDebugJob.py +39 -12
  131. toil/utils/toilDestroyCluster.py +1 -1
  132. toil/utils/toilKill.py +1 -1
  133. toil/utils/toilLaunchCluster.py +13 -2
  134. toil/utils/toilMain.py +3 -2
  135. toil/utils/toilRsyncCluster.py +1 -1
  136. toil/utils/toilSshCluster.py +1 -1
  137. toil/utils/toilStats.py +445 -305
  138. toil/utils/toilStatus.py +2 -5
  139. toil/version.py +10 -10
  140. toil/wdl/utils.py +2 -122
  141. toil/wdl/wdltoil.py +1257 -492
  142. toil/worker.py +55 -46
  143. toil-6.1.0.dist-info/METADATA +124 -0
  144. toil-6.1.0.dist-info/RECORD +241 -0
  145. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/WHEEL +1 -1
  146. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -1
  147. toil/batchSystems/parasol.py +0 -379
  148. toil/batchSystems/tes.py +0 -459
  149. toil/test/batchSystems/parasolTestSupport.py +0 -117
  150. toil/test/wdl/builtinTest.py +0 -506
  151. toil/test/wdl/toilwdlTest.py +0 -522
  152. toil/wdl/toilwdl.py +0 -141
  153. toil/wdl/versions/dev.py +0 -107
  154. toil/wdl/versions/draft2.py +0 -980
  155. toil/wdl/versions/v1.py +0 -794
  156. toil/wdl/wdl_analysis.py +0 -116
  157. toil/wdl/wdl_functions.py +0 -997
  158. toil/wdl/wdl_synthesis.py +0 -1011
  159. toil/wdl/wdl_types.py +0 -243
  160. toil-5.12.0.dist-info/METADATA +0 -118
  161. toil-5.12.0.dist-info/RECORD +0 -244
  162. /toil/{wdl/versions → options}/__init__.py +0 -0
  163. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
  164. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
@@ -16,13 +16,14 @@ import math
16
16
  import os
17
17
  from argparse import ArgumentParser, _ArgumentGroup
18
18
  from shlex import quote
19
- from typing import Callable, Dict, List, Optional, TypeVar, Union
19
+ from typing import Dict, List, Optional, Tuple, TypeVar, Union
20
20
 
21
+ from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE
21
22
  from toil.batchSystems.abstractGridEngineBatchSystem import \
22
23
  AbstractGridEngineBatchSystem
23
24
  from toil.batchSystems.options import OptionSetter
24
- from toil.lib.misc import CalledProcessErrorStderr, call_command
25
25
  from toil.job import Requirer
26
+ from toil.lib.misc import CalledProcessErrorStderr, call_command
26
27
 
27
28
  logger = logging.getLogger(__name__)
28
29
 
@@ -64,7 +65,9 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
64
65
  jobName: str,
65
66
  job_environment: Optional[Dict[str, str]] = None,
66
67
  gpus: Optional[int] = None) -> List[str]:
67
- return self.prepareSbatch(cpu, memory, jobID, jobName, job_environment, gpus) + [f'--wrap={command}']
68
+ # Make sure to use exec so we can get Slurm's signals in the Toil
69
+ # worker instead of having an intervening Bash
70
+ return self.prepareSbatch(cpu, memory, jobID, jobName, job_environment, gpus) + [f'--wrap=exec {command}']
68
71
 
69
72
  def submitJob(self, subLine):
70
73
  try:
@@ -95,12 +98,12 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
95
98
  logger.error("sbatch command failed")
96
99
  raise e
97
100
 
98
- def coalesce_job_exit_codes(self, batch_job_id_list: list) -> list:
101
+ def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
99
102
  """
100
103
  Collect all job exit codes in a single call.
101
104
  :param batch_job_id_list: list of Job ID strings, where each string has the form
102
105
  "<job>[.<task>]".
103
- :return: list of job exit codes, associated with the list of job IDs.
106
+ :return: list of job exit codes or exit code, exit reason pairs associated with the list of job IDs.
104
107
  """
105
108
  logger.debug("Getting exit codes for slurm jobs: %s", batch_job_id_list)
106
109
  # Convert batch_job_id_list to list of integer job IDs.
@@ -111,7 +114,7 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
111
114
  exit_codes.append(self._get_job_return_code(status))
112
115
  return exit_codes
113
116
 
114
- def getJobExitCode(self, batchJobID: str) -> int:
117
+ def getJobExitCode(self, batchJobID: str) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
115
118
  """
116
119
  Get job exit code for given batch job ID.
117
120
  :param batchJobID: string of the form "<job>[.<task>]".
@@ -138,18 +141,68 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
138
141
  status_dict = self._getJobDetailsFromScontrol(job_id_list)
139
142
  return status_dict
140
143
 
141
- def _get_job_return_code(self, status: tuple) -> list:
144
+ def _get_job_return_code(self, status: tuple) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
142
145
  """
146
+ Given a Slurm return code, status pair, summarize them into a Toil return code, exit reason pair.
147
+
148
+ The return code may have already been OR'd with the 128-offset
149
+ Slurm-reported signal.
150
+
151
+ Slurm will report return codes of 0 even if jobs time out instead
152
+ of succeeding:
153
+
154
+ 2093597|TIMEOUT|0:0
155
+ 2093597.batch|CANCELLED|0:15
156
+
157
+ So we guarantee here that, if the Slurm status string is not a
158
+ successful one as defined in
159
+ <https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES>, we
160
+ will not return a successful return code.
161
+
143
162
  Helper function for `getJobExitCode` and `coalesce_job_exit_codes`.
144
- :param status: tuple containing the job's state and it's return code.
145
- :return: the job's return code if it's completed, otherwise None.
163
+ :param status: tuple containing the job's state and it's return code from Slurm.
164
+ :return: the job's return code for Toil if it's completed, otherwise None.
146
165
  """
147
166
  state, rc = status
148
- # If job is in a running state, set return code to None to indicate we don't have
149
- # an update.
150
- if state in ('PENDING', 'RUNNING', 'CONFIGURING', 'COMPLETING', 'RESIZING', 'SUSPENDED'):
151
- rc = None
152
- return rc
167
+
168
+ # If a job is in one of these states, Slurm can't run it anymore.
169
+ # We don't include states where the job is held or paused here;
170
+ # those mean it could run and needs to wait for someone to un-hold
171
+ # it, so Toil should wait for it.
172
+ #
173
+ # We map from each terminal state to the Toil-ontology exit reason.
174
+ TERMINAL_STATES: Dict[str, BatchJobExitReason] = {
175
+ "BOOT_FAIL": BatchJobExitReason.LOST,
176
+ "CANCELLED": BatchJobExitReason.KILLED,
177
+ "COMPLETED": BatchJobExitReason.FINISHED,
178
+ "DEADLINE": BatchJobExitReason.KILLED,
179
+ "FAILED": BatchJobExitReason.FAILED,
180
+ "NODE_FAIL": BatchJobExitReason.LOST,
181
+ "OUT_OF_MEMORY": BatchJobExitReason.MEMLIMIT,
182
+ "PREEMPTED": BatchJobExitReason.KILLED,
183
+ "TIMEOUT": BatchJobExitReason.KILLED
184
+ }
185
+
186
+ if state not in TERMINAL_STATES:
187
+ # Don't treat the job as exited yet
188
+ return None
189
+
190
+ exit_reason = TERMINAL_STATES[state]
191
+
192
+ if exit_reason == BatchJobExitReason.FINISHED:
193
+ # The only state that should produce a 0 ever is COMPLETED. So
194
+ # if the job is COMPLETED and the exit reason is thus FINISHED,
195
+ # pass along the code it has.
196
+ return (rc, exit_reason)
197
+
198
+ if rc == 0:
199
+ # The job claims to be in a state other than COMPLETED, but
200
+ # also to have not encountered a problem. Say the exit status
201
+ # is unavailable.
202
+ return (EXIT_STATUS_UNAVAILABLE_VALUE, exit_reason)
203
+
204
+ # If the code is nonzero, pass it along.
205
+ return (rc, exit_reason)
153
206
 
154
207
  def _getJobDetailsFromSacct(self, job_id_list: list) -> dict:
155
208
  """
@@ -283,8 +336,26 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
283
336
  job_environment: Optional[Dict[str, str]],
284
337
  gpus: Optional[int]) -> List[str]:
285
338
 
286
- # Returns the sbatch command line before the script to run
339
+ """
340
+ Returns the sbatch command line to run to queue the job.
341
+ """
342
+
343
+ # Start by naming the job
287
344
  sbatch_line = ['sbatch', '-J', f'toil_job_{jobID}_{jobName}']
345
+
346
+ # Make sure the job gets a signal before it disappears so that e.g.
347
+ # container cleanup finally blocks can run. Ask for SIGINT so we
348
+ # can get the default Python KeyboardInterrupt which third-party
349
+ # code is likely to plan for. Make sure to send it to the batch
350
+ # shell process with "B:", not to all the srun steps it launches
351
+ # (because there shouldn't be any). We cunningly replaced the batch
352
+ # shell process with the Toil worker process, so Toil should be
353
+ # able to get the signal.
354
+ #
355
+ # TODO: Add a way to detect when the job failed because it
356
+ # responded to this signal and use the right exit reason for it.
357
+ sbatch_line.append("--signal=B:INT@30")
358
+
288
359
  if gpus:
289
360
  sbatch_line = sbatch_line[:1] + [f'--gres=gpu:{gpus}'] + sbatch_line[1:]
290
361
  environment = {}
@@ -387,5 +458,5 @@ class SlurmBatchSystem(AbstractGridEngineBatchSystem):
387
458
  OptionType = TypeVar('OptionType')
388
459
  @classmethod
389
460
  def setOptions(cls, setOption: OptionSetter) -> None:
390
- setOption("allocate_mem", bool, default=False)
461
+ setOption("allocate_mem")
391
462
 
@@ -16,7 +16,6 @@ import math
16
16
  import os
17
17
  import shlex
18
18
  import tempfile
19
- import time
20
19
  from queue import Empty
21
20
  from shlex import quote
22
21
  from typing import Dict, List, Optional
toil/bus.py CHANGED
@@ -61,15 +61,13 @@ MessageBus.connect_output_file() and MessageBus.scan_bus_messages().
61
61
  """
62
62
 
63
63
  import collections
64
- from dataclasses import dataclass
65
- import inspect
64
+ import json
66
65
  import logging
67
-
68
66
  import os
69
67
  import queue
70
- import json
71
68
  import tempfile
72
69
  import threading
70
+ from dataclasses import dataclass
73
71
  from typing import (IO,
74
72
  Any,
75
73
  Callable,
@@ -80,7 +78,6 @@ from typing import (IO,
80
78
  Optional,
81
79
  Type,
82
80
  TypeVar,
83
- Union,
84
81
  cast)
85
82
 
86
83
  from pubsub.core import Publisher
@@ -90,6 +87,43 @@ from pubsub.core.topicutils import ALL_TOPICS
90
87
 
91
88
  logger = logging.getLogger( __name__ )
92
89
 
90
+ # We define some ways to talk about jobs.
91
+
92
+ class Names(NamedTuple):
93
+ """
94
+ Stores all the kinds of name a job can have.
95
+ """
96
+ # Name of the kind of job this is
97
+ job_name: str
98
+ # Name of this particular work unit
99
+ unit_name: str
100
+ # Human-readable name for the job
101
+ display_name: str
102
+ # What the job prints as, used for stats-and-logging log management
103
+ stats_name: str
104
+ # Job store ID of the job for the work unit
105
+ job_store_id: str
106
+
107
+ def get_job_kind(names: Names) -> str:
108
+ """
109
+ Return an identifying string for the job.
110
+
111
+ The result may contain spaces.
112
+
113
+ Returns: Either the unit name, job name, or display name, which identifies
114
+ the kind of job it is to toil.
115
+ Otherwise "Unknown Job" in case no identifier is available
116
+ """
117
+ if names.unit_name:
118
+ return names.unit_name
119
+ elif names.job_name:
120
+ return names.job_name
121
+ elif names.display_name:
122
+ return names.display_name
123
+ else:
124
+ return "Unknown Job"
125
+
126
+
93
127
  # We define a bunch of named tuple message types.
94
128
  # These all need to be plain data: only hold ints, strings, etc.
95
129
 
@@ -396,8 +430,8 @@ class MessageBus:
396
430
  given topic.
397
431
  """
398
432
  # There should always be a "message"
399
- assert len(message_data) == 1
400
- assert 'message' in message_data
433
+ if len(message_data) != 1 or 'message' not in message_data:
434
+ raise RuntimeError("Cannot log the bus message. The message is either empty/malformed or there are too many messages provided.")
401
435
  message = message_data['message']
402
436
  topic = topic_object.getName()
403
437
  stream.write(topic.encode('utf-8'))
@@ -572,7 +606,8 @@ class MessageInbox(MessageBusClient):
572
606
  handled = False
573
607
  try:
574
608
  # Emit the message
575
- assert isinstance(message, message_type), f"Unacceptable message type {type(message)} in list for type {message_type}"
609
+ if not isinstance(message, message_type):
610
+ raise RuntimeError(f"Unacceptable message type {type(message)} in list for type {message_type}")
576
611
  yield message
577
612
  # If we get here it was handled without error.
578
613
  handled = True
@@ -650,6 +685,7 @@ class JobStatus:
650
685
 
651
686
  def __repr__(self) -> str:
652
687
  return json.dumps(self, default= lambda o: o.__dict__, indent=4)
688
+
653
689
  def replay_message_bus(path: str) -> Dict[str, JobStatus]:
654
690
  """
655
691
  Replay all the messages and work out what they mean for jobs.