toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. toil/__init__.py +122 -315
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +173 -89
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
  5. toil/batchSystems/awsBatch.py +244 -135
  6. toil/batchSystems/cleanup_support.py +26 -16
  7. toil/batchSystems/contained_executor.py +31 -28
  8. toil/batchSystems/gridengine.py +86 -50
  9. toil/batchSystems/htcondor.py +166 -89
  10. toil/batchSystems/kubernetes.py +632 -382
  11. toil/batchSystems/local_support.py +20 -15
  12. toil/batchSystems/lsf.py +134 -81
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +290 -151
  16. toil/batchSystems/mesos/executor.py +79 -50
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +46 -28
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +296 -125
  21. toil/batchSystems/slurm.py +603 -138
  22. toil/batchSystems/torque.py +47 -33
  23. toil/bus.py +186 -76
  24. toil/common.py +664 -368
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1136 -483
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +63 -42
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +140 -60
  32. toil/fileStores/cachingFileStore.py +717 -269
  33. toil/fileStores/nonCachingFileStore.py +116 -87
  34. toil/job.py +1225 -368
  35. toil/jobStores/abstractJobStore.py +416 -266
  36. toil/jobStores/aws/jobStore.py +863 -477
  37. toil/jobStores/aws/utils.py +201 -120
  38. toil/jobStores/conftest.py +3 -2
  39. toil/jobStores/fileJobStore.py +292 -154
  40. toil/jobStores/googleJobStore.py +140 -74
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +668 -272
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +74 -31
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +214 -39
  49. toil/lib/aws/utils.py +287 -231
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +104 -47
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +361 -199
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +5 -3
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +141 -15
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +66 -21
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +68 -15
  71. toil/lib/retry.py +126 -81
  72. toil/lib/threading.py +299 -82
  73. toil/lib/throttle.py +16 -15
  74. toil/options/common.py +843 -409
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +73 -17
  78. toil/provisioners/__init__.py +117 -46
  79. toil/provisioners/abstractProvisioner.py +332 -157
  80. toil/provisioners/aws/__init__.py +70 -33
  81. toil/provisioners/aws/awsProvisioner.py +1145 -715
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +155 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +128 -62
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +224 -70
  98. toil/test/__init__.py +282 -183
  99. toil/test/batchSystems/batchSystemTest.py +460 -210
  100. toil/test/batchSystems/batch_system_plugin_test.py +90 -0
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +110 -49
  104. toil/test/cactus/__init__.py +0 -0
  105. toil/test/cactus/test_cactus_integration.py +56 -0
  106. toil/test/cwl/cwlTest.py +496 -287
  107. toil/test/cwl/measure_default_memory.cwl +12 -0
  108. toil/test/cwl/not_run_required_input.cwl +29 -0
  109. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  110. toil/test/cwl/seqtk_seq.cwl +1 -1
  111. toil/test/docs/scriptsTest.py +69 -46
  112. toil/test/jobStores/jobStoreTest.py +427 -264
  113. toil/test/lib/aws/test_iam.py +118 -50
  114. toil/test/lib/aws/test_s3.py +16 -9
  115. toil/test/lib/aws/test_utils.py +5 -6
  116. toil/test/lib/dockerTest.py +118 -141
  117. toil/test/lib/test_conversions.py +113 -115
  118. toil/test/lib/test_ec2.py +58 -50
  119. toil/test/lib/test_integration.py +104 -0
  120. toil/test/lib/test_misc.py +12 -5
  121. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  122. toil/test/mesos/helloWorld.py +7 -6
  123. toil/test/mesos/stress.py +25 -20
  124. toil/test/options/__init__.py +13 -0
  125. toil/test/options/options.py +42 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
  127. toil/test/provisioners/clusterScalerTest.py +440 -250
  128. toil/test/provisioners/clusterTest.py +166 -44
  129. toil/test/provisioners/gceProvisionerTest.py +174 -100
  130. toil/test/provisioners/provisionerTest.py +25 -13
  131. toil/test/provisioners/restartScript.py +5 -4
  132. toil/test/server/serverTest.py +188 -141
  133. toil/test/sort/restart_sort.py +137 -68
  134. toil/test/sort/sort.py +134 -66
  135. toil/test/sort/sortTest.py +91 -49
  136. toil/test/src/autoDeploymentTest.py +141 -101
  137. toil/test/src/busTest.py +20 -18
  138. toil/test/src/checkpointTest.py +8 -2
  139. toil/test/src/deferredFunctionTest.py +49 -35
  140. toil/test/src/dockerCheckTest.py +32 -24
  141. toil/test/src/environmentTest.py +135 -0
  142. toil/test/src/fileStoreTest.py +539 -272
  143. toil/test/src/helloWorldTest.py +7 -4
  144. toil/test/src/importExportFileTest.py +61 -31
  145. toil/test/src/jobDescriptionTest.py +46 -21
  146. toil/test/src/jobEncapsulationTest.py +2 -0
  147. toil/test/src/jobFileStoreTest.py +74 -50
  148. toil/test/src/jobServiceTest.py +187 -73
  149. toil/test/src/jobTest.py +121 -71
  150. toil/test/src/miscTests.py +19 -18
  151. toil/test/src/promisedRequirementTest.py +82 -36
  152. toil/test/src/promisesTest.py +7 -6
  153. toil/test/src/realtimeLoggerTest.py +10 -6
  154. toil/test/src/regularLogTest.py +71 -37
  155. toil/test/src/resourceTest.py +80 -49
  156. toil/test/src/restartDAGTest.py +36 -22
  157. toil/test/src/resumabilityTest.py +9 -2
  158. toil/test/src/retainTempDirTest.py +45 -14
  159. toil/test/src/systemTest.py +12 -8
  160. toil/test/src/threadingTest.py +44 -25
  161. toil/test/src/toilContextManagerTest.py +10 -7
  162. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  163. toil/test/src/workerTest.py +73 -23
  164. toil/test/utils/toilDebugTest.py +103 -33
  165. toil/test/utils/toilKillTest.py +4 -5
  166. toil/test/utils/utilsTest.py +245 -106
  167. toil/test/wdl/wdltoil_test.py +818 -149
  168. toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
  169. toil/toilState.py +120 -35
  170. toil/utils/toilConfig.py +13 -4
  171. toil/utils/toilDebugFile.py +44 -27
  172. toil/utils/toilDebugJob.py +214 -27
  173. toil/utils/toilDestroyCluster.py +11 -6
  174. toil/utils/toilKill.py +8 -3
  175. toil/utils/toilLaunchCluster.py +256 -140
  176. toil/utils/toilMain.py +37 -16
  177. toil/utils/toilRsyncCluster.py +32 -14
  178. toil/utils/toilSshCluster.py +49 -22
  179. toil/utils/toilStats.py +356 -273
  180. toil/utils/toilStatus.py +292 -139
  181. toil/utils/toilUpdateEC2Instances.py +3 -1
  182. toil/version.py +12 -12
  183. toil/wdl/utils.py +5 -5
  184. toil/wdl/wdltoil.py +3913 -1033
  185. toil/worker.py +367 -184
  186. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
  187. toil-8.0.0.dist-info/METADATA +173 -0
  188. toil-8.0.0.dist-info/RECORD +253 -0
  189. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  190. toil-6.1.0a1.dist-info/METADATA +0 -125
  191. toil-6.1.0a1.dist-info/RECORD +0 -237
  192. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  193. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
@@ -22,7 +22,7 @@ import time
22
22
  import traceback
23
23
  from argparse import ArgumentParser, _ArgumentGroup
24
24
  from queue import Empty, Queue
25
- from typing import Dict, Optional, Union
25
+ from typing import Optional, Union
26
26
  from urllib.parse import quote_plus
27
27
  from urllib.request import urlopen
28
28
 
@@ -30,11 +30,13 @@ import addict
30
30
  from pymesos import MesosSchedulerDriver, Scheduler, decode_data, encode_data
31
31
 
32
32
  from toil import resolveEntryPoint
33
- from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE,
34
- AbstractScalableBatchSystem,
35
- BatchJobExitReason,
36
- NodeInfo,
37
- UpdatedBatchJobInfo)
33
+ from toil.batchSystems.abstractBatchSystem import (
34
+ EXIT_STATUS_UNAVAILABLE_VALUE,
35
+ AbstractScalableBatchSystem,
36
+ BatchJobExitReason,
37
+ NodeInfo,
38
+ UpdatedBatchJobInfo,
39
+ )
38
40
  from toil.batchSystems.local_support import BatchSystemLocalSupport
39
41
  from toil.batchSystems.mesos import JobQueue, MesosShape, TaskData, ToilJob
40
42
  from toil.batchSystems.options import OptionSetter
@@ -46,9 +48,7 @@ from toil.lib.misc import get_public_ip, get_user_name
46
48
  log = logging.getLogger(__name__)
47
49
 
48
50
 
49
- class MesosBatchSystem(BatchSystemLocalSupport,
50
- AbstractScalableBatchSystem,
51
- Scheduler):
51
+ class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Scheduler):
52
52
  """
53
53
  A Toil batch system implementation that uses Apache Mesos to distribute toil jobs as Mesos
54
54
  tasks over a cluster of agent nodes. A Mesos framework consists of a scheduler and an
@@ -174,13 +174,18 @@ class MesosBatchSystem(BatchSystemLocalSupport,
174
174
  def unignoreNode(self, nodeAddress):
175
175
  self.ignoredNodes.remove(nodeAddress)
176
176
 
177
- def issueBatchJob(self, jobNode: JobDescription, job_environment: Optional[Dict[str, str]] = None):
177
+ def issueBatchJob(
178
+ self,
179
+ command: str,
180
+ jobNode: JobDescription,
181
+ job_environment: Optional[dict[str, str]] = None,
182
+ ):
178
183
  """
179
184
  Issues the following command returning a unique jobID. Command is the string to run, memory
180
185
  is an int giving the number of bytes the job needs to run in and cores is the number of cpus
181
186
  needed for the job and error-file is the path of the file to place any std-err/std-out in.
182
187
  """
183
- localID = self.handleLocalJob(jobNode)
188
+ localID = self.handleLocalJob(command, jobNode)
184
189
  if localID is not None:
185
190
  return localID
186
191
 
@@ -189,7 +194,7 @@ class MesosBatchSystem(BatchSystemLocalSupport,
189
194
  "memory": jobNode.memory,
190
195
  "cores": jobNode.cores,
191
196
  "disk": jobNode.disk,
192
- "preemptible": jobNode.preemptible
197
+ "preemptible": jobNode.preemptible,
193
198
  }
194
199
 
195
200
  jobID = self.getNextJobID()
@@ -197,15 +202,17 @@ class MesosBatchSystem(BatchSystemLocalSupport,
197
202
  if job_environment:
198
203
  environment.update(job_environment)
199
204
 
200
- job = ToilJob(jobID=jobID,
201
- name=str(jobNode),
202
- resources=MesosShape(wallTime=0, **mesos_resources),
203
- command=jobNode.command,
204
- userScript=self.userScript,
205
- environment=environment,
206
- workerCleanupInfo=self.workerCleanupInfo)
205
+ job = ToilJob(
206
+ jobID=jobID,
207
+ name=str(jobNode),
208
+ resources=MesosShape(wallTime=0, **mesos_resources),
209
+ command=command,
210
+ userScript=self.userScript,
211
+ environment=environment,
212
+ workerCleanupInfo=self.workerCleanupInfo,
213
+ )
207
214
  jobType = job.resources
208
- log.debug("Queueing the job command: %s with job id: %s ...", jobNode.command, str(jobID))
215
+ log.debug("Queueing the job %s with job id: %s ...", jobNode, str(jobID))
209
216
 
210
217
  # TODO: round all elements of resources
211
218
 
@@ -285,11 +292,17 @@ class MesosBatchSystem(BatchSystemLocalSupport,
285
292
  try:
286
293
  self.intendedKill.remove(item.jobID)
287
294
  except KeyError:
288
- log.debug('Job %s ended with status %i, took %s seconds.', item.jobID, item.exitStatus,
289
- '???' if item.wallTime is None else str(item.wallTime))
295
+ log.debug(
296
+ "Job %s ended with status %i, took %s seconds.",
297
+ item.jobID,
298
+ item.exitStatus,
299
+ "???" if item.wallTime is None else str(item.wallTime),
300
+ )
290
301
  return item
291
302
  else:
292
- log.debug('Job %s ended naturally before it could be killed.', item.jobID)
303
+ log.debug(
304
+ "Job %s ended naturally before it could be killed.", item.jobID
305
+ )
293
306
 
294
307
  def nodeInUse(self, nodeIP: str) -> bool:
295
308
  return nodeIP in self.hostToJobIDs
@@ -308,7 +321,7 @@ class MesosBatchSystem(BatchSystemLocalSupport,
308
321
  # The executor program is installed as a setuptools entry point by setup.py
309
322
  info = addict.Dict()
310
323
  info.name = "toil"
311
- info.command.value = resolveEntryPoint('_toil_mesos_executor')
324
+ info.command.value = resolveEntryPoint("_toil_mesos_executor")
312
325
  info.executor_id.value = "toil-%i" % os.getpid()
313
326
  info.source = pwd.getpwuid(os.getuid()).pw_name
314
327
  return info
@@ -318,18 +331,24 @@ class MesosBatchSystem(BatchSystemLocalSupport,
318
331
  The Mesos driver thread which handles the scheduler's communication with the Mesos master
319
332
  """
320
333
  framework = addict.Dict()
321
- framework.user = get_user_name() # We must determine the user name ourselves with pymesos
334
+ framework.user = (
335
+ get_user_name()
336
+ ) # We must determine the user name ourselves with pymesos
322
337
  framework.name = config.mesos_name
323
338
  framework.principal = framework.name
324
339
  if config.mesos_role is not None:
325
340
  framework.roles = config.mesos_role
326
- framework.capabilities = [dict(type='MULTI_ROLE')]
341
+ framework.capabilities = [dict(type="MULTI_ROLE")]
327
342
 
328
343
  # Make the driver which implements most of the scheduler logic and calls back to us for the user-defined parts.
329
344
  # Make sure it will call us with nice namespace-y addicts
330
- self.driver = MesosSchedulerDriver(self, framework,
331
- self._resolveAddress(self.mesos_endpoint),
332
- use_addict=True, implicit_acknowledgements=True)
345
+ self.driver = MesosSchedulerDriver(
346
+ self,
347
+ framework,
348
+ self._resolveAddress(self.mesos_endpoint),
349
+ use_addict=True,
350
+ implicit_acknowledgements=True,
351
+ )
333
352
  self.driver.start()
334
353
 
335
354
  @staticmethod
@@ -349,10 +368,10 @@ class MesosBatchSystem(BatchSystemLocalSupport,
349
368
  >>> f('127.0.0.1:123')
350
369
  '127.0.0.1:123'
351
370
  """
352
- address = address.split(':')
371
+ address = address.split(":")
353
372
  assert len(address) in (1, 2)
354
373
  address[0] = socket.gethostbyname(address[0])
355
- return ':'.join(address)
374
+ return ":".join(address)
356
375
 
357
376
  def shutdown(self) -> None:
358
377
  self.shutdownLocal()
@@ -361,7 +380,7 @@ class MesosBatchSystem(BatchSystemLocalSupport,
361
380
  log.debug("Joining Mesos driver")
362
381
  driver_result = self.driver.join()
363
382
  log.debug("Joined Mesos driver")
364
- if driver_result is not None and driver_result != 'DRIVER_STOPPED':
383
+ if driver_result is not None and driver_result != "DRIVER_STOPPED":
365
384
  # TODO: The docs say join should return a code, but it keeps returning
366
385
  # None when apparently successful. So tolerate that here too.
367
386
  raise RuntimeError("Mesos driver failed with %s" % driver_result)
@@ -384,11 +403,15 @@ class MesosBatchSystem(BatchSystemLocalSupport,
384
403
  disk = 0
385
404
  preemptible = None
386
405
  for attribute in offer.attributes:
387
- if attribute.name == 'preemptible':
388
- assert preemptible is None, "Attribute 'preemptible' occurs more than once."
406
+ if attribute.name == "preemptible":
407
+ assert (
408
+ preemptible is None
409
+ ), "Attribute 'preemptible' occurs more than once."
389
410
  preemptible = strict_bool(attribute.text.value)
390
411
  if preemptible is None:
391
- log.debug('Agent not marked as either preemptible or not. Assuming non-preemptible.')
412
+ log.debug(
413
+ "Agent not marked as either preemptible or not. Assuming non-preemptible."
414
+ )
392
415
  preemptible = False
393
416
  for resource in offer.resources:
394
417
  if resource.name == "cpus":
@@ -415,14 +438,16 @@ class MesosBatchSystem(BatchSystemLocalSupport,
415
438
  except KeyError:
416
439
  self.hostToJobIDs[agentIP] = [resourceKey]
417
440
 
418
- self.runningJobMap[int(task.task_id.value)] = TaskData(startTime=time.time(),
419
- agentID=offer.agent_id.value,
420
- agentIP=agentIP,
421
- executorID=task.executor.executor_id.value,
422
- cores=resources.cores,
423
- memory=resources.memory)
441
+ self.runningJobMap[int(task.task_id.value)] = TaskData(
442
+ startTime=time.time(),
443
+ agentID=offer.agent_id.value,
444
+ agentIP=agentIP,
445
+ executorID=task.executor.executor_id.value,
446
+ cores=resources.cores,
447
+ memory=resources.memory,
448
+ )
424
449
  del self.taskResources[resourceKey]
425
- log.debug('Launched Mesos task %s.', task.task_id.value)
450
+ log.debug("Launched Mesos task %s.", task.task_id.value)
426
451
 
427
452
  def resourceOffers(self, driver, offers):
428
453
  """
@@ -445,10 +470,18 @@ class MesosBatchSystem(BatchSystemLocalSupport,
445
470
  continue
446
471
  runnableTasks = []
447
472
  # TODO: In an offer, can there ever be more than one resource with the same name?
448
- offerCores, offerMemory, offerDisk, offerPreemptible = self._parseOffer(offer)
449
- log.debug('Got offer %s for a %spreemptible agent with %.2f MiB memory, %.2f core(s) '
450
- 'and %.2f MiB of disk.', offer.id.value, '' if offerPreemptible else 'non-',
451
- offerMemory, offerCores, offerDisk)
473
+ offerCores, offerMemory, offerDisk, offerPreemptible = self._parseOffer(
474
+ offer
475
+ )
476
+ log.debug(
477
+ "Got offer %s for a %spreemptible agent with %.2f MiB memory, %.2f core(s) "
478
+ "and %.2f MiB of disk.",
479
+ offer.id.value,
480
+ "" if offerPreemptible else "non-",
481
+ offerMemory,
482
+ offerCores,
483
+ offerDisk,
484
+ )
452
485
  remainingCores = offerCores
453
486
  remainingMemory = offerMemory
454
487
  remainingDisk = offerDisk
@@ -460,35 +493,47 @@ class MesosBatchSystem(BatchSystemLocalSupport,
460
493
  # loop.
461
494
  nextToLaunchIndex = 0
462
495
  # Toil specifies disk and memory in bytes but Mesos uses MiB
463
- while ( not self.jobQueues.typeEmpty(jobType)
464
- # On a non-preemptible node we can run any job, on a preemptible node we
465
- # can only run preemptible jobs:
466
- and (not offerPreemptible or jobType.preemptible)
467
- and remainingCores >= jobType.cores
468
- and remainingDisk >= b_to_mib(jobType.disk)
469
- and remainingMemory >= b_to_mib(jobType.memory)):
496
+ while (
497
+ not self.jobQueues.typeEmpty(jobType)
498
+ # On a non-preemptible node we can run any job, on a preemptible node we
499
+ # can only run preemptible jobs:
500
+ and (not offerPreemptible or jobType.preemptible)
501
+ and remainingCores >= jobType.cores
502
+ and remainingDisk >= b_to_mib(jobType.disk)
503
+ and remainingMemory >= b_to_mib(jobType.memory)
504
+ ):
470
505
  task = self._prepareToRun(jobType, offer)
471
506
  # TODO: this used to be a conditional but Hannes wanted it changed to an assert
472
507
  # TODO: ... so we can understand why it exists.
473
508
  assert int(task.task_id.value) not in self.runningJobMap
474
509
  runnableTasksOfType.append(task)
475
- log.debug("Preparing to launch Mesos task %s with %.2f cores, %.2f MiB memory, and %.2f MiB disk using offer %s ...",
476
- task.task_id.value, jobType.cores, b_to_mib(jobType.memory), b_to_mib(jobType.disk), offer.id.value)
510
+ log.debug(
511
+ "Preparing to launch Mesos task %s with %.2f cores, %.2f MiB memory, and %.2f MiB disk using offer %s ...",
512
+ task.task_id.value,
513
+ jobType.cores,
514
+ b_to_mib(jobType.memory),
515
+ b_to_mib(jobType.disk),
516
+ offer.id.value,
517
+ )
477
518
  remainingCores -= jobType.cores
478
519
  remainingMemory -= b_to_mib(jobType.memory)
479
520
  remainingDisk -= b_to_mib(jobType.disk)
480
521
  nextToLaunchIndex += 1
481
522
  if not self.jobQueues.typeEmpty(jobType):
482
523
  # report that remaining jobs cannot be run with the current resourcesq:
483
- log.debug('Offer %(offer)s not suitable to run the tasks with requirements '
484
- '%(requirements)r. Mesos offered %(memory)s memory, %(cores)s cores '
485
- 'and %(disk)s of disk on a %(non)spreemptible agent.',
486
- dict(offer=offer.id.value,
487
- requirements=jobType.__dict__,
488
- non='' if offerPreemptible else 'non-',
489
- memory=mib_to_b(offerMemory),
490
- cores=offerCores,
491
- disk=mib_to_b(offerDisk)))
524
+ log.debug(
525
+ "Offer %(offer)s not suitable to run the tasks with requirements "
526
+ "%(requirements)r. Mesos offered %(memory)s memory, %(cores)s cores "
527
+ "and %(disk)s of disk on a %(non)spreemptible agent.",
528
+ dict(
529
+ offer=offer.id.value,
530
+ requirements=jobType.__dict__,
531
+ non="" if offerPreemptible else "non-",
532
+ memory=mib_to_b(offerMemory),
533
+ cores=offerCores,
534
+ disk=mib_to_b(offerDisk),
535
+ ),
536
+ )
492
537
  runnableTasks.extend(runnableTasksOfType)
493
538
  # Launch all runnable tasks together so we only call launchTasks once per offer
494
539
  if runnableTasks:
@@ -496,21 +541,27 @@ class MesosBatchSystem(BatchSystemLocalSupport,
496
541
  driver.launchTasks(offer.id, runnableTasks)
497
542
  self._updateStateToRunning(offer, runnableTasks)
498
543
  else:
499
- log.debug('Although there are queued jobs, none of them could be run with offer %s '
500
- 'extended to the framework.', offer.id)
544
+ log.debug(
545
+ "Although there are queued jobs, none of them could be run with offer %s "
546
+ "extended to the framework.",
547
+ offer.id,
548
+ )
501
549
  driver.declineOffer(offer.id)
502
550
 
503
551
  if unableToRun and time.time() > (self.lastTimeOfferLogged + self.logPeriod):
504
552
  self.lastTimeOfferLogged = time.time()
505
- log.debug('Although there are queued jobs, none of them were able to run in '
506
- 'any of the offers extended to the framework. There are currently '
507
- '%i jobs running. Enable debug level logging to see more details about '
508
- 'job types and offers received.', len(self.runningJobMap))
553
+ log.debug(
554
+ "Although there are queued jobs, none of them were able to run in "
555
+ "any of the offers extended to the framework. There are currently "
556
+ "%i jobs running. Enable debug level logging to see more details about "
557
+ "job types and offers received.",
558
+ len(self.runningJobMap),
559
+ )
509
560
 
510
561
  def _trackOfferedNodes(self, offers):
511
562
  for offer in offers:
512
563
  # All AgentID messages are required to have a value according to the Mesos Protobuf file.
513
- assert 'value' in offer.agent_id
564
+ assert "value" in offer.agent_id
514
565
  try:
515
566
  nodeAddress = socket.gethostbyname(offer.hostname)
516
567
  except:
@@ -519,7 +570,7 @@ class MesosBatchSystem(BatchSystemLocalSupport,
519
570
  self._registerNode(nodeAddress, offer.agent_id.value)
520
571
  preemptible = False
521
572
  for attribute in offer.attributes:
522
- if attribute.name == 'preemptible':
573
+ if attribute.name == "preemptible":
523
574
  preemptible = strict_bool(attribute.text.value)
524
575
  if preemptible:
525
576
  try:
@@ -532,11 +583,17 @@ class MesosBatchSystem(BatchSystemLocalSupport,
532
583
  def _filterOfferedNodes(self, offers):
533
584
  if not self.nodeFilter:
534
585
  return offers
535
- executorInfoOrNone = [self.executors.get(socket.gethostbyname(offer.hostname)) for offer in offers]
586
+ executorInfoOrNone = [
587
+ self.executors.get(socket.gethostbyname(offer.hostname)) for offer in offers
588
+ ]
536
589
  executorInfos = [_f for _f in executorInfoOrNone if _f]
537
590
  executorsToConsider = list(filter(self.nodeFilter[0], executorInfos))
538
591
  ipsToConsider = {ex.nodeAddress for ex in executorsToConsider}
539
- return [offer for offer in offers if socket.gethostbyname(offer.hostname) in ipsToConsider]
592
+ return [
593
+ offer
594
+ for offer in offers
595
+ if socket.gethostbyname(offer.hostname) in ipsToConsider
596
+ ]
540
597
 
541
598
  def _newMesosTask(self, job, offer):
542
599
  """
@@ -553,30 +610,36 @@ class MesosBatchSystem(BatchSystemLocalSupport,
553
610
 
554
611
  task.resources.append(addict.Dict())
555
612
  cpus = task.resources[-1]
556
- cpus.name = 'cpus'
557
- cpus.type = 'SCALAR'
613
+ cpus.name = "cpus"
614
+ cpus.type = "SCALAR"
558
615
  cpus.scalar.value = job.resources.cores
559
616
 
560
617
  task.resources.append(addict.Dict())
561
618
  disk = task.resources[-1]
562
- disk.name = 'disk'
563
- disk.type = 'SCALAR'
619
+ disk.name = "disk"
620
+ disk.type = "SCALAR"
564
621
  if b_to_mib(job.resources.disk) > 1:
565
622
  disk.scalar.value = b_to_mib(job.resources.disk)
566
623
  else:
567
- log.warning("Job %s uses less disk than Mesos requires. Rounding %s up to 1 MiB.",
568
- job.jobID, job.resources.disk)
624
+ log.warning(
625
+ "Job %s uses less disk than Mesos requires. Rounding %s up to 1 MiB.",
626
+ job.jobID,
627
+ job.resources.disk,
628
+ )
569
629
  disk.scalar.value = 1
570
630
 
571
631
  task.resources.append(addict.Dict())
572
632
  mem = task.resources[-1]
573
- mem.name = 'mem'
574
- mem.type = 'SCALAR'
633
+ mem.name = "mem"
634
+ mem.type = "SCALAR"
575
635
  if b_to_mib(job.resources.memory) > 1:
576
636
  mem.scalar.value = b_to_mib(job.resources.memory)
577
637
  else:
578
- log.warning("Job %s uses less memory than Mesos requires. Rounding %s up to 1 MiB.",
579
- job.jobID, job.resources.memory)
638
+ log.warning(
639
+ "Job %s uses less memory than Mesos requires. Rounding %s up to 1 MiB.",
640
+ job.jobID,
641
+ job.resources.memory,
642
+ )
580
643
  mem.scalar.value = 1
581
644
  return task
582
645
 
@@ -590,19 +653,34 @@ class MesosBatchSystem(BatchSystemLocalSupport,
590
653
  agent sending the status update is lost/fails during that time).
591
654
  """
592
655
  jobID = int(update.task_id.value)
593
- log.debug("Job %i is in state '%s' due to reason '%s'.", jobID, update.state, update.reason)
656
+ log.debug(
657
+ "Job %i is in state '%s' due to reason '%s'.",
658
+ jobID,
659
+ update.state,
660
+ update.reason,
661
+ )
594
662
 
595
663
  def jobEnded(_exitStatus, wallTime=None, exitReason=None):
596
664
  """
597
665
  Notify external observers of the job ending.
598
666
  """
599
- self.updatedJobsQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=_exitStatus, wallTime=wallTime, exitReason=exitReason))
667
+ self.updatedJobsQueue.put(
668
+ UpdatedBatchJobInfo(
669
+ jobID=jobID,
670
+ exitStatus=_exitStatus,
671
+ wallTime=wallTime,
672
+ exitReason=exitReason,
673
+ )
674
+ )
600
675
  agentIP = None
601
676
  try:
602
677
  agentIP = self.runningJobMap[jobID].agentIP
603
678
  except KeyError:
604
- log.warning("Job %i returned exit code %i but isn't tracked as running.",
605
- jobID, _exitStatus)
679
+ log.warning(
680
+ "Job %i returned exit code %i but isn't tracked as running.",
681
+ jobID,
682
+ _exitStatus,
683
+ )
606
684
  else:
607
685
  # Mark the job as no longer running. We MUST do this BEFORE
608
686
  # saying we killed the job, or it will be possible for another
@@ -612,8 +690,11 @@ class MesosBatchSystem(BatchSystemLocalSupport,
612
690
  try:
613
691
  self.hostToJobIDs[agentIP].remove(jobID)
614
692
  except KeyError:
615
- log.warning("Job %i returned exit code %i from unknown host.",
616
- jobID, _exitStatus)
693
+ log.warning(
694
+ "Job %i returned exit code %i from unknown host.",
695
+ jobID,
696
+ _exitStatus,
697
+ )
617
698
 
618
699
  try:
619
700
  self.killJobIds.remove(jobID)
@@ -626,41 +707,62 @@ class MesosBatchSystem(BatchSystemLocalSupport,
626
707
  # state from other threads.
627
708
  self.killedJobIds.add(jobID)
628
709
 
629
- if update.state == 'TASK_FINISHED':
710
+ if update.state == "TASK_FINISHED":
630
711
  # We get the running time of the job via the timestamp, which is in job-local time in seconds
631
712
  labels = update.labels.labels
632
713
  wallTime = None
633
714
  for label in labels:
634
- if label['key'] == 'wallTime':
635
- wallTime = float(label['value'])
715
+ if label["key"] == "wallTime":
716
+ wallTime = float(label["value"])
636
717
  break
637
- assert(wallTime is not None)
718
+ assert wallTime is not None
638
719
  jobEnded(0, wallTime=wallTime, exitReason=BatchJobExitReason.FINISHED)
639
- elif update.state == 'TASK_FAILED':
720
+ elif update.state == "TASK_FAILED":
640
721
  try:
641
722
  exitStatus = int(update.message)
642
723
  except ValueError:
643
724
  exitStatus = EXIT_STATUS_UNAVAILABLE_VALUE
644
- log.warning("Job %i failed with message '%s' due to reason '%s' on executor '%s' on agent '%s'.",
645
- jobID, update.message, update.reason,
646
- update.executor_id, update.agent_id)
725
+ log.warning(
726
+ "Job %i failed with message '%s' due to reason '%s' on executor '%s' on agent '%s'.",
727
+ jobID,
728
+ update.message,
729
+ update.reason,
730
+ update.executor_id,
731
+ update.agent_id,
732
+ )
647
733
  else:
648
- log.warning("Job %i failed with exit status %i and message '%s' due to reason '%s' on executor '%s' on agent '%s'.",
649
- jobID, exitStatus,
650
- update.message, update.reason,
651
- update.executor_id, update.agent_id)
734
+ log.warning(
735
+ "Job %i failed with exit status %i and message '%s' due to reason '%s' on executor '%s' on agent '%s'.",
736
+ jobID,
737
+ exitStatus,
738
+ update.message,
739
+ update.reason,
740
+ update.executor_id,
741
+ update.agent_id,
742
+ )
652
743
 
653
744
  jobEnded(exitStatus, exitReason=BatchJobExitReason.FAILED)
654
- elif update.state == 'TASK_LOST':
745
+ elif update.state == "TASK_LOST":
655
746
  log.warning("Job %i is lost.", jobID)
656
747
  jobEnded(EXIT_STATUS_UNAVAILABLE_VALUE, exitReason=BatchJobExitReason.LOST)
657
- elif update.state in ('TASK_KILLED', 'TASK_ERROR'):
658
- log.warning("Job %i is in unexpected state %s with message '%s' due to reason '%s'.",
659
- jobID, update.state, update.message, update.reason)
660
- jobEnded(EXIT_STATUS_UNAVAILABLE_VALUE,
661
- exitReason=(BatchJobExitReason.KILLED if update.state == 'TASK_KILLED' else BatchJobExitReason.ERROR))
662
-
663
- if 'limitation' in update:
748
+ elif update.state in ("TASK_KILLED", "TASK_ERROR"):
749
+ log.warning(
750
+ "Job %i is in unexpected state %s with message '%s' due to reason '%s'.",
751
+ jobID,
752
+ update.state,
753
+ update.message,
754
+ update.reason,
755
+ )
756
+ jobEnded(
757
+ EXIT_STATUS_UNAVAILABLE_VALUE,
758
+ exitReason=(
759
+ BatchJobExitReason.KILLED
760
+ if update.state == "TASK_KILLED"
761
+ else BatchJobExitReason.ERROR
762
+ ),
763
+ )
764
+
765
+ if "limitation" in update:
664
766
  log.warning("Job limit info: %s" % update.limitation)
665
767
 
666
768
  def frameworkMessage(self, driver, executorId, agentId, message):
@@ -671,22 +773,31 @@ class MesosBatchSystem(BatchSystemLocalSupport,
671
773
  # Take it out of base 64 encoding from Protobuf
672
774
  message = decode_data(message).decode()
673
775
 
674
- log.debug('Got framework message from executor %s running on agent %s: %s',
675
- executorId.value, agentId.value, message)
776
+ log.debug(
777
+ "Got framework message from executor %s running on agent %s: %s",
778
+ executorId.value,
779
+ agentId.value,
780
+ message,
781
+ )
676
782
  message = ast.literal_eval(message)
677
783
  assert isinstance(message, dict)
678
784
  # Handle the mandatory fields of a message
679
- nodeAddress = message.pop('address')
785
+ nodeAddress = message.pop("address")
680
786
  executor = self._registerNode(nodeAddress, agentId.value)
681
787
  # Handle optional message fields
682
788
  for k, v in message.items():
683
- if k == 'nodeInfo':
789
+ if k == "nodeInfo":
684
790
  assert isinstance(v, dict)
685
- resources = [taskData for taskData in self.runningJobMap.values()
686
- if taskData.executorID == executorId.value]
791
+ resources = [
792
+ taskData
793
+ for taskData in self.runningJobMap.values()
794
+ if taskData.executorID == executorId.value
795
+ ]
687
796
  requestedCores = sum(taskData.cores for taskData in resources)
688
797
  requestedMemory = sum(taskData.memory for taskData in resources)
689
- executor.nodeInfo = NodeInfo(requestedCores=requestedCores, requestedMemory=requestedMemory, **v)
798
+ executor.nodeInfo = NodeInfo(
799
+ requestedCores=requestedCores, requestedMemory=requestedMemory, **v
800
+ )
690
801
  self.executors[nodeAddress] = executor
691
802
  else:
692
803
  raise RuntimeError("Unknown message field '%s'." % k)
@@ -699,10 +810,12 @@ class MesosBatchSystem(BatchSystemLocalSupport,
699
810
  """
700
811
  executor = self.executors.get(nodeAddress)
701
812
  if executor is None or executor.agentId != agentId:
702
- executor = self.ExecutorInfo(nodeAddress=nodeAddress,
703
- agentId=agentId,
704
- nodeInfo=None,
705
- lastSeen=time.time())
813
+ executor = self.ExecutorInfo(
814
+ nodeAddress=nodeAddress,
815
+ agentId=agentId,
816
+ nodeInfo=None,
817
+ lastSeen=time.time(),
818
+ )
706
819
  self.executors[nodeAddress] = executor
707
820
  else:
708
821
  executor.lastSeen = time.time()
@@ -712,9 +825,9 @@ class MesosBatchSystem(BatchSystemLocalSupport,
712
825
 
713
826
  return executor
714
827
 
715
- def getNodes(self,
716
- preemptible: Optional[bool] = None,
717
- timeout: Optional[int] = None) -> Dict[str, NodeInfo]:
828
+ def getNodes(
829
+ self, preemptible: Optional[bool] = None, timeout: Optional[int] = None
830
+ ) -> dict[str, NodeInfo]:
718
831
  """
719
832
  Return all nodes that match:
720
833
  - preemptible status (None includes all)
@@ -722,7 +835,9 @@ class MesosBatchSystem(BatchSystemLocalSupport,
722
835
  """
723
836
  nodes = dict()
724
837
  for node_ip, executor in self.executors.items():
725
- if preemptible is None or (preemptible == (executor.agentId not in self.nonPreemptibleNodes)):
838
+ if preemptible is None or (
839
+ preemptible == (executor.agentId not in self.nonPreemptibleNodes)
840
+ ):
726
841
  if timeout is None or (time.time() - executor.lastSeen < timeout):
727
842
  nodes[node_ip] = executor.nodeInfo
728
843
  return nodes
@@ -731,7 +846,7 @@ class MesosBatchSystem(BatchSystemLocalSupport,
731
846
  """
732
847
  Invoked when the scheduler re-registers with a newly elected Mesos master.
733
848
  """
734
- log.debug('Registered with new master')
849
+ log.debug("Registered with new master")
735
850
 
736
851
  def _handleFailedExecutor(self, agentID, executorID=None):
737
852
  """
@@ -746,8 +861,9 @@ class MesosBatchSystem(BatchSystemLocalSupport,
746
861
  Useful for debugging failing executor code.
747
862
  """
748
863
 
749
- log.warning("Handling failure of executor '%s' on agent '%s'.",
750
- executorID, agentID)
864
+ log.warning(
865
+ "Handling failure of executor '%s' on agent '%s'.", executorID, agentID
866
+ )
751
867
 
752
868
  try:
753
869
  # Look up the IP. We should always know it unless we get answers
@@ -763,22 +879,27 @@ class MesosBatchSystem(BatchSystemLocalSupport,
763
879
  # it, and I can't find a good way to list it, because the API only
764
880
  # seems to report running containers. So we dump all the available
765
881
  # files with /files/debug and look for one that looks right.
766
- filesQueryURL = errorLogURL = "http://%s:%d/files/debug" % \
767
- (agentAddress, agentPort)
882
+ filesQueryURL = errorLogURL = "http://%s:%d/files/debug" % (
883
+ agentAddress,
884
+ agentPort,
885
+ )
768
886
 
769
887
  # Download all the root mount points, which are in an object from
770
888
  # mounted name to real name
771
889
  filesDict = json.loads(urlopen(filesQueryURL).read())
772
890
 
773
- log.debug('Available files: %s', repr(filesDict.keys()))
891
+ log.debug("Available files: %s", repr(filesDict.keys()))
774
892
 
775
893
  # Generate filenames for each container pointing to where stderr should be
776
894
  stderrFilenames = []
777
895
  # And look for the actual agent logs.
778
896
  agentLogFilenames = []
779
897
  for filename in filesDict:
780
- if (self.frameworkId in filename and agentID in filename and
781
- (executorID is None or executorID in filename)):
898
+ if (
899
+ self.frameworkId in filename
900
+ and agentID in filename
901
+ and (executorID is None or executorID in filename)
902
+ ):
782
903
 
783
904
  stderrFilenames.append("%s/stderr" % filename)
784
905
  elif filename.endswith("log"):
@@ -793,10 +914,15 @@ class MesosBatchSystem(BatchSystemLocalSupport,
793
914
  # According to
794
915
  # http://mesos.apache.org/documentation/latest/sandbox/ we can use
795
916
  # the web API to fetch the error log.
796
- errorLogURL = "http://%s:%d/files/download?path=%s" % \
797
- (agentAddress, agentPort, quote_plus(stderrFilename))
917
+ errorLogURL = "http://%s:%d/files/download?path=%s" % (
918
+ agentAddress,
919
+ agentPort,
920
+ quote_plus(stderrFilename),
921
+ )
798
922
 
799
- log.warning("Attempting to retrieve executor error log: %s", errorLogURL)
923
+ log.warning(
924
+ "Attempting to retrieve executor error log: %s", errorLogURL
925
+ )
800
926
 
801
927
  for line in urlopen(errorLogURL):
802
928
  # Warn all the lines of the executor's error log
@@ -808,8 +934,11 @@ class MesosBatchSystem(BatchSystemLocalSupport,
808
934
 
809
935
  for agentLogFilename in agentLogFilenames:
810
936
  try:
811
- agentLogURL = "http://%s:%d/files/download?path=%s" % \
812
- (agentAddress, agentPort, quote_plus(agentLogFilename))
937
+ agentLogURL = "http://%s:%d/files/download?path=%s" % (
938
+ agentAddress,
939
+ agentPort,
940
+ quote_plus(agentLogFilename),
941
+ )
813
942
 
814
943
  log.warning("Attempting to retrieve agent log: %s", agentLogURL)
815
944
 
@@ -829,7 +958,7 @@ class MesosBatchSystem(BatchSystemLocalSupport,
829
958
  Invoked when an executor has exited/terminated abnormally.
830
959
  """
831
960
 
832
- failedId = executorId.get('value', None)
961
+ failedId = executorId.get("value", None)
833
962
 
834
963
  log.warning("Executor '%s' reported lost with status '%s'.", failedId, status)
835
964
 
@@ -840,20 +969,31 @@ class MesosBatchSystem(BatchSystemLocalSupport,
840
969
  """
841
970
  Get the default IP/hostname and port that we will look for Mesos at.
842
971
  """
843
- return f'{get_public_ip()}:5050'
972
+ return f"{get_public_ip()}:5050"
844
973
 
845
974
  @classmethod
846
975
  def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
847
- parser.add_argument("--mesosEndpoint", "--mesosMaster", dest="mesos_endpoint", default=None,
848
- help=f"The host and port of the Mesos master separated by colon. If the provided value "
849
- f"is None, the value will be generated at runtime. "
850
- f"(Generated default: {cls.get_default_mesos_endpoint})")
851
- parser.add_argument("--mesosFrameworkId", dest="mesos_framework_id",
852
- help="Use a specific Mesos framework ID.")
853
- parser.add_argument("--mesosRole", dest="mesos_role",
854
- help="Use a Mesos role.")
855
- parser.add_argument("--mesosName", dest="mesos_name", default="toil",
856
- help="The Mesos name to use. (default: %(default)s)")
976
+ parser.add_argument(
977
+ "--mesosEndpoint",
978
+ "--mesosMaster",
979
+ dest="mesos_endpoint",
980
+ default=None,
981
+ help=f"The host and port of the Mesos master separated by colon. If the provided value "
982
+ f"is None, the value will be generated at runtime. "
983
+ f"(Generated default: {cls.get_default_mesos_endpoint})",
984
+ )
985
+ parser.add_argument(
986
+ "--mesosFrameworkId",
987
+ dest="mesos_framework_id",
988
+ help="Use a specific Mesos framework ID.",
989
+ )
990
+ parser.add_argument("--mesosRole", dest="mesos_role", help="Use a Mesos role.")
991
+ parser.add_argument(
992
+ "--mesosName",
993
+ dest="mesos_name",
994
+ default="toil",
995
+ help="The Mesos name to use. (default: %(default)s)",
996
+ )
857
997
 
858
998
  @classmethod
859
999
  def setOptions(cls, setOption: OptionSetter):
@@ -861,4 +1001,3 @@ class MesosBatchSystem(BatchSystemLocalSupport,
861
1001
  setOption("mesos_name")
862
1002
  setOption("mesos_role")
863
1003
  setOption("mesos_framework_id")
864
-