toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -22,7 +22,7 @@ import time
|
|
|
22
22
|
import traceback
|
|
23
23
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
24
24
|
from queue import Empty, Queue
|
|
25
|
-
from typing import
|
|
25
|
+
from typing import Optional, Union
|
|
26
26
|
from urllib.parse import quote_plus
|
|
27
27
|
from urllib.request import urlopen
|
|
28
28
|
|
|
@@ -30,11 +30,13 @@ import addict
|
|
|
30
30
|
from pymesos import MesosSchedulerDriver, Scheduler, decode_data, encode_data
|
|
31
31
|
|
|
32
32
|
from toil import resolveEntryPoint
|
|
33
|
-
from toil.batchSystems.abstractBatchSystem import (
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
33
|
+
from toil.batchSystems.abstractBatchSystem import (
|
|
34
|
+
EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
35
|
+
AbstractScalableBatchSystem,
|
|
36
|
+
BatchJobExitReason,
|
|
37
|
+
NodeInfo,
|
|
38
|
+
UpdatedBatchJobInfo,
|
|
39
|
+
)
|
|
38
40
|
from toil.batchSystems.local_support import BatchSystemLocalSupport
|
|
39
41
|
from toil.batchSystems.mesos import JobQueue, MesosShape, TaskData, ToilJob
|
|
40
42
|
from toil.batchSystems.options import OptionSetter
|
|
@@ -46,9 +48,7 @@ from toil.lib.misc import get_public_ip, get_user_name
|
|
|
46
48
|
log = logging.getLogger(__name__)
|
|
47
49
|
|
|
48
50
|
|
|
49
|
-
class MesosBatchSystem(BatchSystemLocalSupport,
|
|
50
|
-
AbstractScalableBatchSystem,
|
|
51
|
-
Scheduler):
|
|
51
|
+
class MesosBatchSystem(BatchSystemLocalSupport, AbstractScalableBatchSystem, Scheduler):
|
|
52
52
|
"""
|
|
53
53
|
A Toil batch system implementation that uses Apache Mesos to distribute toil jobs as Mesos
|
|
54
54
|
tasks over a cluster of agent nodes. A Mesos framework consists of a scheduler and an
|
|
@@ -174,13 +174,18 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
174
174
|
def unignoreNode(self, nodeAddress):
|
|
175
175
|
self.ignoredNodes.remove(nodeAddress)
|
|
176
176
|
|
|
177
|
-
def issueBatchJob(
|
|
177
|
+
def issueBatchJob(
|
|
178
|
+
self,
|
|
179
|
+
command: str,
|
|
180
|
+
jobNode: JobDescription,
|
|
181
|
+
job_environment: Optional[dict[str, str]] = None,
|
|
182
|
+
):
|
|
178
183
|
"""
|
|
179
184
|
Issues the following command returning a unique jobID. Command is the string to run, memory
|
|
180
185
|
is an int giving the number of bytes the job needs to run in and cores is the number of cpus
|
|
181
186
|
needed for the job and error-file is the path of the file to place any std-err/std-out in.
|
|
182
187
|
"""
|
|
183
|
-
localID = self.handleLocalJob(jobNode)
|
|
188
|
+
localID = self.handleLocalJob(command, jobNode)
|
|
184
189
|
if localID is not None:
|
|
185
190
|
return localID
|
|
186
191
|
|
|
@@ -189,7 +194,7 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
189
194
|
"memory": jobNode.memory,
|
|
190
195
|
"cores": jobNode.cores,
|
|
191
196
|
"disk": jobNode.disk,
|
|
192
|
-
"preemptible": jobNode.preemptible
|
|
197
|
+
"preemptible": jobNode.preemptible,
|
|
193
198
|
}
|
|
194
199
|
|
|
195
200
|
jobID = self.getNextJobID()
|
|
@@ -197,15 +202,17 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
197
202
|
if job_environment:
|
|
198
203
|
environment.update(job_environment)
|
|
199
204
|
|
|
200
|
-
job = ToilJob(
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
205
|
+
job = ToilJob(
|
|
206
|
+
jobID=jobID,
|
|
207
|
+
name=str(jobNode),
|
|
208
|
+
resources=MesosShape(wallTime=0, **mesos_resources),
|
|
209
|
+
command=command,
|
|
210
|
+
userScript=self.userScript,
|
|
211
|
+
environment=environment,
|
|
212
|
+
workerCleanupInfo=self.workerCleanupInfo,
|
|
213
|
+
)
|
|
207
214
|
jobType = job.resources
|
|
208
|
-
log.debug("Queueing the job
|
|
215
|
+
log.debug("Queueing the job %s with job id: %s ...", jobNode, str(jobID))
|
|
209
216
|
|
|
210
217
|
# TODO: round all elements of resources
|
|
211
218
|
|
|
@@ -285,11 +292,17 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
285
292
|
try:
|
|
286
293
|
self.intendedKill.remove(item.jobID)
|
|
287
294
|
except KeyError:
|
|
288
|
-
log.debug(
|
|
289
|
-
|
|
295
|
+
log.debug(
|
|
296
|
+
"Job %s ended with status %i, took %s seconds.",
|
|
297
|
+
item.jobID,
|
|
298
|
+
item.exitStatus,
|
|
299
|
+
"???" if item.wallTime is None else str(item.wallTime),
|
|
300
|
+
)
|
|
290
301
|
return item
|
|
291
302
|
else:
|
|
292
|
-
log.debug(
|
|
303
|
+
log.debug(
|
|
304
|
+
"Job %s ended naturally before it could be killed.", item.jobID
|
|
305
|
+
)
|
|
293
306
|
|
|
294
307
|
def nodeInUse(self, nodeIP: str) -> bool:
|
|
295
308
|
return nodeIP in self.hostToJobIDs
|
|
@@ -308,7 +321,7 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
308
321
|
# The executor program is installed as a setuptools entry point by setup.py
|
|
309
322
|
info = addict.Dict()
|
|
310
323
|
info.name = "toil"
|
|
311
|
-
info.command.value = resolveEntryPoint(
|
|
324
|
+
info.command.value = resolveEntryPoint("_toil_mesos_executor")
|
|
312
325
|
info.executor_id.value = "toil-%i" % os.getpid()
|
|
313
326
|
info.source = pwd.getpwuid(os.getuid()).pw_name
|
|
314
327
|
return info
|
|
@@ -318,18 +331,24 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
318
331
|
The Mesos driver thread which handles the scheduler's communication with the Mesos master
|
|
319
332
|
"""
|
|
320
333
|
framework = addict.Dict()
|
|
321
|
-
framework.user =
|
|
334
|
+
framework.user = (
|
|
335
|
+
get_user_name()
|
|
336
|
+
) # We must determine the user name ourselves with pymesos
|
|
322
337
|
framework.name = config.mesos_name
|
|
323
338
|
framework.principal = framework.name
|
|
324
339
|
if config.mesos_role is not None:
|
|
325
340
|
framework.roles = config.mesos_role
|
|
326
|
-
framework.capabilities = [dict(type=
|
|
341
|
+
framework.capabilities = [dict(type="MULTI_ROLE")]
|
|
327
342
|
|
|
328
343
|
# Make the driver which implements most of the scheduler logic and calls back to us for the user-defined parts.
|
|
329
344
|
# Make sure it will call us with nice namespace-y addicts
|
|
330
|
-
self.driver = MesosSchedulerDriver(
|
|
331
|
-
|
|
332
|
-
|
|
345
|
+
self.driver = MesosSchedulerDriver(
|
|
346
|
+
self,
|
|
347
|
+
framework,
|
|
348
|
+
self._resolveAddress(self.mesos_endpoint),
|
|
349
|
+
use_addict=True,
|
|
350
|
+
implicit_acknowledgements=True,
|
|
351
|
+
)
|
|
333
352
|
self.driver.start()
|
|
334
353
|
|
|
335
354
|
@staticmethod
|
|
@@ -349,10 +368,10 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
349
368
|
>>> f('127.0.0.1:123')
|
|
350
369
|
'127.0.0.1:123'
|
|
351
370
|
"""
|
|
352
|
-
address = address.split(
|
|
371
|
+
address = address.split(":")
|
|
353
372
|
assert len(address) in (1, 2)
|
|
354
373
|
address[0] = socket.gethostbyname(address[0])
|
|
355
|
-
return
|
|
374
|
+
return ":".join(address)
|
|
356
375
|
|
|
357
376
|
def shutdown(self) -> None:
|
|
358
377
|
self.shutdownLocal()
|
|
@@ -361,7 +380,7 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
361
380
|
log.debug("Joining Mesos driver")
|
|
362
381
|
driver_result = self.driver.join()
|
|
363
382
|
log.debug("Joined Mesos driver")
|
|
364
|
-
if driver_result is not None and driver_result !=
|
|
383
|
+
if driver_result is not None and driver_result != "DRIVER_STOPPED":
|
|
365
384
|
# TODO: The docs say join should return a code, but it keeps returning
|
|
366
385
|
# None when apparently successful. So tolerate that here too.
|
|
367
386
|
raise RuntimeError("Mesos driver failed with %s" % driver_result)
|
|
@@ -384,11 +403,15 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
384
403
|
disk = 0
|
|
385
404
|
preemptible = None
|
|
386
405
|
for attribute in offer.attributes:
|
|
387
|
-
if attribute.name ==
|
|
388
|
-
assert
|
|
406
|
+
if attribute.name == "preemptible":
|
|
407
|
+
assert (
|
|
408
|
+
preemptible is None
|
|
409
|
+
), "Attribute 'preemptible' occurs more than once."
|
|
389
410
|
preemptible = strict_bool(attribute.text.value)
|
|
390
411
|
if preemptible is None:
|
|
391
|
-
log.debug(
|
|
412
|
+
log.debug(
|
|
413
|
+
"Agent not marked as either preemptible or not. Assuming non-preemptible."
|
|
414
|
+
)
|
|
392
415
|
preemptible = False
|
|
393
416
|
for resource in offer.resources:
|
|
394
417
|
if resource.name == "cpus":
|
|
@@ -415,14 +438,16 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
415
438
|
except KeyError:
|
|
416
439
|
self.hostToJobIDs[agentIP] = [resourceKey]
|
|
417
440
|
|
|
418
|
-
self.runningJobMap[int(task.task_id.value)] = TaskData(
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
441
|
+
self.runningJobMap[int(task.task_id.value)] = TaskData(
|
|
442
|
+
startTime=time.time(),
|
|
443
|
+
agentID=offer.agent_id.value,
|
|
444
|
+
agentIP=agentIP,
|
|
445
|
+
executorID=task.executor.executor_id.value,
|
|
446
|
+
cores=resources.cores,
|
|
447
|
+
memory=resources.memory,
|
|
448
|
+
)
|
|
424
449
|
del self.taskResources[resourceKey]
|
|
425
|
-
log.debug(
|
|
450
|
+
log.debug("Launched Mesos task %s.", task.task_id.value)
|
|
426
451
|
|
|
427
452
|
def resourceOffers(self, driver, offers):
|
|
428
453
|
"""
|
|
@@ -445,10 +470,18 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
445
470
|
continue
|
|
446
471
|
runnableTasks = []
|
|
447
472
|
# TODO: In an offer, can there ever be more than one resource with the same name?
|
|
448
|
-
offerCores, offerMemory, offerDisk, offerPreemptible = self._parseOffer(
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
473
|
+
offerCores, offerMemory, offerDisk, offerPreemptible = self._parseOffer(
|
|
474
|
+
offer
|
|
475
|
+
)
|
|
476
|
+
log.debug(
|
|
477
|
+
"Got offer %s for a %spreemptible agent with %.2f MiB memory, %.2f core(s) "
|
|
478
|
+
"and %.2f MiB of disk.",
|
|
479
|
+
offer.id.value,
|
|
480
|
+
"" if offerPreemptible else "non-",
|
|
481
|
+
offerMemory,
|
|
482
|
+
offerCores,
|
|
483
|
+
offerDisk,
|
|
484
|
+
)
|
|
452
485
|
remainingCores = offerCores
|
|
453
486
|
remainingMemory = offerMemory
|
|
454
487
|
remainingDisk = offerDisk
|
|
@@ -460,35 +493,47 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
460
493
|
# loop.
|
|
461
494
|
nextToLaunchIndex = 0
|
|
462
495
|
# Toil specifies disk and memory in bytes but Mesos uses MiB
|
|
463
|
-
while (
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
496
|
+
while (
|
|
497
|
+
not self.jobQueues.typeEmpty(jobType)
|
|
498
|
+
# On a non-preemptible node we can run any job, on a preemptible node we
|
|
499
|
+
# can only run preemptible jobs:
|
|
500
|
+
and (not offerPreemptible or jobType.preemptible)
|
|
501
|
+
and remainingCores >= jobType.cores
|
|
502
|
+
and remainingDisk >= b_to_mib(jobType.disk)
|
|
503
|
+
and remainingMemory >= b_to_mib(jobType.memory)
|
|
504
|
+
):
|
|
470
505
|
task = self._prepareToRun(jobType, offer)
|
|
471
506
|
# TODO: this used to be a conditional but Hannes wanted it changed to an assert
|
|
472
507
|
# TODO: ... so we can understand why it exists.
|
|
473
508
|
assert int(task.task_id.value) not in self.runningJobMap
|
|
474
509
|
runnableTasksOfType.append(task)
|
|
475
|
-
log.debug(
|
|
476
|
-
|
|
510
|
+
log.debug(
|
|
511
|
+
"Preparing to launch Mesos task %s with %.2f cores, %.2f MiB memory, and %.2f MiB disk using offer %s ...",
|
|
512
|
+
task.task_id.value,
|
|
513
|
+
jobType.cores,
|
|
514
|
+
b_to_mib(jobType.memory),
|
|
515
|
+
b_to_mib(jobType.disk),
|
|
516
|
+
offer.id.value,
|
|
517
|
+
)
|
|
477
518
|
remainingCores -= jobType.cores
|
|
478
519
|
remainingMemory -= b_to_mib(jobType.memory)
|
|
479
520
|
remainingDisk -= b_to_mib(jobType.disk)
|
|
480
521
|
nextToLaunchIndex += 1
|
|
481
522
|
if not self.jobQueues.typeEmpty(jobType):
|
|
482
523
|
# report that remaining jobs cannot be run with the current resourcesq:
|
|
483
|
-
log.debug(
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
524
|
+
log.debug(
|
|
525
|
+
"Offer %(offer)s not suitable to run the tasks with requirements "
|
|
526
|
+
"%(requirements)r. Mesos offered %(memory)s memory, %(cores)s cores "
|
|
527
|
+
"and %(disk)s of disk on a %(non)spreemptible agent.",
|
|
528
|
+
dict(
|
|
529
|
+
offer=offer.id.value,
|
|
530
|
+
requirements=jobType.__dict__,
|
|
531
|
+
non="" if offerPreemptible else "non-",
|
|
532
|
+
memory=mib_to_b(offerMemory),
|
|
533
|
+
cores=offerCores,
|
|
534
|
+
disk=mib_to_b(offerDisk),
|
|
535
|
+
),
|
|
536
|
+
)
|
|
492
537
|
runnableTasks.extend(runnableTasksOfType)
|
|
493
538
|
# Launch all runnable tasks together so we only call launchTasks once per offer
|
|
494
539
|
if runnableTasks:
|
|
@@ -496,21 +541,27 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
496
541
|
driver.launchTasks(offer.id, runnableTasks)
|
|
497
542
|
self._updateStateToRunning(offer, runnableTasks)
|
|
498
543
|
else:
|
|
499
|
-
log.debug(
|
|
500
|
-
|
|
544
|
+
log.debug(
|
|
545
|
+
"Although there are queued jobs, none of them could be run with offer %s "
|
|
546
|
+
"extended to the framework.",
|
|
547
|
+
offer.id,
|
|
548
|
+
)
|
|
501
549
|
driver.declineOffer(offer.id)
|
|
502
550
|
|
|
503
551
|
if unableToRun and time.time() > (self.lastTimeOfferLogged + self.logPeriod):
|
|
504
552
|
self.lastTimeOfferLogged = time.time()
|
|
505
|
-
log.debug(
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
553
|
+
log.debug(
|
|
554
|
+
"Although there are queued jobs, none of them were able to run in "
|
|
555
|
+
"any of the offers extended to the framework. There are currently "
|
|
556
|
+
"%i jobs running. Enable debug level logging to see more details about "
|
|
557
|
+
"job types and offers received.",
|
|
558
|
+
len(self.runningJobMap),
|
|
559
|
+
)
|
|
509
560
|
|
|
510
561
|
def _trackOfferedNodes(self, offers):
|
|
511
562
|
for offer in offers:
|
|
512
563
|
# All AgentID messages are required to have a value according to the Mesos Protobuf file.
|
|
513
|
-
assert
|
|
564
|
+
assert "value" in offer.agent_id
|
|
514
565
|
try:
|
|
515
566
|
nodeAddress = socket.gethostbyname(offer.hostname)
|
|
516
567
|
except:
|
|
@@ -519,7 +570,7 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
519
570
|
self._registerNode(nodeAddress, offer.agent_id.value)
|
|
520
571
|
preemptible = False
|
|
521
572
|
for attribute in offer.attributes:
|
|
522
|
-
if attribute.name ==
|
|
573
|
+
if attribute.name == "preemptible":
|
|
523
574
|
preemptible = strict_bool(attribute.text.value)
|
|
524
575
|
if preemptible:
|
|
525
576
|
try:
|
|
@@ -532,11 +583,17 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
532
583
|
def _filterOfferedNodes(self, offers):
|
|
533
584
|
if not self.nodeFilter:
|
|
534
585
|
return offers
|
|
535
|
-
executorInfoOrNone = [
|
|
586
|
+
executorInfoOrNone = [
|
|
587
|
+
self.executors.get(socket.gethostbyname(offer.hostname)) for offer in offers
|
|
588
|
+
]
|
|
536
589
|
executorInfos = [_f for _f in executorInfoOrNone if _f]
|
|
537
590
|
executorsToConsider = list(filter(self.nodeFilter[0], executorInfos))
|
|
538
591
|
ipsToConsider = {ex.nodeAddress for ex in executorsToConsider}
|
|
539
|
-
return [
|
|
592
|
+
return [
|
|
593
|
+
offer
|
|
594
|
+
for offer in offers
|
|
595
|
+
if socket.gethostbyname(offer.hostname) in ipsToConsider
|
|
596
|
+
]
|
|
540
597
|
|
|
541
598
|
def _newMesosTask(self, job, offer):
|
|
542
599
|
"""
|
|
@@ -553,30 +610,36 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
553
610
|
|
|
554
611
|
task.resources.append(addict.Dict())
|
|
555
612
|
cpus = task.resources[-1]
|
|
556
|
-
cpus.name =
|
|
557
|
-
cpus.type =
|
|
613
|
+
cpus.name = "cpus"
|
|
614
|
+
cpus.type = "SCALAR"
|
|
558
615
|
cpus.scalar.value = job.resources.cores
|
|
559
616
|
|
|
560
617
|
task.resources.append(addict.Dict())
|
|
561
618
|
disk = task.resources[-1]
|
|
562
|
-
disk.name =
|
|
563
|
-
disk.type =
|
|
619
|
+
disk.name = "disk"
|
|
620
|
+
disk.type = "SCALAR"
|
|
564
621
|
if b_to_mib(job.resources.disk) > 1:
|
|
565
622
|
disk.scalar.value = b_to_mib(job.resources.disk)
|
|
566
623
|
else:
|
|
567
|
-
log.warning(
|
|
568
|
-
|
|
624
|
+
log.warning(
|
|
625
|
+
"Job %s uses less disk than Mesos requires. Rounding %s up to 1 MiB.",
|
|
626
|
+
job.jobID,
|
|
627
|
+
job.resources.disk,
|
|
628
|
+
)
|
|
569
629
|
disk.scalar.value = 1
|
|
570
630
|
|
|
571
631
|
task.resources.append(addict.Dict())
|
|
572
632
|
mem = task.resources[-1]
|
|
573
|
-
mem.name =
|
|
574
|
-
mem.type =
|
|
633
|
+
mem.name = "mem"
|
|
634
|
+
mem.type = "SCALAR"
|
|
575
635
|
if b_to_mib(job.resources.memory) > 1:
|
|
576
636
|
mem.scalar.value = b_to_mib(job.resources.memory)
|
|
577
637
|
else:
|
|
578
|
-
log.warning(
|
|
579
|
-
|
|
638
|
+
log.warning(
|
|
639
|
+
"Job %s uses less memory than Mesos requires. Rounding %s up to 1 MiB.",
|
|
640
|
+
job.jobID,
|
|
641
|
+
job.resources.memory,
|
|
642
|
+
)
|
|
580
643
|
mem.scalar.value = 1
|
|
581
644
|
return task
|
|
582
645
|
|
|
@@ -590,19 +653,34 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
590
653
|
agent sending the status update is lost/fails during that time).
|
|
591
654
|
"""
|
|
592
655
|
jobID = int(update.task_id.value)
|
|
593
|
-
log.debug(
|
|
656
|
+
log.debug(
|
|
657
|
+
"Job %i is in state '%s' due to reason '%s'.",
|
|
658
|
+
jobID,
|
|
659
|
+
update.state,
|
|
660
|
+
update.reason,
|
|
661
|
+
)
|
|
594
662
|
|
|
595
663
|
def jobEnded(_exitStatus, wallTime=None, exitReason=None):
|
|
596
664
|
"""
|
|
597
665
|
Notify external observers of the job ending.
|
|
598
666
|
"""
|
|
599
|
-
self.updatedJobsQueue.put(
|
|
667
|
+
self.updatedJobsQueue.put(
|
|
668
|
+
UpdatedBatchJobInfo(
|
|
669
|
+
jobID=jobID,
|
|
670
|
+
exitStatus=_exitStatus,
|
|
671
|
+
wallTime=wallTime,
|
|
672
|
+
exitReason=exitReason,
|
|
673
|
+
)
|
|
674
|
+
)
|
|
600
675
|
agentIP = None
|
|
601
676
|
try:
|
|
602
677
|
agentIP = self.runningJobMap[jobID].agentIP
|
|
603
678
|
except KeyError:
|
|
604
|
-
log.warning(
|
|
605
|
-
|
|
679
|
+
log.warning(
|
|
680
|
+
"Job %i returned exit code %i but isn't tracked as running.",
|
|
681
|
+
jobID,
|
|
682
|
+
_exitStatus,
|
|
683
|
+
)
|
|
606
684
|
else:
|
|
607
685
|
# Mark the job as no longer running. We MUST do this BEFORE
|
|
608
686
|
# saying we killed the job, or it will be possible for another
|
|
@@ -612,8 +690,11 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
612
690
|
try:
|
|
613
691
|
self.hostToJobIDs[agentIP].remove(jobID)
|
|
614
692
|
except KeyError:
|
|
615
|
-
log.warning(
|
|
616
|
-
|
|
693
|
+
log.warning(
|
|
694
|
+
"Job %i returned exit code %i from unknown host.",
|
|
695
|
+
jobID,
|
|
696
|
+
_exitStatus,
|
|
697
|
+
)
|
|
617
698
|
|
|
618
699
|
try:
|
|
619
700
|
self.killJobIds.remove(jobID)
|
|
@@ -626,41 +707,62 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
626
707
|
# state from other threads.
|
|
627
708
|
self.killedJobIds.add(jobID)
|
|
628
709
|
|
|
629
|
-
if update.state ==
|
|
710
|
+
if update.state == "TASK_FINISHED":
|
|
630
711
|
# We get the running time of the job via the timestamp, which is in job-local time in seconds
|
|
631
712
|
labels = update.labels.labels
|
|
632
713
|
wallTime = None
|
|
633
714
|
for label in labels:
|
|
634
|
-
if label[
|
|
635
|
-
wallTime = float(label[
|
|
715
|
+
if label["key"] == "wallTime":
|
|
716
|
+
wallTime = float(label["value"])
|
|
636
717
|
break
|
|
637
|
-
assert
|
|
718
|
+
assert wallTime is not None
|
|
638
719
|
jobEnded(0, wallTime=wallTime, exitReason=BatchJobExitReason.FINISHED)
|
|
639
|
-
elif update.state ==
|
|
720
|
+
elif update.state == "TASK_FAILED":
|
|
640
721
|
try:
|
|
641
722
|
exitStatus = int(update.message)
|
|
642
723
|
except ValueError:
|
|
643
724
|
exitStatus = EXIT_STATUS_UNAVAILABLE_VALUE
|
|
644
|
-
log.warning(
|
|
645
|
-
|
|
646
|
-
|
|
725
|
+
log.warning(
|
|
726
|
+
"Job %i failed with message '%s' due to reason '%s' on executor '%s' on agent '%s'.",
|
|
727
|
+
jobID,
|
|
728
|
+
update.message,
|
|
729
|
+
update.reason,
|
|
730
|
+
update.executor_id,
|
|
731
|
+
update.agent_id,
|
|
732
|
+
)
|
|
647
733
|
else:
|
|
648
|
-
log.warning(
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
734
|
+
log.warning(
|
|
735
|
+
"Job %i failed with exit status %i and message '%s' due to reason '%s' on executor '%s' on agent '%s'.",
|
|
736
|
+
jobID,
|
|
737
|
+
exitStatus,
|
|
738
|
+
update.message,
|
|
739
|
+
update.reason,
|
|
740
|
+
update.executor_id,
|
|
741
|
+
update.agent_id,
|
|
742
|
+
)
|
|
652
743
|
|
|
653
744
|
jobEnded(exitStatus, exitReason=BatchJobExitReason.FAILED)
|
|
654
|
-
elif update.state ==
|
|
745
|
+
elif update.state == "TASK_LOST":
|
|
655
746
|
log.warning("Job %i is lost.", jobID)
|
|
656
747
|
jobEnded(EXIT_STATUS_UNAVAILABLE_VALUE, exitReason=BatchJobExitReason.LOST)
|
|
657
|
-
elif update.state in (
|
|
658
|
-
log.warning(
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
748
|
+
elif update.state in ("TASK_KILLED", "TASK_ERROR"):
|
|
749
|
+
log.warning(
|
|
750
|
+
"Job %i is in unexpected state %s with message '%s' due to reason '%s'.",
|
|
751
|
+
jobID,
|
|
752
|
+
update.state,
|
|
753
|
+
update.message,
|
|
754
|
+
update.reason,
|
|
755
|
+
)
|
|
756
|
+
jobEnded(
|
|
757
|
+
EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
758
|
+
exitReason=(
|
|
759
|
+
BatchJobExitReason.KILLED
|
|
760
|
+
if update.state == "TASK_KILLED"
|
|
761
|
+
else BatchJobExitReason.ERROR
|
|
762
|
+
),
|
|
763
|
+
)
|
|
764
|
+
|
|
765
|
+
if "limitation" in update:
|
|
664
766
|
log.warning("Job limit info: %s" % update.limitation)
|
|
665
767
|
|
|
666
768
|
def frameworkMessage(self, driver, executorId, agentId, message):
|
|
@@ -671,22 +773,31 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
671
773
|
# Take it out of base 64 encoding from Protobuf
|
|
672
774
|
message = decode_data(message).decode()
|
|
673
775
|
|
|
674
|
-
log.debug(
|
|
675
|
-
|
|
776
|
+
log.debug(
|
|
777
|
+
"Got framework message from executor %s running on agent %s: %s",
|
|
778
|
+
executorId.value,
|
|
779
|
+
agentId.value,
|
|
780
|
+
message,
|
|
781
|
+
)
|
|
676
782
|
message = ast.literal_eval(message)
|
|
677
783
|
assert isinstance(message, dict)
|
|
678
784
|
# Handle the mandatory fields of a message
|
|
679
|
-
nodeAddress = message.pop(
|
|
785
|
+
nodeAddress = message.pop("address")
|
|
680
786
|
executor = self._registerNode(nodeAddress, agentId.value)
|
|
681
787
|
# Handle optional message fields
|
|
682
788
|
for k, v in message.items():
|
|
683
|
-
if k ==
|
|
789
|
+
if k == "nodeInfo":
|
|
684
790
|
assert isinstance(v, dict)
|
|
685
|
-
resources = [
|
|
686
|
-
|
|
791
|
+
resources = [
|
|
792
|
+
taskData
|
|
793
|
+
for taskData in self.runningJobMap.values()
|
|
794
|
+
if taskData.executorID == executorId.value
|
|
795
|
+
]
|
|
687
796
|
requestedCores = sum(taskData.cores for taskData in resources)
|
|
688
797
|
requestedMemory = sum(taskData.memory for taskData in resources)
|
|
689
|
-
executor.nodeInfo = NodeInfo(
|
|
798
|
+
executor.nodeInfo = NodeInfo(
|
|
799
|
+
requestedCores=requestedCores, requestedMemory=requestedMemory, **v
|
|
800
|
+
)
|
|
690
801
|
self.executors[nodeAddress] = executor
|
|
691
802
|
else:
|
|
692
803
|
raise RuntimeError("Unknown message field '%s'." % k)
|
|
@@ -699,10 +810,12 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
699
810
|
"""
|
|
700
811
|
executor = self.executors.get(nodeAddress)
|
|
701
812
|
if executor is None or executor.agentId != agentId:
|
|
702
|
-
executor = self.ExecutorInfo(
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
813
|
+
executor = self.ExecutorInfo(
|
|
814
|
+
nodeAddress=nodeAddress,
|
|
815
|
+
agentId=agentId,
|
|
816
|
+
nodeInfo=None,
|
|
817
|
+
lastSeen=time.time(),
|
|
818
|
+
)
|
|
706
819
|
self.executors[nodeAddress] = executor
|
|
707
820
|
else:
|
|
708
821
|
executor.lastSeen = time.time()
|
|
@@ -712,9 +825,9 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
712
825
|
|
|
713
826
|
return executor
|
|
714
827
|
|
|
715
|
-
def getNodes(
|
|
716
|
-
|
|
717
|
-
|
|
828
|
+
def getNodes(
|
|
829
|
+
self, preemptible: Optional[bool] = None, timeout: Optional[int] = None
|
|
830
|
+
) -> dict[str, NodeInfo]:
|
|
718
831
|
"""
|
|
719
832
|
Return all nodes that match:
|
|
720
833
|
- preemptible status (None includes all)
|
|
@@ -722,7 +835,9 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
722
835
|
"""
|
|
723
836
|
nodes = dict()
|
|
724
837
|
for node_ip, executor in self.executors.items():
|
|
725
|
-
if preemptible is None or (
|
|
838
|
+
if preemptible is None or (
|
|
839
|
+
preemptible == (executor.agentId not in self.nonPreemptibleNodes)
|
|
840
|
+
):
|
|
726
841
|
if timeout is None or (time.time() - executor.lastSeen < timeout):
|
|
727
842
|
nodes[node_ip] = executor.nodeInfo
|
|
728
843
|
return nodes
|
|
@@ -731,7 +846,7 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
731
846
|
"""
|
|
732
847
|
Invoked when the scheduler re-registers with a newly elected Mesos master.
|
|
733
848
|
"""
|
|
734
|
-
log.debug(
|
|
849
|
+
log.debug("Registered with new master")
|
|
735
850
|
|
|
736
851
|
def _handleFailedExecutor(self, agentID, executorID=None):
|
|
737
852
|
"""
|
|
@@ -746,8 +861,9 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
746
861
|
Useful for debugging failing executor code.
|
|
747
862
|
"""
|
|
748
863
|
|
|
749
|
-
log.warning(
|
|
750
|
-
|
|
864
|
+
log.warning(
|
|
865
|
+
"Handling failure of executor '%s' on agent '%s'.", executorID, agentID
|
|
866
|
+
)
|
|
751
867
|
|
|
752
868
|
try:
|
|
753
869
|
# Look up the IP. We should always know it unless we get answers
|
|
@@ -763,22 +879,27 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
763
879
|
# it, and I can't find a good way to list it, because the API only
|
|
764
880
|
# seems to report running containers. So we dump all the available
|
|
765
881
|
# files with /files/debug and look for one that looks right.
|
|
766
|
-
filesQueryURL = errorLogURL = "http://%s:%d/files/debug" %
|
|
767
|
-
|
|
882
|
+
filesQueryURL = errorLogURL = "http://%s:%d/files/debug" % (
|
|
883
|
+
agentAddress,
|
|
884
|
+
agentPort,
|
|
885
|
+
)
|
|
768
886
|
|
|
769
887
|
# Download all the root mount points, which are in an object from
|
|
770
888
|
# mounted name to real name
|
|
771
889
|
filesDict = json.loads(urlopen(filesQueryURL).read())
|
|
772
890
|
|
|
773
|
-
log.debug(
|
|
891
|
+
log.debug("Available files: %s", repr(filesDict.keys()))
|
|
774
892
|
|
|
775
893
|
# Generate filenames for each container pointing to where stderr should be
|
|
776
894
|
stderrFilenames = []
|
|
777
895
|
# And look for the actual agent logs.
|
|
778
896
|
agentLogFilenames = []
|
|
779
897
|
for filename in filesDict:
|
|
780
|
-
if (
|
|
781
|
-
|
|
898
|
+
if (
|
|
899
|
+
self.frameworkId in filename
|
|
900
|
+
and agentID in filename
|
|
901
|
+
and (executorID is None or executorID in filename)
|
|
902
|
+
):
|
|
782
903
|
|
|
783
904
|
stderrFilenames.append("%s/stderr" % filename)
|
|
784
905
|
elif filename.endswith("log"):
|
|
@@ -793,10 +914,15 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
793
914
|
# According to
|
|
794
915
|
# http://mesos.apache.org/documentation/latest/sandbox/ we can use
|
|
795
916
|
# the web API to fetch the error log.
|
|
796
|
-
errorLogURL = "http://%s:%d/files/download?path=%s" %
|
|
797
|
-
|
|
917
|
+
errorLogURL = "http://%s:%d/files/download?path=%s" % (
|
|
918
|
+
agentAddress,
|
|
919
|
+
agentPort,
|
|
920
|
+
quote_plus(stderrFilename),
|
|
921
|
+
)
|
|
798
922
|
|
|
799
|
-
log.warning(
|
|
923
|
+
log.warning(
|
|
924
|
+
"Attempting to retrieve executor error log: %s", errorLogURL
|
|
925
|
+
)
|
|
800
926
|
|
|
801
927
|
for line in urlopen(errorLogURL):
|
|
802
928
|
# Warn all the lines of the executor's error log
|
|
@@ -808,8 +934,11 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
808
934
|
|
|
809
935
|
for agentLogFilename in agentLogFilenames:
|
|
810
936
|
try:
|
|
811
|
-
agentLogURL = "http://%s:%d/files/download?path=%s" %
|
|
812
|
-
|
|
937
|
+
agentLogURL = "http://%s:%d/files/download?path=%s" % (
|
|
938
|
+
agentAddress,
|
|
939
|
+
agentPort,
|
|
940
|
+
quote_plus(agentLogFilename),
|
|
941
|
+
)
|
|
813
942
|
|
|
814
943
|
log.warning("Attempting to retrieve agent log: %s", agentLogURL)
|
|
815
944
|
|
|
@@ -829,7 +958,7 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
829
958
|
Invoked when an executor has exited/terminated abnormally.
|
|
830
959
|
"""
|
|
831
960
|
|
|
832
|
-
failedId = executorId.get(
|
|
961
|
+
failedId = executorId.get("value", None)
|
|
833
962
|
|
|
834
963
|
log.warning("Executor '%s' reported lost with status '%s'.", failedId, status)
|
|
835
964
|
|
|
@@ -840,20 +969,31 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
840
969
|
"""
|
|
841
970
|
Get the default IP/hostname and port that we will look for Mesos at.
|
|
842
971
|
"""
|
|
843
|
-
return f
|
|
972
|
+
return f"{get_public_ip()}:5050"
|
|
844
973
|
|
|
845
974
|
@classmethod
|
|
846
975
|
def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
|
|
847
|
-
parser.add_argument(
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
976
|
+
parser.add_argument(
|
|
977
|
+
"--mesosEndpoint",
|
|
978
|
+
"--mesosMaster",
|
|
979
|
+
dest="mesos_endpoint",
|
|
980
|
+
default=None,
|
|
981
|
+
help=f"The host and port of the Mesos master separated by colon. If the provided value "
|
|
982
|
+
f"is None, the value will be generated at runtime. "
|
|
983
|
+
f"(Generated default: {cls.get_default_mesos_endpoint})",
|
|
984
|
+
)
|
|
985
|
+
parser.add_argument(
|
|
986
|
+
"--mesosFrameworkId",
|
|
987
|
+
dest="mesos_framework_id",
|
|
988
|
+
help="Use a specific Mesos framework ID.",
|
|
989
|
+
)
|
|
990
|
+
parser.add_argument("--mesosRole", dest="mesos_role", help="Use a Mesos role.")
|
|
991
|
+
parser.add_argument(
|
|
992
|
+
"--mesosName",
|
|
993
|
+
dest="mesos_name",
|
|
994
|
+
default="toil",
|
|
995
|
+
help="The Mesos name to use. (default: %(default)s)",
|
|
996
|
+
)
|
|
857
997
|
|
|
858
998
|
@classmethod
|
|
859
999
|
def setOptions(cls, setOption: OptionSetter):
|
|
@@ -861,4 +1001,3 @@ class MesosBatchSystem(BatchSystemLocalSupport,
|
|
|
861
1001
|
setOption("mesos_name")
|
|
862
1002
|
setOption("mesos_role")
|
|
863
1003
|
setOption("mesos_framework_id")
|
|
864
|
-
|