toil 6.1.0__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +22 -13
- toil/batchSystems/abstractGridEngineBatchSystem.py +59 -45
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +2 -2
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +64 -22
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +7 -3
- toil/common.py +36 -13
- toil/cwl/cwltoil.py +365 -312
- toil/deferred.py +1 -1
- toil/fileStores/abstractFileStore.py +17 -17
- toil/fileStores/cachingFileStore.py +2 -2
- toil/fileStores/nonCachingFileStore.py +1 -1
- toil/job.py +228 -60
- toil/jobStores/abstractJobStore.py +18 -10
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +57 -29
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +2 -2
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +72 -24
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +5 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/io.py +14 -2
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +55 -21
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +2 -2
- toil/lib/throttle.py +1 -1
- toil/options/common.py +27 -24
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +9 -7
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +58 -16
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +1 -1
- toil/test/cwl/cwlTest.py +8 -91
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +10 -13
- toil/test/jobStores/jobStoreTest.py +33 -49
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +90 -8
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +61 -3
- toil/test/utils/utilsTest.py +20 -18
- toil/test/wdl/wdltoil_test.py +24 -71
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +2 -1
- toil/utils/toilStatus.py +97 -51
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +318 -51
- toil/worker.py +96 -69
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/METADATA +55 -21
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/RECORD +93 -90
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/test/utils/utilsTest.py
CHANGED
|
@@ -56,6 +56,7 @@ class UtilsTest(ToilTest):
|
|
|
56
56
|
super().setUp()
|
|
57
57
|
self.tempDir = self._createTempDir()
|
|
58
58
|
self.tempFile = get_temp_file(rootDir=self.tempDir)
|
|
59
|
+
self.outputFile = get_temp_file(rootDir=self.tempDir)
|
|
59
60
|
self.outputFile = 'someSortedStuff.txt'
|
|
60
61
|
self.toilDir = os.path.join(self.tempDir, "jobstore")
|
|
61
62
|
self.assertFalse(os.path.exists(self.toilDir))
|
|
@@ -73,9 +74,9 @@ class UtilsTest(ToilTest):
|
|
|
73
74
|
'-m',
|
|
74
75
|
'toil.test.sort.sort',
|
|
75
76
|
f'file:{self.toilDir}',
|
|
77
|
+
f'--fileToSort={self.tempFile}',
|
|
78
|
+
f'--outputFile={self.outputFile}',
|
|
76
79
|
'--clean=never',
|
|
77
|
-
'--numLines=1',
|
|
78
|
-
'--lineLength=1'
|
|
79
80
|
]
|
|
80
81
|
|
|
81
82
|
self.restart_sort_workflow_cmd = [
|
|
@@ -91,7 +92,7 @@ class UtilsTest(ToilTest):
|
|
|
91
92
|
if os.path.exists(self.toilDir):
|
|
92
93
|
shutil.rmtree(self.toilDir)
|
|
93
94
|
|
|
94
|
-
for f in [
|
|
95
|
+
for f in [self.tempFile, self.outputFile, os.path.join(self.tempDir, "output.txt")]:
|
|
95
96
|
if os.path.exists(f):
|
|
96
97
|
os.remove(f)
|
|
97
98
|
|
|
@@ -314,14 +315,14 @@ class UtilsTest(ToilTest):
|
|
|
314
315
|
def testGetPIDStatus(self):
|
|
315
316
|
"""Test that ToilStatus.getPIDStatus() behaves as expected."""
|
|
316
317
|
wf = subprocess.Popen(self.sort_workflow_cmd)
|
|
317
|
-
self.check_status('RUNNING', status_fn=ToilStatus.getPIDStatus, seconds=
|
|
318
|
+
self.check_status('RUNNING', status_fn=ToilStatus.getPIDStatus, seconds=60)
|
|
318
319
|
wf.wait()
|
|
319
|
-
self.check_status('COMPLETED', status_fn=ToilStatus.getPIDStatus)
|
|
320
|
+
self.check_status('COMPLETED', status_fn=ToilStatus.getPIDStatus, seconds=60)
|
|
320
321
|
|
|
321
322
|
# TODO: we need to reach into the FileJobStore's files and delete this
|
|
322
323
|
# shared file. We assume we know its internal layout.
|
|
323
324
|
os.remove(os.path.join(self.toilDir, 'files/shared/pid.log'))
|
|
324
|
-
self.check_status('QUEUED', status_fn=ToilStatus.getPIDStatus)
|
|
325
|
+
self.check_status('QUEUED', status_fn=ToilStatus.getPIDStatus, seconds=60)
|
|
325
326
|
|
|
326
327
|
def testGetStatusFailedToilWF(self):
|
|
327
328
|
"""
|
|
@@ -331,9 +332,9 @@ class UtilsTest(ToilTest):
|
|
|
331
332
|
"""
|
|
332
333
|
# --badWorker is set to force failure.
|
|
333
334
|
wf = subprocess.Popen(self.sort_workflow_cmd + ['--badWorker=1'])
|
|
334
|
-
self.check_status('RUNNING', status_fn=ToilStatus.getStatus)
|
|
335
|
+
self.check_status('RUNNING', status_fn=ToilStatus.getStatus, seconds=60)
|
|
335
336
|
wf.wait()
|
|
336
|
-
self.check_status('ERROR', status_fn=ToilStatus.getStatus)
|
|
337
|
+
self.check_status('ERROR', status_fn=ToilStatus.getStatus, seconds=60)
|
|
337
338
|
|
|
338
339
|
@needs_cwl
|
|
339
340
|
@needs_docker
|
|
@@ -341,22 +342,22 @@ class UtilsTest(ToilTest):
|
|
|
341
342
|
"""Test that ToilStatus.getStatus() behaves as expected with a failing CWL workflow."""
|
|
342
343
|
# --badWorker is set to force failure.
|
|
343
344
|
cmd = ['toil-cwl-runner', '--jobStore', self.toilDir, '--clean=never', '--badWorker=1',
|
|
344
|
-
'src/toil/test/cwl/sorttool.cwl', '--reverse', '--input', 'src/toil/test/cwl/whale.txt']
|
|
345
|
+
'src/toil/test/cwl/sorttool.cwl', '--reverse', '--input', 'src/toil/test/cwl/whale.txt', f'--outdir={self.tempDir}']
|
|
345
346
|
wf = subprocess.Popen(cmd)
|
|
346
|
-
self.check_status('RUNNING', status_fn=ToilStatus.getStatus)
|
|
347
|
+
self.check_status('RUNNING', status_fn=ToilStatus.getStatus, seconds=60)
|
|
347
348
|
wf.wait()
|
|
348
|
-
self.check_status('ERROR', status_fn=ToilStatus.getStatus)
|
|
349
|
+
self.check_status('ERROR', status_fn=ToilStatus.getStatus, seconds=60)
|
|
349
350
|
|
|
350
351
|
@needs_cwl
|
|
351
352
|
@needs_docker
|
|
352
353
|
def testGetStatusSuccessfulCWLWF(self):
|
|
353
354
|
"""Test that ToilStatus.getStatus() behaves as expected with a successful CWL workflow."""
|
|
354
355
|
cmd = ['toil-cwl-runner', '--jobStore', self.toilDir, '--clean=never',
|
|
355
|
-
'src/toil/test/cwl/sorttool.cwl', '--reverse', '--input', 'src/toil/test/cwl/whale.txt']
|
|
356
|
+
'src/toil/test/cwl/sorttool.cwl', '--reverse', '--input', 'src/toil/test/cwl/whale.txt', f'--outdir={self.tempDir}']
|
|
356
357
|
wf = subprocess.Popen(cmd)
|
|
357
|
-
self.check_status('RUNNING', status_fn=ToilStatus.getStatus, seconds=
|
|
358
|
+
self.check_status('RUNNING', status_fn=ToilStatus.getStatus, seconds=60)
|
|
358
359
|
wf.wait()
|
|
359
|
-
self.check_status('COMPLETED', status_fn=ToilStatus.getStatus)
|
|
360
|
+
self.check_status('COMPLETED', status_fn=ToilStatus.getStatus, seconds=60)
|
|
360
361
|
|
|
361
362
|
@needs_cwl
|
|
362
363
|
@patch('builtins.print')
|
|
@@ -375,23 +376,24 @@ class UtilsTest(ToilTest):
|
|
|
375
376
|
args, kwargs = mock_print.call_args
|
|
376
377
|
self.assertIn('invalidcommand', args[0])
|
|
377
378
|
|
|
379
|
+
@pytest.mark.timeout(1200)
|
|
378
380
|
def testRestartAttribute(self):
|
|
379
381
|
"""
|
|
380
|
-
Test that the job store is only destroyed when we observe a
|
|
382
|
+
Test that the job store is only destroyed when we observe a successful workflow run.
|
|
381
383
|
The following simulates a failing workflow that attempts to resume without restart().
|
|
382
384
|
In this case, the job store should not be destroyed until restart() is called.
|
|
383
385
|
"""
|
|
384
386
|
# Run a workflow that will always fail
|
|
385
|
-
cmd = self.restart_sort_workflow_cmd + ['--badWorker=1']
|
|
387
|
+
cmd = self.restart_sort_workflow_cmd + ['--badWorker=1', '--logDebug']
|
|
386
388
|
subprocess.run(cmd)
|
|
387
389
|
|
|
388
|
-
restart_cmd = self.restart_sort_workflow_cmd + ['--badWorker=0', '--restart']
|
|
390
|
+
restart_cmd = self.restart_sort_workflow_cmd + ['--badWorker=0', '--logDebug', '--restart']
|
|
389
391
|
subprocess.run(restart_cmd)
|
|
390
392
|
|
|
391
393
|
# Check the job store exists after restart attempt
|
|
392
394
|
self.assertTrue(os.path.exists(self.toilDir))
|
|
393
395
|
|
|
394
|
-
successful_cmd = [python, '-m', 'toil.test.sort.sort', 'file:' + self.toilDir,
|
|
396
|
+
successful_cmd = [python, '-m', 'toil.test.sort.sort', '--logDebug', 'file:' + self.toilDir,
|
|
395
397
|
'--restart']
|
|
396
398
|
subprocess.run(successful_cmd)
|
|
397
399
|
|
toil/test/wdl/wdltoil_test.py
CHANGED
|
@@ -16,12 +16,12 @@ from toil.test import (ToilTest,
|
|
|
16
16
|
needs_docker_cuda,
|
|
17
17
|
needs_google_storage,
|
|
18
18
|
needs_singularity_or_docker,
|
|
19
|
+
needs_wdl,
|
|
19
20
|
slow, integrative)
|
|
20
|
-
from toil.test.provisioners.clusterTest import AbstractClusterTest
|
|
21
21
|
from toil.version import exactPython
|
|
22
22
|
from toil.wdl.wdltoil import WDLSectionJob, WDLWorkflowGraph
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
@needs_wdl
|
|
25
25
|
class BaseWDLTest(ToilTest):
|
|
26
26
|
"""Base test class for WDL tests."""
|
|
27
27
|
|
|
@@ -45,7 +45,7 @@ class WDLConformanceTests(BaseWDLTest):
|
|
|
45
45
|
def setUpClass(cls) -> None:
|
|
46
46
|
|
|
47
47
|
url = "https://github.com/DataBiosphere/wdl-conformance-tests.git"
|
|
48
|
-
commit = "
|
|
48
|
+
commit = "c87b62b4f460e009fd42edec13669c4db14cf90c"
|
|
49
49
|
|
|
50
50
|
p = subprocess.Popen(
|
|
51
51
|
f"git clone {url} {cls.wdl_dir} && cd {cls.wdl_dir} && git checkout {commit}",
|
|
@@ -64,7 +64,7 @@ class WDLConformanceTests(BaseWDLTest):
|
|
|
64
64
|
# estimated running time: 2 minutes
|
|
65
65
|
@slow
|
|
66
66
|
def test_conformance_tests_v10(self):
|
|
67
|
-
tests_to_run = "0
|
|
67
|
+
tests_to_run = "0-15,17-20,22-71,73-77"
|
|
68
68
|
p = subprocess.run(self.base_command + ["-v", "1.0", "-n", tests_to_run], capture_output=True)
|
|
69
69
|
|
|
70
70
|
if p.returncode != 0:
|
|
@@ -75,7 +75,7 @@ class WDLConformanceTests(BaseWDLTest):
|
|
|
75
75
|
# estimated running time: 2 minutes
|
|
76
76
|
@slow
|
|
77
77
|
def test_conformance_tests_v11(self):
|
|
78
|
-
tests_to_run = "
|
|
78
|
+
tests_to_run = "1-63,65-71,73-75,77"
|
|
79
79
|
p = subprocess.run(self.base_command + ["-v", "1.1", "-n", tests_to_run], capture_output=True)
|
|
80
80
|
|
|
81
81
|
if p.returncode != 0:
|
|
@@ -83,6 +83,16 @@ class WDLConformanceTests(BaseWDLTest):
|
|
|
83
83
|
|
|
84
84
|
p.check_returncode()
|
|
85
85
|
|
|
86
|
+
@slow
|
|
87
|
+
def test_conformance_tests_integration(self):
|
|
88
|
+
ids_to_run = "encode,tut01,tut02,tut03,tut04"
|
|
89
|
+
p = subprocess.run(self.base_command + ["-v", "1.0", "--id", ids_to_run], capture_output=True)
|
|
90
|
+
|
|
91
|
+
if p.returncode != 0:
|
|
92
|
+
print(p.stdout.decode('utf-8', errors='replace'))
|
|
93
|
+
|
|
94
|
+
p.check_returncode()
|
|
95
|
+
|
|
86
96
|
@classmethod
|
|
87
97
|
def tearDownClass(cls) -> None:
|
|
88
98
|
upper_dir = os.path.dirname(os.getcwd())
|
|
@@ -116,6 +126,14 @@ class WDLTests(BaseWDLTest):
|
|
|
116
126
|
assert os.path.exists(result['ga4ghMd5.value'])
|
|
117
127
|
assert os.path.basename(result['ga4ghMd5.value']) == 'md5sum.txt'
|
|
118
128
|
|
|
129
|
+
def test_missing_output_directory(self):
|
|
130
|
+
"""
|
|
131
|
+
Test if Toil can run a WDL workflow into a new directory.
|
|
132
|
+
"""
|
|
133
|
+
wdl = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.1.0.wdl')
|
|
134
|
+
json_file = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.json')
|
|
135
|
+
subprocess.check_call(self.base_command + [wdl, json_file, '-o', os.path.join(self.output_dir, "does", "not", "exist"), '--logDebug', '--retryCount=0'])
|
|
136
|
+
|
|
119
137
|
@needs_singularity_or_docker
|
|
120
138
|
def test_miniwdl_self_test(self, extra_args: Optional[List[str]] = None) -> None:
|
|
121
139
|
"""Test if the MiniWDL self test runs and produces the expected output."""
|
|
@@ -141,7 +159,7 @@ class WDLTests(BaseWDLTest):
|
|
|
141
159
|
assert isinstance(outputs['hello_caller.message_files'], list)
|
|
142
160
|
assert len(outputs['hello_caller.message_files']) == 2
|
|
143
161
|
for item in outputs['hello_caller.message_files']:
|
|
144
|
-
# All the files should be strings in the "out"
|
|
162
|
+
# All the files should be strings in the "out" directory
|
|
145
163
|
assert isinstance(item, str)
|
|
146
164
|
assert item.startswith(out_dir)
|
|
147
165
|
|
|
@@ -349,70 +367,5 @@ class WDLTests(BaseWDLTest):
|
|
|
349
367
|
assert "successor" in result[1]
|
|
350
368
|
|
|
351
369
|
|
|
352
|
-
@integrative
|
|
353
|
-
@slow
|
|
354
|
-
@pytest.mark.timeout(600)
|
|
355
|
-
class WDLKubernetesClusterTest(AbstractClusterTest):
|
|
356
|
-
"""
|
|
357
|
-
Ensure WDL works on the Kubernetes batchsystem.
|
|
358
|
-
"""
|
|
359
|
-
|
|
360
|
-
def __init__(self, name):
|
|
361
|
-
super().__init__(name)
|
|
362
|
-
self.clusterName = 'wdl-integration-test-' + str(uuid4())
|
|
363
|
-
# t2.medium is the minimum t2 instance that permits Kubernetes
|
|
364
|
-
self.leaderNodeType = "t2.medium"
|
|
365
|
-
self.instanceTypes = ["t2.medium"]
|
|
366
|
-
self.clusterType = "kubernetes"
|
|
367
|
-
|
|
368
|
-
def setUp(self) -> None:
|
|
369
|
-
super().setUp()
|
|
370
|
-
self.jobStore = f'aws:{self.awsRegion()}:wdl-test-{uuid4()}'
|
|
371
|
-
|
|
372
|
-
def launchCluster(self) -> None:
|
|
373
|
-
self.createClusterUtil(args=['--leaderStorage', str(self.requestedLeaderStorage),
|
|
374
|
-
'--nodeTypes', ",".join(self.instanceTypes),
|
|
375
|
-
'-w', ",".join(self.numWorkers),
|
|
376
|
-
'--nodeStorage', str(self.requestedLeaderStorage)])
|
|
377
|
-
|
|
378
|
-
def test_wdl_kubernetes_cluster(self):
|
|
379
|
-
"""
|
|
380
|
-
Test that a wdl workflow works on a kubernetes cluster. Launches a cluster with 1 worker. This runs a wdl
|
|
381
|
-
workflow that performs an image pull on the worker.
|
|
382
|
-
:return:
|
|
383
|
-
"""
|
|
384
|
-
self.numWorkers = "1"
|
|
385
|
-
self.requestedLeaderStorage = 30
|
|
386
|
-
# create the cluster
|
|
387
|
-
self.launchCluster()
|
|
388
|
-
# get leader
|
|
389
|
-
self.cluster = cluster_factory(
|
|
390
|
-
provisioner="aws", zone=self.zone, clusterName=self.clusterName
|
|
391
|
-
)
|
|
392
|
-
self.leader = self.cluster.getLeader()
|
|
393
|
-
|
|
394
|
-
url = "https://github.com/DataBiosphere/wdl-conformance-tests.git"
|
|
395
|
-
commit = "09b9659cd01473e836738a2e0dd205df0adb49c5"
|
|
396
|
-
wdl_dir = "wdl_conformance_tests"
|
|
397
|
-
|
|
398
|
-
# get the wdl-conformance-tests repo to get WDL tasks to run
|
|
399
|
-
self.sshUtil([
|
|
400
|
-
"bash",
|
|
401
|
-
"-c",
|
|
402
|
-
f"git clone {url} {wdl_dir} && cd {wdl_dir} && git checkout {commit}"
|
|
403
|
-
])
|
|
404
|
-
|
|
405
|
-
# run on kubernetes batchsystem
|
|
406
|
-
toil_options = ['--batchSystem=kubernetes',
|
|
407
|
-
f"--jobstore={self.jobStore}"]
|
|
408
|
-
|
|
409
|
-
# run WDL workflow that will run singularity
|
|
410
|
-
test_options = [f"tests/md5sum/md5sum.wdl", f"tests/md5sum/md5sum.json"]
|
|
411
|
-
self.sshUtil([
|
|
412
|
-
"bash",
|
|
413
|
-
"-c",
|
|
414
|
-
f"cd {wdl_dir} && toil-wdl-runner {' '.join(test_options)} {' '.join(toil_options)}"])
|
|
415
|
-
|
|
416
|
-
|
|
417
370
|
if __name__ == "__main__":
|
|
418
371
|
unittest.main() # run all tests
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
|
|
3
|
+
from toil.test.provisioners.clusterTest import AbstractClusterTest
|
|
4
|
+
from uuid import uuid4
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from toil.provisioners import cluster_factory
|
|
9
|
+
from toil.test import (slow, integrative)
|
|
10
|
+
|
|
11
|
+
@integrative
|
|
12
|
+
@slow
|
|
13
|
+
@pytest.mark.timeout(600)
|
|
14
|
+
class WDLKubernetesClusterTest(AbstractClusterTest):
|
|
15
|
+
"""
|
|
16
|
+
Ensure WDL works on the Kubernetes batchsystem.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, name):
|
|
20
|
+
super().__init__(name)
|
|
21
|
+
self.clusterName = 'wdl-integration-test-' + str(uuid4())
|
|
22
|
+
# t2.medium is the minimum t2 instance that permits Kubernetes
|
|
23
|
+
self.leaderNodeType = "t2.medium"
|
|
24
|
+
self.instanceTypes = ["t2.medium"]
|
|
25
|
+
self.clusterType = "kubernetes"
|
|
26
|
+
|
|
27
|
+
def setUp(self) -> None:
|
|
28
|
+
super().setUp()
|
|
29
|
+
self.jobStore = f'aws:{self.awsRegion()}:wdl-test-{uuid4()}'
|
|
30
|
+
|
|
31
|
+
def launchCluster(self) -> None:
|
|
32
|
+
self.createClusterUtil(args=['--leaderStorage', str(self.requestedLeaderStorage),
|
|
33
|
+
'--nodeTypes', ",".join(self.instanceTypes),
|
|
34
|
+
'-w', ",".join(self.numWorkers),
|
|
35
|
+
'--nodeStorage', str(self.requestedLeaderStorage)])
|
|
36
|
+
|
|
37
|
+
def test_wdl_kubernetes_cluster(self):
|
|
38
|
+
"""
|
|
39
|
+
Test that a wdl workflow works on a kubernetes cluster. Launches a cluster with 1 worker. This runs a wdl
|
|
40
|
+
workflow that performs an image pull on the worker.
|
|
41
|
+
:return:
|
|
42
|
+
"""
|
|
43
|
+
self.numWorkers = "1"
|
|
44
|
+
self.requestedLeaderStorage = 30
|
|
45
|
+
# create the cluster
|
|
46
|
+
self.launchCluster()
|
|
47
|
+
# get leader
|
|
48
|
+
self.cluster = cluster_factory(
|
|
49
|
+
provisioner="aws", zone=self.zone, clusterName=self.clusterName
|
|
50
|
+
)
|
|
51
|
+
self.leader = self.cluster.getLeader()
|
|
52
|
+
|
|
53
|
+
url = "https://github.com/DataBiosphere/wdl-conformance-tests.git"
|
|
54
|
+
commit = "09b9659cd01473e836738a2e0dd205df0adb49c5"
|
|
55
|
+
wdl_dir = "wdl_conformance_tests"
|
|
56
|
+
|
|
57
|
+
# get the wdl-conformance-tests repo to get WDL tasks to run
|
|
58
|
+
self.sshUtil([
|
|
59
|
+
"bash",
|
|
60
|
+
"-c",
|
|
61
|
+
f"git clone {url} {wdl_dir} && cd {wdl_dir} && git checkout {commit}"
|
|
62
|
+
])
|
|
63
|
+
|
|
64
|
+
# run on kubernetes batchsystem
|
|
65
|
+
toil_options = ['--batchSystem=kubernetes',
|
|
66
|
+
f"--jobstore={self.jobStore}"]
|
|
67
|
+
|
|
68
|
+
# run WDL workflow that will run singularity
|
|
69
|
+
test_options = [f"tests/md5sum/md5sum.wdl", f"tests/md5sum/md5sum.json"]
|
|
70
|
+
self.sshUtil([
|
|
71
|
+
"bash",
|
|
72
|
+
"-c",
|
|
73
|
+
f"cd {wdl_dir} && toil-wdl-runner {' '.join(test_options)} {' '.join(toil_options)}"])
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
|
+
unittest.main() # run all tests
|
toil/toilState.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import logging
|
|
15
|
+
import time
|
|
15
16
|
from typing import Dict, Optional, Set
|
|
16
17
|
|
|
17
18
|
from toil.bus import JobUpdatedMessage, MessageBus
|
|
@@ -183,12 +184,70 @@ class ToilState:
|
|
|
183
184
|
if job_id in self.__job_database:
|
|
184
185
|
# Update the one true copy in place
|
|
185
186
|
old_truth = self.__job_database[job_id]
|
|
186
|
-
old_truth.
|
|
187
|
+
old_truth.assert_is_not_newer_than(new_truth)
|
|
187
188
|
old_truth.__dict__.update(new_truth.__dict__)
|
|
188
189
|
else:
|
|
189
190
|
# Just keep the new one
|
|
190
191
|
self.__job_database[job_id] = new_truth
|
|
191
192
|
|
|
193
|
+
def reset_job_expecting_change(self, job_id: str, timeout: float) -> bool:
|
|
194
|
+
"""
|
|
195
|
+
Discard any local modifications to a JobDescription.
|
|
196
|
+
|
|
197
|
+
Will make modifications from other hosts visible.
|
|
198
|
+
|
|
199
|
+
Will wait for up to timeout seconds for a modification (or deletion)
|
|
200
|
+
from another host to actually be visible.
|
|
201
|
+
|
|
202
|
+
Always replaces the JobDescription with what is stored in the job
|
|
203
|
+
store, even if no modification ends up being visible.
|
|
204
|
+
|
|
205
|
+
Returns True if an update was detected in time, and False otherwise.
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
start_time = time.time()
|
|
209
|
+
wait_time = 0.1
|
|
210
|
+
initially_known = job_id in self.__job_database
|
|
211
|
+
new_truth: Optional[JobDescription] = None
|
|
212
|
+
while True:
|
|
213
|
+
try:
|
|
214
|
+
new_truth = self.__job_store.load_job(job_id)
|
|
215
|
+
except NoSuchJobException:
|
|
216
|
+
# The job is gone now.
|
|
217
|
+
if job_id in self.__job_database:
|
|
218
|
+
# So forget about it
|
|
219
|
+
del self.__job_database[job_id]
|
|
220
|
+
# TODO: Other collections may still reference it.
|
|
221
|
+
if initially_known:
|
|
222
|
+
# Job was deleted, that's an update
|
|
223
|
+
return True
|
|
224
|
+
else:
|
|
225
|
+
if job_id in self.__job_database:
|
|
226
|
+
# We have an old version to compare against
|
|
227
|
+
old_truth = self.__job_database[job_id]
|
|
228
|
+
old_truth.assert_is_not_newer_than(new_truth)
|
|
229
|
+
if old_truth.is_updated_by(new_truth):
|
|
230
|
+
# Do the update
|
|
231
|
+
old_truth.__dict__.update(new_truth.__dict__)
|
|
232
|
+
return True
|
|
233
|
+
else:
|
|
234
|
+
# Just keep the new one. That's an update.
|
|
235
|
+
self.__job_database[job_id] = new_truth
|
|
236
|
+
return True
|
|
237
|
+
# We looked but didn't get a good update
|
|
238
|
+
time_elapsed = time.time() - start_time
|
|
239
|
+
if time_elapsed >= timeout:
|
|
240
|
+
# We're out of time to check.
|
|
241
|
+
if new_truth is not None:
|
|
242
|
+
# Commit whatever we managed to load to accomplish a real
|
|
243
|
+
# reset.
|
|
244
|
+
old_truth.__dict__.update(new_truth.__dict__)
|
|
245
|
+
return False
|
|
246
|
+
# Wait a little and poll again
|
|
247
|
+
time.sleep(min(timeout - time_elapsed, wait_time))
|
|
248
|
+
# Using exponential backoff
|
|
249
|
+
wait_time *= 2
|
|
250
|
+
|
|
192
251
|
# The next 3 functions provide tracking of how many successor jobs a given job
|
|
193
252
|
# is waiting on, exposing only legit operations.
|
|
194
253
|
# TODO: turn these into messages?
|
|
@@ -247,10 +306,10 @@ class ToilState:
|
|
|
247
306
|
|
|
248
307
|
:param jobDesc: The description for the root job of the workflow being run.
|
|
249
308
|
"""
|
|
250
|
-
# If the job description has a
|
|
309
|
+
# If the job description has a body, is a checkpoint, has services
|
|
251
310
|
# or is ready to be deleted it is ready to be processed (i.e. it is updated)
|
|
252
311
|
if (
|
|
253
|
-
jobDesc.
|
|
312
|
+
jobDesc.has_body()
|
|
254
313
|
or (
|
|
255
314
|
isinstance(jobDesc, CheckpointJobDescription)
|
|
256
315
|
and jobDesc.checkpoint is not None
|
|
@@ -259,10 +318,10 @@ class ToilState:
|
|
|
259
318
|
or jobDesc.nextSuccessors() is None
|
|
260
319
|
):
|
|
261
320
|
logger.debug(
|
|
262
|
-
"Found job to run: %s, with
|
|
321
|
+
"Found job to run: %s, with body: %s, with checkpoint: %s, with "
|
|
263
322
|
"services: %s, with no next successors: %s",
|
|
264
323
|
jobDesc.jobStoreID,
|
|
265
|
-
jobDesc.
|
|
324
|
+
jobDesc.has_body(),
|
|
266
325
|
isinstance(jobDesc, CheckpointJobDescription)
|
|
267
326
|
and jobDesc.checkpoint is not None,
|
|
268
327
|
len(jobDesc.services) > 0,
|
|
@@ -272,18 +331,18 @@ class ToilState:
|
|
|
272
331
|
self.bus.publish(JobUpdatedMessage(str(jobDesc.jobStoreID), 0))
|
|
273
332
|
|
|
274
333
|
if isinstance(jobDesc, CheckpointJobDescription) and jobDesc.checkpoint is not None:
|
|
275
|
-
jobDesc.
|
|
334
|
+
jobDesc.restore_checkpoint()
|
|
276
335
|
|
|
277
336
|
else: # There exist successors
|
|
278
337
|
logger.debug(
|
|
279
338
|
"Adding job: %s to the state with %s successors",
|
|
280
339
|
jobDesc.jobStoreID,
|
|
281
|
-
len(jobDesc.nextSuccessors()),
|
|
340
|
+
len(jobDesc.nextSuccessors() or set()),
|
|
282
341
|
)
|
|
283
342
|
|
|
284
343
|
# Record the number of successors
|
|
285
344
|
self.successorCounts[str(jobDesc.jobStoreID)] = len(
|
|
286
|
-
jobDesc.nextSuccessors()
|
|
345
|
+
jobDesc.nextSuccessors() or set()
|
|
287
346
|
)
|
|
288
347
|
|
|
289
348
|
def processSuccessorWithMultiplePredecessors(successor: JobDescription) -> None:
|
|
@@ -305,7 +364,7 @@ class ToilState:
|
|
|
305
364
|
self._buildToilState(successor)
|
|
306
365
|
|
|
307
366
|
# For each successor
|
|
308
|
-
for successorJobStoreID in jobDesc.nextSuccessors():
|
|
367
|
+
for successorJobStoreID in jobDesc.nextSuccessors() or set():
|
|
309
368
|
|
|
310
369
|
# If the successor does not yet point back at a
|
|
311
370
|
# predecessor we have not yet considered it
|