toil 5.12.0__py3-none-any.whl → 6.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. toil/__init__.py +18 -13
  2. toil/batchSystems/abstractBatchSystem.py +21 -10
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +2 -2
  4. toil/batchSystems/awsBatch.py +14 -14
  5. toil/batchSystems/contained_executor.py +3 -3
  6. toil/batchSystems/htcondor.py +0 -1
  7. toil/batchSystems/kubernetes.py +34 -31
  8. toil/batchSystems/local_support.py +3 -1
  9. toil/batchSystems/mesos/batchSystem.py +7 -7
  10. toil/batchSystems/options.py +32 -83
  11. toil/batchSystems/registry.py +104 -23
  12. toil/batchSystems/singleMachine.py +16 -13
  13. toil/batchSystems/slurm.py +3 -3
  14. toil/batchSystems/torque.py +0 -1
  15. toil/bus.py +6 -8
  16. toil/common.py +532 -743
  17. toil/cwl/__init__.py +28 -32
  18. toil/cwl/cwltoil.py +523 -520
  19. toil/cwl/utils.py +55 -10
  20. toil/fileStores/__init__.py +2 -2
  21. toil/fileStores/abstractFileStore.py +36 -11
  22. toil/fileStores/cachingFileStore.py +607 -530
  23. toil/fileStores/nonCachingFileStore.py +43 -10
  24. toil/job.py +140 -75
  25. toil/jobStores/abstractJobStore.py +147 -79
  26. toil/jobStores/aws/jobStore.py +23 -9
  27. toil/jobStores/aws/utils.py +1 -2
  28. toil/jobStores/fileJobStore.py +117 -19
  29. toil/jobStores/googleJobStore.py +16 -7
  30. toil/jobStores/utils.py +5 -6
  31. toil/leader.py +71 -43
  32. toil/lib/accelerators.py +10 -5
  33. toil/lib/aws/__init__.py +3 -14
  34. toil/lib/aws/ami.py +22 -9
  35. toil/lib/aws/iam.py +21 -13
  36. toil/lib/aws/session.py +2 -16
  37. toil/lib/aws/utils.py +4 -5
  38. toil/lib/compatibility.py +1 -1
  39. toil/lib/conversions.py +7 -3
  40. toil/lib/docker.py +22 -23
  41. toil/lib/ec2.py +10 -6
  42. toil/lib/ec2nodes.py +106 -100
  43. toil/lib/encryption/_nacl.py +2 -1
  44. toil/lib/generatedEC2Lists.py +325 -18
  45. toil/lib/io.py +21 -0
  46. toil/lib/misc.py +1 -1
  47. toil/lib/resources.py +1 -1
  48. toil/lib/threading.py +74 -26
  49. toil/options/common.py +738 -0
  50. toil/options/cwl.py +336 -0
  51. toil/options/wdl.py +32 -0
  52. toil/provisioners/abstractProvisioner.py +1 -4
  53. toil/provisioners/aws/__init__.py +3 -6
  54. toil/provisioners/aws/awsProvisioner.py +6 -0
  55. toil/provisioners/clusterScaler.py +3 -2
  56. toil/provisioners/gceProvisioner.py +2 -2
  57. toil/realtimeLogger.py +2 -1
  58. toil/resource.py +24 -18
  59. toil/server/app.py +2 -3
  60. toil/server/cli/wes_cwl_runner.py +4 -4
  61. toil/server/utils.py +1 -1
  62. toil/server/wes/abstract_backend.py +3 -2
  63. toil/server/wes/amazon_wes_utils.py +5 -4
  64. toil/server/wes/tasks.py +2 -3
  65. toil/server/wes/toil_backend.py +2 -10
  66. toil/server/wsgi_app.py +2 -0
  67. toil/serviceManager.py +12 -10
  68. toil/statsAndLogging.py +5 -1
  69. toil/test/__init__.py +29 -54
  70. toil/test/batchSystems/batchSystemTest.py +11 -111
  71. toil/test/batchSystems/test_slurm.py +3 -2
  72. toil/test/cwl/cwlTest.py +213 -90
  73. toil/test/cwl/glob_dir.cwl +15 -0
  74. toil/test/cwl/preemptible.cwl +21 -0
  75. toil/test/cwl/preemptible_expression.cwl +28 -0
  76. toil/test/cwl/revsort.cwl +1 -1
  77. toil/test/cwl/revsort2.cwl +1 -1
  78. toil/test/docs/scriptsTest.py +0 -1
  79. toil/test/jobStores/jobStoreTest.py +27 -16
  80. toil/test/lib/aws/test_iam.py +4 -14
  81. toil/test/lib/aws/test_utils.py +0 -3
  82. toil/test/lib/dockerTest.py +4 -4
  83. toil/test/lib/test_ec2.py +11 -16
  84. toil/test/mesos/helloWorld.py +4 -5
  85. toil/test/mesos/stress.py +1 -1
  86. toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
  87. toil/test/provisioners/clusterScalerTest.py +6 -4
  88. toil/test/provisioners/clusterTest.py +14 -3
  89. toil/test/provisioners/gceProvisionerTest.py +0 -6
  90. toil/test/provisioners/restartScript.py +3 -2
  91. toil/test/server/serverTest.py +1 -1
  92. toil/test/sort/restart_sort.py +2 -1
  93. toil/test/sort/sort.py +2 -1
  94. toil/test/sort/sortTest.py +2 -13
  95. toil/test/src/autoDeploymentTest.py +45 -45
  96. toil/test/src/busTest.py +5 -5
  97. toil/test/src/checkpointTest.py +2 -2
  98. toil/test/src/deferredFunctionTest.py +1 -1
  99. toil/test/src/fileStoreTest.py +32 -16
  100. toil/test/src/helloWorldTest.py +1 -1
  101. toil/test/src/importExportFileTest.py +1 -1
  102. toil/test/src/jobDescriptionTest.py +2 -1
  103. toil/test/src/jobServiceTest.py +1 -1
  104. toil/test/src/jobTest.py +18 -18
  105. toil/test/src/miscTests.py +5 -3
  106. toil/test/src/promisedRequirementTest.py +3 -3
  107. toil/test/src/realtimeLoggerTest.py +1 -1
  108. toil/test/src/resourceTest.py +2 -2
  109. toil/test/src/restartDAGTest.py +1 -1
  110. toil/test/src/resumabilityTest.py +36 -2
  111. toil/test/src/retainTempDirTest.py +1 -1
  112. toil/test/src/systemTest.py +2 -2
  113. toil/test/src/toilContextManagerTest.py +2 -2
  114. toil/test/src/userDefinedJobArgTypeTest.py +1 -1
  115. toil/test/utils/toilDebugTest.py +98 -32
  116. toil/test/utils/toilKillTest.py +2 -2
  117. toil/test/utils/utilsTest.py +20 -0
  118. toil/test/wdl/wdltoil_test.py +148 -45
  119. toil/toilState.py +7 -6
  120. toil/utils/toilClean.py +1 -1
  121. toil/utils/toilConfig.py +36 -0
  122. toil/utils/toilDebugFile.py +60 -33
  123. toil/utils/toilDebugJob.py +39 -12
  124. toil/utils/toilDestroyCluster.py +1 -1
  125. toil/utils/toilKill.py +1 -1
  126. toil/utils/toilLaunchCluster.py +13 -2
  127. toil/utils/toilMain.py +3 -2
  128. toil/utils/toilRsyncCluster.py +1 -1
  129. toil/utils/toilSshCluster.py +1 -1
  130. toil/utils/toilStats.py +240 -143
  131. toil/utils/toilStatus.py +1 -4
  132. toil/version.py +11 -11
  133. toil/wdl/utils.py +2 -122
  134. toil/wdl/wdltoil.py +999 -386
  135. toil/worker.py +25 -31
  136. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/METADATA +60 -53
  137. toil-6.1.0a1.dist-info/RECORD +237 -0
  138. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/WHEEL +1 -1
  139. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/entry_points.txt +0 -1
  140. toil/batchSystems/parasol.py +0 -379
  141. toil/batchSystems/tes.py +0 -459
  142. toil/test/batchSystems/parasolTestSupport.py +0 -117
  143. toil/test/wdl/builtinTest.py +0 -506
  144. toil/test/wdl/conftest.py +0 -23
  145. toil/test/wdl/toilwdlTest.py +0 -522
  146. toil/wdl/toilwdl.py +0 -141
  147. toil/wdl/versions/dev.py +0 -107
  148. toil/wdl/versions/draft2.py +0 -980
  149. toil/wdl/versions/v1.py +0 -794
  150. toil/wdl/wdl_analysis.py +0 -116
  151. toil/wdl/wdl_functions.py +0 -997
  152. toil/wdl/wdl_synthesis.py +0 -1011
  153. toil/wdl/wdl_types.py +0 -243
  154. toil-5.12.0.dist-info/RECORD +0 -244
  155. /toil/{wdl/versions → options}/__init__.py +0 -0
  156. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/LICENSE +0 -0
  157. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/top_level.txt +0 -0
@@ -1,522 +0,0 @@
1
- import os
2
- import shutil
3
- import subprocess
4
- import tempfile
5
- from typing import List
6
- import unittest
7
- import uuid
8
- import zipfile
9
- from urllib.request import urlretrieve
10
-
11
- from toil.test import ToilTest, needs_docker, needs_java, slow
12
- from toil.version import exactPython
13
- from toil.wdl.utils import get_analyzer
14
- from toil.wdl.wdl_functions import (basename,
15
- glob,
16
- parse_cores,
17
- parse_disk,
18
- parse_memory,
19
- process_infile,
20
- read_csv,
21
- read_tsv,
22
- select_first,
23
- size)
24
-
25
-
26
- class BaseToilWdlTest(ToilTest):
27
- """Base test class for WDL tests"""
28
-
29
- def setUp(self) -> None:
30
- """Runs anew before each test to create farm fresh temp dirs."""
31
- self.output_dir = os.path.join('/tmp/', 'toil-wdl-test-' + str(uuid.uuid4()))
32
- os.makedirs(self.output_dir)
33
-
34
- def tearDown(self) -> None:
35
- if os.path.exists(self.output_dir):
36
- shutil.rmtree(self.output_dir)
37
-
38
- @classmethod
39
- def setUpClass(cls) -> None:
40
- """Runs once for all tests."""
41
- super(BaseToilWdlTest, cls).setUpClass()
42
- cls.base_command = [exactPython, os.path.abspath("src/toil/wdl/toilwdl.py")]
43
-
44
- class ToilWdlTest(BaseToilWdlTest):
45
- """
46
- General tests for Toil WDL
47
- """
48
-
49
- @needs_docker
50
- def testMD5sum(self):
51
- """Test if toilwdl produces the same outputs as known good outputs for WDL's
52
- GATK tutorial #1."""
53
- wdl = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.wdl')
54
- inputfile = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.input')
55
- json = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.json')
56
-
57
- subprocess.check_call(self.base_command + [wdl, json, '-o', self.output_dir, '--logDebug'])
58
- md5sum_output = os.path.join(self.output_dir, 'md5sum.txt')
59
- assert os.path.exists(md5sum_output)
60
- os.unlink(md5sum_output)
61
-
62
- class ToilWDLLibraryTest(BaseToilWdlTest):
63
- """
64
- Test class for WDL standard functions.
65
- """
66
-
67
- # estimated run time <1 sec
68
- def testFn_SelectFirst(self):
69
- """Test the wdl built-in functional equivalent of 'select_first()',
70
- which returns the first value in a list that is not None."""
71
- assert select_first(['somestring', 'anotherstring', None, '', 1]) == 'somestring'
72
- assert select_first([None, '', 1, 'somestring']) == 1
73
- assert select_first([2, 1, '', 'somestring', None, '']) == 2
74
- assert select_first(['', 2, 1, 'somestring', None, '']) == 2
75
-
76
- # estimated run time <1 sec
77
- def testFn_Size(self) -> None:
78
- """Test the wdl built-in functional equivalent of 'size()',
79
- which returns a file's size based on the path."""
80
- from toil.common import Toil
81
- from toil.job import Job
82
- from toil.wdl.wdl_types import WDLFile
83
- options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
84
- options.clean = 'always'
85
- with Toil(options) as toil:
86
- small = process_infile(WDLFile(file_path=os.path.abspath('src/toil/test/wdl/testfiles/vocab.wdl')), toil)
87
- small_file = size(small)
88
- assert small_file >= 1800, small_file
89
-
90
- # estimated run time <1 sec
91
- def testFn_Basename(self):
92
- assert basename('/home/quokka/git/delete/toil/src/toil/wdl/toilwdl.py', '.py') == 'toilwdl'
93
- assert basename('/home/quokka/git/delete/toil/src/toil/wdl/toilwdl.py') == 'toilwdl.py'
94
- assert basename('toilwdl.py', '.py') == 'toilwdl'
95
- assert basename('toilwdl.py') == 'toilwdl.py'
96
-
97
- # estimated run time <1 sec
98
- def testFn_Glob(self):
99
- """Test the wdl built-in functional equivalent of 'glob()',
100
- which finds all files with a pattern in a directory."""
101
- vocab_location = glob('vocab.wdl', os.path.abspath('src/toil'))
102
- assert vocab_location == [os.path.abspath('src/toil/test/wdl/testfiles/vocab.wdl')], str(vocab_location)
103
- wdl_locations = glob('wdl_*.py', os.path.abspath('src/toil'))
104
- wdl_that_should_exist = [os.path.abspath('src/toil/wdl/wdl_analysis.py'),
105
- os.path.abspath('src/toil/wdl/wdl_synthesis.py'),
106
- os.path.abspath('src/toil/wdl/wdl_types.py'),
107
- os.path.abspath('src/toil/wdl/wdl_functions.py')]
108
- # make sure the files match the expected files
109
- for location in wdl_that_should_exist:
110
- assert location in wdl_locations, f'{str(location)} not in {str(wdl_locations)}!'
111
- # make sure the same number of files were found as expected
112
- assert len(wdl_that_should_exist) == len(wdl_locations), f'{str(len(wdl_locations))} != {str(len(wdl_that_should_exist))}'
113
-
114
- # estimated run time <1 sec
115
- def testFn_ParseMemory(self):
116
- """Test the wdl built-in functional equivalent of 'parse_memory()',
117
- which parses a specified memory input to an int output.
118
-
119
- The input can be a string or an int or a float and may include units
120
- such as 'Gb' or 'mib' as a separate argument."""
121
- assert parse_memory(2147483648) == 2147483648, str(parse_memory(2147483648))
122
- assert parse_memory('2147483648') == 2147483648, str(parse_memory(2147483648))
123
- assert parse_memory('2GB') == 2000000000, str(parse_memory('2GB'))
124
- assert parse_memory('2GiB') == 2147483648, str(parse_memory('2GiB'))
125
- assert parse_memory('1 GB') == 1000000000, str(parse_memory('1 GB'))
126
- assert parse_memory('1 GiB') == 1073741824, str(parse_memory('1 GiB'))
127
-
128
- # estimated run time <1 sec
129
- def testFn_ParseCores(self):
130
- """Test the wdl built-in functional equivalent of 'parse_cores()',
131
- which parses a specified disk input to an int output.
132
-
133
- The input can be a string or an int."""
134
- assert parse_cores(1) == 1
135
- assert parse_cores('1') == 1
136
-
137
- # estimated run time <1 sec
138
- def testFn_ParseDisk(self):
139
- """Test the wdl built-in functional equivalent of 'parse_disk()',
140
- which parses a specified disk input to an int output.
141
-
142
- The input can be a string or an int or a float and may include units
143
- such as 'Gb' or 'mib' as a separate argument.
144
-
145
- The minimum returned value is 2147483648 bytes."""
146
- # check minimum returned value
147
- assert parse_disk('1') == 2147483648, str(parse_disk('1'))
148
- assert parse_disk(1) == 2147483648, str(parse_disk(1))
149
-
150
- assert parse_disk(2200000001) == 2200000001, str(parse_disk(2200000001))
151
- assert parse_disk('2200000001') == 2200000001, str(parse_disk('2200000001'))
152
- assert parse_disk('/mnt/my_mnt 3 SSD, /mnt/my_mnt2 500 HDD') == 503000000000, str(parse_disk('/mnt/my_mnt 3 SSD, /mnt/my_mnt2 500 HDD'))
153
- assert parse_disk('local-disk 10 SSD') == 10000000000, str(parse_disk('local-disk 10 SSD'))
154
- assert parse_disk('/mnt/ 10 HDD') == 10000000000, str(parse_disk('/mnt/ 10 HDD'))
155
- assert parse_disk('/mnt/ 1000 HDD') == 1000000000000, str(parse_disk('/mnt/ 1000 HDD'))
156
-
157
- # estimated run time <1 sec
158
- def testPrimitives(self):
159
- """Test if toilwdl correctly interprets some basic declarations."""
160
- wdl = os.path.abspath('src/toil/test/wdl/testfiles/vocab.wdl')
161
-
162
- # TODO: test for all version.
163
- aWDL = get_analyzer(wdl)
164
- aWDL.analyze()
165
-
166
- no_declaration = ['bool1', 'int1', 'float1', 'file1', 'string1']
167
- collection_counter = []
168
- for key, declaration in aWDL.workflows_dictionary['vocabulary'].items():
169
- if not key.startswith('declaration'):
170
- continue
171
-
172
- name, var_type, var_expr = declaration
173
-
174
- if name in no_declaration:
175
- collection_counter.append(name)
176
- assert not var_expr
177
-
178
- if name == 'bool2':
179
- collection_counter.append(name)
180
- assert var_expr == 'True', var_expr
181
- assert var_type == 'Boolean', var_type
182
- if name == 'int2':
183
- collection_counter.append(name)
184
- assert var_expr == '1', var_expr
185
- assert var_type == 'Int', var_type
186
- if name == 'float2':
187
- collection_counter.append(name)
188
- assert var_expr == '1.1', var_expr
189
- assert var_type == 'Float', var_type
190
- if name == 'file2':
191
- collection_counter.append(name)
192
- assert var_expr == "'src/toil/test/wdl/test.tsv'", var_expr
193
- assert var_type == 'File', var_type
194
- if name == 'string2':
195
- collection_counter.append(name)
196
- assert var_expr == "'x'", var_expr
197
- assert var_type == 'String', var_type
198
- assert collection_counter == ['bool1', 'int1', 'float1', 'file1', 'string1',
199
- 'bool2', 'int2', 'float2', 'file2', 'string2']
200
-
201
- # estimated run time <1 sec
202
- def testCSV(self):
203
- default_csv_output = [['1', '2', '3'],
204
- ['4', '5', '6'],
205
- ['7', '8', '9']]
206
- csv_array = read_csv(os.path.abspath('src/toil/test/wdl/test.csv'))
207
- assert csv_array == default_csv_output
208
-
209
- # estimated run time <1 sec
210
- def testTSV(self):
211
- default_tsv_output = [['1', '2', '3'],
212
- ['4', '5', '6'],
213
- ['7', '8', '9']]
214
- tsv_array = read_tsv(os.path.abspath('src/toil/test/wdl/test.tsv'))
215
- assert tsv_array == default_tsv_output
216
-
217
- class ToilWdlIntegrationTest(BaseToilWdlTest):
218
- """Test class for WDL tests that need extra workflows and data downloaded"""
219
-
220
- gatk_data: str
221
- gatk_data_dir: str
222
- encode_data: str
223
- encode_data_dir: str
224
- wdl_data: str
225
- wdl_data_dir: str
226
-
227
- @classmethod
228
- def setUpClass(cls) -> None:
229
- """Runs once for all tests."""
230
- super(ToilWdlIntegrationTest, cls).setUpClass()
231
-
232
- cls.test_directory = os.path.abspath("src/toil/test/wdl/")
233
-
234
- cls.encode_data = os.path.join(cls.test_directory, "ENCODE_data.zip")
235
- cls.encode_data_dir = os.path.join(cls.test_directory, "ENCODE_data")
236
-
237
- cls.wdl_data = os.path.join(cls.test_directory, "wdl_templates.zip")
238
- cls.wdl_data_dir = os.path.join(cls.test_directory, "wdl_templates")
239
-
240
- cls.gatk_data = os.path.join(cls.test_directory, "GATK_data.zip")
241
- cls.gatk_data_dir = os.path.join(cls.test_directory, "GATK_data")
242
-
243
- cls.fetch_and_unzip_from_s3(filename='ENCODE_data.zip',
244
- data=cls.encode_data,
245
- data_dir=cls.encode_data_dir)
246
-
247
- cls.fetch_and_unzip_from_s3(filename='wdl_templates.zip',
248
- data=cls.wdl_data,
249
- data_dir=cls.wdl_data_dir)
250
-
251
- cls.fetch_and_unzip_from_s3(filename='GATK_data.zip',
252
- data=cls.gatk_data,
253
- data_dir=cls.gatk_data_dir)
254
-
255
- @classmethod
256
- def tearDownClass(cls) -> None:
257
- """We generate a lot of cruft."""
258
- data_dirs = [cls.gatk_data_dir, cls.wdl_data_dir, cls.encode_data_dir]
259
- data_zips = [cls.gatk_data, cls.wdl_data, cls.encode_data]
260
- encode_outputs = ['ENCFF000VOL_chr21.fq.gz',
261
- 'ENCFF000VOL_chr21.raw.srt.bam',
262
- 'ENCFF000VOL_chr21.raw.srt.bam.flagstat.qc',
263
- 'ENCFF000VOL_chr21.raw.srt.dup.qc',
264
- 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.bam',
265
- 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.bam.bai',
266
- 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.filt.nodup.sample.15.SE.tagAlign.gz',
267
- 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.filt.nodup.sample.15.SE.tagAlign.gz.cc.plot.pdf',
268
- 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.filt.nodup.sample.15.SE.tagAlign.gz.cc.qc',
269
- 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.flagstat.qc',
270
- 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.pbc.qc',
271
- 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.SE.tagAlign.gz',
272
- 'ENCFF000VOL_chr21.sai',
273
- 'test.txt',
274
- 'filter_qc.json',
275
- 'filter_qc.log',
276
- 'GRCh38_chr21_bwa.tar.gz',
277
- 'mapping.json',
278
- 'mapping.log',
279
- 'post_mapping.json',
280
- 'post_mapping.log',
281
- 'wdl-stats.log',
282
- 'xcor.json',
283
- 'xcor.log',
284
- 'toilwdl_compiled.pyc',
285
- 'toilwdl_compiled.py',
286
- 'post_processing.log',
287
- 'md5.log']
288
- for cleanup in data_dirs + data_zips + encode_outputs:
289
- if os.path.isdir(cleanup):
290
- shutil.rmtree(cleanup)
291
- elif os.path.exists(cleanup):
292
- os.remove(cleanup)
293
- super(ToilWdlIntegrationTest, cls).tearDownClass()
294
-
295
- # estimated run time 27 sec
296
- @slow
297
- @needs_java
298
- def testTut01(self):
299
- """Test if toilwdl produces the same outputs as known good outputs for WDL's
300
- GATK tutorial #1."""
301
- wdl = os.path.abspath("src/toil/test/wdl/wdl_templates/t01/helloHaplotypeCaller.wdl")
302
- json = os.path.abspath("src/toil/test/wdl/wdl_templates/t01/helloHaplotypeCaller_inputs.json")
303
- ref_dir = os.path.abspath("src/toil/test/wdl/wdl_templates/t01/output/")
304
-
305
- subprocess.check_call(self.base_command + [wdl, json, '-o', self.output_dir])
306
-
307
- compare_runs(self.output_dir, ref_dir)
308
-
309
- # estimated run time 28 sec
310
- @slow
311
- @needs_java
312
- def testTut02(self):
313
- """Test if toilwdl produces the same outputs as known good outputs for WDL's
314
- GATK tutorial #2."""
315
- wdl = os.path.abspath("src/toil/test/wdl/wdl_templates/t02/simpleVariantSelection.wdl")
316
- json = os.path.abspath("src/toil/test/wdl/wdl_templates/t02/simpleVariantSelection_inputs.json")
317
- ref_dir = os.path.abspath("src/toil/test/wdl/wdl_templates/t02/output/")
318
-
319
- subprocess.check_call(self.base_command + [wdl, json, '-o', self.output_dir])
320
-
321
- compare_runs(self.output_dir, ref_dir)
322
-
323
- # estimated run time 60 sec
324
- @slow
325
- @needs_java
326
- def testTut03(self):
327
- """Test if toilwdl produces the same outputs as known good outputs for WDL's
328
- GATK tutorial #3."""
329
- wdl = os.path.abspath("src/toil/test/wdl/wdl_templates/t03/simpleVariantDiscovery.wdl")
330
- json = os.path.abspath("src/toil/test/wdl/wdl_templates/t03/simpleVariantDiscovery_inputs.json")
331
- ref_dir = os.path.abspath("src/toil/test/wdl/wdl_templates/t03/output/")
332
-
333
- subprocess.check_call(self.base_command + [wdl, json, '-o', self.output_dir])
334
-
335
- compare_runs(self.output_dir, ref_dir)
336
-
337
- # estimated run time 175 sec
338
- @slow
339
- @needs_java
340
- @unittest.skip('broken; see: https://github.com/DataBiosphere/toil/issues/3339')
341
- def testTut04(self):
342
- """Test if toilwdl produces the same outputs as known good outputs for WDL's
343
- GATK tutorial #4."""
344
- wdl = os.path.abspath("src/toil/test/wdl/wdl_templates/t04/jointCallingGenotypes.wdl")
345
- json = os.path.abspath("src/toil/test/wdl/wdl_templates/t04/jointCallingGenotypes_inputs.json")
346
- ref_dir = os.path.abspath("src/toil/test/wdl/wdl_templates/t04/output/")
347
-
348
- subprocess.check_call(self.base_command + [wdl, json, '-o', self.output_dir])
349
-
350
- compare_runs(self.output_dir, ref_dir)
351
-
352
- # estimated run time 80 sec
353
- @slow
354
- @needs_docker
355
- def testENCODE(self):
356
- """Test if toilwdl produces the same outputs as known good outputs for
357
- a short ENCODE run."""
358
- wdl = os.path.abspath(
359
- "src/toil/test/wdl/wdl_templates/testENCODE/encode_mapping_workflow.wdl")
360
- json = os.path.abspath(
361
- "src/toil/test/wdl/wdl_templates/testENCODE/encode_mapping_workflow.wdl.json")
362
- ref_dir = os.path.abspath(
363
- "src/toil/test/wdl/wdl_templates/testENCODE/output/")
364
-
365
- subprocess.check_call(
366
- self.base_command + [wdl, json, '--docker_user=None', '--out_dir', self.output_dir])
367
-
368
- compare_runs(self.output_dir, ref_dir)
369
-
370
- # estimated run time 2 sec
371
- def testPipe(self):
372
- """Test basic bash input functionality with a pipe."""
373
- wdl = os.path.abspath(
374
- "src/toil/test/wdl/wdl_templates/testPipe/call.wdl")
375
- json = os.path.abspath(
376
- "src/toil/test/wdl/wdl_templates/testPipe/call.json")
377
- ref_dir = os.path.abspath(
378
- "src/toil/test/wdl/wdl_templates/testPipe/output/")
379
-
380
- subprocess.check_call(
381
- self.base_command + [wdl, json, '--out_dir', self.output_dir])
382
-
383
- compare_runs(self.output_dir, ref_dir)
384
-
385
- # estimated run time <1 sec
386
- def testJSON(self):
387
- default_json_dict_output = {
388
- 'helloHaplotypeCaller.haplotypeCaller.RefIndex': '"src/toil/test/wdl/GATK_data/ref/human_g1k_b37_20.fasta.fai"',
389
- 'helloHaplotypeCaller.haplotypeCaller.sampleName': '"WDL_tut1_output"',
390
- 'helloHaplotypeCaller.haplotypeCaller.inputBAM': '"src/toil/test/wdl/GATK_data/inputs/NA12878_wgs_20.bam"',
391
- 'helloHaplotypeCaller.haplotypeCaller.bamIndex': '"src/toil/test/wdl/GATK_data/inputs/NA12878_wgs_20.bai"',
392
- 'helloHaplotypeCaller.haplotypeCaller.GATK': '"src/toil/test/wdl/GATK_data/gatk-package-4.1.9.0-local.jar"',
393
- 'helloHaplotypeCaller.haplotypeCaller.RefDict': '"src/toil/test/wdl/GATK_data/ref/human_g1k_b37_20.dict"',
394
- 'helloHaplotypeCaller.haplotypeCaller.RefFasta': '"src/toil/test/wdl/GATK_data/ref/human_g1k_b37_20.fasta"'}
395
-
396
- from toil.wdl.utils import dict_from_JSON
397
- json_dict = dict_from_JSON("src/toil/test/wdl/wdl_templates/t01/helloHaplotypeCaller_inputs.json")
398
- assert json_dict == default_json_dict_output, (
399
- str(json_dict) + '\nAssertionError: ' + str(default_json_dict_output))
400
-
401
- # estimated run time <1 sec
402
- def test_size_large(self) -> None:
403
- """Test the wdl built-in functional equivalent of 'size()',
404
- which returns a file's size based on the path, on a large file."""
405
- from toil.common import Toil
406
- from toil.job import Job
407
- from toil.wdl.wdl_types import WDLFile
408
- options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
409
- options.clean = 'always'
410
- with Toil(options) as toil:
411
- large = process_infile(WDLFile(file_path=self.encode_data), toil)
412
- larger_file = size(large)
413
- larger_file_in_mb = size(large, 'mb')
414
- assert larger_file >= 70000000, larger_file
415
- assert larger_file_in_mb >= 70, larger_file_in_mb
416
-
417
- @classmethod
418
- def fetch_and_unzip_from_s3(cls, filename, data, data_dir):
419
- if not os.path.exists(data):
420
- s3_loc = os.path.join('http://toil-datasets.s3.amazonaws.com/', filename)
421
- urlretrieve(s3_loc, data)
422
- # extract the compressed data if not already extracted
423
- if not os.path.exists(data_dir):
424
- with zipfile.ZipFile(data, 'r') as zip_ref:
425
- zip_ref.extractall(cls.test_directory)
426
-
427
-
428
- def compare_runs(output_dir, ref_dir):
429
- """
430
- Takes two directories and compares all of the files between those two
431
- directories, asserting that they match.
432
-
433
- - Ignores outputs.txt, which contains a list of the outputs in the folder.
434
- - Compares line by line, unless the file is a .vcf file.
435
- - Ignores potentially date-stamped comments (lines starting with '#').
436
- - Ignores quality scores in .vcf files and only checks that they found
437
- the same variants. This is due to assumed small observed rounding
438
- differences between systems.
439
-
440
- :param ref_dir: The first directory to compare (with output_dir).
441
- :param output_dir: The second directory to compare (with ref_dir).
442
- """
443
- reference_output_files = os.listdir(ref_dir)
444
- for file in reference_output_files:
445
- if file not in ('outputs.txt', '__pycache__'):
446
- test_output_files = os.listdir(output_dir)
447
- filepath = os.path.join(ref_dir, file)
448
- with open(filepath) as default_file:
449
- good_data = []
450
- for line in default_file:
451
- if not line.startswith('#'):
452
- good_data.append(line)
453
- for test_file in test_output_files:
454
- if file == test_file:
455
- test_filepath = os.path.join(output_dir, file)
456
- if file.endswith(".vcf"):
457
- compare_vcf_files(filepath1=filepath,
458
- filepath2=test_filepath)
459
- else:
460
- with open(test_filepath) as test_file:
461
- test_data = []
462
- for line in test_file:
463
- if not line.startswith('#'):
464
- test_data.append(line)
465
- assert good_data == test_data, "File does not match: %r" % file
466
-
467
-
468
- def compare_vcf_files(filepath1, filepath2):
469
- """
470
- Asserts that two .vcf files contain the same variant findings.
471
-
472
- - Ignores potentially date-stamped comments (lines starting with '#').
473
- - Ignores quality scores in .vcf files and only checks that they found
474
- the same variants. This is due to assumed small observed rounding
475
- differences between systems.
476
-
477
- VCF File Column Contents:
478
- 1: #CHROM
479
- 2: POS
480
- 3: ID
481
- 4: REF
482
- 5: ALT
483
- 6: QUAL
484
- 7: FILTER
485
- 8: INFO
486
-
487
- :param filepath1: First .vcf file to compare.
488
- :param filepath2: Second .vcf file to compare.
489
- """
490
- with open(filepath1) as default_file:
491
- good_data = []
492
- for line in default_file:
493
- line = line.strip()
494
- if not line.startswith('#'):
495
- good_data.append(line.split('\t'))
496
-
497
- with open(filepath2) as test_file:
498
- test_data = []
499
- for line in test_file:
500
- line = line.strip()
501
- if not line.startswith('#'):
502
- test_data.append(line.split('\t'))
503
-
504
- for i in range(len(test_data)):
505
- if test_data[i] != good_data[i]:
506
- for j in range(len(test_data[i])):
507
- # Only compare chromosome, position, ID, reference, and alts.
508
- # Quality score may vary (<1%) between systems because of
509
- # (assumed) rounding differences. Same for the "info" sect.
510
- if j < 5:
511
- if j == 4:
512
- if test_data[i][j].startswith('*,'):
513
- test_data[i][j] = test_data[i][j][2:]
514
- if good_data[i][j].startswith('*,'):
515
- good_data[i][j] = good_data[i][j][2:]
516
- assert test_data[i][j] == good_data[i][j], f"\nInconsistent VCFs: {filepath1} != {filepath2}\n" \
517
- f" - {test_data[i][j]} != {good_data[i][j]}\n" \
518
- f" - Line: {i} Column: {j}"
519
-
520
-
521
- if __name__ == "__main__":
522
- unittest.main() # run all tests
toil/wdl/toilwdl.py DELETED
@@ -1,141 +0,0 @@
1
- # Copyright (C) 2018-2021 UCSC Computational Genomics Lab
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- import argparse
15
- import logging
16
- import os
17
- import subprocess
18
- import sys
19
-
20
- from toil.wdl.utils import dict_from_JSON, get_analyzer, write_mappings
21
- from toil.wdl.wdl_synthesis import SynthesizeWDL
22
-
23
- logger = logging.getLogger(__name__)
24
-
25
-
26
- def main():
27
- """
28
- A program to run WDL input files using native Toil scripts.
29
-
30
- Calls two files, described below, wdl_analysis.py and wdl_synthesis.py:
31
-
32
- wdl_analysis reads the wdl and restructures them into 2 intermediate data
33
- structures before writing (python dictionaries):
34
- "wf_dictionary": containing the parsed workflow information.
35
- "tasks_dictionary": containing the parsed task information.
36
-
37
- wdl_synthesis takes the "wf_dictionary", "tasks_dictionary", and the JSON file
38
- and uses them to write a native python script for use with Toil.
39
-
40
- Requires a WDL file, and a JSON file. The WDL file contains ordered commands,
41
- and the JSON file contains input values for those commands. To run in Toil,
42
- these two files must be parsed, restructured into python dictionaries, and
43
- then compiled into a Toil formatted python script. This compiled Toil script
44
- is deleted unless the user specifies: "--dev_mode" as an option.
45
-
46
- The WDL parser was auto-generated from the Broad's current WDL grammar file:
47
- https://github.com/openwdl/wdl/blob/master/parsers/grammar.hgr
48
- using Scott Frazer's Hermes: https://github.com/scottfrazer/hermes
49
- Thank you Scott Frazer!
50
-
51
- Currently in alpha testing, and known to work with the Broad's GATK tutorial
52
- set for WDL on their main wdl site:
53
- software.broadinstitute.org/wdl/documentation/topic?name=wdl-tutorials
54
-
55
- And ENCODE's WDL workflow:
56
- github.com/ENCODE-DCC/pipeline-container/blob/master/local-workflows/encode_mapping_workflow.wdl
57
-
58
- Additional support to be broadened to include more features soon.
59
- """
60
- parser = argparse.ArgumentParser(description='Runs WDL files with toil.')
61
- parser.add_argument('wdl_file', help='A WDL workflow file.')
62
- parser.add_argument('secondary_file', help='A secondary data file (json).')
63
- parser.add_argument("--jobStore", type=str, required=False, default=None)
64
- parser.add_argument('-o',
65
- '--outdir',
66
- required=False,
67
- default=os.getcwd(),
68
- help='Optionally specify the directory that outputs '
69
- 'are written to. Default is the current working dir.')
70
- parser.add_argument('--dev_mode', required=False, default=False,
71
- help='1. Creates "AST.out", which holds the printed AST and '
72
- '"mappings.out", which holds the parsed task, workflow '
73
- 'dictionaries that were generated. '
74
- '2. Saves the compiled toil script generated from the '
75
- 'wdl/json files from deletion. '
76
- '3. Skips autorunning the compiled python file.')
77
- parser.add_argument('--docker_user', required=False, default='root',
78
- help='The user permissions that the docker containers will be run '
79
- 'with (and the permissions set on any output files produced). '
80
- 'Default is "root". Setting this to None will set this to '
81
- 'the current user.')
82
- parser.add_argument("--destBucket", type=str, required=False, default=False,
83
- help="Specify a cloud bucket endpoint for output files.")
84
-
85
- # wdl_run_args is an array containing all of the unknown arguments not
86
- # specified by the parser in this main. All of these will be passed down in
87
- # check_call later to run the compiled toil file.
88
- args, wdl_run_args = parser.parse_known_args()
89
-
90
- wdl_file = os.path.abspath(args.wdl_file)
91
- args.secondary_file = os.path.abspath(args.secondary_file)
92
- args.outdir = os.path.abspath(args.outdir)
93
-
94
- aWDL = get_analyzer(wdl_file=wdl_file)
95
-
96
- if args.dev_mode:
97
- aWDL.write_AST(out_dir=args.outdir)
98
-
99
- # read secondary file; create dictionary to hold variables
100
- if args.secondary_file.endswith('.json'):
101
- json_dict = dict_from_JSON(args.secondary_file)
102
- else:
103
- raise RuntimeError('Unsupported Secondary File Type. Use json.')
104
-
105
- aWDL.analyze()
106
-
107
- sWDL = SynthesizeWDL(aWDL.version,
108
- aWDL.tasks_dictionary,
109
- aWDL.workflows_dictionary,
110
- args.outdir,
111
- json_dict,
112
- args.docker_user,
113
- args.jobStore,
114
- args.destBucket)
115
-
116
- # use the AST dictionaries to write 4 strings
117
- # these are the future 4 sections of the compiled toil python file
118
- module_section = sWDL.write_modules()
119
- fn_section = sWDL.write_functions()
120
- main_section = sWDL.write_main()
121
-
122
- # write 3 strings to a python output file
123
- sWDL.write_python_file(module_section,
124
- fn_section,
125
- main_section,
126
- sWDL.output_file)
127
-
128
- if args.dev_mode:
129
- logger.debug('WDL file compiled to toil script.')
130
- write_mappings(aWDL)
131
- else:
132
- logger.debug('WDL file compiled to toil script. Running now.')
133
- exe = sys.executable if sys.executable else 'python'
134
- cmd = [exe, sWDL.output_file]
135
- cmd.extend(wdl_run_args)
136
- subprocess.check_call(cmd)
137
- os.remove(sWDL.output_file)
138
-
139
-
140
- if __name__ == '__main__':
141
- main()