toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/test/sort/restart_sort.py
CHANGED
|
@@ -29,7 +29,7 @@ from toil.realtimeLogger import RealtimeLogger
|
|
|
29
29
|
|
|
30
30
|
defaultLines = 1000
|
|
31
31
|
defaultLineLen = 50
|
|
32
|
-
sortMemory =
|
|
32
|
+
sortMemory = "600M"
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
def setup(job, inputFile, N, downCheckpoints, options):
|
|
@@ -38,12 +38,16 @@ def setup(job, inputFile, N, downCheckpoints, options):
|
|
|
38
38
|
Returns the FileID of the sorted file
|
|
39
39
|
"""
|
|
40
40
|
RealtimeLogger.info("Starting the merge sort")
|
|
41
|
-
return job.addChildJobFn(
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
41
|
+
return job.addChildJobFn(
|
|
42
|
+
down,
|
|
43
|
+
inputFile,
|
|
44
|
+
N,
|
|
45
|
+
"root",
|
|
46
|
+
downCheckpoints,
|
|
47
|
+
options=options,
|
|
48
|
+
preemptible=True,
|
|
49
|
+
memory=sortMemory,
|
|
50
|
+
).rv()
|
|
47
51
|
|
|
48
52
|
|
|
49
53
|
def down(job, inputFileStoreID, N, path, downCheckpoints, options, memory=sortMemory):
|
|
@@ -61,34 +65,57 @@ def down(job, inputFileStoreID, N, path, downCheckpoints, options, memory=sortMe
|
|
|
61
65
|
length = os.path.getsize(inputFile)
|
|
62
66
|
if length > N:
|
|
63
67
|
# We will subdivide the file
|
|
64
|
-
RealtimeLogger.critical(
|
|
65
|
-
|
|
68
|
+
RealtimeLogger.critical(
|
|
69
|
+
"Splitting file: %s of size: %s" % (inputFileStoreID, length)
|
|
70
|
+
)
|
|
66
71
|
# Split the file into two copies
|
|
67
72
|
midPoint = getMidPoint(inputFile, 0, length)
|
|
68
73
|
t1 = job.fileStore.getLocalTempFile()
|
|
69
|
-
with open(t1,
|
|
70
|
-
fH.write(copySubRangeOfFile(inputFile, 0, midPoint+1))
|
|
74
|
+
with open(t1, "w") as fH:
|
|
75
|
+
fH.write(copySubRangeOfFile(inputFile, 0, midPoint + 1))
|
|
71
76
|
t2 = job.fileStore.getLocalTempFile()
|
|
72
|
-
with open(t2,
|
|
73
|
-
fH.write(copySubRangeOfFile(inputFile, midPoint+1, length))
|
|
77
|
+
with open(t2, "w") as fH:
|
|
78
|
+
fH.write(copySubRangeOfFile(inputFile, midPoint + 1, length))
|
|
74
79
|
# Call down recursively. By giving the rv() of the two jobs as inputs to the follow-on job, up,
|
|
75
80
|
# we communicate the dependency without hindering concurrency.
|
|
76
|
-
result = job.addFollowOnJobFn(
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
81
|
+
result = job.addFollowOnJobFn(
|
|
82
|
+
up,
|
|
83
|
+
job.addChildJobFn(
|
|
84
|
+
down,
|
|
85
|
+
job.fileStore.writeGlobalFile(t1),
|
|
86
|
+
N,
|
|
87
|
+
path + "/0",
|
|
88
|
+
downCheckpoints,
|
|
89
|
+
checkpoint=downCheckpoints,
|
|
90
|
+
options=options,
|
|
91
|
+
preemptible=True,
|
|
92
|
+
memory=options.sortMemory,
|
|
93
|
+
).rv(),
|
|
94
|
+
job.addChildJobFn(
|
|
95
|
+
down,
|
|
96
|
+
job.fileStore.writeGlobalFile(t2),
|
|
97
|
+
N,
|
|
98
|
+
path + "/1",
|
|
99
|
+
downCheckpoints,
|
|
100
|
+
checkpoint=downCheckpoints,
|
|
101
|
+
options=options,
|
|
102
|
+
preemptible=True,
|
|
103
|
+
memory=options.mergeMemory,
|
|
104
|
+
).rv(),
|
|
105
|
+
path + "/up",
|
|
106
|
+
preemptible=True,
|
|
107
|
+
options=options,
|
|
108
|
+
memory=options.sortMemory,
|
|
109
|
+
).rv()
|
|
84
110
|
else:
|
|
85
111
|
# We can sort this bit of the file
|
|
86
|
-
RealtimeLogger.critical(
|
|
87
|
-
|
|
112
|
+
RealtimeLogger.critical(
|
|
113
|
+
"Sorting file: %s of size: %s" % (inputFileStoreID, length)
|
|
114
|
+
)
|
|
88
115
|
# Sort the copy and write back to the fileStore
|
|
89
|
-
shutil.copyfile(inputFile, inputFile +
|
|
90
|
-
sort(inputFile +
|
|
91
|
-
result = job.fileStore.writeGlobalFile(inputFile +
|
|
116
|
+
shutil.copyfile(inputFile, inputFile + ".sort")
|
|
117
|
+
sort(inputFile + ".sort")
|
|
118
|
+
result = job.fileStore.writeGlobalFile(inputFile + ".sort")
|
|
92
119
|
|
|
93
120
|
RealtimeLogger.info("Down job finished: %s" % path)
|
|
94
121
|
return result
|
|
@@ -102,13 +129,15 @@ def up(job, inputFileID1, inputFileID2, path, options, memory=sortMemory):
|
|
|
102
129
|
RealtimeLogger.info("Up job starting: %s" % path)
|
|
103
130
|
|
|
104
131
|
with job.fileStore.writeGlobalFileStream() as (fileHandle, outputFileStoreID):
|
|
105
|
-
fileHandle = codecs.getwriter(
|
|
132
|
+
fileHandle = codecs.getwriter("utf-8")(fileHandle)
|
|
106
133
|
with job.fileStore.readGlobalFileStream(inputFileID1) as inputFileHandle1:
|
|
107
|
-
inputFileHandle1 = codecs.getreader(
|
|
134
|
+
inputFileHandle1 = codecs.getreader("utf-8")(inputFileHandle1)
|
|
108
135
|
with job.fileStore.readGlobalFileStream(inputFileID2) as inputFileHandle2:
|
|
109
|
-
inputFileHandle2 = codecs.getreader(
|
|
110
|
-
RealtimeLogger.info(
|
|
111
|
-
%
|
|
136
|
+
inputFileHandle2 = codecs.getreader("utf-8")(inputFileHandle2)
|
|
137
|
+
RealtimeLogger.info(
|
|
138
|
+
"Merging %s and %s to %s"
|
|
139
|
+
% (inputFileID1, inputFileID2, outputFileStoreID)
|
|
140
|
+
)
|
|
112
141
|
merge(inputFileHandle1, inputFileHandle2, fileHandle)
|
|
113
142
|
# Cleanup up the input files - these deletes will occur after the completion is successful.
|
|
114
143
|
job.fileStore.deleteGlobalFile(inputFileID1)
|
|
@@ -126,7 +155,7 @@ def sort(file):
|
|
|
126
155
|
|
|
127
156
|
lines.sort()
|
|
128
157
|
|
|
129
|
-
with open(file,
|
|
158
|
+
with open(file, "w") as f:
|
|
130
159
|
for line in lines:
|
|
131
160
|
f.write(line)
|
|
132
161
|
|
|
@@ -181,9 +210,12 @@ def getMidPoint(file, fileStart, fileEnd):
|
|
|
181
210
|
|
|
182
211
|
|
|
183
212
|
def makeFileToSort(fileName, lines=defaultLines, lineLen=defaultLineLen):
|
|
184
|
-
with open(fileName,
|
|
213
|
+
with open(fileName, "w") as f:
|
|
185
214
|
for _ in range(lines):
|
|
186
|
-
line =
|
|
215
|
+
line = (
|
|
216
|
+
"".join(random.choice("actgACTGNXYZ") for _ in range(lineLen - 1))
|
|
217
|
+
+ "\n"
|
|
218
|
+
)
|
|
187
219
|
f.write(line)
|
|
188
220
|
|
|
189
221
|
|
|
@@ -192,25 +224,51 @@ def main(options=None):
|
|
|
192
224
|
# deal with command line arguments
|
|
193
225
|
parser = ArgumentParser()
|
|
194
226
|
Job.Runner.addToilOptions(parser)
|
|
195
|
-
parser.add_argument(
|
|
196
|
-
|
|
227
|
+
parser.add_argument(
|
|
228
|
+
"--numLines",
|
|
229
|
+
default=defaultLines,
|
|
230
|
+
help="Number of lines in file to sort.",
|
|
231
|
+
type=int,
|
|
232
|
+
)
|
|
233
|
+
parser.add_argument(
|
|
234
|
+
"--lineLength",
|
|
235
|
+
default=defaultLineLen,
|
|
236
|
+
help="Length of lines in file to sort.",
|
|
237
|
+
type=int,
|
|
238
|
+
)
|
|
197
239
|
parser.add_argument("--fileToSort", help="The file you wish to sort")
|
|
198
240
|
parser.add_argument("--outputFile", help="Where the sorted output will go")
|
|
199
|
-
parser.add_argument(
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
parser.add_argument(
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
parser.add_argument(
|
|
212
|
-
|
|
213
|
-
|
|
241
|
+
parser.add_argument(
|
|
242
|
+
"--overwriteOutput",
|
|
243
|
+
help="Write over the output file if it already exists.",
|
|
244
|
+
default=True,
|
|
245
|
+
)
|
|
246
|
+
parser.add_argument(
|
|
247
|
+
"--N",
|
|
248
|
+
dest="N",
|
|
249
|
+
help="The threshold below which a serial sort function is used to sort file. "
|
|
250
|
+
"All lines must of length less than or equal to N or program will fail",
|
|
251
|
+
default=10000,
|
|
252
|
+
)
|
|
253
|
+
parser.add_argument(
|
|
254
|
+
"--downCheckpoints",
|
|
255
|
+
action="store_true",
|
|
256
|
+
help="If this option is set, the workflow will make checkpoints on its way through"
|
|
257
|
+
'the recursive "down" part of the sort',
|
|
258
|
+
)
|
|
259
|
+
parser.add_argument(
|
|
260
|
+
"--sortMemory",
|
|
261
|
+
dest="sortMemory",
|
|
262
|
+
help="Memory for jobs that sort chunks of the file.",
|
|
263
|
+
default=None,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
parser.add_argument(
|
|
267
|
+
"--mergeMemory",
|
|
268
|
+
dest="mergeMemory",
|
|
269
|
+
help="Memory for jobs that collate results.",
|
|
270
|
+
default=None,
|
|
271
|
+
)
|
|
214
272
|
|
|
215
273
|
options = parser.parse_args()
|
|
216
274
|
if not hasattr(options, "sortMemory") or not options.sortMemory:
|
|
@@ -221,19 +279,25 @@ def main(options=None):
|
|
|
221
279
|
# do some input verification
|
|
222
280
|
sortedFileName = options.outputFile or "sortedFile.txt"
|
|
223
281
|
if not options.overwriteOutput and os.path.exists(sortedFileName):
|
|
224
|
-
print(
|
|
225
|
-
|
|
282
|
+
print(
|
|
283
|
+
f"Output file {sortedFileName} already exists. "
|
|
284
|
+
f"Delete it to run the sort example again or use --overwriteOutput=True"
|
|
285
|
+
)
|
|
226
286
|
exit()
|
|
227
287
|
|
|
228
288
|
fileName = options.fileToSort
|
|
229
289
|
if options.fileToSort is None:
|
|
230
290
|
# make the file ourselves
|
|
231
|
-
fileName =
|
|
291
|
+
fileName = "fileToSort.txt"
|
|
232
292
|
if os.path.exists(fileName):
|
|
233
|
-
print(f
|
|
293
|
+
print(f"Sorting existing file: {fileName}")
|
|
234
294
|
else:
|
|
235
|
-
print(
|
|
236
|
-
|
|
295
|
+
print(
|
|
296
|
+
f"No sort file specified. Generating one automatically called: {fileName}."
|
|
297
|
+
)
|
|
298
|
+
makeFileToSort(
|
|
299
|
+
fileName=fileName, lines=options.numLines, lineLen=options.lineLength
|
|
300
|
+
)
|
|
237
301
|
else:
|
|
238
302
|
if not os.path.exists(options.fileToSort):
|
|
239
303
|
raise RuntimeError("File to sort does not exist: %s" % options.fileToSort)
|
|
@@ -241,24 +305,29 @@ def main(options=None):
|
|
|
241
305
|
if int(options.N) <= 0:
|
|
242
306
|
raise RuntimeError("Invalid value of N: %s" % options.N)
|
|
243
307
|
|
|
244
|
-
|
|
245
308
|
# Now we are ready to run
|
|
246
309
|
with Toil(options) as workflow:
|
|
247
|
-
sortedFileURL =
|
|
248
|
-
#raise Exception('test')
|
|
310
|
+
sortedFileURL = "file://" + os.path.abspath(sortedFileName)
|
|
311
|
+
# raise Exception('test')
|
|
249
312
|
|
|
250
313
|
if not workflow.options.restart:
|
|
251
|
-
sortFileURL =
|
|
314
|
+
sortFileURL = "file://" + os.path.abspath(fileName)
|
|
252
315
|
sortFileID = workflow.importFile(sortFileURL)
|
|
253
|
-
sortedFileID = workflow.start(
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
316
|
+
sortedFileID = workflow.start(
|
|
317
|
+
Job.wrapJobFn(
|
|
318
|
+
setup,
|
|
319
|
+
sortFileID,
|
|
320
|
+
int(options.N),
|
|
321
|
+
options.downCheckpoints,
|
|
322
|
+
options=options,
|
|
323
|
+
memory=sortMemory,
|
|
324
|
+
)
|
|
325
|
+
)
|
|
259
326
|
"""
|
|
260
327
|
The else block is removed here to test that the job store is not
|
|
261
328
|
destroyed when attempting to resume without restart().
|
|
262
329
|
"""
|
|
263
|
-
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
if __name__ == "__main__":
|
|
264
333
|
main()
|
toil/test/sort/sort.py
CHANGED
|
@@ -27,7 +27,7 @@ from toil.realtimeLogger import RealtimeLogger
|
|
|
27
27
|
|
|
28
28
|
defaultLines = 1000
|
|
29
29
|
defaultLineLen = 50
|
|
30
|
-
sortMemory =
|
|
30
|
+
sortMemory = "600M"
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
def setup(job, inputFile, N, downCheckpoints, options):
|
|
@@ -36,12 +36,16 @@ def setup(job, inputFile, N, downCheckpoints, options):
|
|
|
36
36
|
Returns the FileID of the sorted file
|
|
37
37
|
"""
|
|
38
38
|
RealtimeLogger.info("Starting the merge sort")
|
|
39
|
-
return job.addChildJobFn(
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
39
|
+
return job.addChildJobFn(
|
|
40
|
+
down,
|
|
41
|
+
inputFile,
|
|
42
|
+
N,
|
|
43
|
+
"root",
|
|
44
|
+
downCheckpoints,
|
|
45
|
+
options=options,
|
|
46
|
+
preemptible=True,
|
|
47
|
+
memory=sortMemory,
|
|
48
|
+
).rv()
|
|
45
49
|
|
|
46
50
|
|
|
47
51
|
def down(job, inputFileStoreID, N, path, downCheckpoints, options, memory=sortMemory):
|
|
@@ -59,34 +63,57 @@ def down(job, inputFileStoreID, N, path, downCheckpoints, options, memory=sortMe
|
|
|
59
63
|
length = os.path.getsize(inputFile)
|
|
60
64
|
if length > N:
|
|
61
65
|
# We will subdivide the file
|
|
62
|
-
RealtimeLogger.critical(
|
|
63
|
-
|
|
66
|
+
RealtimeLogger.critical(
|
|
67
|
+
"Splitting file: %s of size: %s" % (inputFileStoreID, length)
|
|
68
|
+
)
|
|
64
69
|
# Split the file into two copies
|
|
65
70
|
midPoint = getMidPoint(inputFile, 0, length)
|
|
66
71
|
t1 = job.fileStore.getLocalTempFile()
|
|
67
|
-
with open(t1,
|
|
68
|
-
fH.write(copySubRangeOfFile(inputFile, 0, midPoint+1))
|
|
72
|
+
with open(t1, "w") as fH:
|
|
73
|
+
fH.write(copySubRangeOfFile(inputFile, 0, midPoint + 1))
|
|
69
74
|
t2 = job.fileStore.getLocalTempFile()
|
|
70
|
-
with open(t2,
|
|
71
|
-
fH.write(copySubRangeOfFile(inputFile, midPoint+1, length))
|
|
75
|
+
with open(t2, "w") as fH:
|
|
76
|
+
fH.write(copySubRangeOfFile(inputFile, midPoint + 1, length))
|
|
72
77
|
# Call down recursively. By giving the rv() of the two jobs as inputs to the follow-on job, up,
|
|
73
78
|
# we communicate the dependency without hindering concurrency.
|
|
74
|
-
result = job.addFollowOnJobFn(
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
79
|
+
result = job.addFollowOnJobFn(
|
|
80
|
+
up,
|
|
81
|
+
job.addChildJobFn(
|
|
82
|
+
down,
|
|
83
|
+
job.fileStore.writeGlobalFile(t1),
|
|
84
|
+
N,
|
|
85
|
+
path + "/0",
|
|
86
|
+
downCheckpoints,
|
|
87
|
+
checkpoint=downCheckpoints,
|
|
88
|
+
options=options,
|
|
89
|
+
preemptible=True,
|
|
90
|
+
memory=options.sortMemory,
|
|
91
|
+
).rv(),
|
|
92
|
+
job.addChildJobFn(
|
|
93
|
+
down,
|
|
94
|
+
job.fileStore.writeGlobalFile(t2),
|
|
95
|
+
N,
|
|
96
|
+
path + "/1",
|
|
97
|
+
downCheckpoints,
|
|
98
|
+
checkpoint=downCheckpoints,
|
|
99
|
+
options=options,
|
|
100
|
+
preemptible=True,
|
|
101
|
+
memory=options.mergeMemory,
|
|
102
|
+
).rv(),
|
|
103
|
+
path + "/up",
|
|
104
|
+
preemptible=True,
|
|
105
|
+
options=options,
|
|
106
|
+
memory=options.sortMemory,
|
|
107
|
+
).rv()
|
|
82
108
|
else:
|
|
83
109
|
# We can sort this bit of the file
|
|
84
|
-
RealtimeLogger.critical(
|
|
85
|
-
|
|
110
|
+
RealtimeLogger.critical(
|
|
111
|
+
"Sorting file: %s of size: %s" % (inputFileStoreID, length)
|
|
112
|
+
)
|
|
86
113
|
# Sort the copy and write back to the fileStore
|
|
87
|
-
shutil.copyfile(inputFile, inputFile +
|
|
88
|
-
sort(inputFile +
|
|
89
|
-
result = job.fileStore.writeGlobalFile(inputFile +
|
|
114
|
+
shutil.copyfile(inputFile, inputFile + ".sort")
|
|
115
|
+
sort(inputFile + ".sort")
|
|
116
|
+
result = job.fileStore.writeGlobalFile(inputFile + ".sort")
|
|
90
117
|
|
|
91
118
|
RealtimeLogger.info("Down job finished: %s" % path)
|
|
92
119
|
return result
|
|
@@ -100,13 +127,15 @@ def up(job, inputFileID1, inputFileID2, path, options, memory=sortMemory):
|
|
|
100
127
|
RealtimeLogger.info("Up job starting: %s" % path)
|
|
101
128
|
|
|
102
129
|
with job.fileStore.writeGlobalFileStream() as (fileHandle, outputFileStoreID):
|
|
103
|
-
fileHandle = codecs.getwriter(
|
|
130
|
+
fileHandle = codecs.getwriter("utf-8")(fileHandle)
|
|
104
131
|
with job.fileStore.readGlobalFileStream(inputFileID1) as inputFileHandle1:
|
|
105
|
-
inputFileHandle1 = codecs.getreader(
|
|
132
|
+
inputFileHandle1 = codecs.getreader("utf-8")(inputFileHandle1)
|
|
106
133
|
with job.fileStore.readGlobalFileStream(inputFileID2) as inputFileHandle2:
|
|
107
|
-
inputFileHandle2 = codecs.getreader(
|
|
108
|
-
RealtimeLogger.info(
|
|
109
|
-
%
|
|
134
|
+
inputFileHandle2 = codecs.getreader("utf-8")(inputFileHandle2)
|
|
135
|
+
RealtimeLogger.info(
|
|
136
|
+
"Merging %s and %s to %s"
|
|
137
|
+
% (inputFileID1, inputFileID2, outputFileStoreID)
|
|
138
|
+
)
|
|
110
139
|
merge(inputFileHandle1, inputFileHandle2, fileHandle)
|
|
111
140
|
# Cleanup up the input files - these deletes will occur after the completion is successful.
|
|
112
141
|
job.fileStore.deleteGlobalFile(inputFileID1)
|
|
@@ -124,7 +153,7 @@ def sort(file):
|
|
|
124
153
|
|
|
125
154
|
lines.sort()
|
|
126
155
|
|
|
127
|
-
with open(file,
|
|
156
|
+
with open(file, "w") as f:
|
|
128
157
|
for line in lines:
|
|
129
158
|
f.write(line)
|
|
130
159
|
|
|
@@ -179,9 +208,12 @@ def getMidPoint(file, fileStart, fileEnd):
|
|
|
179
208
|
|
|
180
209
|
|
|
181
210
|
def makeFileToSort(fileName, lines=defaultLines, lineLen=defaultLineLen):
|
|
182
|
-
with open(fileName,
|
|
211
|
+
with open(fileName, "w") as f:
|
|
183
212
|
for _ in range(lines):
|
|
184
|
-
line =
|
|
213
|
+
line = (
|
|
214
|
+
"".join(random.choice("actgACTGNXYZ") for _ in range(lineLen - 1))
|
|
215
|
+
+ "\n"
|
|
216
|
+
)
|
|
185
217
|
f.write(line)
|
|
186
218
|
|
|
187
219
|
|
|
@@ -190,25 +222,51 @@ def main(options=None):
|
|
|
190
222
|
# deal with command line arguments
|
|
191
223
|
parser = ArgumentParser()
|
|
192
224
|
Job.Runner.addToilOptions(parser)
|
|
193
|
-
parser.add_argument(
|
|
194
|
-
|
|
225
|
+
parser.add_argument(
|
|
226
|
+
"--numLines",
|
|
227
|
+
default=defaultLines,
|
|
228
|
+
help="Number of lines in file to sort.",
|
|
229
|
+
type=int,
|
|
230
|
+
)
|
|
231
|
+
parser.add_argument(
|
|
232
|
+
"--lineLength",
|
|
233
|
+
default=defaultLineLen,
|
|
234
|
+
help="Length of lines in file to sort.",
|
|
235
|
+
type=int,
|
|
236
|
+
)
|
|
195
237
|
parser.add_argument("--fileToSort", help="The file you wish to sort")
|
|
196
238
|
parser.add_argument("--outputFile", help="Where the sorted output will go")
|
|
197
|
-
parser.add_argument(
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
parser.add_argument(
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
parser.add_argument(
|
|
210
|
-
|
|
211
|
-
|
|
239
|
+
parser.add_argument(
|
|
240
|
+
"--overwriteOutput",
|
|
241
|
+
help="Write over the output file if it already exists.",
|
|
242
|
+
default=True,
|
|
243
|
+
)
|
|
244
|
+
parser.add_argument(
|
|
245
|
+
"--N",
|
|
246
|
+
dest="N",
|
|
247
|
+
help="The threshold below which a serial sort function is used to sort file. "
|
|
248
|
+
"All lines must of length less than or equal to N or program will fail",
|
|
249
|
+
default=10000,
|
|
250
|
+
)
|
|
251
|
+
parser.add_argument(
|
|
252
|
+
"--downCheckpoints",
|
|
253
|
+
action="store_true",
|
|
254
|
+
help="If this option is set, the workflow will make checkpoints on its way through"
|
|
255
|
+
'the recursive "down" part of the sort',
|
|
256
|
+
)
|
|
257
|
+
parser.add_argument(
|
|
258
|
+
"--sortMemory",
|
|
259
|
+
dest="sortMemory",
|
|
260
|
+
help="Memory for jobs that sort chunks of the file.",
|
|
261
|
+
default=None,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
parser.add_argument(
|
|
265
|
+
"--mergeMemory",
|
|
266
|
+
dest="mergeMemory",
|
|
267
|
+
help="Memory for jobs that collate results.",
|
|
268
|
+
default=None,
|
|
269
|
+
)
|
|
212
270
|
|
|
213
271
|
options = parser.parse_args()
|
|
214
272
|
if not hasattr(options, "sortMemory") or not options.sortMemory:
|
|
@@ -219,19 +277,25 @@ def main(options=None):
|
|
|
219
277
|
# do some input verification
|
|
220
278
|
sortedFileName = options.outputFile or "sortedFile.txt"
|
|
221
279
|
if not options.overwriteOutput and os.path.exists(sortedFileName):
|
|
222
|
-
print(
|
|
223
|
-
|
|
280
|
+
print(
|
|
281
|
+
f"Output file {sortedFileName} already exists. "
|
|
282
|
+
f"Delete it to run the sort example again or use --overwriteOutput=True"
|
|
283
|
+
)
|
|
224
284
|
exit()
|
|
225
285
|
|
|
226
286
|
fileName = options.fileToSort
|
|
227
287
|
if options.fileToSort is None:
|
|
228
288
|
# make the file ourselves
|
|
229
|
-
fileName =
|
|
289
|
+
fileName = "fileToSort.txt"
|
|
230
290
|
if os.path.exists(fileName):
|
|
231
|
-
print(f
|
|
291
|
+
print(f"Sorting existing file: {fileName}")
|
|
232
292
|
else:
|
|
233
|
-
print(
|
|
234
|
-
|
|
293
|
+
print(
|
|
294
|
+
f"No sort file specified. Generating one automatically called: {fileName}."
|
|
295
|
+
)
|
|
296
|
+
makeFileToSort(
|
|
297
|
+
fileName=fileName, lines=options.numLines, lineLen=options.lineLength
|
|
298
|
+
)
|
|
235
299
|
else:
|
|
236
300
|
if not os.path.exists(options.fileToSort):
|
|
237
301
|
raise RuntimeError("File to sort does not exist: %s" % options.fileToSort)
|
|
@@ -241,20 +305,24 @@ def main(options=None):
|
|
|
241
305
|
|
|
242
306
|
# Now we are ready to run
|
|
243
307
|
with Toil(options) as workflow:
|
|
244
|
-
sortedFileURL =
|
|
308
|
+
sortedFileURL = "file://" + os.path.abspath(sortedFileName)
|
|
245
309
|
if not workflow.options.restart:
|
|
246
|
-
sortFileURL =
|
|
310
|
+
sortFileURL = "file://" + os.path.abspath(fileName)
|
|
247
311
|
sortFileID = workflow.importFile(sortFileURL)
|
|
248
|
-
sortedFileID = workflow.start(
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
312
|
+
sortedFileID = workflow.start(
|
|
313
|
+
Job.wrapJobFn(
|
|
314
|
+
setup,
|
|
315
|
+
sortFileID,
|
|
316
|
+
int(options.N),
|
|
317
|
+
options.downCheckpoints,
|
|
318
|
+
options=options,
|
|
319
|
+
memory=sortMemory,
|
|
320
|
+
)
|
|
321
|
+
)
|
|
254
322
|
else:
|
|
255
323
|
sortedFileID = workflow.restart()
|
|
256
324
|
workflow.exportFile(sortedFileID, sortedFileURL)
|
|
257
325
|
|
|
258
326
|
|
|
259
|
-
if __name__ ==
|
|
327
|
+
if __name__ == "__main__":
|
|
260
328
|
main()
|