toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -51,14 +51,14 @@ class MesosExecutor(Executor):
|
|
|
51
51
|
self.popenLock = threading.Lock()
|
|
52
52
|
self.runningTasks = {}
|
|
53
53
|
self.workerCleanupInfo = None
|
|
54
|
-
log.debug(
|
|
54
|
+
log.debug("Preparing system for resource download")
|
|
55
55
|
Resource.prepareSystem()
|
|
56
56
|
self.address = None
|
|
57
57
|
self.id = None
|
|
58
58
|
# Setting this value at this point will ensure that the toil workflow directory will go to
|
|
59
59
|
# the mesos sandbox if the user hasn't specified --workDir on the command line.
|
|
60
|
-
if not os.getenv(
|
|
61
|
-
os.environ[
|
|
60
|
+
if not os.getenv("TOIL_WORKDIR"):
|
|
61
|
+
os.environ["TOIL_WORKDIR"] = os.getcwd()
|
|
62
62
|
|
|
63
63
|
def registered(self, driver, executorInfo, frameworkInfo, agentInfo):
|
|
64
64
|
"""
|
|
@@ -66,11 +66,13 @@ class MesosExecutor(Executor):
|
|
|
66
66
|
"""
|
|
67
67
|
|
|
68
68
|
# Get the ID we have been assigned, if we have it
|
|
69
|
-
self.id = executorInfo.executor_id.get(
|
|
69
|
+
self.id = executorInfo.executor_id.get("value", None)
|
|
70
70
|
|
|
71
71
|
log.debug("Registered executor %s with framework", self.id)
|
|
72
72
|
self.address = socket.gethostbyname(agentInfo.hostname)
|
|
73
|
-
nodeInfoThread = threading.Thread(
|
|
73
|
+
nodeInfoThread = threading.Thread(
|
|
74
|
+
target=self._sendFrameworkMessage, args=[driver], daemon=True
|
|
75
|
+
)
|
|
74
76
|
nodeInfoThread.start()
|
|
75
77
|
|
|
76
78
|
def reregistered(self, driver, agentInfo):
|
|
@@ -99,12 +101,12 @@ class MesosExecutor(Executor):
|
|
|
99
101
|
os.killpg(pgid, signal.SIGKILL)
|
|
100
102
|
|
|
101
103
|
def shutdown(self, driver):
|
|
102
|
-
log.critical(
|
|
104
|
+
log.critical("Shutting down executor ...")
|
|
103
105
|
for taskId in list(self.runningTasks.keys()):
|
|
104
106
|
self.killTask(driver, taskId)
|
|
105
107
|
Resource.cleanSystem()
|
|
106
108
|
BatchSystemSupport.workerCleanup(self.workerCleanupInfo)
|
|
107
|
-
log.critical(
|
|
109
|
+
log.critical("... executor shut down.")
|
|
108
110
|
|
|
109
111
|
def error(self, driver, message):
|
|
110
112
|
"""
|
|
@@ -123,13 +125,15 @@ class MesosExecutor(Executor):
|
|
|
123
125
|
message = Expando(address=self.address)
|
|
124
126
|
psutil.cpu_percent()
|
|
125
127
|
else:
|
|
126
|
-
message.nodeInfo = dict(
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
128
|
+
message.nodeInfo = dict(
|
|
129
|
+
coresUsed=float(psutil.cpu_percent()) * 0.01,
|
|
130
|
+
memoryUsed=float(psutil.virtual_memory().percent) * 0.01,
|
|
131
|
+
coresTotal=cpu_count(),
|
|
132
|
+
memoryTotal=psutil.virtual_memory().total,
|
|
133
|
+
workers=len(self.runningTasks),
|
|
134
|
+
)
|
|
131
135
|
log.debug("Send framework message: %s", message)
|
|
132
|
-
driver.sendFrameworkMessage(encode_data(repr(message).encode(
|
|
136
|
+
driver.sendFrameworkMessage(encode_data(repr(message).encode("utf-8")))
|
|
133
137
|
# Prevent workers launched together from repeatedly hitting the leader at the same time
|
|
134
138
|
time.sleep(random.randint(45, 75))
|
|
135
139
|
|
|
@@ -144,16 +148,21 @@ class MesosExecutor(Executor):
|
|
|
144
148
|
|
|
145
149
|
log.debug("Running task %s", task.task_id.value)
|
|
146
150
|
startTime = time.time()
|
|
147
|
-
sendUpdate(task,
|
|
151
|
+
sendUpdate(task, "TASK_RUNNING", wallTime=0)
|
|
148
152
|
|
|
149
153
|
# try to unpickle the task
|
|
150
154
|
try:
|
|
151
155
|
taskData = pickle.loads(decode_data(task.data))
|
|
152
156
|
except:
|
|
153
157
|
exc_info = sys.exc_info()
|
|
154
|
-
log.error(
|
|
158
|
+
log.error("Exception while unpickling task: ", exc_info=exc_info)
|
|
155
159
|
exc_type, exc_value, exc_trace = exc_info
|
|
156
|
-
sendUpdate(
|
|
160
|
+
sendUpdate(
|
|
161
|
+
task,
|
|
162
|
+
"TASK_FAILED",
|
|
163
|
+
wallTime=0,
|
|
164
|
+
msg="".join(traceback.format_exception_only(exc_type, exc_value)),
|
|
165
|
+
)
|
|
157
166
|
return
|
|
158
167
|
|
|
159
168
|
# This is where task.data is first invoked. Using this position to setup cleanupInfo
|
|
@@ -170,23 +179,27 @@ class MesosExecutor(Executor):
|
|
|
170
179
|
exitStatus = process.wait()
|
|
171
180
|
wallTime = time.time() - startTime
|
|
172
181
|
if 0 == exitStatus:
|
|
173
|
-
sendUpdate(task,
|
|
182
|
+
sendUpdate(task, "TASK_FINISHED", wallTime)
|
|
174
183
|
elif -9 == exitStatus:
|
|
175
|
-
sendUpdate(task,
|
|
184
|
+
sendUpdate(task, "TASK_KILLED", wallTime)
|
|
176
185
|
else:
|
|
177
|
-
sendUpdate(task,
|
|
186
|
+
sendUpdate(task, "TASK_FAILED", wallTime, msg=str(exitStatus))
|
|
178
187
|
finally:
|
|
179
188
|
del self.runningTasks[task.task_id.value]
|
|
180
189
|
except:
|
|
181
190
|
wallTime = time.time() - startTime
|
|
182
191
|
exc_info = sys.exc_info()
|
|
183
|
-
log.error(
|
|
192
|
+
log.error("Exception while running task:", exc_info=exc_info)
|
|
184
193
|
exc_type, exc_value, exc_trace = exc_info
|
|
185
|
-
sendUpdate(
|
|
194
|
+
sendUpdate(
|
|
195
|
+
task,
|
|
196
|
+
"TASK_FAILED",
|
|
197
|
+
wallTime=wallTime,
|
|
198
|
+
msg="".join(traceback.format_exception_only(exc_type, exc_value)),
|
|
199
|
+
)
|
|
186
200
|
|
|
187
201
|
wallTime = time.time() - startTime
|
|
188
|
-
sendUpdate(task,
|
|
189
|
-
|
|
202
|
+
sendUpdate(task, "TASK_FINISHED", wallTime)
|
|
190
203
|
|
|
191
204
|
def runJob(job):
|
|
192
205
|
"""
|
|
@@ -196,16 +209,17 @@ class MesosExecutor(Executor):
|
|
|
196
209
|
"""
|
|
197
210
|
if job.userScript:
|
|
198
211
|
job.userScript.register()
|
|
199
|
-
|
|
212
|
+
command = job.command
|
|
213
|
+
log.debug("Invoking command: '%s'", command)
|
|
200
214
|
# Construct the job's environment
|
|
201
215
|
jobEnv = dict(os.environ, **job.environment)
|
|
202
|
-
log.debug(
|
|
216
|
+
log.debug("Using environment variables: %s", jobEnv.keys())
|
|
203
217
|
with self.popenLock:
|
|
204
|
-
return subprocess.Popen(
|
|
205
|
-
|
|
206
|
-
|
|
218
|
+
return subprocess.Popen(
|
|
219
|
+
command, preexec_fn=lambda: os.setpgrp(), shell=True, env=jobEnv
|
|
220
|
+
)
|
|
207
221
|
|
|
208
|
-
def sendUpdate(task, taskState, wallTime, msg=
|
|
222
|
+
def sendUpdate(task, taskState, wallTime, msg=""):
|
|
209
223
|
update = addict.Dict()
|
|
210
224
|
update.task_id.value = task.task_id.value
|
|
211
225
|
if self.id is not None:
|
|
@@ -216,7 +230,7 @@ class MesosExecutor(Executor):
|
|
|
216
230
|
|
|
217
231
|
# Add wallTime as a label.
|
|
218
232
|
labels = addict.Dict()
|
|
219
|
-
labels.labels = [{
|
|
233
|
+
labels.labels = [{"key": "wallTime", "value": str(wallTime)}]
|
|
220
234
|
update.labels = labels
|
|
221
235
|
|
|
222
236
|
driver.sendStatusUpdate(update)
|
|
@@ -238,34 +252,48 @@ def main():
|
|
|
238
252
|
if not os.environ.get("MESOS_AGENT_ENDPOINT"):
|
|
239
253
|
# Some Mesos setups in our tests somehow lack this variable. Provide a
|
|
240
254
|
# fake one to maybe convince the executor driver to work.
|
|
241
|
-
os.environ["MESOS_AGENT_ENDPOINT"] = os.environ.get(
|
|
242
|
-
|
|
255
|
+
os.environ["MESOS_AGENT_ENDPOINT"] = os.environ.get(
|
|
256
|
+
"MESOS_SLAVE_ENDPOINT", "127.0.0.1:5051"
|
|
257
|
+
)
|
|
258
|
+
log.warning(
|
|
259
|
+
"Had to fake MESOS_AGENT_ENDPOINT as %s"
|
|
260
|
+
% os.environ["MESOS_AGENT_ENDPOINT"]
|
|
261
|
+
)
|
|
243
262
|
|
|
244
263
|
# must be set manually to enable toggling of the mesos log level for debugging jenkins
|
|
245
264
|
# may be useful: https://github.com/DataBiosphere/toil/pull/2338#discussion_r223854931
|
|
246
265
|
if False:
|
|
247
266
|
try:
|
|
248
|
-
urlopen(
|
|
267
|
+
urlopen(
|
|
268
|
+
"http://%s/logging/toggle?level=1&duration=15mins"
|
|
269
|
+
% os.environ["MESOS_AGENT_ENDPOINT"]
|
|
270
|
+
).read()
|
|
249
271
|
log.debug("Toggled agent log level")
|
|
250
272
|
except Exception:
|
|
251
273
|
log.debug("Failed to toggle agent log level")
|
|
252
274
|
|
|
253
275
|
# Parse the agent state
|
|
254
|
-
agent_state = json.loads(
|
|
255
|
-
|
|
276
|
+
agent_state = json.loads(
|
|
277
|
+
urlopen("http://%s/state" % os.environ["MESOS_AGENT_ENDPOINT"]).read()
|
|
278
|
+
)
|
|
279
|
+
if "completed_frameworks" in agent_state:
|
|
256
280
|
# Drop the completed frameworks which grow over time
|
|
257
|
-
del agent_state[
|
|
281
|
+
del agent_state["completed_frameworks"]
|
|
258
282
|
log.debug("Agent state: %s", str(agent_state))
|
|
259
283
|
|
|
260
284
|
log.debug("Virtual memory info in executor: %s" % repr(psutil.virtual_memory()))
|
|
261
285
|
|
|
262
|
-
if os.path.exists(
|
|
286
|
+
if os.path.exists("/sys/fs/cgroup/memory"):
|
|
263
287
|
# Mesos can limit memory with a cgroup, so we should report on that.
|
|
264
|
-
for
|
|
288
|
+
for dirpath, dirnames, filenames in os.walk(
|
|
289
|
+
"/sys/fs/cgroup/memory", followlinks=True
|
|
290
|
+
):
|
|
265
291
|
for filename in filenames:
|
|
266
|
-
if
|
|
292
|
+
if "limit_in_bytes" not in filename:
|
|
267
293
|
continue
|
|
268
|
-
log.debug(
|
|
294
|
+
log.debug(
|
|
295
|
+
"cgroup memory info from %s:" % os.path.join(dirpath, filename)
|
|
296
|
+
)
|
|
269
297
|
try:
|
|
270
298
|
for line in open(os.path.join(dirpath, filename)):
|
|
271
299
|
log.debug(line.rstrip())
|
|
@@ -274,14 +302,13 @@ def main():
|
|
|
274
302
|
|
|
275
303
|
# Mesos can also impose rlimit limits, including on things that really
|
|
276
304
|
# ought to not be limited, like virtual address space size.
|
|
277
|
-
log.debug(
|
|
278
|
-
log.debug(
|
|
279
|
-
log.debug(
|
|
280
|
-
log.debug(
|
|
281
|
-
|
|
305
|
+
log.debug("DATA rlimit: %s", str(resource.getrlimit(resource.RLIMIT_DATA)))
|
|
306
|
+
log.debug("STACK rlimit: %s", str(resource.getrlimit(resource.RLIMIT_STACK)))
|
|
307
|
+
log.debug("RSS rlimit: %s", str(resource.getrlimit(resource.RLIMIT_RSS)))
|
|
308
|
+
log.debug("AS rlimit: %s", str(resource.getrlimit(resource.RLIMIT_AS)))
|
|
282
309
|
|
|
283
310
|
executor = MesosExecutor()
|
|
284
|
-
log.debug(
|
|
311
|
+
log.debug("Made executor")
|
|
285
312
|
driver = MesosExecutorDriver(executor, use_addict=True)
|
|
286
313
|
|
|
287
314
|
old_on_event = driver.on_event
|
|
@@ -295,13 +322,15 @@ def main():
|
|
|
295
322
|
|
|
296
323
|
driver.on_event = patched_on_event
|
|
297
324
|
|
|
298
|
-
log.debug(
|
|
325
|
+
log.debug("Made driver")
|
|
299
326
|
driver.start()
|
|
300
|
-
log.debug(
|
|
327
|
+
log.debug("Started driver")
|
|
301
328
|
driver_result = driver.join()
|
|
302
|
-
log.debug(
|
|
329
|
+
log.debug("Joined driver")
|
|
303
330
|
|
|
304
331
|
# Tolerate a None in addition to the code the docs suggest we should receive from join()
|
|
305
|
-
exit_value =
|
|
332
|
+
exit_value = (
|
|
333
|
+
0 if (driver_result is None or driver_result == "DRIVER_STOPPED") else 1
|
|
334
|
+
)
|
|
306
335
|
assert len(executor.runningTasks) == 0
|
|
307
336
|
sys.exit(exit_value)
|
|
@@ -17,16 +17,18 @@ log = logging.getLogger(__name__)
|
|
|
17
17
|
class MesosTestSupport:
|
|
18
18
|
"""Mixin for test cases that need a running Mesos master and agent on the local host."""
|
|
19
19
|
|
|
20
|
-
@retry(
|
|
21
|
-
|
|
20
|
+
@retry(
|
|
21
|
+
intervals=[1, 1, 2, 4, 8, 16, 32, 64, 128],
|
|
22
|
+
log_message=(log.info, "Checking if Mesos is ready..."),
|
|
23
|
+
)
|
|
22
24
|
def wait_for_master(self):
|
|
23
|
-
with closing(urlopen(
|
|
25
|
+
with closing(urlopen("http://127.0.0.1:5050/version")) as content:
|
|
24
26
|
content.read()
|
|
25
27
|
|
|
26
28
|
def _startMesos(self, numCores=None):
|
|
27
29
|
if numCores is None:
|
|
28
30
|
numCores = cpu_count()
|
|
29
|
-
shutil.rmtree(
|
|
31
|
+
shutil.rmtree("/tmp/mesos", ignore_errors=True)
|
|
30
32
|
self.master = self.MesosMasterThread(numCores)
|
|
31
33
|
self.master.start()
|
|
32
34
|
self.agent = self.MesosAgentThread(numCores)
|
|
@@ -35,7 +37,7 @@ class MesosTestSupport:
|
|
|
35
37
|
# Bad Things will happen if the master is not yet ready when Toil tries to use it.
|
|
36
38
|
self.wait_for_master()
|
|
37
39
|
|
|
38
|
-
log.info(
|
|
40
|
+
log.info("Mesos is ready! Running test.")
|
|
39
41
|
|
|
40
42
|
def _stopProcess(self, process, timeout=10) -> None:
|
|
41
43
|
"""Gracefully stop a process on a timeout, given the Popen object for the process."""
|
|
@@ -47,7 +49,7 @@ class MesosTestSupport:
|
|
|
47
49
|
waited += 1
|
|
48
50
|
if process.poll() is None:
|
|
49
51
|
# It didn't shut down gracefully
|
|
50
|
-
log.warning(
|
|
52
|
+
log.warning("Forcibly killing child which ignored SIGTERM")
|
|
51
53
|
process.kill()
|
|
52
54
|
|
|
53
55
|
def _stopMesos(self):
|
|
@@ -71,7 +73,7 @@ class MesosTestSupport:
|
|
|
71
73
|
|
|
72
74
|
def tryRun(self):
|
|
73
75
|
self.popen.wait()
|
|
74
|
-
log.info(
|
|
76
|
+
log.info("Exiting %s", self.__class__.__name__)
|
|
75
77
|
|
|
76
78
|
def findMesosBinary(self, names):
|
|
77
79
|
if isinstance(names, str):
|
|
@@ -86,7 +88,7 @@ class MesosTestSupport:
|
|
|
86
88
|
# Special case for users of PyCharm on OS X. This is where Homebrew installs
|
|
87
89
|
# it. It's hard to set PATH for PyCharm (or any GUI app) on OS X so let's
|
|
88
90
|
# make it easy for those poor souls.
|
|
89
|
-
return which(name, path=
|
|
91
|
+
return which(name, path="/usr/local/sbin")
|
|
90
92
|
except StopIteration:
|
|
91
93
|
pass
|
|
92
94
|
|
|
@@ -94,18 +96,22 @@ class MesosTestSupport:
|
|
|
94
96
|
if len(names) == 1:
|
|
95
97
|
sought = "binary '%s'" % names[0]
|
|
96
98
|
else:
|
|
97
|
-
sought =
|
|
99
|
+
sought = "any binary in %s" % str(names)
|
|
98
100
|
|
|
99
|
-
raise RuntimeError(
|
|
100
|
-
|
|
101
|
+
raise RuntimeError(
|
|
102
|
+
"Cannot find %s. Make sure Mesos is installed "
|
|
103
|
+
"and it's 'bin' directory is present on the PATH." % sought
|
|
104
|
+
)
|
|
101
105
|
|
|
102
106
|
class MesosMasterThread(MesosThread):
|
|
103
107
|
def mesosCommand(self):
|
|
104
|
-
return [
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
108
|
+
return [
|
|
109
|
+
self.findMesosBinary("mesos-master"),
|
|
110
|
+
"--registry=in_memory",
|
|
111
|
+
"--ip=127.0.0.1",
|
|
112
|
+
"--port=5050",
|
|
113
|
+
"--allocation_interval=500ms",
|
|
114
|
+
]
|
|
109
115
|
|
|
110
116
|
class MesosAgentThread(MesosThread):
|
|
111
117
|
def mesosCommand(self):
|
|
@@ -114,10 +120,12 @@ class MesosTestSupport:
|
|
|
114
120
|
# We also make sure to point it explicitly at the right temp work directory, and
|
|
115
121
|
# to disable systemd support because we have to be root to make systemd make us
|
|
116
122
|
# things and we probably aren't when testing.
|
|
117
|
-
return [
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
123
|
+
return [
|
|
124
|
+
self.findMesosBinary(["mesos-agent"]),
|
|
125
|
+
"--ip=127.0.0.1",
|
|
126
|
+
"--master=127.0.0.1:5050",
|
|
127
|
+
"--attributes=preemptible:False",
|
|
128
|
+
"--resources=cpus(*):%i" % self.numCores,
|
|
129
|
+
"--work_dir=/tmp/mesos",
|
|
130
|
+
"--no-systemd_enable_support",
|
|
131
|
+
]
|
toil/batchSystems/options.py
CHANGED
|
@@ -12,22 +12,19 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
|
|
14
14
|
import logging
|
|
15
|
-
import sys
|
|
16
15
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
17
|
-
from typing import Any, Callable,
|
|
16
|
+
from typing import Any, Callable, Optional, Protocol, TypeVar, Union
|
|
18
17
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
from toil.batchSystems.registry import (DEFAULT_BATCH_SYSTEM,
|
|
25
|
-
get_batch_system,
|
|
26
|
-
get_batch_systems)
|
|
18
|
+
from toil.batchSystems.registry import (
|
|
19
|
+
DEFAULT_BATCH_SYSTEM,
|
|
20
|
+
get_batch_system,
|
|
21
|
+
get_batch_systems,
|
|
22
|
+
)
|
|
27
23
|
from toil.lib.threading import cpu_count
|
|
28
24
|
|
|
29
25
|
logger = logging.getLogger(__name__)
|
|
30
26
|
|
|
27
|
+
|
|
31
28
|
class OptionSetter(Protocol):
|
|
32
29
|
"""
|
|
33
30
|
Protocol for the setOption function we get to let us set up CLI options for
|
|
@@ -36,19 +33,22 @@ class OptionSetter(Protocol):
|
|
|
36
33
|
Actual functionality is defined in the Config class.
|
|
37
34
|
"""
|
|
38
35
|
|
|
39
|
-
OptionType = TypeVar(
|
|
36
|
+
OptionType = TypeVar("OptionType")
|
|
37
|
+
|
|
40
38
|
def __call__(
|
|
41
39
|
self,
|
|
42
40
|
option_name: str,
|
|
43
41
|
parsing_function: Optional[Callable[[Any], OptionType]] = None,
|
|
44
42
|
check_function: Optional[Callable[[OptionType], Union[None, bool]]] = None,
|
|
45
43
|
default: Optional[OptionType] = None,
|
|
46
|
-
env: Optional[
|
|
47
|
-
old_names: Optional[
|
|
48
|
-
) -> bool:
|
|
49
|
-
...
|
|
44
|
+
env: Optional[list[str]] = None,
|
|
45
|
+
old_names: Optional[list[str]] = None,
|
|
46
|
+
) -> bool: ...
|
|
50
47
|
|
|
51
|
-
|
|
48
|
+
|
|
49
|
+
def set_batchsystem_options(
|
|
50
|
+
batch_system: Optional[str], set_option: OptionSetter
|
|
51
|
+
) -> None:
|
|
52
52
|
"""
|
|
53
53
|
Call set_option for all the options for the given named batch system, or
|
|
54
54
|
all batch systems if no name is provided.
|
|
@@ -76,6 +76,7 @@ def set_batchsystem_options(batch_system: Optional[str], set_option: OptionSette
|
|
|
76
76
|
set_option("manualMemArgs")
|
|
77
77
|
set_option("run_local_jobs_on_workers")
|
|
78
78
|
set_option("statePollingWait")
|
|
79
|
+
set_option("state_polling_timeout")
|
|
79
80
|
set_option("batch_logs_dir")
|
|
80
81
|
|
|
81
82
|
|
|
@@ -109,11 +110,11 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
109
110
|
parser.add_argument(
|
|
110
111
|
"--maxJobs",
|
|
111
112
|
dest="max_jobs",
|
|
112
|
-
default=SYS_MAX_SIZE,
|
|
113
|
+
default=SYS_MAX_SIZE, # This is *basically* unlimited and saves a lot of Optional[]
|
|
113
114
|
type=lambda x: int(x) or SYS_MAX_SIZE,
|
|
114
115
|
help="Specifies the maximum number of jobs to submit to the "
|
|
115
|
-
|
|
116
|
-
|
|
116
|
+
"backing scheduler at once. Not supported on Mesos or "
|
|
117
|
+
"AWS Batch. Use 0 for unlimited. Defaults to unlimited.",
|
|
117
118
|
)
|
|
118
119
|
parser.add_argument(
|
|
119
120
|
"--maxLocalJobs",
|
|
@@ -121,8 +122,8 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
121
122
|
default=None,
|
|
122
123
|
type=lambda x: int(x) or 0,
|
|
123
124
|
help=f"Specifies the maximum number of housekeeping jobs to "
|
|
124
|
-
|
|
125
|
-
|
|
125
|
+
f"run sumultaneously on the local system. Use 0 for "
|
|
126
|
+
f"unlimited. Defaults to the number of local cores ({cpu_count()}).",
|
|
126
127
|
)
|
|
127
128
|
parser.add_argument(
|
|
128
129
|
"--manualMemArgs",
|
|
@@ -161,8 +162,16 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
161
162
|
type=int,
|
|
162
163
|
default=None,
|
|
163
164
|
help="Time, in seconds, to wait before doing a scheduler query for job state. "
|
|
164
|
-
|
|
165
|
-
|
|
165
|
+
"Return cached results if within the waiting period. Only works for grid "
|
|
166
|
+
"engine batch systems such as gridengine, htcondor, torque, slurm, and lsf.",
|
|
167
|
+
)
|
|
168
|
+
parser.add_argument(
|
|
169
|
+
"--statePollingTimeout",
|
|
170
|
+
dest="state_polling_timeout",
|
|
171
|
+
type=int,
|
|
172
|
+
default=1200,
|
|
173
|
+
help="Time, in seconds, to retry against a broken scheduler. Only works for grid "
|
|
174
|
+
"engine batch systems such as gridengine, htcondor, torque, slurm, and lsf.",
|
|
166
175
|
)
|
|
167
176
|
parser.add_argument(
|
|
168
177
|
"--batchLogsDir",
|
|
@@ -170,10 +179,19 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
170
179
|
default=None,
|
|
171
180
|
env_var="TOIL_BATCH_LOGS_DIR",
|
|
172
181
|
help="Directory to tell the backing batch system to log into. Should be available "
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
182
|
+
"on both the leader and the workers, if the backing batch system writes logs "
|
|
183
|
+
"to the worker machines' filesystems, as many HPC schedulers do. If unset, "
|
|
184
|
+
"the Toil work directory will be used. Only works for grid engine batch "
|
|
185
|
+
"systems such as gridengine, htcondor, torque, slurm, and lsf.",
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
parser.add_argument(
|
|
189
|
+
"--memoryIsProduct",
|
|
190
|
+
dest="memory_is_product",
|
|
191
|
+
default=False,
|
|
192
|
+
action="store_true",
|
|
193
|
+
help="If the batch system understands requested memory as a product of the requested memory and the number"
|
|
194
|
+
"of cores, set this flag to properly allocate memory.",
|
|
177
195
|
)
|
|
178
196
|
|
|
179
197
|
for name in get_batch_systems():
|
|
@@ -185,5 +203,5 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
185
203
|
# Skip anything we can't import
|
|
186
204
|
continue
|
|
187
205
|
# Ask the batch system to create its options in the parser
|
|
188
|
-
logger.debug(
|
|
206
|
+
logger.debug("Add options for %s batch system", name)
|
|
189
207
|
batch_system_type.add_options(parser)
|