toil 7.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +124 -86
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +39 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +651 -155
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +784 -397
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1137 -534
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +1031 -349
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +772 -412
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +204 -58
- toil/lib/aws/utils.py +290 -213
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/dockstore.py +379 -0
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -105
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/history.py +1271 -0
- toil/lib/history_submission.py +681 -0
- toil/lib/humanize.py +6 -2
- toil/lib/io.py +121 -12
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +83 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +125 -87
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/lib/trs.py +390 -0
- toil/lib/web.py +38 -0
- toil/options/common.py +850 -402
- toil/options/cwl.py +185 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +283 -180
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +84 -55
- toil/server/utils.py +56 -31
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +183 -65
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +265 -49
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/conftest.py +39 -0
- toil/test/cwl/cwlTest.py +375 -72
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/optional-file.cwl +18 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_history.py +212 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/lib/test_trs.py +161 -0
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +6 -6
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3528 -1053
- toil/worker.py +370 -149
- toil-8.1.0b1.dist-info/METADATA +178 -0
- toil-8.1.0b1.dist-info/RECORD +259 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
|
@@ -51,14 +51,14 @@ class MesosExecutor(Executor):
|
|
|
51
51
|
self.popenLock = threading.Lock()
|
|
52
52
|
self.runningTasks = {}
|
|
53
53
|
self.workerCleanupInfo = None
|
|
54
|
-
log.debug(
|
|
54
|
+
log.debug("Preparing system for resource download")
|
|
55
55
|
Resource.prepareSystem()
|
|
56
56
|
self.address = None
|
|
57
57
|
self.id = None
|
|
58
58
|
# Setting this value at this point will ensure that the toil workflow directory will go to
|
|
59
59
|
# the mesos sandbox if the user hasn't specified --workDir on the command line.
|
|
60
|
-
if not os.getenv(
|
|
61
|
-
os.environ[
|
|
60
|
+
if not os.getenv("TOIL_WORKDIR"):
|
|
61
|
+
os.environ["TOIL_WORKDIR"] = os.getcwd()
|
|
62
62
|
|
|
63
63
|
def registered(self, driver, executorInfo, frameworkInfo, agentInfo):
|
|
64
64
|
"""
|
|
@@ -66,11 +66,13 @@ class MesosExecutor(Executor):
|
|
|
66
66
|
"""
|
|
67
67
|
|
|
68
68
|
# Get the ID we have been assigned, if we have it
|
|
69
|
-
self.id = executorInfo.executor_id.get(
|
|
69
|
+
self.id = executorInfo.executor_id.get("value", None)
|
|
70
70
|
|
|
71
71
|
log.debug("Registered executor %s with framework", self.id)
|
|
72
72
|
self.address = socket.gethostbyname(agentInfo.hostname)
|
|
73
|
-
nodeInfoThread = threading.Thread(
|
|
73
|
+
nodeInfoThread = threading.Thread(
|
|
74
|
+
target=self._sendFrameworkMessage, args=[driver], daemon=True
|
|
75
|
+
)
|
|
74
76
|
nodeInfoThread.start()
|
|
75
77
|
|
|
76
78
|
def reregistered(self, driver, agentInfo):
|
|
@@ -99,12 +101,12 @@ class MesosExecutor(Executor):
|
|
|
99
101
|
os.killpg(pgid, signal.SIGKILL)
|
|
100
102
|
|
|
101
103
|
def shutdown(self, driver):
|
|
102
|
-
log.critical(
|
|
104
|
+
log.critical("Shutting down executor ...")
|
|
103
105
|
for taskId in list(self.runningTasks.keys()):
|
|
104
106
|
self.killTask(driver, taskId)
|
|
105
107
|
Resource.cleanSystem()
|
|
106
108
|
BatchSystemSupport.workerCleanup(self.workerCleanupInfo)
|
|
107
|
-
log.critical(
|
|
109
|
+
log.critical("... executor shut down.")
|
|
108
110
|
|
|
109
111
|
def error(self, driver, message):
|
|
110
112
|
"""
|
|
@@ -123,13 +125,15 @@ class MesosExecutor(Executor):
|
|
|
123
125
|
message = Expando(address=self.address)
|
|
124
126
|
psutil.cpu_percent()
|
|
125
127
|
else:
|
|
126
|
-
message.nodeInfo = dict(
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
128
|
+
message.nodeInfo = dict(
|
|
129
|
+
coresUsed=float(psutil.cpu_percent()) * 0.01,
|
|
130
|
+
memoryUsed=float(psutil.virtual_memory().percent) * 0.01,
|
|
131
|
+
coresTotal=cpu_count(),
|
|
132
|
+
memoryTotal=psutil.virtual_memory().total,
|
|
133
|
+
workers=len(self.runningTasks),
|
|
134
|
+
)
|
|
131
135
|
log.debug("Send framework message: %s", message)
|
|
132
|
-
driver.sendFrameworkMessage(encode_data(repr(message).encode(
|
|
136
|
+
driver.sendFrameworkMessage(encode_data(repr(message).encode("utf-8")))
|
|
133
137
|
# Prevent workers launched together from repeatedly hitting the leader at the same time
|
|
134
138
|
time.sleep(random.randint(45, 75))
|
|
135
139
|
|
|
@@ -144,16 +148,21 @@ class MesosExecutor(Executor):
|
|
|
144
148
|
|
|
145
149
|
log.debug("Running task %s", task.task_id.value)
|
|
146
150
|
startTime = time.time()
|
|
147
|
-
sendUpdate(task,
|
|
151
|
+
sendUpdate(task, "TASK_RUNNING", wallTime=0)
|
|
148
152
|
|
|
149
153
|
# try to unpickle the task
|
|
150
154
|
try:
|
|
151
155
|
taskData = pickle.loads(decode_data(task.data))
|
|
152
156
|
except:
|
|
153
157
|
exc_info = sys.exc_info()
|
|
154
|
-
log.error(
|
|
158
|
+
log.error("Exception while unpickling task: ", exc_info=exc_info)
|
|
155
159
|
exc_type, exc_value, exc_trace = exc_info
|
|
156
|
-
sendUpdate(
|
|
160
|
+
sendUpdate(
|
|
161
|
+
task,
|
|
162
|
+
"TASK_FAILED",
|
|
163
|
+
wallTime=0,
|
|
164
|
+
msg="".join(traceback.format_exception_only(exc_type, exc_value)),
|
|
165
|
+
)
|
|
157
166
|
return
|
|
158
167
|
|
|
159
168
|
# This is where task.data is first invoked. Using this position to setup cleanupInfo
|
|
@@ -170,23 +179,27 @@ class MesosExecutor(Executor):
|
|
|
170
179
|
exitStatus = process.wait()
|
|
171
180
|
wallTime = time.time() - startTime
|
|
172
181
|
if 0 == exitStatus:
|
|
173
|
-
sendUpdate(task,
|
|
182
|
+
sendUpdate(task, "TASK_FINISHED", wallTime)
|
|
174
183
|
elif -9 == exitStatus:
|
|
175
|
-
sendUpdate(task,
|
|
184
|
+
sendUpdate(task, "TASK_KILLED", wallTime)
|
|
176
185
|
else:
|
|
177
|
-
sendUpdate(task,
|
|
186
|
+
sendUpdate(task, "TASK_FAILED", wallTime, msg=str(exitStatus))
|
|
178
187
|
finally:
|
|
179
188
|
del self.runningTasks[task.task_id.value]
|
|
180
189
|
except:
|
|
181
190
|
wallTime = time.time() - startTime
|
|
182
191
|
exc_info = sys.exc_info()
|
|
183
|
-
log.error(
|
|
192
|
+
log.error("Exception while running task:", exc_info=exc_info)
|
|
184
193
|
exc_type, exc_value, exc_trace = exc_info
|
|
185
|
-
sendUpdate(
|
|
194
|
+
sendUpdate(
|
|
195
|
+
task,
|
|
196
|
+
"TASK_FAILED",
|
|
197
|
+
wallTime=wallTime,
|
|
198
|
+
msg="".join(traceback.format_exception_only(exc_type, exc_value)),
|
|
199
|
+
)
|
|
186
200
|
|
|
187
201
|
wallTime = time.time() - startTime
|
|
188
|
-
sendUpdate(task,
|
|
189
|
-
|
|
202
|
+
sendUpdate(task, "TASK_FINISHED", wallTime)
|
|
190
203
|
|
|
191
204
|
def runJob(job):
|
|
192
205
|
"""
|
|
@@ -200,13 +213,13 @@ class MesosExecutor(Executor):
|
|
|
200
213
|
log.debug("Invoking command: '%s'", command)
|
|
201
214
|
# Construct the job's environment
|
|
202
215
|
jobEnv = dict(os.environ, **job.environment)
|
|
203
|
-
log.debug(
|
|
216
|
+
log.debug("Using environment variables: %s", jobEnv.keys())
|
|
204
217
|
with self.popenLock:
|
|
205
|
-
return subprocess.Popen(
|
|
206
|
-
|
|
207
|
-
|
|
218
|
+
return subprocess.Popen(
|
|
219
|
+
command, preexec_fn=lambda: os.setpgrp(), shell=True, env=jobEnv
|
|
220
|
+
)
|
|
208
221
|
|
|
209
|
-
def sendUpdate(task, taskState, wallTime, msg=
|
|
222
|
+
def sendUpdate(task, taskState, wallTime, msg=""):
|
|
210
223
|
update = addict.Dict()
|
|
211
224
|
update.task_id.value = task.task_id.value
|
|
212
225
|
if self.id is not None:
|
|
@@ -217,7 +230,7 @@ class MesosExecutor(Executor):
|
|
|
217
230
|
|
|
218
231
|
# Add wallTime as a label.
|
|
219
232
|
labels = addict.Dict()
|
|
220
|
-
labels.labels = [{
|
|
233
|
+
labels.labels = [{"key": "wallTime", "value": str(wallTime)}]
|
|
221
234
|
update.labels = labels
|
|
222
235
|
|
|
223
236
|
driver.sendStatusUpdate(update)
|
|
@@ -239,34 +252,48 @@ def main():
|
|
|
239
252
|
if not os.environ.get("MESOS_AGENT_ENDPOINT"):
|
|
240
253
|
# Some Mesos setups in our tests somehow lack this variable. Provide a
|
|
241
254
|
# fake one to maybe convince the executor driver to work.
|
|
242
|
-
os.environ["MESOS_AGENT_ENDPOINT"] = os.environ.get(
|
|
243
|
-
|
|
255
|
+
os.environ["MESOS_AGENT_ENDPOINT"] = os.environ.get(
|
|
256
|
+
"MESOS_SLAVE_ENDPOINT", "127.0.0.1:5051"
|
|
257
|
+
)
|
|
258
|
+
log.warning(
|
|
259
|
+
"Had to fake MESOS_AGENT_ENDPOINT as %s"
|
|
260
|
+
% os.environ["MESOS_AGENT_ENDPOINT"]
|
|
261
|
+
)
|
|
244
262
|
|
|
245
263
|
# must be set manually to enable toggling of the mesos log level for debugging jenkins
|
|
246
264
|
# may be useful: https://github.com/DataBiosphere/toil/pull/2338#discussion_r223854931
|
|
247
265
|
if False:
|
|
248
266
|
try:
|
|
249
|
-
urlopen(
|
|
267
|
+
urlopen(
|
|
268
|
+
"http://%s/logging/toggle?level=1&duration=15mins"
|
|
269
|
+
% os.environ["MESOS_AGENT_ENDPOINT"]
|
|
270
|
+
).read()
|
|
250
271
|
log.debug("Toggled agent log level")
|
|
251
272
|
except Exception:
|
|
252
273
|
log.debug("Failed to toggle agent log level")
|
|
253
274
|
|
|
254
275
|
# Parse the agent state
|
|
255
|
-
agent_state = json.loads(
|
|
256
|
-
|
|
276
|
+
agent_state = json.loads(
|
|
277
|
+
urlopen("http://%s/state" % os.environ["MESOS_AGENT_ENDPOINT"]).read()
|
|
278
|
+
)
|
|
279
|
+
if "completed_frameworks" in agent_state:
|
|
257
280
|
# Drop the completed frameworks which grow over time
|
|
258
|
-
del agent_state[
|
|
281
|
+
del agent_state["completed_frameworks"]
|
|
259
282
|
log.debug("Agent state: %s", str(agent_state))
|
|
260
283
|
|
|
261
284
|
log.debug("Virtual memory info in executor: %s" % repr(psutil.virtual_memory()))
|
|
262
285
|
|
|
263
|
-
if os.path.exists(
|
|
286
|
+
if os.path.exists("/sys/fs/cgroup/memory"):
|
|
264
287
|
# Mesos can limit memory with a cgroup, so we should report on that.
|
|
265
|
-
for
|
|
288
|
+
for dirpath, dirnames, filenames in os.walk(
|
|
289
|
+
"/sys/fs/cgroup/memory", followlinks=True
|
|
290
|
+
):
|
|
266
291
|
for filename in filenames:
|
|
267
|
-
if
|
|
292
|
+
if "limit_in_bytes" not in filename:
|
|
268
293
|
continue
|
|
269
|
-
log.debug(
|
|
294
|
+
log.debug(
|
|
295
|
+
"cgroup memory info from %s:" % os.path.join(dirpath, filename)
|
|
296
|
+
)
|
|
270
297
|
try:
|
|
271
298
|
for line in open(os.path.join(dirpath, filename)):
|
|
272
299
|
log.debug(line.rstrip())
|
|
@@ -275,14 +302,13 @@ def main():
|
|
|
275
302
|
|
|
276
303
|
# Mesos can also impose rlimit limits, including on things that really
|
|
277
304
|
# ought to not be limited, like virtual address space size.
|
|
278
|
-
log.debug(
|
|
279
|
-
log.debug(
|
|
280
|
-
log.debug(
|
|
281
|
-
log.debug(
|
|
282
|
-
|
|
305
|
+
log.debug("DATA rlimit: %s", str(resource.getrlimit(resource.RLIMIT_DATA)))
|
|
306
|
+
log.debug("STACK rlimit: %s", str(resource.getrlimit(resource.RLIMIT_STACK)))
|
|
307
|
+
log.debug("RSS rlimit: %s", str(resource.getrlimit(resource.RLIMIT_RSS)))
|
|
308
|
+
log.debug("AS rlimit: %s", str(resource.getrlimit(resource.RLIMIT_AS)))
|
|
283
309
|
|
|
284
310
|
executor = MesosExecutor()
|
|
285
|
-
log.debug(
|
|
311
|
+
log.debug("Made executor")
|
|
286
312
|
driver = MesosExecutorDriver(executor, use_addict=True)
|
|
287
313
|
|
|
288
314
|
old_on_event = driver.on_event
|
|
@@ -296,13 +322,15 @@ def main():
|
|
|
296
322
|
|
|
297
323
|
driver.on_event = patched_on_event
|
|
298
324
|
|
|
299
|
-
log.debug(
|
|
325
|
+
log.debug("Made driver")
|
|
300
326
|
driver.start()
|
|
301
|
-
log.debug(
|
|
327
|
+
log.debug("Started driver")
|
|
302
328
|
driver_result = driver.join()
|
|
303
|
-
log.debug(
|
|
329
|
+
log.debug("Joined driver")
|
|
304
330
|
|
|
305
331
|
# Tolerate a None in addition to the code the docs suggest we should receive from join()
|
|
306
|
-
exit_value =
|
|
332
|
+
exit_value = (
|
|
333
|
+
0 if (driver_result is None or driver_result == "DRIVER_STOPPED") else 1
|
|
334
|
+
)
|
|
307
335
|
assert len(executor.runningTasks) == 0
|
|
308
336
|
sys.exit(exit_value)
|
|
@@ -17,16 +17,18 @@ log = logging.getLogger(__name__)
|
|
|
17
17
|
class MesosTestSupport:
|
|
18
18
|
"""Mixin for test cases that need a running Mesos master and agent on the local host."""
|
|
19
19
|
|
|
20
|
-
@retry(
|
|
21
|
-
|
|
20
|
+
@retry(
|
|
21
|
+
intervals=[1, 1, 2, 4, 8, 16, 32, 64, 128],
|
|
22
|
+
log_message=(log.info, "Checking if Mesos is ready..."),
|
|
23
|
+
)
|
|
22
24
|
def wait_for_master(self):
|
|
23
|
-
with closing(urlopen(
|
|
25
|
+
with closing(urlopen("http://127.0.0.1:5050/version")) as content:
|
|
24
26
|
content.read()
|
|
25
27
|
|
|
26
28
|
def _startMesos(self, numCores=None):
|
|
27
29
|
if numCores is None:
|
|
28
30
|
numCores = cpu_count()
|
|
29
|
-
shutil.rmtree(
|
|
31
|
+
shutil.rmtree("/tmp/mesos", ignore_errors=True)
|
|
30
32
|
self.master = self.MesosMasterThread(numCores)
|
|
31
33
|
self.master.start()
|
|
32
34
|
self.agent = self.MesosAgentThread(numCores)
|
|
@@ -35,7 +37,7 @@ class MesosTestSupport:
|
|
|
35
37
|
# Bad Things will happen if the master is not yet ready when Toil tries to use it.
|
|
36
38
|
self.wait_for_master()
|
|
37
39
|
|
|
38
|
-
log.info(
|
|
40
|
+
log.info("Mesos is ready! Running test.")
|
|
39
41
|
|
|
40
42
|
def _stopProcess(self, process, timeout=10) -> None:
|
|
41
43
|
"""Gracefully stop a process on a timeout, given the Popen object for the process."""
|
|
@@ -47,7 +49,7 @@ class MesosTestSupport:
|
|
|
47
49
|
waited += 1
|
|
48
50
|
if process.poll() is None:
|
|
49
51
|
# It didn't shut down gracefully
|
|
50
|
-
log.warning(
|
|
52
|
+
log.warning("Forcibly killing child which ignored SIGTERM")
|
|
51
53
|
process.kill()
|
|
52
54
|
|
|
53
55
|
def _stopMesos(self):
|
|
@@ -71,7 +73,7 @@ class MesosTestSupport:
|
|
|
71
73
|
|
|
72
74
|
def tryRun(self):
|
|
73
75
|
self.popen.wait()
|
|
74
|
-
log.info(
|
|
76
|
+
log.info("Exiting %s", self.__class__.__name__)
|
|
75
77
|
|
|
76
78
|
def findMesosBinary(self, names):
|
|
77
79
|
if isinstance(names, str):
|
|
@@ -86,7 +88,7 @@ class MesosTestSupport:
|
|
|
86
88
|
# Special case for users of PyCharm on OS X. This is where Homebrew installs
|
|
87
89
|
# it. It's hard to set PATH for PyCharm (or any GUI app) on OS X so let's
|
|
88
90
|
# make it easy for those poor souls.
|
|
89
|
-
return which(name, path=
|
|
91
|
+
return which(name, path="/usr/local/sbin")
|
|
90
92
|
except StopIteration:
|
|
91
93
|
pass
|
|
92
94
|
|
|
@@ -94,18 +96,22 @@ class MesosTestSupport:
|
|
|
94
96
|
if len(names) == 1:
|
|
95
97
|
sought = "binary '%s'" % names[0]
|
|
96
98
|
else:
|
|
97
|
-
sought =
|
|
99
|
+
sought = "any binary in %s" % str(names)
|
|
98
100
|
|
|
99
|
-
raise RuntimeError(
|
|
100
|
-
|
|
101
|
+
raise RuntimeError(
|
|
102
|
+
"Cannot find %s. Make sure Mesos is installed "
|
|
103
|
+
"and it's 'bin' directory is present on the PATH." % sought
|
|
104
|
+
)
|
|
101
105
|
|
|
102
106
|
class MesosMasterThread(MesosThread):
|
|
103
107
|
def mesosCommand(self):
|
|
104
|
-
return [
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
108
|
+
return [
|
|
109
|
+
self.findMesosBinary("mesos-master"),
|
|
110
|
+
"--registry=in_memory",
|
|
111
|
+
"--ip=127.0.0.1",
|
|
112
|
+
"--port=5050",
|
|
113
|
+
"--allocation_interval=500ms",
|
|
114
|
+
]
|
|
109
115
|
|
|
110
116
|
class MesosAgentThread(MesosThread):
|
|
111
117
|
def mesosCommand(self):
|
|
@@ -114,10 +120,12 @@ class MesosTestSupport:
|
|
|
114
120
|
# We also make sure to point it explicitly at the right temp work directory, and
|
|
115
121
|
# to disable systemd support because we have to be root to make systemd make us
|
|
116
122
|
# things and we probably aren't when testing.
|
|
117
|
-
return [
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
123
|
+
return [
|
|
124
|
+
self.findMesosBinary(["mesos-agent"]),
|
|
125
|
+
"--ip=127.0.0.1",
|
|
126
|
+
"--master=127.0.0.1:5050",
|
|
127
|
+
"--attributes=preemptible:False",
|
|
128
|
+
"--resources=cpus(*):%i" % self.numCores,
|
|
129
|
+
"--work_dir=/tmp/mesos",
|
|
130
|
+
"--no-systemd_enable_support",
|
|
131
|
+
]
|
toil/batchSystems/options.py
CHANGED
|
@@ -12,22 +12,19 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
|
|
14
14
|
import logging
|
|
15
|
-
import sys
|
|
16
15
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
17
|
-
from typing import Any, Callable,
|
|
16
|
+
from typing import Any, Callable, Optional, Protocol, TypeVar, Union
|
|
18
17
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
from toil.batchSystems.registry import (DEFAULT_BATCH_SYSTEM,
|
|
25
|
-
get_batch_system,
|
|
26
|
-
get_batch_systems)
|
|
18
|
+
from toil.batchSystems.registry import (
|
|
19
|
+
DEFAULT_BATCH_SYSTEM,
|
|
20
|
+
get_batch_system,
|
|
21
|
+
get_batch_systems,
|
|
22
|
+
)
|
|
27
23
|
from toil.lib.threading import cpu_count
|
|
28
24
|
|
|
29
25
|
logger = logging.getLogger(__name__)
|
|
30
26
|
|
|
27
|
+
|
|
31
28
|
class OptionSetter(Protocol):
|
|
32
29
|
"""
|
|
33
30
|
Protocol for the setOption function we get to let us set up CLI options for
|
|
@@ -36,19 +33,22 @@ class OptionSetter(Protocol):
|
|
|
36
33
|
Actual functionality is defined in the Config class.
|
|
37
34
|
"""
|
|
38
35
|
|
|
39
|
-
OptionType = TypeVar(
|
|
36
|
+
OptionType = TypeVar("OptionType")
|
|
37
|
+
|
|
40
38
|
def __call__(
|
|
41
39
|
self,
|
|
42
40
|
option_name: str,
|
|
43
41
|
parsing_function: Optional[Callable[[Any], OptionType]] = None,
|
|
44
42
|
check_function: Optional[Callable[[OptionType], Union[None, bool]]] = None,
|
|
45
43
|
default: Optional[OptionType] = None,
|
|
46
|
-
env: Optional[
|
|
47
|
-
old_names: Optional[
|
|
48
|
-
) -> bool:
|
|
49
|
-
...
|
|
44
|
+
env: Optional[list[str]] = None,
|
|
45
|
+
old_names: Optional[list[str]] = None,
|
|
46
|
+
) -> bool: ...
|
|
50
47
|
|
|
51
|
-
|
|
48
|
+
|
|
49
|
+
def set_batchsystem_options(
|
|
50
|
+
batch_system: Optional[str], set_option: OptionSetter
|
|
51
|
+
) -> None:
|
|
52
52
|
"""
|
|
53
53
|
Call set_option for all the options for the given named batch system, or
|
|
54
54
|
all batch systems if no name is provided.
|
|
@@ -110,11 +110,11 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
110
110
|
parser.add_argument(
|
|
111
111
|
"--maxJobs",
|
|
112
112
|
dest="max_jobs",
|
|
113
|
-
default=SYS_MAX_SIZE,
|
|
113
|
+
default=SYS_MAX_SIZE, # This is *basically* unlimited and saves a lot of Optional[]
|
|
114
114
|
type=lambda x: int(x) or SYS_MAX_SIZE,
|
|
115
115
|
help="Specifies the maximum number of jobs to submit to the "
|
|
116
|
-
|
|
117
|
-
|
|
116
|
+
"backing scheduler at once. Not supported on Mesos or "
|
|
117
|
+
"AWS Batch. Use 0 for unlimited. Defaults to unlimited.",
|
|
118
118
|
)
|
|
119
119
|
parser.add_argument(
|
|
120
120
|
"--maxLocalJobs",
|
|
@@ -122,8 +122,8 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
122
122
|
default=None,
|
|
123
123
|
type=lambda x: int(x) or 0,
|
|
124
124
|
help=f"Specifies the maximum number of housekeeping jobs to "
|
|
125
|
-
|
|
126
|
-
|
|
125
|
+
f"run sumultaneously on the local system. Use 0 for "
|
|
126
|
+
f"unlimited. Defaults to the number of local cores ({cpu_count()}).",
|
|
127
127
|
)
|
|
128
128
|
parser.add_argument(
|
|
129
129
|
"--manualMemArgs",
|
|
@@ -162,8 +162,8 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
162
162
|
type=int,
|
|
163
163
|
default=None,
|
|
164
164
|
help="Time, in seconds, to wait before doing a scheduler query for job state. "
|
|
165
|
-
|
|
166
|
-
|
|
165
|
+
"Return cached results if within the waiting period. Only works for grid "
|
|
166
|
+
"engine batch systems such as gridengine, htcondor, torque, slurm, and lsf.",
|
|
167
167
|
)
|
|
168
168
|
parser.add_argument(
|
|
169
169
|
"--statePollingTimeout",
|
|
@@ -171,7 +171,7 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
171
171
|
type=int,
|
|
172
172
|
default=1200,
|
|
173
173
|
help="Time, in seconds, to retry against a broken scheduler. Only works for grid "
|
|
174
|
-
|
|
174
|
+
"engine batch systems such as gridengine, htcondor, torque, slurm, and lsf.",
|
|
175
175
|
)
|
|
176
176
|
parser.add_argument(
|
|
177
177
|
"--batchLogsDir",
|
|
@@ -179,10 +179,20 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
179
179
|
default=None,
|
|
180
180
|
env_var="TOIL_BATCH_LOGS_DIR",
|
|
181
181
|
help="Directory to tell the backing batch system to log into. Should be available "
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
182
|
+
"on both the leader and the workers, if the backing batch system writes logs "
|
|
183
|
+
"to the worker machines' filesystems, as many HPC schedulers do. If unset, "
|
|
184
|
+
"the Toil work directory will be used. Only works for grid engine batch "
|
|
185
|
+
"systems such as gridengine, htcondor, torque, slurm, and lsf.",
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# TODO: Move this to Slurm specifically.
|
|
189
|
+
parser.add_argument(
|
|
190
|
+
"--memoryIsProduct",
|
|
191
|
+
dest="memory_is_product",
|
|
192
|
+
default=False,
|
|
193
|
+
action="store_true",
|
|
194
|
+
help="If the batch system understands requested memory as a product of the requested memory and the number"
|
|
195
|
+
"of cores, set this flag to properly allocate memory.",
|
|
186
196
|
)
|
|
187
197
|
|
|
188
198
|
for name in get_batch_systems():
|
|
@@ -194,5 +204,5 @@ def add_all_batchsystem_options(parser: Union[ArgumentParser, _ArgumentGroup]) -
|
|
|
194
204
|
# Skip anything we can't import
|
|
195
205
|
continue
|
|
196
206
|
# Ask the batch system to create its options in the parser
|
|
197
|
-
logger.debug(
|
|
207
|
+
logger.debug("Add options for %s batch system", name)
|
|
198
208
|
batch_system_type.add_options(parser)
|