toil 7.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +124 -86
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +39 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +651 -155
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +784 -397
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1137 -534
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +1031 -349
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +772 -412
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +204 -58
- toil/lib/aws/utils.py +290 -213
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/dockstore.py +379 -0
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -105
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/history.py +1271 -0
- toil/lib/history_submission.py +681 -0
- toil/lib/humanize.py +6 -2
- toil/lib/io.py +121 -12
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +83 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +125 -87
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/lib/trs.py +390 -0
- toil/lib/web.py +38 -0
- toil/options/common.py +850 -402
- toil/options/cwl.py +185 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +283 -180
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +84 -55
- toil/server/utils.py +56 -31
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +183 -65
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +265 -49
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/conftest.py +39 -0
- toil/test/cwl/cwlTest.py +375 -72
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/optional-file.cwl +18 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_history.py +212 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/lib/test_trs.py +161 -0
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +6 -6
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3528 -1053
- toil/worker.py +370 -149
- toil-8.1.0b1.dist-info/METADATA +178 -0
- toil-8.1.0b1.dist-info/RECORD +259 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
toil/serviceManager.py
CHANGED
|
@@ -15,9 +15,10 @@
|
|
|
15
15
|
|
|
16
16
|
import logging
|
|
17
17
|
import time
|
|
18
|
+
from collections.abc import Iterable
|
|
18
19
|
from queue import Empty, Queue
|
|
19
20
|
from threading import Event, Thread
|
|
20
|
-
from typing import
|
|
21
|
+
from typing import Optional
|
|
21
22
|
|
|
22
23
|
from toil.job import ServiceJobDescription
|
|
23
24
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
@@ -40,7 +41,7 @@ class ServiceManager:
|
|
|
40
41
|
|
|
41
42
|
# These are all the client jobs that are waiting for their services to
|
|
42
43
|
# start.
|
|
43
|
-
self.__waiting_clients:
|
|
44
|
+
self.__waiting_clients: set[str] = set()
|
|
44
45
|
|
|
45
46
|
# This is used to terminate the thread associated with the service
|
|
46
47
|
# manager
|
|
@@ -123,7 +124,9 @@ class ServiceManager:
|
|
|
123
124
|
client_id = self.__clients_out.get(timeout=maxWait)
|
|
124
125
|
self.__waiting_clients.remove(client_id)
|
|
125
126
|
if self.__service_manager_jobs < 0:
|
|
126
|
-
raise RuntimeError(
|
|
127
|
+
raise RuntimeError(
|
|
128
|
+
"The number of jobs scheduled by the service manager cannot be negative."
|
|
129
|
+
)
|
|
127
130
|
self.__service_manager_jobs -= 1
|
|
128
131
|
return client_id
|
|
129
132
|
except Empty:
|
|
@@ -141,7 +144,9 @@ class ServiceManager:
|
|
|
141
144
|
client_id = self.__failed_clients_out.get(timeout=maxWait)
|
|
142
145
|
self.__waiting_clients.remove(client_id)
|
|
143
146
|
if self.__service_manager_jobs < 0:
|
|
144
|
-
raise RuntimeError(
|
|
147
|
+
raise RuntimeError(
|
|
148
|
+
"The number of jobs scheduled by the service manager cannot be negative."
|
|
149
|
+
)
|
|
145
150
|
self.__service_manager_jobs -= 1
|
|
146
151
|
return client_id
|
|
147
152
|
except Empty:
|
|
@@ -157,7 +162,9 @@ class ServiceManager:
|
|
|
157
162
|
try:
|
|
158
163
|
service_id = self.__services_out.get(timeout=maxWait)
|
|
159
164
|
if self.__service_manager_jobs < 0:
|
|
160
|
-
raise RuntimeError(
|
|
165
|
+
raise RuntimeError(
|
|
166
|
+
"The number of jobs scheduled by the service manager cannot be negative."
|
|
167
|
+
)
|
|
161
168
|
self.__service_manager_jobs -= 1
|
|
162
169
|
return service_id
|
|
163
170
|
except Empty:
|
|
@@ -226,7 +233,7 @@ class ServiceManager:
|
|
|
226
233
|
|
|
227
234
|
Will block until all services are started and blocked.
|
|
228
235
|
"""
|
|
229
|
-
logger.debug(
|
|
236
|
+
logger.debug("Waiting for service manager thread to finish ...")
|
|
230
237
|
start_time = time.time()
|
|
231
238
|
self.__terminate.set()
|
|
232
239
|
self.__service_starter.join()
|
|
@@ -251,13 +258,17 @@ class ServiceManager:
|
|
|
251
258
|
while True:
|
|
252
259
|
with throttle(1.0):
|
|
253
260
|
if self.__terminate.is_set():
|
|
254
|
-
logger.debug(
|
|
261
|
+
logger.debug("Received signal to quit starting services.")
|
|
255
262
|
break
|
|
256
263
|
try:
|
|
257
264
|
client_id = self.__clients_in.get_nowait()
|
|
258
265
|
client = self.__toil_state.get_job(client_id)
|
|
259
266
|
host_id_batches = list(client.serviceHostIDsInBatches())
|
|
260
|
-
logger.debug(
|
|
267
|
+
logger.debug(
|
|
268
|
+
"Service manager processing client %s with %d batches of services",
|
|
269
|
+
client,
|
|
270
|
+
len(host_id_batches),
|
|
271
|
+
)
|
|
261
272
|
if len(host_id_batches) > 1:
|
|
262
273
|
# Have to fall back to the old blocking behavior to
|
|
263
274
|
# ensure entire service "groups" are issued as a whole.
|
|
@@ -288,7 +299,7 @@ class ServiceManager:
|
|
|
288
299
|
|
|
289
300
|
pending_service_count = len(starting_services)
|
|
290
301
|
if pending_service_count > 0 and log_limiter.throttle(False):
|
|
291
|
-
logger.debug(
|
|
302
|
+
logger.debug("%d services are starting...", pending_service_count)
|
|
292
303
|
|
|
293
304
|
for service_id in list(starting_services):
|
|
294
305
|
service_job_desc = self._get_service_job(service_id)
|
|
@@ -297,7 +308,9 @@ class ServiceManager:
|
|
|
297
308
|
or service_job_desc.errorJobStoreID is None
|
|
298
309
|
):
|
|
299
310
|
raise Exception("Must be a registered ServiceJobDescription")
|
|
300
|
-
if not self.__job_store.file_exists(
|
|
311
|
+
if not self.__job_store.file_exists(
|
|
312
|
+
service_job_desc.startJobStoreID
|
|
313
|
+
):
|
|
301
314
|
# Service has started (or failed)
|
|
302
315
|
logger.debug(
|
|
303
316
|
"Service %s has removed %s and is therefore started",
|
|
@@ -308,9 +321,13 @@ class ServiceManager:
|
|
|
308
321
|
client_id = service_to_client[service_id]
|
|
309
322
|
remaining_services_by_client[client_id] -= 1
|
|
310
323
|
if remaining_services_by_client[client_id] < 0:
|
|
311
|
-
raise RuntimeError(
|
|
324
|
+
raise RuntimeError(
|
|
325
|
+
"The number of remaining services cannot be negative."
|
|
326
|
+
)
|
|
312
327
|
del service_to_client[service_id]
|
|
313
|
-
if not self.__job_store.file_exists(
|
|
328
|
+
if not self.__job_store.file_exists(
|
|
329
|
+
service_job_desc.errorJobStoreID
|
|
330
|
+
):
|
|
314
331
|
logger.error(
|
|
315
332
|
"Service %s has immediately failed before it could be used",
|
|
316
333
|
service_job_desc,
|
|
@@ -321,13 +338,22 @@ class ServiceManager:
|
|
|
321
338
|
|
|
322
339
|
# Find if any clients have had *all* their services started.
|
|
323
340
|
ready_clients = set()
|
|
324
|
-
for
|
|
341
|
+
for (
|
|
342
|
+
client_id,
|
|
343
|
+
remainingServices,
|
|
344
|
+
) in remaining_services_by_client.items():
|
|
325
345
|
if remainingServices == 0:
|
|
326
346
|
if client_id in clients_with_failed_services:
|
|
327
|
-
logger.error(
|
|
347
|
+
logger.error(
|
|
348
|
+
"Job %s has had all its services try to start, but at least one failed",
|
|
349
|
+
self.__toil_state.get_job(client_id),
|
|
350
|
+
)
|
|
328
351
|
self.__failed_clients_out.put(client_id)
|
|
329
352
|
else:
|
|
330
|
-
logger.debug(
|
|
353
|
+
logger.debug(
|
|
354
|
+
"Job %s has all its services started",
|
|
355
|
+
self.__toil_state.get_job(client_id),
|
|
356
|
+
)
|
|
331
357
|
self.__clients_out.put(client_id)
|
|
332
358
|
ready_clients.add(client_id)
|
|
333
359
|
for client_id in ready_clients:
|
|
@@ -344,7 +370,9 @@ class ServiceManager:
|
|
|
344
370
|
|
|
345
371
|
# Start the service jobs in batches, waiting for each batch
|
|
346
372
|
# to become established before starting the next batch
|
|
347
|
-
for service_job_list in self.__toil_state.get_job(
|
|
373
|
+
for service_job_list in self.__toil_state.get_job(
|
|
374
|
+
client_id
|
|
375
|
+
).serviceHostIDsInBatches():
|
|
348
376
|
# When we get the job descriptions we store them here to go over them again.
|
|
349
377
|
wait_on = []
|
|
350
378
|
for service_id in service_job_list:
|
|
@@ -361,9 +389,13 @@ class ServiceManager:
|
|
|
361
389
|
service_job_desc.startJobStoreID,
|
|
362
390
|
)
|
|
363
391
|
if not self.__job_store.file_exists(service_job_desc.startJobStoreID):
|
|
364
|
-
raise RuntimeError(
|
|
392
|
+
raise RuntimeError(
|
|
393
|
+
f"Service manager attempted to start service {service_job_desc} that has already started"
|
|
394
|
+
)
|
|
365
395
|
if not self.__toil_state.job_exists(str(service_job_desc.jobStoreID)):
|
|
366
|
-
raise RuntimeError(
|
|
396
|
+
raise RuntimeError(
|
|
397
|
+
f"Service manager attempted to start service {service_job_desc} that is not in the job store"
|
|
398
|
+
)
|
|
367
399
|
# At this point the terminateJobStoreID and errorJobStoreID
|
|
368
400
|
# could have been deleted, since the service can be killed at
|
|
369
401
|
# any time! So we can't assert their presence here.
|
|
@@ -382,7 +414,7 @@ class ServiceManager:
|
|
|
382
414
|
time.sleep(1.0)
|
|
383
415
|
|
|
384
416
|
if log_limiter.throttle(False):
|
|
385
|
-
logger.info(
|
|
417
|
+
logger.info("Service %s is starting...", service_job_desc)
|
|
386
418
|
|
|
387
419
|
# Check if the thread should quit
|
|
388
420
|
if self.__terminate.is_set():
|
|
@@ -395,9 +427,14 @@ class ServiceManager:
|
|
|
395
427
|
):
|
|
396
428
|
# The service job has gone away but the service never flipped its start flag.
|
|
397
429
|
# That's not what the worker is supposed to do when running a service at all.
|
|
398
|
-
logger.error(
|
|
430
|
+
logger.error(
|
|
431
|
+
"Service %s has completed and been removed without ever starting",
|
|
432
|
+
service_job_desc,
|
|
433
|
+
)
|
|
399
434
|
# Stop everything.
|
|
400
|
-
raise RuntimeError(
|
|
435
|
+
raise RuntimeError(
|
|
436
|
+
f"Service {service_job_desc} is in an inconsistent state"
|
|
437
|
+
)
|
|
401
438
|
|
|
402
439
|
# We don't bail out early here.
|
|
403
440
|
|
|
@@ -409,6 +446,5 @@ class ServiceManager:
|
|
|
409
446
|
# though, so they should stop immediately when we run them. TODO:
|
|
410
447
|
# this is a bad design!
|
|
411
448
|
|
|
412
|
-
|
|
413
449
|
# Add the JobDescription to the output queue of jobs whose services have been started
|
|
414
450
|
self.__clients_out.put(client_id)
|
toil/statsAndLogging.py
CHANGED
|
@@ -20,10 +20,11 @@ import time
|
|
|
20
20
|
from argparse import ArgumentParser, Namespace
|
|
21
21
|
from logging.handlers import RotatingFileHandler
|
|
22
22
|
from threading import Event, Thread
|
|
23
|
-
from typing import IO, TYPE_CHECKING, Any, Callable,
|
|
23
|
+
from typing import IO, TYPE_CHECKING, Any, Callable, Optional, Union
|
|
24
24
|
|
|
25
25
|
from toil.lib.conversions import strtobool
|
|
26
26
|
from toil.lib.expando import Expando
|
|
27
|
+
from toil.lib.history import HistoryManager
|
|
27
28
|
from toil.lib.resources import ResourceMonitor
|
|
28
29
|
|
|
29
30
|
if TYPE_CHECKING:
|
|
@@ -32,27 +33,36 @@ if TYPE_CHECKING:
|
|
|
32
33
|
|
|
33
34
|
logger = logging.getLogger(__name__)
|
|
34
35
|
root_logger = logging.getLogger()
|
|
35
|
-
toil_logger = logging.getLogger(
|
|
36
|
+
toil_logger = logging.getLogger("toil")
|
|
36
37
|
|
|
37
38
|
DEFAULT_LOGLEVEL = logging.INFO
|
|
38
39
|
__loggingFiles = []
|
|
39
40
|
|
|
41
|
+
# We have some logging that belongs at a TRACE level, below DEBUG
|
|
42
|
+
TRACE = logging.DEBUG - 5
|
|
43
|
+
|
|
44
|
+
logging.addLevelName(TRACE, "TRACE")
|
|
45
|
+
|
|
40
46
|
|
|
41
47
|
class StatsAndLogging:
|
|
42
48
|
"""A thread to aggregate statistics and logging."""
|
|
43
49
|
|
|
44
|
-
def __init__(self, jobStore:
|
|
50
|
+
def __init__(self, jobStore: "AbstractJobStore", config: "Config") -> None:
|
|
45
51
|
self._stop = Event()
|
|
46
|
-
self._worker = Thread(
|
|
47
|
-
|
|
48
|
-
|
|
52
|
+
self._worker = Thread(
|
|
53
|
+
target=self.statsAndLoggingAggregator,
|
|
54
|
+
args=(jobStore, self._stop, config),
|
|
55
|
+
daemon=True,
|
|
56
|
+
)
|
|
49
57
|
|
|
50
58
|
def start(self) -> None:
|
|
51
59
|
"""Start the stats and logging thread."""
|
|
52
60
|
self._worker.start()
|
|
53
61
|
|
|
54
62
|
@classmethod
|
|
55
|
-
def formatLogStream(
|
|
63
|
+
def formatLogStream(
|
|
64
|
+
cls, stream: Union[IO[str], IO[bytes]], stream_name: str
|
|
65
|
+
) -> str:
|
|
56
66
|
"""
|
|
57
67
|
Given a stream of text or bytes, and the job name, job itself, or some
|
|
58
68
|
other optional stringifyable identity info for the job, return a big
|
|
@@ -65,21 +75,25 @@ class StatsAndLogging:
|
|
|
65
75
|
|
|
66
76
|
:param stream: The stream of text or bytes to print for the user.
|
|
67
77
|
"""
|
|
68
|
-
lines = [f
|
|
78
|
+
lines = [f"{stream_name} follows:", "=========>"]
|
|
69
79
|
|
|
70
80
|
for line in stream:
|
|
71
81
|
if isinstance(line, bytes):
|
|
72
|
-
line = line.decode(
|
|
73
|
-
lines.append(
|
|
82
|
+
line = line.decode("utf-8", errors="replace")
|
|
83
|
+
lines.append("\t" + line.rstrip("\n"))
|
|
74
84
|
|
|
75
|
-
lines.append(
|
|
85
|
+
lines.append("<=========")
|
|
76
86
|
|
|
77
|
-
return
|
|
87
|
+
return "\n".join(lines)
|
|
78
88
|
|
|
79
89
|
@classmethod
|
|
80
|
-
def logWithFormatting(
|
|
81
|
-
|
|
82
|
-
|
|
90
|
+
def logWithFormatting(
|
|
91
|
+
cls,
|
|
92
|
+
stream_name: str,
|
|
93
|
+
jobLogs: Union[IO[str], IO[bytes]],
|
|
94
|
+
method: Callable[[str], None] = logger.debug,
|
|
95
|
+
message: Optional[str] = None,
|
|
96
|
+
) -> None:
|
|
83
97
|
if message is not None:
|
|
84
98
|
method(message)
|
|
85
99
|
|
|
@@ -87,28 +101,36 @@ class StatsAndLogging:
|
|
|
87
101
|
method(cls.formatLogStream(jobLogs, stream_name))
|
|
88
102
|
|
|
89
103
|
@classmethod
|
|
90
|
-
def writeLogFiles(
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
104
|
+
def writeLogFiles(
|
|
105
|
+
cls,
|
|
106
|
+
jobNames: list[str],
|
|
107
|
+
jobLogList: list[str],
|
|
108
|
+
config: "Config",
|
|
109
|
+
failed: bool = False,
|
|
110
|
+
) -> None:
|
|
111
|
+
def createName(
|
|
112
|
+
logPath: str, jobName: str, logExtension: str, failed: bool = False
|
|
113
|
+
) -> str:
|
|
114
|
+
logName = jobName.replace("-", "--")
|
|
115
|
+
logName = logName.replace("/", "-")
|
|
116
|
+
logName = logName.replace(" ", "_")
|
|
117
|
+
logName = logName.replace("'", "")
|
|
118
|
+
logName = logName.replace('"', "")
|
|
97
119
|
# Add a "failed_" prefix to logs from failed jobs.
|
|
98
|
-
logName = (
|
|
120
|
+
logName = ("failed_" if failed else "") + logName
|
|
99
121
|
counter = 0
|
|
100
122
|
while True:
|
|
101
|
-
suffix =
|
|
123
|
+
suffix = "_" + str(counter).zfill(3) + logExtension
|
|
102
124
|
fullName = os.path.join(logPath, logName + suffix)
|
|
103
125
|
# The maximum file name size in the default HFS+ file system is 255 UTF-16 encoding units, so basically 255 characters
|
|
104
126
|
if len(fullName) >= 255:
|
|
105
|
-
return fullName[:(255 - len(suffix))] + suffix
|
|
127
|
+
return fullName[: (255 - len(suffix))] + suffix
|
|
106
128
|
if not os.path.exists(fullName):
|
|
107
129
|
return fullName
|
|
108
130
|
counter += 1
|
|
109
131
|
|
|
110
132
|
mainFileName = jobNames[0]
|
|
111
|
-
extension =
|
|
133
|
+
extension = ".log"
|
|
112
134
|
writeFn: Callable[..., Any]
|
|
113
135
|
if config.writeLogs:
|
|
114
136
|
path = config.writeLogs
|
|
@@ -116,7 +138,7 @@ class StatsAndLogging:
|
|
|
116
138
|
elif config.writeLogsGzip:
|
|
117
139
|
path = config.writeLogsGzip
|
|
118
140
|
writeFn = gzip.open
|
|
119
|
-
extension +=
|
|
141
|
+
extension += ".gz"
|
|
120
142
|
else:
|
|
121
143
|
# we don't have anywhere to write the logs, return now
|
|
122
144
|
return
|
|
@@ -125,13 +147,13 @@ class StatsAndLogging:
|
|
|
125
147
|
os.makedirs(path, exist_ok=True)
|
|
126
148
|
|
|
127
149
|
fullName = createName(path, mainFileName, extension, failed)
|
|
128
|
-
with writeFn(fullName,
|
|
150
|
+
with writeFn(fullName, "wb") as f:
|
|
129
151
|
for l in jobLogList:
|
|
130
152
|
if isinstance(l, bytes):
|
|
131
|
-
l = l.decode(
|
|
132
|
-
if not l.endswith(
|
|
133
|
-
l +=
|
|
134
|
-
f.write(l.encode(
|
|
153
|
+
l = l.decode("utf-8")
|
|
154
|
+
if not l.endswith("\n"):
|
|
155
|
+
l += "\n"
|
|
156
|
+
f.write(l.encode("utf-8"))
|
|
135
157
|
for alternateName in jobNames[1:]:
|
|
136
158
|
# There are chained jobs in this output - indicate this with a symlink
|
|
137
159
|
# of the job's name to this file
|
|
@@ -140,11 +162,14 @@ class StatsAndLogging:
|
|
|
140
162
|
os.symlink(os.path.relpath(fullName, path), name)
|
|
141
163
|
|
|
142
164
|
@classmethod
|
|
143
|
-
def statsAndLoggingAggregator(
|
|
165
|
+
def statsAndLoggingAggregator(
|
|
166
|
+
cls, jobStore: "AbstractJobStore", stop: Event, config: "Config"
|
|
167
|
+
) -> None:
|
|
144
168
|
"""
|
|
145
169
|
The following function is used for collating stats/reporting log messages from the workers.
|
|
146
170
|
Works inside of a thread, collates as long as the stop flag is not True.
|
|
147
171
|
"""
|
|
172
|
+
|
|
148
173
|
# Overall timing
|
|
149
174
|
startTime = time.time()
|
|
150
175
|
startClock = ResourceMonitor.get_total_cpu_time()
|
|
@@ -165,9 +190,12 @@ class StatsAndLogging:
|
|
|
165
190
|
pass
|
|
166
191
|
else:
|
|
167
192
|
for message in logs:
|
|
168
|
-
logger.log(
|
|
169
|
-
|
|
170
|
-
|
|
193
|
+
logger.log(
|
|
194
|
+
int(message.level),
|
|
195
|
+
"Got message from job at time %s: %s",
|
|
196
|
+
time.strftime("%m-%d-%Y %H:%M:%S"),
|
|
197
|
+
message.text,
|
|
198
|
+
)
|
|
171
199
|
|
|
172
200
|
try:
|
|
173
201
|
# Handle all the user-level text streams reported back (command output, etc.)
|
|
@@ -198,12 +226,47 @@ class StatsAndLogging:
|
|
|
198
226
|
# we may have multiple jobs per worker
|
|
199
227
|
jobNames = logs.names
|
|
200
228
|
messages = logs.messages
|
|
201
|
-
cls.logWithFormatting(
|
|
202
|
-
|
|
229
|
+
cls.logWithFormatting(
|
|
230
|
+
f'Log from job "{jobNames[0]}"',
|
|
231
|
+
messages,
|
|
232
|
+
message="Received Toil worker log. Disable debug level logging to hide this output",
|
|
233
|
+
)
|
|
203
234
|
cls.writeLogFiles(jobNames, messages, config=config)
|
|
204
235
|
|
|
236
|
+
try:
|
|
237
|
+
jobs = stats.jobs
|
|
238
|
+
except AttributeError:
|
|
239
|
+
pass
|
|
240
|
+
else:
|
|
241
|
+
for job in jobs:
|
|
242
|
+
try:
|
|
243
|
+
# Here we're talking to job._executor which fills in these stats.
|
|
244
|
+
|
|
245
|
+
# Convince MyPy we won't be sent any job stats without
|
|
246
|
+
# a workflow ID. You can't set up the job store without
|
|
247
|
+
# one, but if we're somehow missing one, keep the stats
|
|
248
|
+
# and logging thread up.
|
|
249
|
+
assert config.workflowID is not None
|
|
250
|
+
|
|
251
|
+
# TODO: Use better job names!
|
|
252
|
+
HistoryManager.record_job_attempt(
|
|
253
|
+
config.workflowID,
|
|
254
|
+
config.workflowAttemptNumber,
|
|
255
|
+
job.class_name,
|
|
256
|
+
job.succeeded == "True",
|
|
257
|
+
float(job.start),
|
|
258
|
+
float(job.time),
|
|
259
|
+
cores=float(job.requested_cores),
|
|
260
|
+
cpu_seconds=float(job.clock),
|
|
261
|
+
memory_bytes=int(job.memory) * 1024,
|
|
262
|
+
disk_bytes=int(job.disk)
|
|
263
|
+
)
|
|
264
|
+
except:
|
|
265
|
+
logger.exception("Could not record job attempt in history!")
|
|
266
|
+
# Keep going. Don't fail the workflow for history-related issues.
|
|
267
|
+
|
|
205
268
|
while True:
|
|
206
|
-
# This is
|
|
269
|
+
# This is an indirect way of getting a message to the thread to exit
|
|
207
270
|
if stop.is_set():
|
|
208
271
|
jobStore.read_logs(callback)
|
|
209
272
|
break
|
|
@@ -211,8 +274,13 @@ class StatsAndLogging:
|
|
|
211
274
|
time.sleep(0.5) # Avoid cycling too fast
|
|
212
275
|
|
|
213
276
|
# Finish the stats file
|
|
214
|
-
text = json.dumps(
|
|
215
|
-
|
|
277
|
+
text = json.dumps(
|
|
278
|
+
dict(
|
|
279
|
+
total_time=str(time.time() - startTime),
|
|
280
|
+
total_clock=str(ResourceMonitor.get_total_cpu_time() - startClock),
|
|
281
|
+
),
|
|
282
|
+
ensure_ascii=True,
|
|
283
|
+
)
|
|
216
284
|
jobStore.write_logs(text)
|
|
217
285
|
|
|
218
286
|
def check(self) -> None:
|
|
@@ -225,11 +293,14 @@ class StatsAndLogging:
|
|
|
225
293
|
|
|
226
294
|
def shutdown(self) -> None:
|
|
227
295
|
"""Finish up the stats/logging aggregation thread."""
|
|
228
|
-
logger.debug(
|
|
296
|
+
logger.debug("Waiting for stats and logging collator thread to finish ...")
|
|
229
297
|
startTime = time.time()
|
|
230
298
|
self._stop.set()
|
|
231
299
|
self._worker.join()
|
|
232
|
-
logger.debug(
|
|
300
|
+
logger.debug(
|
|
301
|
+
"... finished collating stats and logs. Took %s seconds",
|
|
302
|
+
time.time() - startTime,
|
|
303
|
+
)
|
|
233
304
|
# in addition to cleaning on exceptions, onError should clean if there are any failed jobs
|
|
234
305
|
|
|
235
306
|
|
|
@@ -250,6 +321,11 @@ def install_log_color(set_logger: Optional[logging.Logger] = None) -> None:
|
|
|
250
321
|
import coloredlogs # type: ignore[import-untyped]
|
|
251
322
|
|
|
252
323
|
level_styles = dict(coloredlogs.DEFAULT_LEVEL_STYLES)
|
|
324
|
+
level_styles["trace"] = dict(level_styles["debug"])
|
|
325
|
+
|
|
326
|
+
# TODO: What if these fixed colors aren't right for the terminal background?
|
|
327
|
+
# It might be light or dark or even grey.
|
|
328
|
+
level_styles["trace"]["color"] = 242
|
|
253
329
|
level_styles["debug"]["color"] = 242
|
|
254
330
|
level_styles["notice"] = {"color": "green", "bold": True}
|
|
255
331
|
level_styles["error"]["bold"] = True
|
|
@@ -272,7 +348,9 @@ def install_log_color(set_logger: Optional[logging.Logger] = None) -> None:
|
|
|
272
348
|
)
|
|
273
349
|
|
|
274
350
|
|
|
275
|
-
def add_logging_options(
|
|
351
|
+
def add_logging_options(
|
|
352
|
+
parser: ArgumentParser, default_level: Optional[int] = None
|
|
353
|
+
) -> None:
|
|
276
354
|
"""
|
|
277
355
|
Add logging options to set the global log level.
|
|
278
356
|
|
|
@@ -285,23 +363,51 @@ def add_logging_options(parser: ArgumentParser, default_level: Optional[int] = N
|
|
|
285
363
|
|
|
286
364
|
group = parser.add_argument_group("Logging Options")
|
|
287
365
|
|
|
288
|
-
levels = [
|
|
366
|
+
levels = ["Critical", "Error", "Warning", "Info", "Debug", "Trace"]
|
|
289
367
|
for level in levels:
|
|
290
|
-
group.add_argument(
|
|
291
|
-
|
|
368
|
+
group.add_argument(
|
|
369
|
+
f"--log{level}",
|
|
370
|
+
dest="logLevel",
|
|
371
|
+
default=default_level_name,
|
|
372
|
+
action="store_const",
|
|
373
|
+
const=level,
|
|
374
|
+
help=f"Set logging level to {level}. Default: {default_level_name}.",
|
|
375
|
+
)
|
|
292
376
|
|
|
293
377
|
levels += [l.lower() for l in levels] + [l.upper() for l in levels]
|
|
294
|
-
group.add_argument(
|
|
295
|
-
|
|
378
|
+
group.add_argument(
|
|
379
|
+
"--logOff",
|
|
380
|
+
dest="logLevel",
|
|
381
|
+
default=default_level_name,
|
|
382
|
+
action="store_const",
|
|
383
|
+
const="CRITICAL",
|
|
384
|
+
help="Same as --logCritical.",
|
|
385
|
+
)
|
|
296
386
|
# Maybe deprecate the above in favor of --logLevel?
|
|
297
387
|
|
|
298
|
-
group.add_argument(
|
|
299
|
-
|
|
388
|
+
group.add_argument(
|
|
389
|
+
"--logLevel",
|
|
390
|
+
dest="logLevel",
|
|
391
|
+
default=default_level_name,
|
|
392
|
+
choices=levels,
|
|
393
|
+
help=f"Set the log level. Default: {default_level_name}. Options: {levels}.",
|
|
394
|
+
)
|
|
300
395
|
group.add_argument("--logFile", dest="logFile", help="File to log in.")
|
|
301
|
-
group.add_argument(
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
396
|
+
group.add_argument(
|
|
397
|
+
"--rotatingLogging",
|
|
398
|
+
dest="logRotating",
|
|
399
|
+
action="store_true",
|
|
400
|
+
default=False,
|
|
401
|
+
help="Turn on rotating logging, which prevents log files from getting too big.",
|
|
402
|
+
)
|
|
403
|
+
group.add_argument(
|
|
404
|
+
"--logColors",
|
|
405
|
+
dest="colored_logs",
|
|
406
|
+
default=True,
|
|
407
|
+
type=strtobool,
|
|
408
|
+
metavar="BOOL",
|
|
409
|
+
help="Enable or disable colored logging. Default: %(default)s",
|
|
410
|
+
)
|
|
305
411
|
|
|
306
412
|
|
|
307
413
|
def configure_root_logger() -> None:
|
|
@@ -311,8 +417,10 @@ def configure_root_logger() -> None:
|
|
|
311
417
|
Should be called before any entry point tries to log anything,
|
|
312
418
|
to ensure consistent formatting.
|
|
313
419
|
"""
|
|
314
|
-
logging.basicConfig(
|
|
315
|
-
|
|
420
|
+
logging.basicConfig(
|
|
421
|
+
format="[%(asctime)s] [%(threadName)-10s] [%(levelname).1s] [%(name)s] %(message)s",
|
|
422
|
+
datefmt="%Y-%m-%dT%H:%M:%S%z",
|
|
423
|
+
)
|
|
316
424
|
root_logger.setLevel(DEFAULT_LOGLEVEL)
|
|
317
425
|
|
|
318
426
|
|
|
@@ -330,12 +438,16 @@ def log_to_file(log_file: Optional[str], log_rotation: bool) -> None:
|
|
|
330
438
|
|
|
331
439
|
def set_logging_from_options(options: Union["Config", Namespace]) -> None:
|
|
332
440
|
configure_root_logger()
|
|
333
|
-
options.logLevel = options.logLevel or logging.getLevelName(
|
|
441
|
+
options.logLevel = options.logLevel or logging.getLevelName(
|
|
442
|
+
root_logger.getEffectiveLevel()
|
|
443
|
+
)
|
|
334
444
|
set_log_level(options.logLevel)
|
|
335
445
|
if options.colored_logs:
|
|
336
446
|
install_log_color()
|
|
337
|
-
logger.debug(
|
|
338
|
-
|
|
447
|
+
logger.debug(
|
|
448
|
+
f"Root logger is at level '{logging.getLevelName(root_logger.getEffectiveLevel())}', "
|
|
449
|
+
f"'toil' logger at level '{logging.getLevelName(toil_logger.getEffectiveLevel())}'."
|
|
450
|
+
)
|
|
339
451
|
|
|
340
452
|
# start logging to log file if specified
|
|
341
453
|
log_to_file(options.logFile, options.logRotating)
|
|
@@ -353,18 +465,24 @@ def suppress_exotic_logging(local_logger: str) -> None:
|
|
|
353
465
|
This is important because some packages, particularly boto3, are not always instantiated yet in the
|
|
354
466
|
environment when this is run, and so we create the logger and set the level preemptively.
|
|
355
467
|
"""
|
|
356
|
-
never_suppress = [
|
|
357
|
-
always_suppress = [
|
|
468
|
+
never_suppress = ["toil", "__init__", "__main__", "toil-rt", "cwltool"]
|
|
469
|
+
always_suppress = [
|
|
470
|
+
"boto3",
|
|
471
|
+
"boto",
|
|
472
|
+
"botocore",
|
|
473
|
+
] # ensure we suppress even before instantiated
|
|
358
474
|
|
|
359
|
-
top_level_loggers:
|
|
475
|
+
top_level_loggers: list[str] = []
|
|
360
476
|
|
|
361
477
|
# Due to https://stackoverflow.com/questions/61683713
|
|
362
478
|
for pkg_logger in list(logging.Logger.manager.loggerDict.keys()) + always_suppress:
|
|
363
479
|
if pkg_logger != local_logger:
|
|
364
480
|
# many sub-loggers may exist, like "boto.a", "boto.b", "boto.c"; we only want the top_level: "boto"
|
|
365
|
-
top_level_logger =
|
|
481
|
+
top_level_logger = (
|
|
482
|
+
pkg_logger.split(".")[0] if "." in pkg_logger else pkg_logger
|
|
483
|
+
)
|
|
366
484
|
|
|
367
485
|
if top_level_logger not in top_level_loggers + never_suppress:
|
|
368
486
|
top_level_loggers.append(top_level_logger)
|
|
369
487
|
logging.getLogger(top_level_logger).setLevel(logging.CRITICAL)
|
|
370
|
-
logger.debug(f
|
|
488
|
+
logger.debug(f"Suppressing the following loggers: {set(top_level_loggers)}")
|