toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. toil/__init__.py +1 -232
  2. toil/batchSystems/abstractBatchSystem.py +41 -17
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +4 -5
  7. toil/batchSystems/gridengine.py +1 -1
  8. toil/batchSystems/htcondor.py +5 -5
  9. toil/batchSystems/kubernetes.py +25 -11
  10. toil/batchSystems/local_support.py +3 -3
  11. toil/batchSystems/lsf.py +9 -9
  12. toil/batchSystems/mesos/batchSystem.py +4 -4
  13. toil/batchSystems/mesos/executor.py +3 -2
  14. toil/batchSystems/options.py +9 -0
  15. toil/batchSystems/singleMachine.py +11 -10
  16. toil/batchSystems/slurm.py +129 -16
  17. toil/batchSystems/torque.py +1 -1
  18. toil/bus.py +45 -3
  19. toil/common.py +56 -31
  20. toil/cwl/cwltoil.py +442 -371
  21. toil/deferred.py +1 -1
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/abstractFileStore.py +69 -20
  24. toil/fileStores/cachingFileStore.py +6 -22
  25. toil/fileStores/nonCachingFileStore.py +6 -15
  26. toil/job.py +270 -86
  27. toil/jobStores/abstractJobStore.py +37 -31
  28. toil/jobStores/aws/jobStore.py +280 -218
  29. toil/jobStores/aws/utils.py +60 -31
  30. toil/jobStores/conftest.py +2 -2
  31. toil/jobStores/fileJobStore.py +3 -3
  32. toil/jobStores/googleJobStore.py +3 -4
  33. toil/leader.py +89 -38
  34. toil/lib/aws/__init__.py +26 -10
  35. toil/lib/aws/iam.py +2 -2
  36. toil/lib/aws/session.py +62 -22
  37. toil/lib/aws/utils.py +73 -37
  38. toil/lib/conversions.py +24 -1
  39. toil/lib/ec2.py +118 -69
  40. toil/lib/expando.py +1 -1
  41. toil/lib/generatedEC2Lists.py +8 -8
  42. toil/lib/io.py +42 -4
  43. toil/lib/misc.py +1 -3
  44. toil/lib/resources.py +57 -16
  45. toil/lib/retry.py +12 -5
  46. toil/lib/threading.py +29 -14
  47. toil/lib/throttle.py +1 -1
  48. toil/options/common.py +31 -30
  49. toil/options/wdl.py +5 -0
  50. toil/provisioners/__init__.py +9 -3
  51. toil/provisioners/abstractProvisioner.py +12 -2
  52. toil/provisioners/aws/__init__.py +20 -15
  53. toil/provisioners/aws/awsProvisioner.py +406 -329
  54. toil/provisioners/gceProvisioner.py +2 -2
  55. toil/provisioners/node.py +13 -5
  56. toil/server/app.py +1 -1
  57. toil/statsAndLogging.py +93 -23
  58. toil/test/__init__.py +27 -12
  59. toil/test/batchSystems/batchSystemTest.py +40 -33
  60. toil/test/batchSystems/batch_system_plugin_test.py +79 -0
  61. toil/test/batchSystems/test_slurm.py +22 -7
  62. toil/test/cactus/__init__.py +0 -0
  63. toil/test/cactus/test_cactus_integration.py +58 -0
  64. toil/test/cwl/cwlTest.py +245 -236
  65. toil/test/cwl/seqtk_seq.cwl +1 -1
  66. toil/test/docs/scriptsTest.py +11 -14
  67. toil/test/jobStores/jobStoreTest.py +40 -54
  68. toil/test/lib/aws/test_iam.py +2 -2
  69. toil/test/lib/test_ec2.py +1 -1
  70. toil/test/options/__init__.py +13 -0
  71. toil/test/options/options.py +37 -0
  72. toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
  73. toil/test/provisioners/clusterTest.py +99 -16
  74. toil/test/server/serverTest.py +2 -2
  75. toil/test/src/autoDeploymentTest.py +1 -1
  76. toil/test/src/dockerCheckTest.py +2 -1
  77. toil/test/src/environmentTest.py +125 -0
  78. toil/test/src/fileStoreTest.py +1 -1
  79. toil/test/src/jobDescriptionTest.py +18 -8
  80. toil/test/src/jobTest.py +1 -1
  81. toil/test/src/realtimeLoggerTest.py +4 -0
  82. toil/test/src/workerTest.py +52 -19
  83. toil/test/utils/toilDebugTest.py +62 -4
  84. toil/test/utils/utilsTest.py +23 -21
  85. toil/test/wdl/wdltoil_test.py +49 -21
  86. toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
  87. toil/toilState.py +68 -9
  88. toil/utils/toilDebugFile.py +1 -1
  89. toil/utils/toilDebugJob.py +153 -26
  90. toil/utils/toilLaunchCluster.py +12 -2
  91. toil/utils/toilRsyncCluster.py +7 -2
  92. toil/utils/toilSshCluster.py +7 -3
  93. toil/utils/toilStats.py +310 -266
  94. toil/utils/toilStatus.py +98 -52
  95. toil/version.py +11 -11
  96. toil/wdl/wdltoil.py +644 -225
  97. toil/worker.py +125 -83
  98. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
  99. toil-7.0.0.dist-info/METADATA +158 -0
  100. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
  101. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
  102. toil-6.1.0a1.dist-info/METADATA +0 -125
  103. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
  104. {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
toil/worker.py CHANGED
@@ -26,7 +26,7 @@ import sys
26
26
  import time
27
27
  import traceback
28
28
  from contextlib import contextmanager
29
- from typing import Any, Callable, Iterator, List, Optional
29
+ from typing import Any, Callable, Iterator, List, Set, Optional
30
30
 
31
31
  from configargparse import ArgParser
32
32
 
@@ -36,13 +36,12 @@ from toil.cwl.utils import (CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION,
36
36
  CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE)
37
37
  from toil.deferred import DeferredFunctionManager
38
38
  from toil.fileStores.abstractFileStore import AbstractFileStore
39
- from toil.job import CheckpointJobDescription, Job, JobDescription
39
+ from toil.job import CheckpointJobDescription, Job, JobDescription, DebugStoppingPointReached
40
40
  from toil.jobStores.abstractJobStore import AbstractJobStore
41
41
  from toil.lib.expando import MagicExpando
42
42
  from toil.lib.io import make_public_dir
43
- from toil.lib.resources import (get_total_cpu_time,
44
- get_total_cpu_time_and_memory_usage)
45
- from toil.statsAndLogging import configure_root_logger, set_log_level
43
+ from toil.lib.resources import ResourceMonitor
44
+ from toil.statsAndLogging import configure_root_logger, set_log_level, install_log_color
46
45
 
47
46
  logger = logging.getLogger(__name__)
48
47
 
@@ -50,27 +49,27 @@ logger = logging.getLogger(__name__)
50
49
  class StatsDict(MagicExpando):
51
50
  """Subclass of MagicExpando for type-checking purposes."""
52
51
 
53
- jobs: List[str]
52
+ jobs: List[MagicExpando]
54
53
 
55
54
 
56
- def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, config: Config) -> Optional[JobDescription]:
55
+ def nextChainable(predecessor: JobDescription, job_store: AbstractJobStore, config: Config) -> Optional[JobDescription]:
57
56
  """
58
57
  Returns the next chainable job's JobDescription after the given predecessor
59
58
  JobDescription, if one exists, or None if the chain must terminate.
60
59
 
61
60
  :param predecessor: The job to chain from
62
- :param jobStore: The JobStore to fetch JobDescriptions from.
61
+ :param job_store: The JobStore to fetch JobDescriptions from.
63
62
  :param config: The configuration for the current run.
64
63
  """
65
64
  #If no more jobs to run or services not finished, quit
66
- if predecessor.nextSuccessors() is None or len(predecessor.services) > 0 or (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint != None):
65
+ if predecessor.nextSuccessors() is None or len(predecessor.services) > 0 or (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint is not None):
67
66
  logger.debug("Stopping running chain of jobs: no successors: %s, services: %s, checkpoint: %s",
68
- predecessor.nextSuccessors() is None, len(predecessor.services), (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint != None))
67
+ predecessor.nextSuccessors() is None, len(predecessor.services), (isinstance(predecessor, CheckpointJobDescription) and predecessor.checkpoint is not None))
69
68
  return None
70
69
 
71
70
 
72
71
  #Get the next set of jobs to run
73
- jobs = list(predecessor.nextSuccessors())
72
+ jobs = list(predecessor.nextSuccessors() or set())
74
73
  if len(jobs) == 0:
75
74
  # If there are no jobs, we might just not have any children.
76
75
  logger.debug("Stopping running chain of jobs because job has no ready children or follow-ons")
@@ -89,7 +88,7 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
89
88
  logger.debug("%s would chain to ID %s", predecessor, successorID)
90
89
 
91
90
  # Load the successor JobDescription
92
- successor = jobStore.load_job(successorID)
91
+ successor = job_store.load_job(successorID)
93
92
 
94
93
  #We check the requirements of the successor to see if we can run it
95
94
  #within the current worker
@@ -118,17 +117,38 @@ def nextChainable(predecessor: JobDescription, jobStore: AbstractJobStore, confi
118
117
  logger.debug("Next job is checkpoint, so finishing")
119
118
  return None
120
119
 
120
+ if not config.run_local_jobs_on_workers and predecessor.local and not successor.local:
121
+ # This job might be running on the leader, but the next job may not.
122
+ #
123
+ # TODO: Optimize by detecting whether we actually are on the leader,
124
+ # somehow.
125
+ logger.debug("Next job is not allowed to run on the leader, so finishing")
126
+ return None
127
+
121
128
  # Made it through! This job is chainable.
122
129
  return successor
123
130
 
124
- def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobStoreID: str, redirectOutputToLogFile: bool = True) -> int:
131
+ def workerScript(
132
+ job_store: AbstractJobStore,
133
+ config: Config,
134
+ job_name: str,
135
+ job_store_id: str,
136
+ redirect_output_to_log_file: bool = True,
137
+ local_worker_temp_dir: Optional[str] = None,
138
+ debug_flags: Optional[Set[str]] = None
139
+ ) -> int:
125
140
  """
126
141
  Worker process script, runs a job.
127
142
 
128
- :param jobStore: The JobStore to fetch JobDescriptions from.
143
+ :param job_store: The JobStore to fetch JobDescriptions from.
129
144
  :param config: The configuration for the current run.
130
- :param jobName: The "job name" (a user friendly name) of the job to be run
131
- :param jobStoreID: The job store ID of the job to be run
145
+ :param job_name: The "job name" (a user friendly name) of the job to be run
146
+ :param job_store_id: The job store ID of the job to be run
147
+ :param redirect_output_to_log_file: If False, log directly to the console
148
+ instead of capturing job output.
149
+ :param local_worker_temp_dir: The directory for the worker to work in. May
150
+ be recursively removed after the job runs.
151
+ :param debug_flags: Flags to set on each job before running it.
132
152
 
133
153
  :return int: 1 if a job failed, or 0 if all jobs succeeded
134
154
  """
@@ -136,6 +156,11 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
136
156
  configure_root_logger()
137
157
  set_log_level(config.logLevel)
138
158
 
159
+ if config.colored_logs:
160
+ install_log_color()
161
+
162
+ logger.debug("Worker started for job %s...", job_name)
163
+
139
164
  ##########################################
140
165
  #Create the worker killer, if requested
141
166
  ##########################################
@@ -182,7 +207,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
182
207
  ##########################################
183
208
 
184
209
  #First load the environment for the job.
185
- with jobStore.read_shared_file_stream("environment.pickle") as fileHandle:
210
+ with job_store.read_shared_file_stream("environment.pickle") as fileHandle:
186
211
  environment = safeUnpickleFromStream(fileHandle)
187
212
  env_reject = {
188
213
  "TMPDIR",
@@ -228,8 +253,10 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
228
253
  toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID, config.workDir)
229
254
  # Dir to put lock files in, ideally not on NFS.
230
255
  toil_coordination_dir = Toil.get_local_workflow_coordination_dir(config.workflowID, config.workDir, config.coordination_dir)
231
- localWorkerTempDir = make_public_dir(in_directory=toilWorkflowDir)
232
- os.chmod(localWorkerTempDir, 0o755)
256
+ if local_worker_temp_dir is None:
257
+ # Invent a temp directory to work in
258
+ local_worker_temp_dir = make_public_dir(toilWorkflowDir)
259
+ os.chmod(local_worker_temp_dir, 0o755)
233
260
 
234
261
  ##########################################
235
262
  #Setup the logging
@@ -245,12 +272,12 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
245
272
  #file descriptor 1, and standard error is file descriptor 2.
246
273
 
247
274
  # Do we even want to redirect output? Let the config make us not do it.
248
- redirectOutputToLogFile = redirectOutputToLogFile and not config.disableWorkerOutputCapture
275
+ redirect_output_to_log_file = redirect_output_to_log_file and not config.disableWorkerOutputCapture
249
276
 
250
277
  #What file do we want to point FDs 1 and 2 to?
251
- tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")
278
+ tempWorkerLogPath = os.path.join(local_worker_temp_dir, "worker_log.txt")
252
279
 
253
- if redirectOutputToLogFile:
280
+ if redirect_output_to_log_file:
254
281
  # Announce that we are redirecting logging, and where it will now go.
255
282
  # This is only important if we are trying to manually trace a faulty worker invocation.
256
283
  logger.debug("Redirecting logging to %s", tempWorkerLogPath)
@@ -287,13 +314,14 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
287
314
 
288
315
  jobAttemptFailed = False
289
316
  failure_exit_code = 1
317
+ first_job_cores = None
290
318
  statsDict = StatsDict() # type: ignore[no-untyped-call]
291
319
  statsDict.jobs = []
292
- statsDict.workers.logsToMaster = []
320
+ statsDict.workers.logs_to_leader = []
321
+ statsDict.workers.logging_user_streams = []
293
322
 
294
323
  def blockFn() -> bool:
295
324
  return True
296
- listOfJobs = [jobName]
297
325
  job = None
298
326
  try:
299
327
 
@@ -312,18 +340,17 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
312
340
  # Load the JobDescription
313
341
  ##########################################
314
342
 
315
- jobDesc = jobStore.load_job(jobStoreID)
316
- listOfJobs[0] = str(jobDesc)
343
+ jobDesc = job_store.load_job(job_store_id)
317
344
  logger.debug("Parsed job description")
318
345
 
319
346
  ##########################################
320
347
  # Cleanup from any earlier invocation of the job
321
348
  ##########################################
322
349
 
323
- if jobDesc.command is None:
350
+ if not jobDesc.has_body():
324
351
  logger.debug("Job description has no body to run.")
325
352
  # Cleanup jobs already finished
326
- jobDesc.clear_nonexistent_dependents(jobStore)
353
+ jobDesc.clear_nonexistent_dependents(job_store)
327
354
  logger.debug("Cleaned up any references to completed successor jobs")
328
355
 
329
356
  # This cleans the old log file which may
@@ -331,8 +358,8 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
331
358
  oldLogFile = jobDesc.logJobStoreFileID
332
359
  if oldLogFile is not None:
333
360
  jobDesc.logJobStoreFileID = None
334
- jobStore.update_job(jobDesc) # Update first, before deleting any files
335
- jobStore.delete_file(oldLogFile)
361
+ job_store.update_job(jobDesc) # Update first, before deleting any files
362
+ job_store.delete_file(oldLogFile)
336
363
 
337
364
  ##########################################
338
365
  # If a checkpoint exists, restart from the checkpoint
@@ -350,20 +377,22 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
350
377
  if jobDesc.remainingTryCount < 0:
351
378
  raise RuntimeError("The try count of the job cannot be negative.")
352
379
  jobDesc.remainingTryCount = max(0, jobDesc.remainingTryCount - 1)
353
- jobDesc.restartCheckpoint(jobStore)
380
+ jobDesc.restartCheckpoint(job_store)
354
381
  # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
355
382
  # because of the job being a checkpoint
356
383
  else:
357
384
  logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.")
358
385
  #Delete any remnant files
359
- list(map(jobStore.delete_file, list(filter(jobStore.file_exists, jobDesc.checkpointFilesToDelete))))
386
+ list(map(job_store.delete_file, list(filter(job_store.file_exists, jobDesc.checkpointFilesToDelete))))
360
387
 
361
388
  ##########################################
362
389
  #Setup the stats, if requested
363
390
  ##########################################
364
391
 
365
392
  if config.stats:
366
- startClock = get_total_cpu_time()
393
+ # Remember the cores from the first job, which is how many we have reserved for us.
394
+ statsDict.workers.requested_cores = jobDesc.cores
395
+ startClock = ResourceMonitor.get_total_cpu_time()
367
396
 
368
397
  startTime = time.time()
369
398
  while True:
@@ -373,20 +402,22 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
373
402
 
374
403
  logger.info("Working on job %s", jobDesc)
375
404
 
376
- if jobDesc.command is not None:
377
- if not jobDesc.command.startswith("_toil "):
378
- raise RuntimeError("Job command must start with '_toil' before being converted to an executable command.")
379
- logger.debug("Got a command to run: %s" % jobDesc.command)
405
+ if jobDesc.has_body():
380
406
  # Load the job. It will use the same JobDescription we have been using.
381
- job = Job.loadJob(jobStore, jobDesc)
407
+ job = Job.loadJob(job_store, jobDesc)
382
408
  if isinstance(jobDesc, CheckpointJobDescription):
383
- # If it is a checkpoint job, save the command
384
- jobDesc.checkpoint = jobDesc.command
409
+ # If it is a checkpoint job, set the checkpoint
410
+ jobDesc.set_checkpoint()
385
411
 
386
412
  logger.info("Loaded body %s from description %s", job, jobDesc)
387
413
 
414
+ if debug_flags:
415
+ for flag in debug_flags:
416
+ logger.debug("Turning on debug flag %s on job", flag)
417
+ job.set_debug_flag(flag)
418
+
388
419
  # Create a fileStore object for the job
389
- fileStore = AbstractFileStore.createFileStore(jobStore, jobDesc, localWorkerTempDir, blockFn,
420
+ fileStore = AbstractFileStore.createFileStore(job_store, jobDesc, local_worker_temp_dir, blockFn,
390
421
  caching=config.caching)
391
422
  with job._executor(stats=statsDict if config.stats else None,
392
423
  fileStore=fileStore):
@@ -404,21 +435,22 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
404
435
  # wants across multiple Toil versions. We also
405
436
  # still pass a jobGraph argument to placate old
406
437
  # versions of Cactus.
407
- job._runner(jobGraph=None, jobStore=jobStore, fileStore=fileStore, defer=defer)
438
+ job._runner(jobGraph=None, jobStore=job_store, fileStore=fileStore, defer=defer)
408
439
 
409
440
  # When the executor for the job finishes it will
410
- # kick off a commit with the command link to the
411
- # job body cut.
441
+ # kick off a commit with the link to the job body
442
+ # cut.
412
443
 
413
444
  # Accumulate messages from this job & any subsequent chained jobs
414
- statsDict.workers.logsToMaster += fileStore.loggingMessages
445
+ statsDict.workers.logs_to_leader += fileStore.logging_messages
446
+ statsDict.workers.logging_user_streams += fileStore.logging_user_streams
415
447
 
416
448
  logger.info("Completed body for %s", jobDesc)
417
449
 
418
450
  else:
419
- #The command may be none, in which case
420
- #the JobDescription is either a shell ready to be deleted or has
421
- #been scheduled after a failure to cleanup
451
+ # The body may not be attached, in which case the
452
+ # JobDescription is either a shell ready to be deleted or has
453
+ # been scheduled after a failure to cleanup
422
454
  logger.debug("No user job to run, so finishing")
423
455
  break
424
456
 
@@ -428,7 +460,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
428
460
  ##########################################
429
461
  #Establish if we can run another job within the worker
430
462
  ##########################################
431
- successor = nextChainable(jobDesc, jobStore, config)
463
+ successor = nextChainable(jobDesc, job_store, config)
432
464
  if successor is None or config.disableChaining:
433
465
  # Can't chain any more jobs. We are going to stop.
434
466
 
@@ -457,9 +489,6 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
457
489
  # body) up after we finish executing it.
458
490
  successorID = successor.jobStoreID
459
491
 
460
- # add the successor to the list of jobs run
461
- listOfJobs.append(str(successor))
462
-
463
492
  # Now we need to become that successor, under the original ID.
464
493
  successor.replace(jobDesc)
465
494
  jobDesc = successor
@@ -470,7 +499,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
470
499
 
471
500
  # Build a fileStore to update the job and commit the replacement.
472
501
  # TODO: can we have a commit operation without an entire FileStore???
473
- fileStore = AbstractFileStore.createFileStore(jobStore, jobDesc, localWorkerTempDir, blockFn,
502
+ fileStore = AbstractFileStore.createFileStore(job_store, jobDesc, local_worker_temp_dir, blockFn,
474
503
  caching=config.caching)
475
504
 
476
505
  # Update blockFn to wait for that commit operation.
@@ -485,27 +514,44 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
485
514
  #Finish up the stats
486
515
  ##########################################
487
516
  if config.stats:
488
- totalCPUTime, totalMemoryUsage = get_total_cpu_time_and_memory_usage()
517
+ totalCPUTime, totalMemoryUsage = ResourceMonitor.get_total_cpu_time_and_memory_usage()
489
518
  statsDict.workers.time = str(time.time() - startTime)
490
519
  statsDict.workers.clock = str(totalCPUTime - startClock)
491
520
  statsDict.workers.memory = str(totalMemoryUsage)
521
+ # Say the worker used the max disk we saw from any job
522
+ max_bytes = 0
523
+ for job_stats in statsDict.jobs:
524
+ if "disk" in job_stats:
525
+ max_bytes = max(max_bytes, int(job_stats.disk))
526
+ statsDict.workers.disk = str(max_bytes)
527
+ # Count the jobs executed.
528
+ # TODO: toil stats could compute this but its parser is too general to hook into simply.
529
+ statsDict.workers.jobs_run = len(statsDict.jobs)
530
+
492
531
 
493
532
  # log the worker log path here so that if the file is truncated the path can still be found
494
- if redirectOutputToLogFile:
495
- logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", localWorkerTempDir)
533
+ if redirect_output_to_log_file:
534
+ logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", local_worker_temp_dir)
496
535
 
497
536
  logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
498
537
 
499
538
  ##########################################
500
539
  #Trapping where worker goes wrong
501
540
  ##########################################
502
- except Exception as e: #Case that something goes wrong in worker
503
- traceback.print_exc()
541
+ except DebugStoppingPointReached:
542
+ # Job wants the worker to stop for debugging
543
+ raise
544
+ except BaseException as e: #Case that something goes wrong in worker, or we are asked to stop
545
+ if not isinstance(e, SystemExit):
546
+ logger.critical("Worker crashed with traceback:\n%s", traceback.format_exc())
504
547
  logger.error("Exiting the worker because of a failed job on host %s", socket.gethostname())
505
548
  if isinstance(e, CWL_UNSUPPORTED_REQUIREMENT_EXCEPTION):
506
549
  # We need to inform the leader that this is a CWL workflow problem
507
550
  # and it needs to inform its caller.
508
551
  failure_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
552
+ elif isinstance(e, SystemExit) and isinstance(e.code, int) and e.code != 0:
553
+ # We're meant to be exiting with a particular code.
554
+ failure_exit_code = e.code
509
555
  AbstractFileStore._terminateEvent.set()
510
556
  finally:
511
557
  # Get rid of our deferred function manager now so we can't mistake it
@@ -538,7 +584,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
538
584
 
539
585
  # Clobber any garbage state we have for this job from failing with
540
586
  # whatever good state is still stored in the JobStore
541
- jobDesc = jobStore.load_job(jobStoreID)
587
+ jobDesc = job_store.load_job(job_store_id)
542
588
  # Remember that we failed
543
589
  jobAttemptFailed = True
544
590
 
@@ -550,7 +596,7 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
550
596
  # Flush at the Python level
551
597
  sys.stdout.flush()
552
598
  sys.stderr.flush()
553
- if redirectOutputToLogFile:
599
+ if redirect_output_to_log_file:
554
600
  # Flush at the OS level
555
601
  os.fsync(1)
556
602
  os.fsync(2)
@@ -577,12 +623,11 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
577
623
  # relative to the end (since Python won't decode Unicode backward, or even
578
624
  # interpret seek offsets in characters for us). TODO: We may get invalid or
579
625
  # just different Unicode by breaking up a character at the boundary!
580
- if jobAttemptFailed and redirectOutputToLogFile:
581
- jobDesc.logJobStoreFileID = logJobStoreFileID = jobStore.getEmptyFileStoreID(
626
+ if jobAttemptFailed and redirect_output_to_log_file:
627
+ jobDesc.logJobStoreFileID = logJobStoreFileID = job_store.getEmptyFileStoreID(
582
628
  jobDesc.jobStoreID, cleanup=True
583
629
  )
584
- jobDesc.chainedJobs = listOfJobs
585
- with jobStore.update_file_stream(logJobStoreFileID) as w:
630
+ with job_store.update_file_stream(logJobStoreFileID) as w:
586
631
  with open(tempWorkerLogPath, 'rb') as f:
587
632
  if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0:
588
633
  if logFileByteReportLimit > 0:
@@ -592,10 +637,10 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
592
637
  # Dump the possibly-invalid-Unicode bytes into the log file
593
638
  w.write(f.read()) # TODO load file using a buffer
594
639
  # Commit log file reference back to JobStore
595
- jobStore.update_job(jobDesc)
640
+ job_store.update_job(jobDesc)
596
641
 
597
642
  elif ((debugging or (config.writeLogsFromAllJobs and not jobDesc.local))
598
- and redirectOutputToLogFile): # write log messages
643
+ and redirect_output_to_log_file): # write log messages
599
644
  with open(tempWorkerLogPath, 'rb') as logFile:
600
645
  if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
601
646
  if logFileByteReportLimit > 0:
@@ -605,11 +650,14 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
605
650
  # Make sure lines are Unicode so they can be JSON serialized as part of the dict.
606
651
  # We may have damaged the Unicode text by cutting it at an arbitrary byte so we drop bad characters.
607
652
  logMessages = [line.decode('utf-8', 'skip') for line in logFile.read().splitlines()]
608
- statsDict.logs.names = listOfJobs
653
+ statsDict.logs.names = [names.stats_name for names in jobDesc.get_chain()]
609
654
  statsDict.logs.messages = logMessages
610
655
 
611
- if (debugging or config.stats or statsDict.workers.logsToMaster) and not jobAttemptFailed: # We have stats/logging to report back
612
- jobStore.write_logs(json.dumps(statsDict, ensure_ascii=True))
656
+ if debugging or config.stats or statsDict.workers.logs_to_leader or statsDict.workers.logging_user_streams:
657
+ # We have stats/logging to report back.
658
+ # We report even if the job attempt failed.
659
+ # TODO: Will that upset analysis of the stats?
660
+ job_store.write_logs(json.dumps(statsDict, ensure_ascii=True))
613
661
 
614
662
  # Remove the temp dir
615
663
  cleanUp = config.cleanWorkDir
@@ -627,14 +675,14 @@ def workerScript(jobStore: AbstractJobStore, config: Config, jobName: str, jobSt
627
675
  os.chmod(os.path.dirname(path), stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
628
676
  except PermissionError as e:
629
677
  logger.error('Could not set permissions on %s to allow cleanup of %s: %s', os.path.dirname(path), path, e)
630
- shutil.rmtree(localWorkerTempDir, onerror=make_parent_writable)
678
+ shutil.rmtree(local_worker_temp_dir, onerror=make_parent_writable)
631
679
 
632
680
  # This must happen after the log file is done with, else there is no place to put the log
633
681
  if (not jobAttemptFailed) and jobDesc.is_subtree_done():
634
- # We can now safely get rid of the JobDescription, and all jobs it chained up
635
- for otherID in jobDesc.merged_jobs:
636
- jobStore.delete_job(otherID)
637
- jobStore.delete_job(str(jobDesc.jobStoreID))
682
+ for merged_in in jobDesc.get_chain():
683
+ # We can now safely get rid of the JobDescription, and all jobs it chained up
684
+ job_store.delete_job(merged_in.job_store_id)
685
+
638
686
 
639
687
  if jobAttemptFailed:
640
688
  return failure_exit_code
@@ -704,25 +752,19 @@ def in_contexts(contexts: List[str]) -> Iterator[None]:
704
752
  def main(argv: Optional[List[str]] = None) -> None:
705
753
  if argv is None:
706
754
  argv = sys.argv
707
-
708
755
  # Parse our command line
709
756
  options = parse_args(argv)
710
757
 
711
- # Parse input args
712
- jobName = argv[1]
713
- jobStoreLocator = argv[2]
714
- jobStoreID = argv[3]
715
-
716
758
  ##########################################
717
759
  #Load the jobStore/config file
718
760
  ##########################################
719
761
 
720
- jobStore = Toil.resumeJobStore(options.jobStoreLocator)
721
- config = jobStore.config
762
+ job_store = Toil.resumeJobStore(options.jobStoreLocator)
763
+ config = job_store.config
722
764
 
723
765
  with in_contexts(options.context):
724
766
  # Call the worker
725
- exit_code = workerScript(jobStore, config, options.jobName, options.jobStoreID)
767
+ exit_code = workerScript(job_store, config, options.jobName, options.jobStoreID)
726
768
 
727
769
  # Exit with its return value
728
770
  sys.exit(exit_code)
@@ -202,3 +202,28 @@
202
202
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
203
203
  See the License for the specific language governing permissions and
204
204
  limitations under the License.
205
+
206
+ All code in this repository excluding src/toil/statsAndLogging.py::install_log_color is under the Apache License as outlined directly above.
207
+ Some code in src/toil/statsAndLogging.py::install_log_color is under the MiniWDL MIT License as outlined directly below.
208
+
209
+ MIT License
210
+
211
+ Copyright (c) 2018 Chan Zuckerberg Initiative
212
+
213
+ Permission is hereby granted, free of charge, to any person obtaining a copy
214
+ of this software and associated documentation files (the "Software"), to deal
215
+ in the Software without restriction, including without limitation the rights
216
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
217
+ copies of the Software, and to permit persons to whom the Software is
218
+ furnished to do so, subject to the following conditions:
219
+
220
+ The above copyright notice and this permission notice shall be included in all
221
+ copies or substantial portions of the Software.
222
+
223
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
224
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
225
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
226
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
227
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
228
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
229
+ SOFTWARE.