toil 5.12.0__py3-none-any.whl → 6.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. toil/__init__.py +18 -13
  2. toil/batchSystems/abstractBatchSystem.py +39 -13
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +24 -24
  4. toil/batchSystems/awsBatch.py +14 -14
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +3 -3
  7. toil/batchSystems/htcondor.py +0 -1
  8. toil/batchSystems/kubernetes.py +34 -31
  9. toil/batchSystems/local_support.py +3 -1
  10. toil/batchSystems/lsf.py +7 -7
  11. toil/batchSystems/mesos/batchSystem.py +7 -7
  12. toil/batchSystems/options.py +32 -83
  13. toil/batchSystems/registry.py +104 -23
  14. toil/batchSystems/singleMachine.py +16 -13
  15. toil/batchSystems/slurm.py +87 -16
  16. toil/batchSystems/torque.py +0 -1
  17. toil/bus.py +44 -8
  18. toil/common.py +544 -753
  19. toil/cwl/__init__.py +28 -32
  20. toil/cwl/cwltoil.py +595 -574
  21. toil/cwl/utils.py +55 -10
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/__init__.py +2 -2
  24. toil/fileStores/abstractFileStore.py +88 -14
  25. toil/fileStores/cachingFileStore.py +610 -549
  26. toil/fileStores/nonCachingFileStore.py +46 -22
  27. toil/job.py +182 -101
  28. toil/jobStores/abstractJobStore.py +161 -95
  29. toil/jobStores/aws/jobStore.py +23 -9
  30. toil/jobStores/aws/utils.py +6 -6
  31. toil/jobStores/fileJobStore.py +116 -18
  32. toil/jobStores/googleJobStore.py +16 -7
  33. toil/jobStores/utils.py +5 -6
  34. toil/leader.py +87 -56
  35. toil/lib/accelerators.py +10 -5
  36. toil/lib/aws/__init__.py +3 -14
  37. toil/lib/aws/ami.py +22 -9
  38. toil/lib/aws/iam.py +21 -13
  39. toil/lib/aws/session.py +2 -16
  40. toil/lib/aws/utils.py +4 -5
  41. toil/lib/compatibility.py +1 -1
  42. toil/lib/conversions.py +26 -3
  43. toil/lib/docker.py +22 -23
  44. toil/lib/ec2.py +10 -6
  45. toil/lib/ec2nodes.py +106 -100
  46. toil/lib/encryption/_nacl.py +2 -1
  47. toil/lib/generatedEC2Lists.py +325 -18
  48. toil/lib/io.py +49 -2
  49. toil/lib/misc.py +1 -1
  50. toil/lib/resources.py +9 -2
  51. toil/lib/threading.py +101 -38
  52. toil/options/common.py +736 -0
  53. toil/options/cwl.py +336 -0
  54. toil/options/wdl.py +37 -0
  55. toil/provisioners/abstractProvisioner.py +9 -4
  56. toil/provisioners/aws/__init__.py +3 -6
  57. toil/provisioners/aws/awsProvisioner.py +6 -0
  58. toil/provisioners/clusterScaler.py +3 -2
  59. toil/provisioners/gceProvisioner.py +2 -2
  60. toil/realtimeLogger.py +2 -1
  61. toil/resource.py +24 -18
  62. toil/server/app.py +2 -3
  63. toil/server/cli/wes_cwl_runner.py +4 -4
  64. toil/server/utils.py +1 -1
  65. toil/server/wes/abstract_backend.py +3 -2
  66. toil/server/wes/amazon_wes_utils.py +5 -4
  67. toil/server/wes/tasks.py +2 -3
  68. toil/server/wes/toil_backend.py +2 -10
  69. toil/server/wsgi_app.py +2 -0
  70. toil/serviceManager.py +12 -10
  71. toil/statsAndLogging.py +41 -9
  72. toil/test/__init__.py +29 -54
  73. toil/test/batchSystems/batchSystemTest.py +11 -111
  74. toil/test/batchSystems/test_slurm.py +24 -8
  75. toil/test/cactus/__init__.py +0 -0
  76. toil/test/cactus/test_cactus_integration.py +58 -0
  77. toil/test/cwl/cwlTest.py +438 -223
  78. toil/test/cwl/glob_dir.cwl +15 -0
  79. toil/test/cwl/preemptible.cwl +21 -0
  80. toil/test/cwl/preemptible_expression.cwl +28 -0
  81. toil/test/cwl/revsort.cwl +1 -1
  82. toil/test/cwl/revsort2.cwl +1 -1
  83. toil/test/docs/scriptsTest.py +2 -3
  84. toil/test/jobStores/jobStoreTest.py +34 -21
  85. toil/test/lib/aws/test_iam.py +4 -14
  86. toil/test/lib/aws/test_utils.py +0 -3
  87. toil/test/lib/dockerTest.py +4 -4
  88. toil/test/lib/test_ec2.py +12 -17
  89. toil/test/mesos/helloWorld.py +4 -5
  90. toil/test/mesos/stress.py +1 -1
  91. toil/test/{wdl/conftest.py → options/__init__.py} +0 -10
  92. toil/test/options/options.py +37 -0
  93. toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
  94. toil/test/provisioners/clusterScalerTest.py +6 -4
  95. toil/test/provisioners/clusterTest.py +23 -11
  96. toil/test/provisioners/gceProvisionerTest.py +0 -6
  97. toil/test/provisioners/restartScript.py +3 -2
  98. toil/test/server/serverTest.py +1 -1
  99. toil/test/sort/restart_sort.py +2 -1
  100. toil/test/sort/sort.py +2 -1
  101. toil/test/sort/sortTest.py +2 -13
  102. toil/test/src/autoDeploymentTest.py +45 -45
  103. toil/test/src/busTest.py +5 -5
  104. toil/test/src/checkpointTest.py +2 -2
  105. toil/test/src/deferredFunctionTest.py +1 -1
  106. toil/test/src/fileStoreTest.py +32 -16
  107. toil/test/src/helloWorldTest.py +1 -1
  108. toil/test/src/importExportFileTest.py +1 -1
  109. toil/test/src/jobDescriptionTest.py +2 -1
  110. toil/test/src/jobServiceTest.py +1 -1
  111. toil/test/src/jobTest.py +18 -18
  112. toil/test/src/miscTests.py +5 -3
  113. toil/test/src/promisedRequirementTest.py +3 -3
  114. toil/test/src/realtimeLoggerTest.py +1 -1
  115. toil/test/src/resourceTest.py +2 -2
  116. toil/test/src/restartDAGTest.py +1 -1
  117. toil/test/src/resumabilityTest.py +36 -2
  118. toil/test/src/retainTempDirTest.py +1 -1
  119. toil/test/src/systemTest.py +2 -2
  120. toil/test/src/toilContextManagerTest.py +2 -2
  121. toil/test/src/userDefinedJobArgTypeTest.py +1 -1
  122. toil/test/utils/toilDebugTest.py +98 -32
  123. toil/test/utils/toilKillTest.py +2 -2
  124. toil/test/utils/utilsTest.py +23 -3
  125. toil/test/wdl/wdltoil_test.py +223 -45
  126. toil/toilState.py +7 -6
  127. toil/utils/toilClean.py +1 -1
  128. toil/utils/toilConfig.py +36 -0
  129. toil/utils/toilDebugFile.py +60 -33
  130. toil/utils/toilDebugJob.py +39 -12
  131. toil/utils/toilDestroyCluster.py +1 -1
  132. toil/utils/toilKill.py +1 -1
  133. toil/utils/toilLaunchCluster.py +13 -2
  134. toil/utils/toilMain.py +3 -2
  135. toil/utils/toilRsyncCluster.py +1 -1
  136. toil/utils/toilSshCluster.py +1 -1
  137. toil/utils/toilStats.py +445 -305
  138. toil/utils/toilStatus.py +2 -5
  139. toil/version.py +10 -10
  140. toil/wdl/utils.py +2 -122
  141. toil/wdl/wdltoil.py +1257 -492
  142. toil/worker.py +55 -46
  143. toil-6.1.0.dist-info/METADATA +124 -0
  144. toil-6.1.0.dist-info/RECORD +241 -0
  145. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/WHEEL +1 -1
  146. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -1
  147. toil/batchSystems/parasol.py +0 -379
  148. toil/batchSystems/tes.py +0 -459
  149. toil/test/batchSystems/parasolTestSupport.py +0 -117
  150. toil/test/wdl/builtinTest.py +0 -506
  151. toil/test/wdl/toilwdlTest.py +0 -522
  152. toil/wdl/toilwdl.py +0 -141
  153. toil/wdl/versions/dev.py +0 -107
  154. toil/wdl/versions/draft2.py +0 -980
  155. toil/wdl/versions/v1.py +0 -794
  156. toil/wdl/wdl_analysis.py +0 -116
  157. toil/wdl/wdl_functions.py +0 -997
  158. toil/wdl/wdl_synthesis.py +0 -1011
  159. toil/wdl/wdl_types.py +0 -243
  160. toil-5.12.0.dist-info/METADATA +0 -118
  161. toil-5.12.0.dist-info/RECORD +0 -244
  162. /toil/{wdl/versions → options}/__init__.py +0 -0
  163. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
  164. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
toil/leader.py CHANGED
@@ -28,22 +28,24 @@ import enlighten
28
28
  from toil import resolveEntryPoint
29
29
  from toil.batchSystems import DeadlockException
30
30
  from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem,
31
- BatchJobExitReason)
32
- from toil.bus import (JobAnnotationMessage,
33
- JobCompletedMessage,
31
+ BatchJobExitReason,
32
+ EXIT_STATUS_UNAVAILABLE_VALUE)
33
+ from toil.bus import (JobCompletedMessage,
34
34
  JobFailedMessage,
35
35
  JobIssuedMessage,
36
36
  JobMissingMessage,
37
37
  JobUpdatedMessage,
38
- QueueSizeMessage)
39
- from toil.common import Config, Toil, ToilMetrics
38
+ QueueSizeMessage,
39
+ gen_message_bus_path,
40
+ get_job_kind)
41
+ from toil.common import Config, ToilMetrics
40
42
  from toil.cwl.utils import CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
43
+ from toil.exceptions import FailedJobsException
41
44
  from toil.job import (CheckpointJobDescription,
42
45
  JobDescription,
43
46
  ServiceJobDescription,
44
47
  TemporaryID)
45
48
  from toil.jobStores.abstractJobStore import (AbstractJobStore,
46
- NoSuchFileException,
47
49
  NoSuchJobException)
48
50
  from toil.lib.throttle import LocalThrottle
49
51
  from toil.provisioners.abstractProvisioner import AbstractProvisioner
@@ -51,7 +53,6 @@ from toil.provisioners.clusterScaler import ScalerThread
51
53
  from toil.serviceManager import ServiceManager
52
54
  from toil.statsAndLogging import StatsAndLogging
53
55
  from toil.toilState import ToilState
54
- from toil.exceptions import FailedJobsException
55
56
 
56
57
  logger = logging.getLogger(__name__)
57
58
 
@@ -115,10 +116,14 @@ class Leader:
115
116
  # state change information about jobs.
116
117
  self.toilState = ToilState(self.jobStore)
117
118
 
118
- if self.config.write_messages is not None:
119
- # Message bus messages need to go to the given file.
120
- # Keep a reference to the return value so the listener stays alive.
121
- self._message_subscription = self.toilState.bus.connect_output_file(self.config.write_messages)
119
+ if self.config.write_messages is None:
120
+ # The user hasn't specified a place for the message bus so we
121
+ # should make one.
122
+ self.config.write_messages = gen_message_bus_path()
123
+
124
+ # Message bus messages need to go to the given file.
125
+ # Keep a reference to the return value so the listener stays alive.
126
+ self._message_subscription = self.toilState.bus.connect_output_file(self.config.write_messages)
122
127
 
123
128
  # Connect to the message bus, so we will get all the messages of these
124
129
  # types in an inbox.
@@ -138,7 +143,8 @@ class Leader:
138
143
 
139
144
  # Batch system
140
145
  self.batchSystem = batchSystem
141
- assert len(self.batchSystem.getIssuedBatchJobIDs()) == 0 # Batch system must start with no active jobs!
146
+ if len(self.batchSystem.getIssuedBatchJobIDs()) != 0:
147
+ raise RuntimeError("The initialized batchsystem did not start with 0 active jobs.")
142
148
  logger.debug("Checked batch system has no running jobs and no updated jobs")
143
149
 
144
150
  # Map of batch system IDs to job store IDs
@@ -370,7 +376,8 @@ class Leader:
370
376
 
371
377
  # If the successor job's predecessors have all not all completed then
372
378
  # ignore the successor as is not yet ready to run
373
- assert len(successor.predecessorsFinished) <= successor.predecessorNumber
379
+ if len(successor.predecessorsFinished) > successor.predecessorNumber:
380
+ raise RuntimeError("There are more finished predecessors than possible.")
374
381
  if len(successor.predecessorsFinished) == successor.predecessorNumber:
375
382
  # All the successor's predecessors are done now.
376
383
  # Remove the successor job from the set of waiting multi-predecessor jobs.
@@ -391,8 +398,10 @@ class Leader:
391
398
  #Build map from successor to predecessors.
392
399
  if successor_id not in self.toilState.successor_to_predecessors:
393
400
  self.toilState.successor_to_predecessors[successor_id] = set()
394
- assert isinstance(successor_id, str)
395
- assert isinstance(predecessor_id, str)
401
+ if not isinstance(successor_id, str):
402
+ raise RuntimeError("The given successor ID is invalid.")
403
+ if not isinstance(predecessor_id, str):
404
+ raise RuntimeError("The given predecessor ID is invalid.")
396
405
  self.toilState.successor_to_predecessors[successor_id].add(predecessor_id)
397
406
 
398
407
  # Grab the successor
@@ -423,7 +432,8 @@ class Leader:
423
432
  predecessor_id, len(next_successors))
424
433
  #Record the number of successors that must be completed before
425
434
  #the job can be considered again
426
- assert self.toilState.count_pending_successors(predecessor_id) == 0, 'Attempted to schedule successors of the same job twice!'
435
+ if self.toilState.count_pending_successors(predecessor_id) != 0:
436
+ raise RuntimeError('Attempted to schedule successors of the same job twice!')
427
437
  self.toilState.successors_pending(predecessor_id, len(next_successors))
428
438
 
429
439
  # For each successor schedule if all predecessors have been completed
@@ -534,11 +544,13 @@ class Leader:
534
544
  # the job has services to run, which have not been started, start them
535
545
  # Build a map from the service jobs to the job and a map
536
546
  # of the services created for the job
537
- assert readyJob.jobStoreID not in self.toilState.servicesIssued
547
+ if readyJob.jobStoreID in self.toilState.servicesIssued:
548
+ raise RuntimeError(f"The ready job: {readyJob.jobStoreID} was already issued.")
538
549
  self.toilState.servicesIssued[readyJob.jobStoreID] = set()
539
550
  for serviceJobList in readyJob.serviceHostIDsInBatches():
540
551
  for serviceID in serviceJobList:
541
- assert serviceID not in self.toilState.service_to_client
552
+ if serviceID in self.toilState.service_to_client:
553
+ raise RuntimeError(f"The ready service ID: {serviceID} was already added.")
542
554
  self.toilState.reset_job(serviceID)
543
555
  serviceHost = self.toilState.get_job(serviceID)
544
556
  self.toilState.service_to_client[serviceID] = readyJob.jobStoreID
@@ -675,7 +687,8 @@ class Leader:
675
687
  client = self.toilState.get_job(client_id)
676
688
 
677
689
  # Make sure services still want to run
678
- assert next(client.serviceHostIDsInBatches(), None) is not None
690
+ if next(client.serviceHostIDsInBatches(), None) is None:
691
+ raise RuntimeError("No more services want to run.")
679
692
 
680
693
  # Mark the service job updated so we don't stop here.
681
694
  self._messages.publish(JobUpdatedMessage(client_id, 1))
@@ -694,8 +707,9 @@ class Leader:
694
707
  if exitStatus == 0:
695
708
  logger.debug('Job ended: %s', updatedJob)
696
709
  else:
697
- logger.warning(f'Job failed with exit value {exitStatus}: {updatedJob}\n'
698
- f'Exit reason: {exitReason}')
710
+ status_string = str(exitStatus) if exitStatus != EXIT_STATUS_UNAVAILABLE_VALUE else "<UNAVAILABLE>"
711
+ logger.warning(f'Job failed with exit value {status_string}: {updatedJob}\n'
712
+ f'Exit reason: {BatchJobExitReason.to_string(exitReason)}')
699
713
  if exitStatus == CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE:
700
714
  # This is a CWL job informing us that the workflow is
701
715
  # asking things of us that Toil can't do. When we raise an
@@ -704,7 +718,7 @@ class Leader:
704
718
  logger.warning("This indicates an unsupported CWL requirement!")
705
719
  self.recommended_fail_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
706
720
  # Tell everyone it stopped running.
707
- self._messages.publish(JobCompletedMessage(updatedJob.get_job_kind(), updatedJob.jobStoreID, exitStatus))
721
+ self._messages.publish(JobCompletedMessage(get_job_kind(updatedJob.get_names()), updatedJob.jobStoreID, exitStatus))
708
722
  self.process_finished_job(bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason)
709
723
 
710
724
  def _processLostJobs(self):
@@ -784,13 +798,16 @@ class Leader:
784
798
  logger.debug("Finished the main loop: no jobs left to run.")
785
799
 
786
800
  # Consistency check the toil state
787
- assert self._messages.empty(), f"Pending messages at shutdown: {self._messages}"
788
- assert self.toilState.successorCounts == {}, f"Jobs waiting on successors at shutdown: {self.toilState.successorCounts}"
789
- assert self.toilState.successor_to_predecessors == {}, f"Successors pending for their predecessors at shutdown: {self.toilState.successor_to_predecessors}"
790
- assert self.toilState.service_to_client == {}, f"Services pending for their clients at shutdown: {self.toilState.service_to_client}"
791
- assert self.toilState.servicesIssued == {}, f"Services running at shutdown: {self.toilState.servicesIssued}"
792
- # assert self.toilState.jobsToBeScheduledWithMultiplePredecessors # These are not properly emptied yet
793
- # assert self.toilState.hasFailedSuccessors == set() # These are not properly emptied yet
801
+ if not self._messages.empty():
802
+ raise RuntimeError(f"Pending messages at shutdown: {self._messages}")
803
+ if self.toilState.successorCounts != {}:
804
+ raise RuntimeError(f"Jobs waiting on successors at shutdown: {self.toilState.successorCounts}")
805
+ if self.toilState.successor_to_predecessors != {}:
806
+ raise RuntimeError(f"Successors pending for their predecessors at shutdown: {self.toilState.successor_to_predecessors}")
807
+ if self.toilState.service_to_client != {}:
808
+ raise RuntimeError(f"Services pending for their clients at shutdown: {self.toilState.service_to_client}")
809
+ if self.toilState.servicesIssued != {}:
810
+ raise RuntimeError(f"Services running at shutdown: {self.toilState.servicesIssued}")
794
811
 
795
812
  def checkForDeadlocks(self):
796
813
  """Check if the system is deadlocked running service jobs."""
@@ -865,8 +882,8 @@ class Leader:
865
882
  def issueJob(self, jobNode: JobDescription) -> None:
866
883
  """Add a job to the queue of jobs currently trying to run."""
867
884
  # Never issue the same job multiple times simultaneously
868
- assert jobNode.jobStoreID not in self.toilState.jobs_issued, \
869
- f"Attempted to issue {jobNode} multiple times simultaneously!"
885
+ if jobNode.jobStoreID in self.toilState.jobs_issued:
886
+ raise RuntimeError(f"Attempted to issue {jobNode} multiple times simultaneously!")
870
887
 
871
888
  workerCommand = [resolveEntryPoint('_toil_worker'),
872
889
  jobNode.jobName,
@@ -907,7 +924,7 @@ class Leader:
907
924
  "%s and %s",
908
925
  jobNode, str(jobBatchSystemID), jobNode.requirements_string())
909
926
  # Tell everyone it is issued and the queue size changed
910
- self._messages.publish(JobIssuedMessage(jobNode.get_job_kind(), jobNode.jobStoreID, jobBatchSystemID))
927
+ self._messages.publish(JobIssuedMessage(get_job_kind(jobNode.get_names()), jobNode.jobStoreID, jobBatchSystemID))
911
928
  self._messages.publish(QueueSizeMessage(self.getNumberOfJobsIssued()))
912
929
  # Tell the user there's another job to do
913
930
  self.progress_overall.total += 1
@@ -926,7 +943,8 @@ class Leader:
926
943
  """
927
944
  # Grab the service job description
928
945
  service = self.toilState.get_job(service_id)
929
- assert isinstance(service, ServiceJobDescription)
946
+ if not isinstance(service, ServiceJobDescription):
947
+ raise RuntimeError("The grabbed service job description is not the right type.")
930
948
 
931
949
  if service.preemptible:
932
950
  self.preemptibleServiceJobsToBeIssued.append(service_id)
@@ -956,7 +974,8 @@ class Leader:
956
974
  elif preemptible:
957
975
  return self.preemptibleJobsIssued
958
976
  else:
959
- assert len(self.issued_jobs_by_batch_system_id) >= self.preemptibleJobsIssued
977
+ if len(self.issued_jobs_by_batch_system_id) < self.preemptibleJobsIssued:
978
+ raise RuntimeError("Number of jobs issued cannot be negative.")
960
979
  return len(self.issued_jobs_by_batch_system_id) - self.preemptibleJobsIssued
961
980
 
962
981
  def _getStatusHint(self) -> str:
@@ -990,16 +1009,19 @@ class Leader:
990
1009
 
991
1010
  :return: Job description as it was issued.
992
1011
  """
993
- assert jobBatchSystemID in self.issued_jobs_by_batch_system_id
1012
+ if jobBatchSystemID not in self.issued_jobs_by_batch_system_id:
1013
+ raise RuntimeError("Job was already removed or was never issued.")
994
1014
  issuedDesc = self.toilState.get_job(self.issued_jobs_by_batch_system_id[jobBatchSystemID])
995
1015
  if issuedDesc.preemptible:
996
1016
  # len(issued_jobs_by_batch_system_id) should always be greater than or equal to preemptibleJobsIssued,
997
1017
  # so decrement this value before removing the job from the issuedJob map
998
- assert self.preemptibleJobsIssued > 0
1018
+ if self.preemptibleJobsIssued <= 0:
1019
+ raise RuntimeError("The number of preemptive issued jobs cannot be negative.")
999
1020
  self.preemptibleJobsIssued -= 1
1000
1021
  # It's not issued anymore.
1001
1022
  del self.issued_jobs_by_batch_system_id[jobBatchSystemID]
1002
- assert issuedDesc.jobStoreID in self.toilState.jobs_issued, f"Job {issuedDesc} came back without being issued"
1023
+ if issuedDesc.jobStoreID not in self.toilState.jobs_issued:
1024
+ raise RuntimeError(f"Job {issuedDesc} came back without being issued")
1003
1025
  self.toilState.jobs_issued.remove(issuedDesc.jobStoreID)
1004
1026
  # If service job
1005
1027
  if issuedDesc.jobStoreID in self.toilState.service_to_client:
@@ -1090,8 +1112,9 @@ class Leader:
1090
1112
  for jobBatchSystemID in missingJobIDsSet.difference(jobBatchSystemIDsSet):
1091
1113
  self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
1092
1114
  logger.warning("Batch system id: %s is no longer missing", str(jobBatchSystemID))
1093
- assert issuedJobs.issubset(jobBatchSystemIDsSet) #Assert checks we have
1094
- #no unexpected jobs running
1115
+ # checks we have no unexpected jobs running
1116
+ if not issuedJobs.issubset(jobBatchSystemIDsSet):
1117
+ raise RuntimeError("An unexpected job is still running.")
1095
1118
  jobsToKill = []
1096
1119
  for jobBatchSystemID in set(jobBatchSystemIDsSet.difference(issuedJobs)):
1097
1120
  jobStoreID = self.issued_jobs_by_batch_system_id[jobBatchSystemID]
@@ -1137,7 +1160,7 @@ class Leader:
1137
1160
  self.progress_overall.update(incr=-1)
1138
1161
  self.progress_failed.update(incr=1)
1139
1162
 
1140
- # Delegate to the vers
1163
+ # Delegate to the version that uses a JobDescription
1141
1164
  return self.process_finished_job_description(issued_job, result_status, wall_time, exit_reason, batch_system_id)
1142
1165
 
1143
1166
  def process_finished_job_description(self, finished_job: JobDescription, result_status: int,
@@ -1188,11 +1211,12 @@ class Leader:
1188
1211
  # more memory efficient than read().striplines() while leaving off the
1189
1212
  # trailing \n left when using readlines()
1190
1213
  # http://stackoverflow.com/a/15233739
1191
- StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
1214
+ StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
1192
1215
  message='The job seems to have left a log file, indicating failure: %s' % replacement_job)
1193
1216
  if self.config.writeLogs or self.config.writeLogsGzip:
1194
1217
  with replacement_job.getLogFileHandle(self.jobStore) as log_stream:
1195
- StatsAndLogging.writeLogFiles(replacement_job.chainedJobs, log_stream, self.config, failed=True)
1218
+ # Send log data from the job store to each per-job log file involved.
1219
+ StatsAndLogging.writeLogFiles([names.stats_name for names in replacement_job.get_chain()], log_stream, self.config, failed=True)
1196
1220
  if result_status != 0:
1197
1221
  # If the batch system returned a non-zero exit code then the worker
1198
1222
  # is assumed not to have captured the failure of the job, so we
@@ -1216,13 +1240,12 @@ class Leader:
1216
1240
  else:
1217
1241
  with log_stream:
1218
1242
  if os.path.getsize(log_file) > 0:
1219
- StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
1243
+ StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
1220
1244
  message='The batch system left a non-empty file %s:' % log_file)
1221
1245
  if self.config.writeLogs or self.config.writeLogsGzip:
1222
1246
  file_root, _ = os.path.splitext(os.path.basename(log_file))
1223
- job_names = replacement_job.chainedJobs
1224
- if job_names is None: # For jobs that fail this way, replacement_job.chainedJobs is not guaranteed to be set
1225
- job_names = [str(replacement_job)]
1247
+ job_names = [names.stats_name for names in replacement_job.get_chain()]
1248
+ # Tack the batch system log file name onto each job's name
1226
1249
  job_names = [j + '_' + file_root for j in job_names]
1227
1250
  log_stream.seek(0)
1228
1251
  StatsAndLogging.writeLogFiles(job_names, log_stream, self.config, failed=True)
@@ -1289,18 +1312,21 @@ class Leader:
1289
1312
 
1290
1313
  # Tell everyone it failed
1291
1314
 
1292
- self._messages.publish(JobFailedMessage(job_desc.get_job_kind(), job_id))
1315
+ self._messages.publish(JobFailedMessage(get_job_kind(job_desc.get_names()), job_id))
1293
1316
 
1294
1317
  if job_id in self.toilState.service_to_client:
1295
1318
  # Is a service job
1296
1319
  logger.debug("Service job is being processed as a totally failed job: %s", job_desc)
1297
1320
 
1298
- assert isinstance(job_desc, ServiceJobDescription)
1321
+
1322
+ if not isinstance(job_desc, ServiceJobDescription):
1323
+ raise RuntimeError("The service job description type is incorrect.")
1299
1324
 
1300
1325
  # Grab the client, which is the predecessor.
1301
1326
  client_id = self.toilState.service_to_client[job_id]
1302
1327
 
1303
- assert client_id in self.toilState.servicesIssued
1328
+ if client_id not in self.toilState.servicesIssued:
1329
+ raise RuntimeError("The client was never issued.")
1304
1330
 
1305
1331
  # Leave the service job as a service of its predecessor, because it
1306
1332
  # didn't work.
@@ -1331,8 +1357,10 @@ class Leader:
1331
1357
  self.jobStore.delete_file(job_desc.startJobStoreID)
1332
1358
  else:
1333
1359
  # Is a non-service job
1334
- assert job_id not in self.toilState.servicesIssued
1335
- assert not isinstance(job_desc, ServiceJobDescription)
1360
+ if job_id in self.toilState.servicesIssued:
1361
+ raise RuntimeError("The non-service job should not have been issued.")
1362
+ if isinstance(job_desc, ServiceJobDescription):
1363
+ raise RuntimeError("The job description type is incorrect.")
1336
1364
 
1337
1365
  # Traverse failed job's successor graph and get the jobStoreID of new successors.
1338
1366
  # Any successor already in toilState.failedSuccessors will not be traversed
@@ -1401,11 +1429,13 @@ class Leader:
1401
1429
  len(self.toilState.servicesIssued[client_id]))
1402
1430
  elif jobStoreID not in self.toilState.successor_to_predecessors:
1403
1431
  #We have reach the root job
1404
- assert self._messages.count(JobUpdatedMessage) == 0, "Root job is done but other jobs are still updated"
1405
- assert len(self.toilState.successor_to_predecessors) == 0, \
1406
- ("Job {} is finished and had no predecessor, but we have other outstanding jobs "
1432
+ if self._messages.count(JobUpdatedMessage) != 0:
1433
+ raise RuntimeError("Root job is done but other jobs are still updated")
1434
+ if len(self.toilState.successor_to_predecessors) != 0:
1435
+ raise RuntimeError("Job {} is finished and had no predecessor, but we have other outstanding jobs "
1407
1436
  "with predecessors: {}".format(jobStoreID, self.toilState.successor_to_predecessors.keys()))
1408
- assert len(self.toilState.successorCounts) == 0, f"Root job is done but jobs waiting on successors: {self.toilState.successorCounts}"
1437
+ if len(self.toilState.successorCounts) != 0:
1438
+ raise RuntimeError("Root job is done but jobs waiting on successors: {self.toilState.successorCounts}")
1409
1439
  logger.debug("Reached root job %s so no predecessors to clean up" % jobStoreID)
1410
1440
 
1411
1441
  else:
@@ -1414,7 +1444,8 @@ class Leader:
1414
1444
 
1415
1445
  # For each predecessor
1416
1446
  for predecessor_id in self.toilState.successor_to_predecessors.pop(jobStoreID):
1417
- assert isinstance(predecessor_id, str), f"Predecessor ID should be str but is {type(predecessor_id)}"
1447
+ if not isinstance(predecessor_id, str):
1448
+ raise RuntimeError("Predecessor ID should be str but is {type(predecessor_id)}")
1418
1449
  predecessor = self.toilState.get_job(predecessor_id)
1419
1450
 
1420
1451
  # Tell the predecessor that this job is done (keep only other successor jobs)
toil/lib/accelerators.py CHANGED
@@ -16,7 +16,7 @@
16
16
 
17
17
  import os
18
18
  import subprocess
19
- from typing import Dict, List, Optional, Set, Union
19
+ from typing import Dict, List, Set, Union, cast
20
20
  from xml.dom import minidom
21
21
 
22
22
  from toil.job import AcceleratorRequirement
@@ -92,10 +92,15 @@ def count_nvidia_gpus() -> int:
92
92
  # <https://github.com/common-workflow-language/cwltool/blob/6f29c59fb1b5426ef6f2891605e8fa2d08f1a8da/cwltool/cuda.py>
93
93
  # Some example output is here: <https://gist.github.com/loretoparisi/2620b777562c2dfd50d6b618b5f20867>
94
94
  try:
95
- return int(minidom.parseString(
96
- subprocess.check_output(["nvidia-smi", "-q", "-x"])
97
- ).getElementsByTagName("attached_gpus")[0].firstChild.data)
98
- except (FileNotFoundError, subprocess.CalledProcessError, IndexError, ValueError, PermissionError):
95
+ return int(
96
+ cast(
97
+ minidom.Text,
98
+ minidom.parseString(subprocess.check_output(["nvidia-smi", "-q", "-x"]))
99
+ .getElementsByTagName("attached_gpus")[0]
100
+ .firstChild,
101
+ ).data
102
+ )
103
+ except:
99
104
  return 0
100
105
 
101
106
  # TODO: Parse each gpu > product_name > text content and convert to some
toil/lib/aws/__init__.py CHANGED
@@ -11,27 +11,15 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import collections
15
- import inspect
16
14
  import json
17
15
  import logging
18
16
  import os
19
17
  import re
20
18
  import socket
21
- import threading
22
- from functools import lru_cache
23
- from typing import (Any,
24
- Callable,
25
- Dict,
26
- Iterable,
27
- List,
28
- MutableMapping,
29
- Optional,
30
- TypeVar,
31
- Union)
19
+ from http.client import HTTPException
20
+ from typing import Dict, MutableMapping, Optional
32
21
  from urllib.error import URLError
33
22
  from urllib.request import urlopen
34
- from http.client import HTTPException
35
23
 
36
24
  logger = logging.getLogger(__name__)
37
25
 
@@ -80,6 +68,7 @@ def get_aws_zone_from_metadata() -> Optional[str]:
80
68
  try:
81
69
  # Use the EC2 metadata service
82
70
  import boto
71
+ str(boto) # to prevent removal of the import
83
72
  from boto.utils import get_instance_metadata
84
73
  logger.debug("Fetch AZ from EC2 metadata")
85
74
  return get_instance_metadata()['placement']['availability-zone']
toil/lib/aws/ami.py CHANGED
@@ -1,12 +1,12 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
- import time
5
4
  import urllib.request
6
- from urllib.error import HTTPError
7
- from typing import Dict, Optional, Iterator, cast
5
+ from typing import Dict, Iterator, Optional, cast
6
+ from urllib.error import HTTPError, URLError
8
7
 
9
8
  from botocore.client import BaseClient
9
+ from botocore.exceptions import ClientError
10
10
 
11
11
  from toil.lib.retry import retry
12
12
 
@@ -110,6 +110,12 @@ def flatcar_release_feed_amis(region: str, architecture: str = 'amd64', source:
110
110
  # Try again
111
111
  try_number += 1
112
112
  continue
113
+ except URLError:
114
+ # Could be a connection timeout
115
+ logger.exception(f'Failed to retrieve {source} Flatcar release feed JSON')
116
+ # Try again
117
+ try_number += 1
118
+ continue
113
119
  if try_number == MAX_TRIES:
114
120
  # We could not get the JSON
115
121
  logger.error(f'Could not get a readable {source} Flatcar release feed JSON')
@@ -150,11 +156,18 @@ def feed_flatcar_ami_release(ec2_client: BaseClient, architecture: str = 'amd64'
150
156
 
151
157
  for ami in flatcar_release_feed_amis(region, architecture, source):
152
158
  # verify it exists on AWS
153
- response = ec2_client.describe_images(Filters=[{'Name': 'image-id', 'Values': [ami]}]) # type: ignore
154
- if len(response['Images']) == 1 and response['Images'][0]['State'] == 'available':
155
- return ami
156
- else:
157
- logger.warning(f'Flatcar release feed suggests image {ami} which does not exist on AWS in {region}')
159
+ try:
160
+ response = ec2_client.describe_images(Filters=[{'Name': 'image-id', 'Values': [ami]}]) # type: ignore
161
+ if len(response['Images']) == 1 and response['Images'][0]['State'] == 'available':
162
+ return ami
163
+ else:
164
+ logger.warning(f'Flatcar release feed suggests image {ami} which does not exist on AWS in {region}')
165
+ except ClientError:
166
+ # Sometimes we get back nonsense like:
167
+ # botocore.exceptions.ClientError: An error occurred (AuthFailure) when calling the DescribeImages operation: AWS was not able to validate the provided access credentials
168
+ # Don't hold that against the AMI.
169
+ logger.exception(f'Unable to check if AMI {ami} exists on AWS in {region}; assuming it does')
170
+ return ami
158
171
  # We didn't find it
159
172
  logger.warning(f'Flatcar release feed does not have an image for region {region} that exists on AWS')
160
173
  return None
@@ -162,7 +175,7 @@ def feed_flatcar_ami_release(ec2_client: BaseClient, architecture: str = 'amd64'
162
175
 
163
176
  @retry() # TODO: What errors do we get for timeout, JSON parse failure, etc?
164
177
  def aws_marketplace_flatcar_ami_search(ec2_client: BaseClient, architecture: str = 'amd64') -> Optional[str]:
165
- """Query AWS for all AMI names matching 'Flatcar-stable-*' and return the most recent one."""
178
+ """Query AWS for all AMI names matching ``Flatcar-stable-*`` and return the most recent one."""
166
179
 
167
180
  # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.describe_images
168
181
  # Possible arch choices on AWS: 'i386'|'x86_64'|'arm64'|'x86_64_mac'
toil/lib/aws/iam.py CHANGED
@@ -3,16 +3,15 @@ import json
3
3
  import logging
4
4
  from collections import defaultdict
5
5
  from functools import lru_cache
6
- from typing import Any, Dict, List, Optional, Set, cast, Union, Sequence
6
+ from typing import Dict, List, Optional, Union, cast
7
7
 
8
8
  import boto3
9
9
  from mypy_boto3_iam import IAMClient
10
- from mypy_boto3_iam.type_defs import AttachedPolicyTypeDef
10
+ from mypy_boto3_iam.type_defs import (AttachedPolicyTypeDef,
11
+ PolicyDocumentDictTypeDef)
11
12
  from mypy_boto3_sts import STSClient
12
13
 
13
- from toil.lib.aws import zone_to_region
14
14
  from toil.lib.aws.session import client as get_client
15
- from toil.provisioners.aws import get_best_aws_zone
16
15
 
17
16
  logger = logging.getLogger(__name__)
18
17
 
@@ -121,7 +120,7 @@ def permission_matches_any(perm: str, list_perms: List[str]) -> bool:
121
120
  return True
122
121
  return False
123
122
 
124
- def get_actions_from_policy_document(policy_doc: Dict[str, Any]) -> AllowedActionCollection:
123
+ def get_actions_from_policy_document(policy_doc: PolicyDocumentDictTypeDef) -> AllowedActionCollection:
125
124
  '''
126
125
  Given a policy document, go through each statement and create an AllowedActionCollection representing the
127
126
  permissions granted in the policy document.
@@ -138,11 +137,16 @@ def get_actions_from_policy_document(policy_doc: Dict[str, Any]) -> AllowedActio
138
137
  for resource in statement["Resource"]:
139
138
  for key in ["Action", "NotAction"]:
140
139
  if key in statement.keys():
141
- if isinstance(statement[key], list):
142
- allowed_actions[resource][key] += statement[key]
140
+ # mypy_boto3_iam declares policy document as a TypedDict
141
+ # This type expects 4 string keys, of which NotAction is not an option
142
+ # Thus mypy complains. NotAction seems to be valid according to Amazon:
143
+ # https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_elements_notaction.html
144
+ # so type: ignore for now
145
+ if isinstance(statement[key], list): # type: ignore[literal-required]
146
+ allowed_actions[resource][key] += statement[key] # type: ignore[literal-required]
143
147
  else:
144
148
  #Assumes that if it isn't a list it's probably a string
145
- allowed_actions[resource][key].append(statement[key])
149
+ allowed_actions[resource][key].append(statement[key]) # type: ignore[literal-required]
146
150
 
147
151
  return allowed_actions
148
152
  def allowed_actions_attached(iam: IAMClient, attached_policies: List[AttachedPolicyTypeDef]) -> AllowedActionCollection:
@@ -181,24 +185,28 @@ def allowed_actions_roles(iam: IAMClient, policy_names: List[str], role_name: st
181
185
  PolicyName=policy_name
182
186
  )
183
187
  logger.debug("Checking role policy")
184
- policy_document = json.loads(role_policy["PolicyDocument"])
188
+ # PolicyDocument is now a TypedDict, but an instance of TypedDict is not an instance of dict?
189
+ if isinstance(role_policy["PolicyDocument"], str):
190
+ policy_document = json.loads(role_policy["PolicyDocument"])
191
+ else:
192
+ policy_document = role_policy["PolicyDocument"]
185
193
 
186
194
  allowed_actions = add_to_action_collection(allowed_actions, get_actions_from_policy_document(policy_document))
187
195
 
188
196
  return allowed_actions
189
197
 
190
198
 
191
- def collect_policy_actions(policy_documents: Sequence[Union[str, Dict[str, Any]]]) -> AllowedActionCollection:
199
+ def collect_policy_actions(policy_documents: List[Union[str, PolicyDocumentDictTypeDef]]) -> AllowedActionCollection:
192
200
  """
193
201
  Collect all of the actions allowed by the given policy documents into one AllowedActionCollection.
194
202
  """
195
203
  allowed_actions: AllowedActionCollection = init_action_collection()
196
204
  for policy_str in policy_documents:
197
205
  # sometimes a string is returned from the api, so convert to a dictionary
198
- if isinstance(policy_str, dict):
199
- policy_dict = policy_str
200
- else:
206
+ if isinstance(policy_str, str):
201
207
  policy_dict = json.loads(policy_str)
208
+ else:
209
+ policy_dict = policy_str
202
210
  allowed_actions = add_to_action_collection(allowed_actions, get_actions_from_policy_document(policy_dict))
203
211
  return allowed_actions
204
212
 
toil/lib/aws/session.py CHANGED
@@ -12,24 +12,10 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import collections
15
- import inspect
16
15
  import logging
17
16
  import os
18
- import re
19
- import socket
20
17
  import threading
21
- from typing import (Any,
22
- Callable,
23
- Dict,
24
- Iterable,
25
- List,
26
- Optional,
27
- Tuple,
28
- TypeVar,
29
- Union,
30
- cast)
31
- from urllib.error import URLError
32
- from urllib.request import urlopen
18
+ from typing import Dict, Optional, Tuple, cast
33
19
 
34
20
  import boto3
35
21
  import boto3.resources.base
@@ -37,8 +23,8 @@ import boto.connection
37
23
  import botocore
38
24
  from boto3 import Session
39
25
  from botocore.client import Config
40
- from botocore.utils import JSONFileCache
41
26
  from botocore.session import get_session
27
+ from botocore.utils import JSONFileCache
42
28
 
43
29
  logger = logging.getLogger(__name__)
44
30
 
toil/lib/aws/utils.py CHANGED
@@ -12,7 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import errno
15
- import json
16
15
  import logging
17
16
  import os
18
17
  import socket
@@ -21,15 +20,13 @@ from typing import (Any,
21
20
  Callable,
22
21
  ContextManager,
23
22
  Dict,
24
- Hashable,
25
23
  Iterable,
26
24
  Iterator,
27
25
  List,
28
26
  Optional,
29
27
  Set,
30
28
  Union,
31
- cast,
32
- MutableMapping)
29
+ cast)
33
30
  from urllib.parse import ParseResult
34
31
 
35
32
  from toil.lib.aws import session
@@ -345,6 +342,8 @@ def get_object_for_url(url: ParseResult, existing: Optional[bool] = None) -> "Ob
345
342
  """
346
343
  Extracts a key (object) from a given parsed s3:// URL.
347
344
 
345
+ If existing is true and the object does not exist, raises FileNotFoundError.
346
+
348
347
  :param bool existing: If True, key is expected to exist. If False, key is expected not to
349
348
  exists and it will be created. If None, the key will be created if it doesn't exist.
350
349
  """
@@ -386,7 +385,7 @@ def get_object_for_url(url: ParseResult, existing: Optional[bool] = None) -> "Ob
386
385
  else:
387
386
  raise
388
387
  if existing is True and not objExists:
389
- raise RuntimeError(f"Key '{key_name}' does not exist in bucket '{bucket_name}'.")
388
+ raise FileNotFoundError(f"Key '{key_name}' does not exist in bucket '{bucket_name}'.")
390
389
  elif existing is False and objExists:
391
390
  raise RuntimeError(f"Key '{key_name}' exists in bucket '{bucket_name}'.")
392
391
 
toil/lib/compatibility.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import functools
2
2
  import warnings
3
- from typing import Any, Dict, Callable, Union, TypeVar, overload
3
+ from typing import Any, Callable, Union
4
4
 
5
5
 
6
6
  def deprecated(new_function_name: str) -> Callable[..., Any]: