toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. toil/__init__.py +122 -315
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +173 -89
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
  5. toil/batchSystems/awsBatch.py +244 -135
  6. toil/batchSystems/cleanup_support.py +26 -16
  7. toil/batchSystems/contained_executor.py +31 -28
  8. toil/batchSystems/gridengine.py +86 -50
  9. toil/batchSystems/htcondor.py +166 -89
  10. toil/batchSystems/kubernetes.py +632 -382
  11. toil/batchSystems/local_support.py +20 -15
  12. toil/batchSystems/lsf.py +134 -81
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +290 -151
  16. toil/batchSystems/mesos/executor.py +79 -50
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +46 -28
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +296 -125
  21. toil/batchSystems/slurm.py +603 -138
  22. toil/batchSystems/torque.py +47 -33
  23. toil/bus.py +186 -76
  24. toil/common.py +664 -368
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1136 -483
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +63 -42
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +140 -60
  32. toil/fileStores/cachingFileStore.py +717 -269
  33. toil/fileStores/nonCachingFileStore.py +116 -87
  34. toil/job.py +1225 -368
  35. toil/jobStores/abstractJobStore.py +416 -266
  36. toil/jobStores/aws/jobStore.py +863 -477
  37. toil/jobStores/aws/utils.py +201 -120
  38. toil/jobStores/conftest.py +3 -2
  39. toil/jobStores/fileJobStore.py +292 -154
  40. toil/jobStores/googleJobStore.py +140 -74
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +668 -272
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +74 -31
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +214 -39
  49. toil/lib/aws/utils.py +287 -231
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +104 -47
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +361 -199
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +5 -3
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +141 -15
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +66 -21
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +68 -15
  71. toil/lib/retry.py +126 -81
  72. toil/lib/threading.py +299 -82
  73. toil/lib/throttle.py +16 -15
  74. toil/options/common.py +843 -409
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +73 -17
  78. toil/provisioners/__init__.py +117 -46
  79. toil/provisioners/abstractProvisioner.py +332 -157
  80. toil/provisioners/aws/__init__.py +70 -33
  81. toil/provisioners/aws/awsProvisioner.py +1145 -715
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +155 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +128 -62
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +224 -70
  98. toil/test/__init__.py +282 -183
  99. toil/test/batchSystems/batchSystemTest.py +460 -210
  100. toil/test/batchSystems/batch_system_plugin_test.py +90 -0
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +110 -49
  104. toil/test/cactus/__init__.py +0 -0
  105. toil/test/cactus/test_cactus_integration.py +56 -0
  106. toil/test/cwl/cwlTest.py +496 -287
  107. toil/test/cwl/measure_default_memory.cwl +12 -0
  108. toil/test/cwl/not_run_required_input.cwl +29 -0
  109. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  110. toil/test/cwl/seqtk_seq.cwl +1 -1
  111. toil/test/docs/scriptsTest.py +69 -46
  112. toil/test/jobStores/jobStoreTest.py +427 -264
  113. toil/test/lib/aws/test_iam.py +118 -50
  114. toil/test/lib/aws/test_s3.py +16 -9
  115. toil/test/lib/aws/test_utils.py +5 -6
  116. toil/test/lib/dockerTest.py +118 -141
  117. toil/test/lib/test_conversions.py +113 -115
  118. toil/test/lib/test_ec2.py +58 -50
  119. toil/test/lib/test_integration.py +104 -0
  120. toil/test/lib/test_misc.py +12 -5
  121. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  122. toil/test/mesos/helloWorld.py +7 -6
  123. toil/test/mesos/stress.py +25 -20
  124. toil/test/options/__init__.py +13 -0
  125. toil/test/options/options.py +42 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
  127. toil/test/provisioners/clusterScalerTest.py +440 -250
  128. toil/test/provisioners/clusterTest.py +166 -44
  129. toil/test/provisioners/gceProvisionerTest.py +174 -100
  130. toil/test/provisioners/provisionerTest.py +25 -13
  131. toil/test/provisioners/restartScript.py +5 -4
  132. toil/test/server/serverTest.py +188 -141
  133. toil/test/sort/restart_sort.py +137 -68
  134. toil/test/sort/sort.py +134 -66
  135. toil/test/sort/sortTest.py +91 -49
  136. toil/test/src/autoDeploymentTest.py +141 -101
  137. toil/test/src/busTest.py +20 -18
  138. toil/test/src/checkpointTest.py +8 -2
  139. toil/test/src/deferredFunctionTest.py +49 -35
  140. toil/test/src/dockerCheckTest.py +32 -24
  141. toil/test/src/environmentTest.py +135 -0
  142. toil/test/src/fileStoreTest.py +539 -272
  143. toil/test/src/helloWorldTest.py +7 -4
  144. toil/test/src/importExportFileTest.py +61 -31
  145. toil/test/src/jobDescriptionTest.py +46 -21
  146. toil/test/src/jobEncapsulationTest.py +2 -0
  147. toil/test/src/jobFileStoreTest.py +74 -50
  148. toil/test/src/jobServiceTest.py +187 -73
  149. toil/test/src/jobTest.py +121 -71
  150. toil/test/src/miscTests.py +19 -18
  151. toil/test/src/promisedRequirementTest.py +82 -36
  152. toil/test/src/promisesTest.py +7 -6
  153. toil/test/src/realtimeLoggerTest.py +10 -6
  154. toil/test/src/regularLogTest.py +71 -37
  155. toil/test/src/resourceTest.py +80 -49
  156. toil/test/src/restartDAGTest.py +36 -22
  157. toil/test/src/resumabilityTest.py +9 -2
  158. toil/test/src/retainTempDirTest.py +45 -14
  159. toil/test/src/systemTest.py +12 -8
  160. toil/test/src/threadingTest.py +44 -25
  161. toil/test/src/toilContextManagerTest.py +10 -7
  162. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  163. toil/test/src/workerTest.py +73 -23
  164. toil/test/utils/toilDebugTest.py +103 -33
  165. toil/test/utils/toilKillTest.py +4 -5
  166. toil/test/utils/utilsTest.py +245 -106
  167. toil/test/wdl/wdltoil_test.py +818 -149
  168. toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
  169. toil/toilState.py +120 -35
  170. toil/utils/toilConfig.py +13 -4
  171. toil/utils/toilDebugFile.py +44 -27
  172. toil/utils/toilDebugJob.py +214 -27
  173. toil/utils/toilDestroyCluster.py +11 -6
  174. toil/utils/toilKill.py +8 -3
  175. toil/utils/toilLaunchCluster.py +256 -140
  176. toil/utils/toilMain.py +37 -16
  177. toil/utils/toilRsyncCluster.py +32 -14
  178. toil/utils/toilSshCluster.py +49 -22
  179. toil/utils/toilStats.py +356 -273
  180. toil/utils/toilStatus.py +292 -139
  181. toil/utils/toilUpdateEC2Instances.py +3 -1
  182. toil/version.py +12 -12
  183. toil/wdl/utils.py +5 -5
  184. toil/wdl/wdltoil.py +3913 -1033
  185. toil/worker.py +367 -184
  186. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
  187. toil-8.0.0.dist-info/METADATA +173 -0
  188. toil-8.0.0.dist-info/RECORD +253 -0
  189. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  190. toil-6.1.0a1.dist-info/METADATA +0 -125
  191. toil-6.1.0a1.dist-info/RECORD +0 -237
  192. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  193. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
@@ -12,10 +12,12 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import logging
15
- from typing import Dict, List, Optional
15
+ from typing import Optional
16
16
 
17
- from toil.batchSystems.abstractBatchSystem import (BatchSystemSupport,
18
- UpdatedBatchJobInfo)
17
+ from toil.batchSystems.abstractBatchSystem import (
18
+ BatchSystemSupport,
19
+ UpdatedBatchJobInfo,
20
+ )
19
21
  from toil.batchSystems.singleMachine import SingleMachineBatchSystem
20
22
  from toil.common import Config
21
23
  from toil.job import JobDescription
@@ -27,22 +29,25 @@ logger = logging.getLogger(__name__)
27
29
  class BatchSystemLocalSupport(BatchSystemSupport):
28
30
  """Adds a local queue for helper jobs, useful for CWL & others."""
29
31
 
30
- def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int) -> None:
32
+ def __init__(
33
+ self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
34
+ ) -> None:
31
35
  super().__init__(config, maxCores, maxMemory, maxDisk)
32
- max_local_jobs = config.max_local_jobs if config.max_local_jobs is not None else cpu_count()
36
+ max_local_jobs = (
37
+ config.max_local_jobs if config.max_local_jobs is not None else cpu_count()
38
+ )
33
39
  self.localBatch: SingleMachineBatchSystem = SingleMachineBatchSystem(
34
40
  config, maxCores, maxMemory, maxDisk, max_jobs=max_local_jobs
35
41
  )
36
42
 
37
- def handleLocalJob(self, jobDesc: JobDescription) -> Optional[int]:
43
+ def handleLocalJob(self, command: str, jobDesc: JobDescription) -> Optional[int]:
38
44
  """
39
- To be called by issueBatchJobs.
45
+ To be called by issueBatchJob.
40
46
 
41
47
  Returns the jobID if the jobDesc has been submitted to the local queue,
42
48
  otherwise returns None
43
49
  """
44
- if (not self.config.run_local_jobs_on_workers
45
- and jobDesc.local):
50
+ if not self.config.run_local_jobs_on_workers and jobDesc.local:
46
51
  # Since singleMachine.py doesn't typecheck yet and MyPy is ignoring
47
52
  # it, it will raise errors here unless we add type annotations to
48
53
  # everything we get back from it. The easiest way to do that seems
@@ -50,12 +55,12 @@ class BatchSystemLocalSupport(BatchSystemSupport):
50
55
  # somehow doesn't error whereas just returning the value complains
51
56
  # we're returning an Any. TODO: When singleMachine.py typechecks,
52
57
  # remove all these extra variables.
53
- local_id: int = self.localBatch.issueBatchJob(jobDesc)
58
+ local_id: int = self.localBatch.issueBatchJob(command, jobDesc)
54
59
  return local_id
55
60
  else:
56
61
  return None
57
62
 
58
- def killLocalJobs(self, jobIDs: List[int]) -> None:
63
+ def killLocalJobs(self, jobIDs: list[int]) -> None:
59
64
  """
60
65
  Will kill all local jobs that match the provided jobIDs.
61
66
 
@@ -63,14 +68,14 @@ class BatchSystemLocalSupport(BatchSystemSupport):
63
68
  """
64
69
  self.localBatch.killBatchJobs(jobIDs)
65
70
 
66
- def getIssuedLocalJobIDs(self) -> List[int]:
71
+ def getIssuedLocalJobIDs(self) -> list[int]:
67
72
  """To be called by getIssuedBatchJobIDs."""
68
- local_ids: List[int] = self.localBatch.getIssuedBatchJobIDs()
73
+ local_ids: list[int] = self.localBatch.getIssuedBatchJobIDs()
69
74
  return local_ids
70
75
 
71
- def getRunningLocalJobIDs(self) -> Dict[int, float]:
76
+ def getRunningLocalJobIDs(self) -> dict[int, float]:
72
77
  """To be called by getRunningBatchJobIDs()."""
73
- local_running: Dict[int, float] = self.localBatch.getRunningBatchJobIDs()
78
+ local_running: dict[int, float] = self.localBatch.getRunningBatchJobIDs()
74
79
  return local_running
75
80
 
76
81
  def getUpdatedLocalJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]:
toil/batchSystems/lsf.py CHANGED
@@ -25,18 +25,24 @@ import re
25
25
  import subprocess
26
26
  from datetime import datetime
27
27
  from random import randint
28
- from typing import Dict, List, Optional, Union
28
+ from typing import Optional, Union
29
29
 
30
30
  from dateutil.parser import parse
31
31
  from dateutil.tz import tzlocal
32
32
 
33
- from toil.batchSystems.abstractBatchSystem import BatchJobExitReason
34
- from toil.batchSystems.abstractGridEngineBatchSystem import \
35
- AbstractGridEngineBatchSystem
36
- from toil.batchSystems.lsfHelper import (check_lsf_json_output_supported,
37
- parse_mem_and_cmd_from_output,
38
- parse_memory,
39
- per_core_reservation)
33
+ from toil.batchSystems.abstractBatchSystem import (
34
+ EXIT_STATUS_UNAVAILABLE_VALUE,
35
+ BatchJobExitReason,
36
+ )
37
+ from toil.batchSystems.abstractGridEngineBatchSystem import (
38
+ AbstractGridEngineBatchSystem,
39
+ )
40
+ from toil.batchSystems.lsfHelper import (
41
+ check_lsf_json_output_supported,
42
+ parse_mem_and_cmd_from_output,
43
+ parse_memory,
44
+ per_core_reservation,
45
+ )
40
46
  from toil.lib.misc import call_command
41
47
 
42
48
  logger = logging.getLogger(__name__)
@@ -44,53 +50,64 @@ logger = logging.getLogger(__name__)
44
50
 
45
51
  class LSFBatchSystem(AbstractGridEngineBatchSystem):
46
52
 
47
- class Worker(AbstractGridEngineBatchSystem.Worker):
48
- """LSF specific AbstractGridEngineWorker methods."""
53
+ class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
54
+ """LSF specific GridEngineThread methods."""
49
55
 
50
56
  def getRunningJobIDs(self):
51
57
  times = {}
52
58
  with self.runningJobsLock:
53
- currentjobs = {str(self.batchJobIDs[x][0]): x for x in
54
- self.runningJobs}
59
+ currentjobs = {str(self.batchJobIDs[x][0]): x for x in self.runningJobs}
55
60
 
56
61
  if check_lsf_json_output_supported:
57
- stdout = call_command(["bjobs","-json","-o", "jobid stat start_time"])
62
+ stdout = call_command(["bjobs", "-json", "-o", "jobid stat start_time"])
58
63
 
59
64
  bjobs_records = self.parseBjobs(stdout)
60
65
  if bjobs_records:
61
66
  for single_item in bjobs_records:
62
- if single_item['STAT'] == 'RUN' and single_item['JOBID'] in currentjobs:
63
- jobstart = parse(single_item['START_TIME'], default=datetime.now(tzlocal()))
64
- times[currentjobs[single_item['JOBID']]] = datetime.now(tzlocal()) \
65
- - jobstart
67
+ if (
68
+ single_item["STAT"] == "RUN"
69
+ and single_item["JOBID"] in currentjobs
70
+ ):
71
+ jobstart = parse(
72
+ single_item["START_TIME"],
73
+ default=datetime.now(tzlocal()),
74
+ )
75
+ times[currentjobs[single_item["JOBID"]]] = (
76
+ datetime.now(tzlocal()) - jobstart
77
+ )
66
78
  else:
67
79
  times = self.fallbackRunningJobIDs(currentjobs)
68
80
  return times
69
81
 
70
82
  def fallbackRunningJobIDs(self, currentjobs):
71
83
  times = {}
72
- stdout = call_command(["bjobs", "-o", "jobid stat start_time delimiter='|'"])
73
- for curline in stdout.split('\n'):
74
- items = curline.strip().split('|')
75
- if items[0] in currentjobs and items[1] == 'RUN':
84
+ stdout = call_command(
85
+ ["bjobs", "-o", "jobid stat start_time delimiter='|'"]
86
+ )
87
+ for curline in stdout.split("\n"):
88
+ items = curline.strip().split("|")
89
+ if items[0] in currentjobs and items[1] == "RUN":
76
90
  jobstart = parse(items[2], default=datetime.now(tzlocal()))
77
- times[currentjobs[items[0]]] = datetime.now(tzlocal()) \
78
- - jobstart
91
+ times[currentjobs[items[0]]] = datetime.now(tzlocal()) - jobstart
79
92
  return times
80
93
 
81
94
  def killJob(self, jobID):
82
- call_command(['bkill', self.getBatchSystemID(jobID)])
83
-
84
- def prepareSubmission(self,
85
- cpu: int,
86
- memory: int,
87
- jobID: int,
88
- command: str,
89
- jobName: str,
90
- job_environment: Optional[Dict[str, str]] = None,
91
- gpus: Optional[int] = None):
92
- return (self.prepareBsub(cpu, memory, jobID) + [command],
93
- job_environment) # pass job_environment to .submitJob()
95
+ call_command(["bkill", self.getBatchSystemID(jobID)])
96
+
97
+ def prepareSubmission(
98
+ self,
99
+ cpu: int,
100
+ memory: int,
101
+ jobID: int,
102
+ command: str,
103
+ jobName: str,
104
+ job_environment: Optional[dict[str, str]] = None,
105
+ gpus: Optional[int] = None,
106
+ ):
107
+ return (
108
+ self.prepareBsub(cpu, memory, jobID) + [command],
109
+ job_environment,
110
+ ) # pass job_environment to .submitJob()
94
111
 
95
112
  def submitJob(self, subLine):
96
113
  subLine, job_environment = subLine
@@ -102,7 +119,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
102
119
  stdout = call_command(subLine, env=combinedEnv)
103
120
  # Example success: Job <39605914> is submitted to default queue <general>.
104
121
  # Example fail: Service class does not exist. Job not submitted.
105
- result_search = re.search('Job <(.*)> is submitted', stdout)
122
+ result_search = re.search("Job <(.*)> is submitted", stdout)
106
123
 
107
124
  if result_search:
108
125
  result = int(result_search.group(1))
@@ -138,7 +155,11 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
138
155
  logger.debug("Getting coalesced job exit codes via bjobs")
139
156
  bjobs_records = self.parseBjobs(
140
157
  subprocess.run(
141
- args, check=False, stderr=subprocess.STDOUT, encoding="utf-8"
158
+ args,
159
+ check=False,
160
+ stdout=subprocess.PIPE,
161
+ stderr=subprocess.STDOUT,
162
+ encoding="utf-8",
142
163
  ).stdout
143
164
  )
144
165
  if bjobs_records:
@@ -161,23 +182,31 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
161
182
  status_resonse.append(None)
162
183
  return status_resonse
163
184
 
164
- def getJobExitCode(self, lsfJobID):
185
+ def getJobExitCode(
186
+ self, lsfJobID
187
+ ) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]:
165
188
  # the task is set as part of the job ID if using getBatchSystemID()
166
189
  if "NOT_SUBMITTED" in lsfJobID:
167
190
  logger.error("bjobs detected job failed to submit")
168
191
  return 1
169
192
 
170
193
  job, task = (lsfJobID, None)
171
- if '.' in lsfJobID:
172
- job, task = lsfJobID.split('.', 1)
194
+ if "." in lsfJobID:
195
+ job, task = lsfJobID.split(".", 1)
173
196
 
174
197
  self.parseMaxMem(job)
175
198
  # first try bjobs to find out job state
176
199
  if check_lsf_json_output_supported:
177
- args = ["bjobs", "-json", "-o",
178
- "user exit_code stat exit_reason pend_reason", str(job)]
179
- logger.debug("Checking job exit code for job via bjobs: "
180
- "{}".format(job))
200
+ args = [
201
+ "bjobs",
202
+ "-json",
203
+ "-o",
204
+ "user exit_code stat exit_reason pend_reason",
205
+ str(job),
206
+ ]
207
+ logger.debug(
208
+ "Checking job exit code for job via bjobs: " "{}".format(job)
209
+ )
181
210
  stdout = call_command(args)
182
211
  bjobs_records = self.parseBjobs(stdout)
183
212
  if bjobs_records:
@@ -186,7 +215,9 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
186
215
 
187
216
  return self.fallbackGetJobExitCode(job)
188
217
 
189
- def parse_bjobs_record(self, bjobs_record: dict, job: int) -> Union[int, None]:
218
+ def parse_bjobs_record(
219
+ self, bjobs_record: dict, job: int
220
+ ) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]:
190
221
  """
191
222
  Helper functions for getJobExitCode and to parse the bjobs status record
192
223
  """
@@ -202,7 +233,8 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
202
233
  pending_info = "\n" + bjobs_record["PEND_REASON"]
203
234
  logger.debug(
204
235
  "bjobs detected job pending with: %s\nfor job: %s",
205
- pending_info, job
236
+ pending_info,
237
+ job,
206
238
  )
207
239
  return None
208
240
  if process_status == "EXIT":
@@ -221,10 +253,18 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
221
253
  exit_info += f"\nexit reason: {exit_reason}"
222
254
  logger.error(
223
255
  "bjobs detected job failed with: %s\nfor job: %s",
224
- exit_info, job
256
+ exit_info,
257
+ job,
225
258
  )
226
259
  if "TERM_MEMLIMIT" in exit_reason:
227
- return BatchJobExitReason.MEMLIMIT
260
+ return (
261
+ (
262
+ exit_code
263
+ if exit_code != 0
264
+ else EXIT_STATUS_UNAVAILABLE_VALUE
265
+ ),
266
+ BatchJobExitReason.MEMLIMIT,
267
+ )
228
268
  return exit_code
229
269
  if process_status == "RUN":
230
270
  logger.debug(
@@ -237,46 +277,53 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
237
277
 
238
278
  return self.getJobExitCodeBACCT(job)
239
279
 
240
- def getJobExitCodeBACCT(self,job):
280
+ def getJobExitCodeBACCT(
281
+ self, job
282
+ ) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]:
241
283
  # if not found in bjobs, then try bacct (slower than bjobs)
242
- logger.debug("bjobs failed to detect job - trying bacct: "
243
- "{}".format(job))
284
+ logger.debug("bjobs failed to detect job - trying bacct: " "{}".format(job))
244
285
 
245
286
  args = ["bacct", "-l", str(job)]
246
287
  stdout = call_command(args)
247
- process_output = stdout.split('\n')
288
+ process_output = stdout.split("\n")
248
289
  for line in process_output:
249
290
  if line.find("Completed <done>") > -1 or line.find("<DONE>") > -1:
250
- logger.debug("Detected job completed for job: "
251
- "{}".format(job))
291
+ logger.debug("Detected job completed for job: " "{}".format(job))
252
292
  return 0
253
293
  elif line.find("Completed <exit>") > -1 or line.find("<EXIT>") > -1:
254
- logger.error("Detected job failed for job: "
255
- "{}".format(job))
294
+ logger.error("Detected job failed for job: " "{}".format(job))
256
295
  return 1
257
- logger.debug("Can't determine exit code for job or job still "
258
- "running: {}".format(job))
296
+ logger.debug(
297
+ "Can't determine exit code for job or job still "
298
+ "running: {}".format(job)
299
+ )
259
300
  return None
260
301
 
261
- def fallbackGetJobExitCode(self, job):
302
+ def fallbackGetJobExitCode(
303
+ self, job
304
+ ) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]:
262
305
  args = ["bjobs", "-l", str(job)]
263
306
  logger.debug(f"Checking job exit code for job via bjobs (fallback): {job}")
264
307
  stdout = call_command(args)
265
308
  output = stdout.replace("\n ", "")
266
- process_output = output.split('\n')
309
+ process_output = output.split("\n")
267
310
  started = 0
268
311
  for line in process_output:
269
312
  if "Done successfully" in line or "Status <DONE>" in line:
270
313
  logger.debug(f"bjobs detected job completed for job: {job}")
271
314
  return 0
272
315
  elif "New job is waiting for scheduling" in line:
273
- logger.debug(f"bjobs detected job pending scheduling for job: {job}")
316
+ logger.debug(
317
+ f"bjobs detected job pending scheduling for job: {job}"
318
+ )
274
319
  return None
275
320
  elif "PENDING REASONS" in line or "Status <PEND>" in line:
276
321
  logger.debug(f"bjobs detected job pending for job: {job}")
277
322
  return None
278
323
  elif "Exited with exit code" in line:
279
- exit = int(line[line.find("Exited with exit code ")+22:].split('.')[0])
324
+ exit = int(
325
+ line[line.find("Exited with exit code ") + 22 :].split(".")[0]
326
+ )
280
327
  logger.error(f"bjobs detected job exit code {exit} for job {job}")
281
328
  return exit
282
329
  elif "Completed <exit>" in line:
@@ -293,7 +340,8 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
293
340
  """
294
341
  Implementation-specific helper methods
295
342
  """
296
- def prepareBsub(self, cpu: int, mem: int, jobID: int) -> List[str]:
343
+
344
+ def prepareBsub(self, cpu: int, mem: int, jobID: int) -> list[str]:
297
345
  """
298
346
  Make a bsub commandline to execute.
299
347
 
@@ -308,18 +356,15 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
308
356
  if per_core_reservation() and cpu:
309
357
  mem = mem / math.ceil(cpu)
310
358
  mem = parse_memory(mem)
311
- bsubMem = ['-R',
312
- f'select[mem>{mem}] '
313
- f'rusage[mem={mem}]',
314
- '-M', mem]
315
- bsubCpu = [] if cpu is None else ['-n', str(math.ceil(cpu))]
359
+ bsubMem = ["-R", f"select[mem>{mem}] " f"rusage[mem={mem}]", "-M", mem]
360
+ bsubCpu = [] if cpu is None else ["-n", str(math.ceil(cpu))]
316
361
  bsubline = ["bsub", "-cwd", ".", "-J", f"toil_job_{jobID}"]
317
362
  bsubline.extend(bsubMem)
318
363
  bsubline.extend(bsubCpu)
319
- stdoutfile: str = self.boss.format_std_out_err_path(jobID, '%J', 'out')
320
- stderrfile: str = self.boss.format_std_out_err_path(jobID, '%J', 'err')
321
- bsubline.extend(['-o', stdoutfile, '-e', stderrfile])
322
- lsfArgs = os.getenv('TOIL_LSF_ARGS')
364
+ stdoutfile: str = self.boss.format_std_out_err_path(jobID, "%J", "out")
365
+ stderrfile: str = self.boss.format_std_out_err_path(jobID, "%J", "err")
366
+ bsubline.extend(["-o", stdoutfile, "-e", stderrfile])
367
+ lsfArgs = os.getenv("TOIL_LSF_ARGS")
323
368
  if lsfArgs:
324
369
  bsubline.extend(lsfArgs.split())
325
370
  return bsubline
@@ -333,16 +378,16 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
333
378
  bjobs_dict = None
334
379
  bjobs_records = None
335
380
  # Handle Cannot connect to LSF. Please wait ... type messages
336
- dict_start = bjobs_output_str.find('{')
337
- dict_end = bjobs_output_str.rfind('}')
381
+ dict_start = bjobs_output_str.find("{")
382
+ dict_end = bjobs_output_str.rfind("}")
338
383
  if dict_start != -1 and dict_end != -1:
339
- bjobs_output = bjobs_output_str[dict_start:(dict_end+1)]
384
+ bjobs_output = bjobs_output_str[dict_start : (dict_end + 1)]
340
385
  try:
341
386
  bjobs_dict = json.loads(bjobs_output)
342
387
  except json.decoder.JSONDecodeError:
343
388
  logger.error(f"Could not parse bjobs output: {bjobs_output_str}")
344
- if 'RECORDS' in bjobs_dict:
345
- bjobs_records = bjobs_dict['RECORDS']
389
+ if "RECORDS" in bjobs_dict:
390
+ bjobs_records = bjobs_dict["RECORDS"]
346
391
  if bjobs_records is None:
347
392
  logger.error(f"Could not find bjobs output json in: {bjobs_output_str}")
348
393
 
@@ -358,16 +403,24 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
358
403
  output = subprocess.check_output(["bjobs", "-l", str(jobID)], text=True)
359
404
  max_mem, command = parse_mem_and_cmd_from_output(output=output)
360
405
  if not max_mem:
361
- logger.warning(f"[job ID {jobID}] Unable to Collect Maximum Memory Usage: {output}")
406
+ logger.warning(
407
+ f"[job ID {jobID}] Unable to Collect Maximum Memory Usage: {output}"
408
+ )
362
409
  return
363
410
 
364
411
  if not command:
365
- logger.warning(f"[job ID {jobID}] Cannot Parse Max Memory Due to Missing Command String: {output}")
412
+ logger.warning(
413
+ f"[job ID {jobID}] Cannot Parse Max Memory Due to Missing Command String: {output}"
414
+ )
366
415
  else:
367
- logger.info(f"[job ID {jobID}, Command {command.group(1)}] Max Memory Used: {max_mem.group(1)}")
416
+ logger.info(
417
+ f"[job ID {jobID}, Command {command.group(1)}] Max Memory Used: {max_mem.group(1)}"
418
+ )
368
419
  return max_mem
369
420
  except subprocess.CalledProcessError as e:
370
- logger.warning(f"[job ID {jobID}] Unable to Collect Maximum Memory Usage: {e}")
421
+ logger.warning(
422
+ f"[job ID {jobID}] Unable to Collect Maximum Memory Usage: {e}"
423
+ )
371
424
 
372
425
  def getWaitDuration(self):
373
426
  """We give LSF a second to catch its breath (in seconds)"""
@@ -72,7 +72,7 @@ def apply_conf_file(fn, conf_filename):
72
72
  for env in LSF_CONF_ENV:
73
73
  conf_file = get_conf_file(conf_filename, env)
74
74
  if conf_file:
75
- with open(conf_file, encoding='utf-8') as conf_handle:
75
+ with open(conf_file, encoding="utf-8") as conf_handle:
76
76
  value = fn(conf_handle)
77
77
  if value:
78
78
  return value
@@ -112,9 +112,9 @@ def apply_bparams(fn):
112
112
  """
113
113
  cmd = ["bparams", "-a"]
114
114
  try:
115
- output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode('utf-8')
115
+ output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode("utf-8")
116
116
  except subprocess.CalledProcessError as exc:
117
- logger.debug(exc.output.decode('utf-8'))
117
+ logger.debug(exc.output.decode("utf-8"))
118
118
  return None
119
119
  return fn(output.split("\n"))
120
120
 
@@ -125,9 +125,9 @@ def apply_lsadmin(fn):
125
125
  """
126
126
  cmd = ["lsadmin", "showconf", "lim"]
127
127
  try:
128
- output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode('utf-8')
128
+ output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode("utf-8")
129
129
  except subprocess.CalledProcessError as exc:
130
- logger.debug(exc.output.decode('utf-8'))
130
+ logger.debug(exc.output.decode("utf-8"))
131
131
  return None
132
132
  return fn(output.split("\n"))
133
133
 
@@ -161,7 +161,7 @@ def parse_mem_and_cmd_from_output(output: str):
161
161
  # Handle hard wrapping in the middle of words and arbitrary
162
162
  # indents. May drop spaces at the starts of lines that aren't
163
163
  # meant to be part of the indent.
164
- cleaned_up_output = ' '.join(re.sub(r"\n\s*", "", output).split(','))
164
+ cleaned_up_output = " ".join(re.sub(r"\n\s*", "", output).split(","))
165
165
  max_mem = re.search(r"MAX ?MEM: ?(.*?);", cleaned_up_output)
166
166
  command = re.search(r"Command ?<(.*?)>", cleaned_up_output)
167
167
  return max_mem, command
@@ -173,10 +173,10 @@ def get_lsf_version():
173
173
  """
174
174
  cmd = ["lsid"]
175
175
  try:
176
- output = subprocess.check_output(cmd).decode('utf-8')
176
+ output = subprocess.check_output(cmd).decode("utf-8")
177
177
  except:
178
178
  return None
179
- bjobs_search = re.search('IBM Spectrum LSF Standard (.*),', output)
179
+ bjobs_search = re.search("IBM Spectrum LSF Standard (.*),", output)
180
180
  if bjobs_search:
181
181
  lsf_version = bjobs_search.group(1)
182
182
  return lsf_version
@@ -188,7 +188,9 @@ def check_lsf_json_output_supported():
188
188
  """Check if the current LSF system supports bjobs json output."""
189
189
  try:
190
190
  lsf_version = get_lsf_version()
191
- if lsf_version and (version.parse(lsf_version) >= version.parse(LSF_JSON_OUTPUT_MIN_VERSION)):
191
+ if lsf_version and (
192
+ version.parse(lsf_version) >= version.parse(LSF_JSON_OUTPUT_MIN_VERSION)
193
+ ):
192
194
  return True
193
195
  except:
194
196
  return False
@@ -197,11 +199,11 @@ def check_lsf_json_output_supported():
197
199
 
198
200
  def parse_memory(mem: float) -> str:
199
201
  """Parse memory parameter."""
200
- megabytes_of_mem = convert_units(float(mem), src_unit='B', dst_unit='MB')
202
+ megabytes_of_mem = convert_units(float(mem), src_unit="B", dst_unit="MB")
201
203
  if megabytes_of_mem < 1:
202
204
  megabytes_of_mem = 1.0
203
205
  # round as a string here to avoid returning something like 1.231e+12
204
- return f'{megabytes_of_mem:.0f}MB'
206
+ return f"{megabytes_of_mem:.0f}MB"
205
207
 
206
208
 
207
209
  def per_core_reservation():
@@ -19,19 +19,23 @@ from threading import Lock
19
19
 
20
20
  from toil.provisioners.abstractProvisioner import Shape
21
21
 
22
- TaskData = namedtuple('TaskData', (
23
- # Time when the task was started
24
- 'startTime',
25
- # Mesos' ID of the agent where task is being run
26
- 'agentID',
27
- # IP of agent where task is being run
28
- 'agentIP',
29
- # Mesos' ID of the executor running the task
30
- 'executorID',
31
- # Memory requirement of the task
32
- 'memory',
33
- # CPU requirement of the task
34
- 'cores'))
22
+ TaskData = namedtuple(
23
+ "TaskData",
24
+ (
25
+ # Time when the task was started
26
+ "startTime",
27
+ # Mesos' ID of the agent where task is being run
28
+ "agentID",
29
+ # IP of agent where task is being run
30
+ "agentIP",
31
+ # Mesos' ID of the executor running the task
32
+ "executorID",
33
+ # Memory requirement of the task
34
+ "memory",
35
+ # CPU requirement of the task
36
+ "cores",
37
+ ),
38
+ )
35
39
 
36
40
 
37
41
  class JobQueue:
@@ -52,7 +56,11 @@ class JobQueue:
52
56
 
53
57
  def jobIDs(self):
54
58
  with self.jobLock:
55
- return [job.jobID for queue in list(self.queues.values()) for job in list(queue.queue)]
59
+ return [
60
+ job.jobID
61
+ for queue in list(self.queues.values())
62
+ for job in list(queue.queue)
63
+ ]
56
64
 
57
65
  def nextJobOfType(self, jobType):
58
66
  with self.jobLock:
@@ -80,18 +88,22 @@ class MesosShape(Shape):
80
88
  return not self.greater_than(other)
81
89
 
82
90
 
83
- ToilJob = namedtuple('ToilJob', (
84
- # A job ID specific to this batch system implementation
85
- 'jobID',
86
- # What string to display in the mesos UI
87
- 'name',
88
- # A ResourceRequirement tuple describing the resources needed by this job
89
- 'resources',
90
- # The command to be run on the worker node
91
- 'command',
92
- # The resource object representing the user script
93
- 'userScript',
94
- # A dictionary with additional environment variables to be set on the worker process
95
- 'environment',
96
- # A named tuple containing all the required info for cleaning up the worker node
97
- 'workerCleanupInfo'))
91
+ ToilJob = namedtuple(
92
+ "ToilJob",
93
+ (
94
+ # A job ID specific to this batch system implementation
95
+ "jobID",
96
+ # What string to display in the mesos UI
97
+ "name",
98
+ # A ResourceRequirement tuple describing the resources needed by this job
99
+ "resources",
100
+ # The command to be run on the worker node
101
+ "command",
102
+ # The resource object representing the user script
103
+ "userScript",
104
+ # A dictionary with additional environment variables to be set on the worker process
105
+ "environment",
106
+ # A named tuple containing all the required info for cleaning up the worker node
107
+ "workerCleanupInfo",
108
+ ),
109
+ )