toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/batchSystems/htcondor.py
CHANGED
|
@@ -18,12 +18,13 @@ import os
|
|
|
18
18
|
import time
|
|
19
19
|
from contextlib import contextmanager
|
|
20
20
|
from threading import Lock
|
|
21
|
-
from typing import Any,
|
|
21
|
+
from typing import Any, Optional
|
|
22
22
|
|
|
23
23
|
import htcondor
|
|
24
24
|
|
|
25
|
-
from toil.batchSystems.abstractGridEngineBatchSystem import
|
|
26
|
-
AbstractGridEngineBatchSystem
|
|
25
|
+
from toil.batchSystems.abstractGridEngineBatchSystem import (
|
|
26
|
+
AbstractGridEngineBatchSystem,
|
|
27
|
+
)
|
|
27
28
|
from toil.job import AcceleratorRequirement
|
|
28
29
|
from toil.lib.retry import retry
|
|
29
30
|
|
|
@@ -40,15 +41,18 @@ logger = logging.getLogger(__name__)
|
|
|
40
41
|
# *Command to run* (swapped with unit name)
|
|
41
42
|
# Environment dict for the job
|
|
42
43
|
# Accelerator requirements for the job
|
|
43
|
-
JobTuple =
|
|
44
|
+
JobTuple = tuple[
|
|
45
|
+
int, int, int, int, str, str, dict[str, str], list[AcceleratorRequirement]
|
|
46
|
+
]
|
|
44
47
|
|
|
45
48
|
# We have one global lock to control access to the HTCondor scheduler
|
|
46
49
|
schedd_lock = Lock()
|
|
47
50
|
|
|
51
|
+
|
|
48
52
|
class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
49
53
|
# When using HTCondor, the Schedd handles scheduling
|
|
50
54
|
|
|
51
|
-
class
|
|
55
|
+
class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
|
|
52
56
|
|
|
53
57
|
# Override the createJobs method so that we can use htcondor.Submit objects
|
|
54
58
|
# and so that we can get disk allocation requests and ceil the CPU request.
|
|
@@ -59,15 +63,31 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
59
63
|
self.waitingJobs.append(newJob)
|
|
60
64
|
|
|
61
65
|
# Queue jobs as necessary:
|
|
62
|
-
while len(self.waitingJobs) > 0 and len(self.runningJobs) < int(
|
|
66
|
+
while len(self.waitingJobs) > 0 and len(self.runningJobs) < int(
|
|
67
|
+
self.boss.config.max_jobs
|
|
68
|
+
):
|
|
63
69
|
activity = True
|
|
64
|
-
|
|
70
|
+
(
|
|
71
|
+
jobID,
|
|
72
|
+
cpu,
|
|
73
|
+
memory,
|
|
74
|
+
disk,
|
|
75
|
+
jobName,
|
|
76
|
+
command,
|
|
77
|
+
environment,
|
|
78
|
+
accelerators,
|
|
79
|
+
) = self.waitingJobs.pop(0)
|
|
65
80
|
|
|
66
81
|
if accelerators:
|
|
67
|
-
logger.warning(
|
|
82
|
+
logger.warning(
|
|
83
|
+
"Scheduling job %s without enforcing accelerator requirement",
|
|
84
|
+
jobID,
|
|
85
|
+
)
|
|
68
86
|
|
|
69
87
|
# Prepare the htcondor.Submit object
|
|
70
|
-
submitObj: htcondor.Submit = self.prepareSubmission(
|
|
88
|
+
submitObj: htcondor.Submit = self.prepareSubmission(
|
|
89
|
+
cpu, memory, disk, jobID, jobName, command, environment
|
|
90
|
+
)
|
|
71
91
|
logger.debug("Submitting %r", submitObj)
|
|
72
92
|
|
|
73
93
|
# Submit job and get batch system ID (i.e. the ClusterId)
|
|
@@ -85,13 +105,22 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
85
105
|
|
|
86
106
|
return activity
|
|
87
107
|
|
|
88
|
-
def prepareSubmission(
|
|
108
|
+
def prepareSubmission(
|
|
109
|
+
self,
|
|
110
|
+
cpu: int,
|
|
111
|
+
memory: int,
|
|
112
|
+
disk: int,
|
|
113
|
+
jobID: int,
|
|
114
|
+
jobName: str,
|
|
115
|
+
command: str,
|
|
116
|
+
environment: dict[str, str],
|
|
117
|
+
) -> htcondor.Submit:
|
|
89
118
|
# Note that we don't yet take the accelerators here.
|
|
90
119
|
|
|
91
120
|
# Convert resource requests
|
|
92
|
-
cpu = int(math.ceil(cpu))
|
|
93
|
-
ht_memory = float(memory)/1024
|
|
94
|
-
ht_disk = float(disk)/1024
|
|
121
|
+
cpu = int(math.ceil(cpu)) # integer CPUs
|
|
122
|
+
ht_memory = float(memory) / 1024 # memory in KB
|
|
123
|
+
ht_disk = float(disk) / 1024 # disk in KB
|
|
95
124
|
|
|
96
125
|
# NOTE: format_std_out_err_path() by default puts files in the Toil
|
|
97
126
|
# work directory, which defaults to being in the system temporary
|
|
@@ -101,41 +130,54 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
101
130
|
# = Yes in the submit file, so that HTCondor will write the
|
|
102
131
|
# standard output/error files on the compute node, then transfer
|
|
103
132
|
# back once the job has completed.
|
|
104
|
-
stdoutfile: str = self.boss.format_std_out_err_path(
|
|
105
|
-
|
|
106
|
-
|
|
133
|
+
stdoutfile: str = self.boss.format_std_out_err_path(
|
|
134
|
+
jobID, "$(cluster)", "out"
|
|
135
|
+
)
|
|
136
|
+
stderrfile: str = self.boss.format_std_out_err_path(
|
|
137
|
+
jobID, "$(cluster)", "err"
|
|
138
|
+
)
|
|
139
|
+
condorlogfile: str = self.boss.format_std_out_err_path(
|
|
140
|
+
jobID, "$(cluster)", "events"
|
|
141
|
+
)
|
|
107
142
|
|
|
108
143
|
# Execute the entire command as /bin/sh -c "command"
|
|
109
144
|
# TODO: Transfer the jobStore directory if using a local file store with a relative path.
|
|
110
145
|
submit_parameters = {
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
146
|
+
"executable": "/bin/sh",
|
|
147
|
+
"transfer_executable": "False",
|
|
148
|
+
"arguments": f'''"-c '{self.duplicate_quotes(command)}'"'''.encode(), # Workaround for HTCondor Python bindings Unicode conversion bug
|
|
149
|
+
"environment": self.getEnvString(environment),
|
|
150
|
+
"getenv": "True",
|
|
151
|
+
"should_transfer_files": "Yes", # See note above for stdoutfile, stderrfile
|
|
152
|
+
"output": stdoutfile,
|
|
153
|
+
"error": stderrfile,
|
|
154
|
+
"log": condorlogfile,
|
|
155
|
+
"request_cpus": f"{cpu}",
|
|
156
|
+
"request_memory": f"{ht_memory:.3f}KB",
|
|
157
|
+
"request_disk": f"{ht_disk:.3f}KB",
|
|
158
|
+
"leave_in_queue": "(JobStatus == 4)",
|
|
159
|
+
"+IsToilJob": "True",
|
|
160
|
+
"+ToilJobID": f"{jobID}",
|
|
161
|
+
"+ToilJobName": f'"{jobName}"',
|
|
162
|
+
"+ToilJobKilled": "False",
|
|
128
163
|
}
|
|
129
164
|
|
|
130
165
|
# Extra parameters for HTCondor
|
|
131
|
-
extra_parameters = os.getenv(
|
|
166
|
+
extra_parameters = os.getenv("TOIL_HTCONDOR_PARAMS")
|
|
132
167
|
if extra_parameters is not None:
|
|
133
|
-
logger.debug(
|
|
134
|
-
|
|
168
|
+
logger.debug(
|
|
169
|
+
f"Extra HTCondor parameters added to submit file from TOIL_HTCONDOR_PARAMS env. variable: {extra_parameters}"
|
|
170
|
+
)
|
|
171
|
+
for parameter, value in [
|
|
172
|
+
parameter_value.split("=", 1)
|
|
173
|
+
for parameter_value in extra_parameters.split(";")
|
|
174
|
+
]:
|
|
135
175
|
parameter = parameter.strip()
|
|
136
176
|
value = value.strip()
|
|
137
177
|
if parameter in submit_parameters:
|
|
138
|
-
raise ValueError(
|
|
178
|
+
raise ValueError(
|
|
179
|
+
f"Some extra parameters are incompatible: {extra_parameters}"
|
|
180
|
+
)
|
|
139
181
|
|
|
140
182
|
submit_parameters[parameter] = value
|
|
141
183
|
|
|
@@ -156,23 +198,24 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
156
198
|
def getRunningJobIDs(self):
|
|
157
199
|
|
|
158
200
|
# Get all Toil jobs that are running
|
|
159
|
-
requirements =
|
|
160
|
-
projection = [
|
|
201
|
+
requirements = "(JobStatus == 2) && (IsToilJob)"
|
|
202
|
+
projection = ["ClusterId", "ToilJobID", "EnteredCurrentStatus"]
|
|
161
203
|
with self.connectSchedd() as schedd:
|
|
162
|
-
ads = schedd.xquery(requirements
|
|
163
|
-
projection = projection)
|
|
204
|
+
ads = schedd.xquery(requirements=requirements, projection=projection)
|
|
164
205
|
|
|
165
206
|
# Only consider the Toil jobs that are part of this workflow
|
|
166
|
-
batchJobIDs = [
|
|
207
|
+
batchJobIDs = [
|
|
208
|
+
batchJobID for (batchJobID, task) in self.batchJobIDs.values()
|
|
209
|
+
]
|
|
167
210
|
job_runtimes = {}
|
|
168
211
|
for ad in ads:
|
|
169
|
-
batchJobID = int(ad[
|
|
170
|
-
jobID = int(ad[
|
|
212
|
+
batchJobID = int(ad["ClusterId"])
|
|
213
|
+
jobID = int(ad["ToilJobID"])
|
|
171
214
|
if not (batchJobID in batchJobIDs):
|
|
172
215
|
continue
|
|
173
216
|
|
|
174
217
|
# HTCondor stores the start of the runtime as a Unix timestamp
|
|
175
|
-
runtime = time.time() - ad[
|
|
218
|
+
runtime = time.time() - ad["EnteredCurrentStatus"]
|
|
176
219
|
job_runtimes[jobID] = runtime
|
|
177
220
|
|
|
178
221
|
return job_runtimes
|
|
@@ -183,28 +226,33 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
183
226
|
|
|
184
227
|
# Set the job to be killed when its exit status is checked
|
|
185
228
|
with self.connectSchedd() as schedd:
|
|
186
|
-
job_spec = f
|
|
187
|
-
schedd.edit(job_spec,
|
|
229
|
+
job_spec = f"(ClusterId == {batchJobID})"
|
|
230
|
+
schedd.edit(job_spec, "ToilJobKilled", "True")
|
|
188
231
|
|
|
189
232
|
def getJobExitCode(self, batchJobID):
|
|
190
233
|
logger.debug(f"Getting exit code for HTCondor job {batchJobID}")
|
|
191
234
|
|
|
192
235
|
status = {
|
|
193
|
-
1:
|
|
194
|
-
2:
|
|
195
|
-
3:
|
|
196
|
-
4:
|
|
197
|
-
5:
|
|
198
|
-
6:
|
|
199
|
-
7:
|
|
236
|
+
1: "Idle",
|
|
237
|
+
2: "Running",
|
|
238
|
+
3: "Removed",
|
|
239
|
+
4: "Completed",
|
|
240
|
+
5: "Held",
|
|
241
|
+
6: "Transferring Output",
|
|
242
|
+
7: "Suspended",
|
|
200
243
|
}
|
|
201
244
|
|
|
202
|
-
requirements = f
|
|
203
|
-
projection = [
|
|
204
|
-
|
|
245
|
+
requirements = f"(ClusterId == {batchJobID})"
|
|
246
|
+
projection = [
|
|
247
|
+
"JobStatus",
|
|
248
|
+
"ToilJobKilled",
|
|
249
|
+
"ExitCode",
|
|
250
|
+
"HoldReason",
|
|
251
|
+
"HoldReasonSubCode",
|
|
252
|
+
]
|
|
205
253
|
|
|
206
254
|
with self.connectSchedd() as schedd:
|
|
207
|
-
ads = schedd.xquery(requirements
|
|
255
|
+
ads = schedd.xquery(requirements=requirements, projection=projection)
|
|
208
256
|
|
|
209
257
|
# Make sure a ClassAd was returned
|
|
210
258
|
try:
|
|
@@ -214,7 +262,8 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
214
262
|
ad = ads.next()
|
|
215
263
|
except StopIteration:
|
|
216
264
|
logger.error(
|
|
217
|
-
f"No HTCondor ads returned using constraint: {requirements}"
|
|
265
|
+
f"No HTCondor ads returned using constraint: {requirements}"
|
|
266
|
+
)
|
|
218
267
|
raise
|
|
219
268
|
|
|
220
269
|
# Make sure only one ClassAd was returned
|
|
@@ -227,40 +276,49 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
227
276
|
pass
|
|
228
277
|
else:
|
|
229
278
|
logger.warning(
|
|
230
|
-
f"Multiple HTCondor ads returned using constraint: {requirements}"
|
|
279
|
+
f"Multiple HTCondor ads returned using constraint: {requirements}"
|
|
280
|
+
)
|
|
231
281
|
|
|
232
|
-
if ad[
|
|
282
|
+
if ad["ToilJobKilled"]:
|
|
233
283
|
logger.debug(f"HTCondor job {batchJobID} was killed by Toil")
|
|
234
284
|
|
|
235
285
|
# Remove the job from the Schedd and return 1
|
|
236
|
-
job_spec = f
|
|
286
|
+
job_spec = f"ClusterId == {batchJobID}"
|
|
237
287
|
schedd.act(htcondor.JobAction.Remove, job_spec)
|
|
238
288
|
return 1
|
|
239
289
|
|
|
240
|
-
elif status[ad[
|
|
241
|
-
logger.debug(
|
|
242
|
-
|
|
290
|
+
elif status[ad["JobStatus"]] == "Completed":
|
|
291
|
+
logger.debug(
|
|
292
|
+
"HTCondor job {} completed with exit code {}".format(
|
|
293
|
+
batchJobID, ad["ExitCode"]
|
|
294
|
+
)
|
|
295
|
+
)
|
|
243
296
|
|
|
244
297
|
# Remove the job from the Schedd and return its exit code
|
|
245
|
-
job_spec = f
|
|
298
|
+
job_spec = f"ClusterId == {batchJobID}"
|
|
246
299
|
schedd.act(htcondor.JobAction.Remove, job_spec)
|
|
247
|
-
return int(ad[
|
|
300
|
+
return int(ad["ExitCode"])
|
|
248
301
|
|
|
249
|
-
elif status[ad[
|
|
250
|
-
logger.error(
|
|
251
|
-
|
|
302
|
+
elif status[ad["JobStatus"]] == "Held":
|
|
303
|
+
logger.error(
|
|
304
|
+
"HTCondor job {} was held: '{} (sub code {})'".format(
|
|
305
|
+
batchJobID, ad["HoldReason"], ad["HoldReasonSubCode"]
|
|
306
|
+
)
|
|
307
|
+
)
|
|
252
308
|
|
|
253
309
|
# Remove the job from the Schedd and return 1
|
|
254
|
-
job_spec = f
|
|
310
|
+
job_spec = f"ClusterId == {batchJobID}"
|
|
255
311
|
schedd.act(htcondor.JobAction.Remove, job_spec)
|
|
256
312
|
return 1
|
|
257
313
|
|
|
258
|
-
else:
|
|
259
|
-
logger.debug(
|
|
260
|
-
|
|
314
|
+
else: # Job still running or idle or doing something else
|
|
315
|
+
logger.debug(
|
|
316
|
+
"HTCondor job {} has not completed (Status: {})".format(
|
|
317
|
+
batchJobID, status[ad["JobStatus"]]
|
|
318
|
+
)
|
|
319
|
+
)
|
|
261
320
|
return None
|
|
262
321
|
|
|
263
|
-
|
|
264
322
|
"""
|
|
265
323
|
Implementation-specific helper methods
|
|
266
324
|
"""
|
|
@@ -294,7 +352,7 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
294
352
|
"""
|
|
295
353
|
Ping the scheduler, or fail if it persistently cannot be contacted.
|
|
296
354
|
"""
|
|
297
|
-
schedd.xquery(limit
|
|
355
|
+
schedd.xquery(limit=0)
|
|
298
356
|
|
|
299
357
|
@retry(errors=[htcondor.HTCondorIOError])
|
|
300
358
|
def _get_schedd_address(self) -> Optional[str]:
|
|
@@ -305,8 +363,8 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
305
363
|
"""
|
|
306
364
|
# TODO: Memoize? Or is the collector meant to field every request?
|
|
307
365
|
|
|
308
|
-
condor_host = os.getenv(
|
|
309
|
-
schedd_name = os.getenv(
|
|
366
|
+
condor_host = os.getenv("TOIL_HTCONDOR_COLLECTOR")
|
|
367
|
+
schedd_name = os.getenv("TOIL_HTCONDOR_SCHEDD")
|
|
310
368
|
|
|
311
369
|
# Get the scheduler's address, if not local
|
|
312
370
|
schedd_ad: Optional[str] = None
|
|
@@ -315,17 +373,22 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
315
373
|
if condor_host and schedd_name:
|
|
316
374
|
logger.debug(
|
|
317
375
|
"Connecting to HTCondor Schedd {} using Collector at {}".format(
|
|
318
|
-
schedd_name, condor_host
|
|
376
|
+
schedd_name, condor_host
|
|
377
|
+
)
|
|
378
|
+
)
|
|
319
379
|
try:
|
|
320
380
|
schedd_ad = htcondor.Collector(condor_host).locate(
|
|
321
|
-
htcondor.DaemonTypes.Schedd, schedd_name
|
|
381
|
+
htcondor.DaemonTypes.Schedd, schedd_name
|
|
382
|
+
)
|
|
322
383
|
except OSError:
|
|
323
384
|
logger.error(
|
|
324
|
-
f"Could not connect to HTCondor Collector at {condor_host}"
|
|
385
|
+
f"Could not connect to HTCondor Collector at {condor_host}"
|
|
386
|
+
)
|
|
325
387
|
raise
|
|
326
388
|
except ValueError:
|
|
327
389
|
logger.error(
|
|
328
|
-
f"Could not find HTCondor Schedd with name {schedd_name}"
|
|
390
|
+
f"Could not find HTCondor Schedd with name {schedd_name}"
|
|
391
|
+
)
|
|
329
392
|
raise
|
|
330
393
|
else:
|
|
331
394
|
# Otherwise assume the Schedd is on the local machine
|
|
@@ -359,7 +422,7 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
359
422
|
"""
|
|
360
423
|
return value.replace("'", "''").replace('"', '""')
|
|
361
424
|
|
|
362
|
-
def getEnvString(self, overrides:
|
|
425
|
+
def getEnvString(self, overrides: dict[str, str]) -> str:
|
|
363
426
|
"""
|
|
364
427
|
Build an environment string that a HTCondor Submit object can use.
|
|
365
428
|
|
|
@@ -384,12 +447,14 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
384
447
|
|
|
385
448
|
# The entire string should be encapsulated in double quotes
|
|
386
449
|
# Each variable should be separated by a single space
|
|
387
|
-
return '"' +
|
|
450
|
+
return '"' + " ".join(env_items) + '"'
|
|
388
451
|
|
|
389
452
|
# Override the issueBatchJob method so HTCondor can be given the disk request
|
|
390
|
-
def issueBatchJob(
|
|
453
|
+
def issueBatchJob(
|
|
454
|
+
self, command: str, jobNode, job_environment: Optional[dict[str, str]] = None
|
|
455
|
+
):
|
|
391
456
|
# Avoid submitting internal jobs to the batch queue, handle locally
|
|
392
|
-
localID = self.handleLocalJob(jobNode)
|
|
457
|
+
localID = self.handleLocalJob(command, jobNode)
|
|
393
458
|
if localID is not None:
|
|
394
459
|
return localID
|
|
395
460
|
else:
|
|
@@ -398,7 +463,19 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
398
463
|
self.currentJobs.add(jobID)
|
|
399
464
|
|
|
400
465
|
# Construct our style of job tuple
|
|
401
|
-
self.newJobsQueue.put(
|
|
402
|
-
|
|
403
|
-
|
|
466
|
+
self.newJobsQueue.put(
|
|
467
|
+
(
|
|
468
|
+
jobID,
|
|
469
|
+
jobNode.cores,
|
|
470
|
+
jobNode.memory,
|
|
471
|
+
jobNode.disk,
|
|
472
|
+
jobNode.jobName,
|
|
473
|
+
command,
|
|
474
|
+
job_environment or {},
|
|
475
|
+
jobNode.accelerators,
|
|
476
|
+
)
|
|
477
|
+
)
|
|
478
|
+
logger.debug(
|
|
479
|
+
"Issued the job command: %s with job id: %s ", command, str(jobID)
|
|
480
|
+
)
|
|
404
481
|
return jobID
|