lsst-ctrl-bps-htcondor 29.2025.3700__py3-none-any.whl → 29.2025.3900__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/ctrl/bps/htcondor/__init__.py +1 -0
- lsst/ctrl/bps/htcondor/common_utils.py +306 -0
- lsst/ctrl/bps/htcondor/htcondor_service.py +12 -2060
- lsst/ctrl/bps/htcondor/htcondor_workflow.py +89 -0
- lsst/ctrl/bps/htcondor/lssthtc.py +27 -1
- lsst/ctrl/bps/htcondor/prepare_utils.py +967 -0
- lsst/ctrl/bps/htcondor/provisioner.py +3 -2
- lsst/ctrl/bps/htcondor/report_utils.py +842 -0
- lsst/ctrl/bps/htcondor/version.py +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/METADATA +1 -1
- lsst_ctrl_bps_htcondor-29.2025.3900.dist-info/RECORD +23 -0
- lsst_ctrl_bps_htcondor-29.2025.3700.dist-info/RECORD +0 -19
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/WHEEL +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/licenses/LICENSE +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/top_level.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
# This file is part of ctrl_bps_htcondor.
|
|
2
|
+
#
|
|
3
|
+
# Developed for the LSST Data Management System.
|
|
4
|
+
# This product includes software developed by the LSST Project
|
|
5
|
+
# (https://www.lsst.org).
|
|
6
|
+
# See the COPYRIGHT file at the top-level directory of this distribution
|
|
7
|
+
# for details of code ownership.
|
|
8
|
+
#
|
|
9
|
+
# This software is dual licensed under the GNU General Public License and also
|
|
10
|
+
# under a 3-clause BSD license. Recipients may choose which of these licenses
|
|
11
|
+
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
|
|
12
|
+
# respectively. If you choose the GPL option then the following text applies
|
|
13
|
+
# (but note that there is still no warranty even if you opt for BSD instead):
|
|
14
|
+
#
|
|
15
|
+
# This program is free software: you can redistribute it and/or modify
|
|
16
|
+
# it under the terms of the GNU General Public License as published by
|
|
17
|
+
# the Free Software Foundation, either version 3 of the License, or
|
|
18
|
+
# (at your option) any later version.
|
|
19
|
+
#
|
|
20
|
+
# This program is distributed in the hope that it will be useful,
|
|
21
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
22
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
23
|
+
# GNU General Public License for more details.
|
|
24
|
+
#
|
|
25
|
+
# You should have received a copy of the GNU General Public License
|
|
26
|
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
27
|
+
|
|
28
|
+
"""Utility functions used by multiple functions in ctrl_bps_htcondor."""
|
|
29
|
+
|
|
30
|
+
import logging
|
|
31
|
+
from enum import IntEnum, auto
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
|
|
34
|
+
import htcondor
|
|
35
|
+
|
|
36
|
+
from lsst.ctrl.bps import (
|
|
37
|
+
WmsStates,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
from .lssthtc import (
|
|
41
|
+
NodeStatus,
|
|
42
|
+
condor_history,
|
|
43
|
+
condor_q,
|
|
44
|
+
read_dag_info,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
_LOG = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class WmsIdType(IntEnum):
|
|
51
|
+
"""Type of valid WMS ids."""
|
|
52
|
+
|
|
53
|
+
UNKNOWN = auto()
|
|
54
|
+
"""The type of id cannot be determined.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
LOCAL = auto()
|
|
58
|
+
"""The id is HTCondor job's ClusterId (with optional '.ProcId').
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
GLOBAL = auto()
|
|
62
|
+
"""Id is a HTCondor's global job id.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
PATH = auto()
|
|
66
|
+
"""Id is a submission path.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _htc_status_to_wms_state(job):
|
|
71
|
+
"""Convert HTCondor job status to generic wms state.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
76
|
+
HTCondor job information.
|
|
77
|
+
|
|
78
|
+
Returns
|
|
79
|
+
-------
|
|
80
|
+
wms_state : `WmsStates`
|
|
81
|
+
The equivalent WmsState to given job's status.
|
|
82
|
+
"""
|
|
83
|
+
wms_state = WmsStates.MISFIT
|
|
84
|
+
if "JobStatus" in job:
|
|
85
|
+
wms_state = _htc_job_status_to_wms_state(job)
|
|
86
|
+
|
|
87
|
+
if wms_state == WmsStates.MISFIT and "NodeStatus" in job:
|
|
88
|
+
wms_state = _htc_node_status_to_wms_state(job)
|
|
89
|
+
return wms_state
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _htc_job_status_to_wms_state(job):
|
|
93
|
+
"""Convert HTCondor job status to generic wms state.
|
|
94
|
+
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
98
|
+
HTCondor job information.
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
wms_state : `lsst.ctrl.bps.WmsStates`
|
|
103
|
+
The equivalent WmsState to given job's status.
|
|
104
|
+
"""
|
|
105
|
+
_LOG.debug(
|
|
106
|
+
"htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"])
|
|
107
|
+
)
|
|
108
|
+
wms_state = WmsStates.MISFIT
|
|
109
|
+
if "JobStatus" in job and job["JobStatus"]:
|
|
110
|
+
job_status = int(job["JobStatus"])
|
|
111
|
+
|
|
112
|
+
_LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
|
|
113
|
+
if job_status == htcondor.JobStatus.IDLE:
|
|
114
|
+
wms_state = WmsStates.PENDING
|
|
115
|
+
elif job_status == htcondor.JobStatus.RUNNING:
|
|
116
|
+
wms_state = WmsStates.RUNNING
|
|
117
|
+
elif job_status == htcondor.JobStatus.REMOVED:
|
|
118
|
+
wms_state = WmsStates.DELETED
|
|
119
|
+
elif job_status == htcondor.JobStatus.COMPLETED:
|
|
120
|
+
if (
|
|
121
|
+
(job.get("ExitBySignal", False) and job.get("ExitSignal", 0))
|
|
122
|
+
or job.get("ExitCode", 0)
|
|
123
|
+
or job.get("DAG_Status", 0)
|
|
124
|
+
):
|
|
125
|
+
wms_state = WmsStates.FAILED
|
|
126
|
+
else:
|
|
127
|
+
wms_state = WmsStates.SUCCEEDED
|
|
128
|
+
elif job_status == htcondor.JobStatus.HELD:
|
|
129
|
+
wms_state = WmsStates.HELD
|
|
130
|
+
|
|
131
|
+
return wms_state
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _htc_node_status_to_wms_state(job):
|
|
135
|
+
"""Convert HTCondor node status to generic wms state.
|
|
136
|
+
|
|
137
|
+
Parameters
|
|
138
|
+
----------
|
|
139
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
140
|
+
HTCondor job information.
|
|
141
|
+
|
|
142
|
+
Returns
|
|
143
|
+
-------
|
|
144
|
+
wms_state : `lsst.ctrl.bps.WmsStates`
|
|
145
|
+
The equivalent WmsState to given node's status.
|
|
146
|
+
"""
|
|
147
|
+
wms_state = WmsStates.MISFIT
|
|
148
|
+
match job["NodeStatus"]:
|
|
149
|
+
case NodeStatus.NOT_READY:
|
|
150
|
+
wms_state = WmsStates.UNREADY
|
|
151
|
+
case NodeStatus.READY:
|
|
152
|
+
wms_state = WmsStates.READY
|
|
153
|
+
case NodeStatus.PRERUN:
|
|
154
|
+
wms_state = WmsStates.MISFIT
|
|
155
|
+
case NodeStatus.SUBMITTED:
|
|
156
|
+
if job["JobProcsHeld"]:
|
|
157
|
+
wms_state = WmsStates.HELD
|
|
158
|
+
elif job["StatusDetails"] == "not_idle":
|
|
159
|
+
wms_state = WmsStates.RUNNING
|
|
160
|
+
elif job["JobProcsQueued"]:
|
|
161
|
+
wms_state = WmsStates.PENDING
|
|
162
|
+
case NodeStatus.POSTRUN:
|
|
163
|
+
wms_state = WmsStates.MISFIT
|
|
164
|
+
case NodeStatus.DONE:
|
|
165
|
+
wms_state = WmsStates.SUCCEEDED
|
|
166
|
+
case NodeStatus.ERROR:
|
|
167
|
+
# Use job exit status instead of post script exit status.
|
|
168
|
+
if "DAGMAN error 0" in job["StatusDetails"]:
|
|
169
|
+
wms_state = WmsStates.SUCCEEDED
|
|
170
|
+
elif "ULOG_JOB_ABORTED" in job["StatusDetails"]:
|
|
171
|
+
wms_state = WmsStates.DELETED
|
|
172
|
+
else:
|
|
173
|
+
wms_state = WmsStates.FAILED
|
|
174
|
+
case NodeStatus.FUTILE:
|
|
175
|
+
wms_state = WmsStates.PRUNED
|
|
176
|
+
return wms_state
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _wms_id_type(wms_id):
|
|
180
|
+
"""Determine the type of the WMS id.
|
|
181
|
+
|
|
182
|
+
Parameters
|
|
183
|
+
----------
|
|
184
|
+
wms_id : `str`
|
|
185
|
+
WMS id identifying a job.
|
|
186
|
+
|
|
187
|
+
Returns
|
|
188
|
+
-------
|
|
189
|
+
id_type : `lsst.ctrl.bps.htcondor.WmsIdType`
|
|
190
|
+
Type of WMS id.
|
|
191
|
+
"""
|
|
192
|
+
try:
|
|
193
|
+
int(float(wms_id))
|
|
194
|
+
except ValueError:
|
|
195
|
+
wms_path = Path(wms_id)
|
|
196
|
+
if wms_path.is_dir():
|
|
197
|
+
id_type = WmsIdType.PATH
|
|
198
|
+
else:
|
|
199
|
+
id_type = WmsIdType.GLOBAL
|
|
200
|
+
except TypeError:
|
|
201
|
+
id_type = WmsIdType.UNKNOWN
|
|
202
|
+
else:
|
|
203
|
+
id_type = WmsIdType.LOCAL
|
|
204
|
+
return id_type
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _wms_id_to_cluster(wms_id):
|
|
208
|
+
"""Convert WMS id to cluster id.
|
|
209
|
+
|
|
210
|
+
Parameters
|
|
211
|
+
----------
|
|
212
|
+
wms_id : `int` or `float` or `str`
|
|
213
|
+
HTCondor job id or path.
|
|
214
|
+
|
|
215
|
+
Returns
|
|
216
|
+
-------
|
|
217
|
+
schedd_ad : `classad.ClassAd`
|
|
218
|
+
ClassAd describing the scheduler managing the job with the given id.
|
|
219
|
+
cluster_id : `int`
|
|
220
|
+
HTCondor cluster id.
|
|
221
|
+
id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
|
|
222
|
+
The type of the provided id.
|
|
223
|
+
"""
|
|
224
|
+
coll = htcondor.Collector()
|
|
225
|
+
|
|
226
|
+
schedd_ad = None
|
|
227
|
+
cluster_id = None
|
|
228
|
+
id_type = _wms_id_type(wms_id)
|
|
229
|
+
if id_type == WmsIdType.LOCAL:
|
|
230
|
+
schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
|
|
231
|
+
cluster_id = int(float(wms_id))
|
|
232
|
+
elif id_type == WmsIdType.GLOBAL:
|
|
233
|
+
constraint = f'GlobalJobId == "{wms_id}"'
|
|
234
|
+
schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)}
|
|
235
|
+
schedds = {name: htcondor.Schedd(ad) for name, ad in schedd_ads.items()}
|
|
236
|
+
job_info = condor_q(constraint=constraint, schedds=schedds)
|
|
237
|
+
if job_info:
|
|
238
|
+
schedd_name, job_rec = job_info.popitem()
|
|
239
|
+
job_id, _ = job_rec.popitem()
|
|
240
|
+
schedd_ad = schedd_ads[schedd_name]
|
|
241
|
+
cluster_id = int(float(job_id))
|
|
242
|
+
elif id_type == WmsIdType.PATH:
|
|
243
|
+
try:
|
|
244
|
+
job_info = read_dag_info(wms_id)
|
|
245
|
+
except (FileNotFoundError, PermissionError, OSError):
|
|
246
|
+
pass
|
|
247
|
+
else:
|
|
248
|
+
schedd_name, job_rec = job_info.popitem()
|
|
249
|
+
job_id, _ = job_rec.popitem()
|
|
250
|
+
schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name)
|
|
251
|
+
cluster_id = int(float(job_id))
|
|
252
|
+
else:
|
|
253
|
+
pass
|
|
254
|
+
return schedd_ad, cluster_id, id_type
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _wms_id_to_dir(wms_id):
|
|
258
|
+
"""Convert WMS id to a submit directory candidate.
|
|
259
|
+
|
|
260
|
+
The function does not check if the directory exists or if it is a valid
|
|
261
|
+
BPS submit directory.
|
|
262
|
+
|
|
263
|
+
Parameters
|
|
264
|
+
----------
|
|
265
|
+
wms_id : `int` or `float` or `str`
|
|
266
|
+
HTCondor job id or path.
|
|
267
|
+
|
|
268
|
+
Returns
|
|
269
|
+
-------
|
|
270
|
+
wms_path : `pathlib.Path` or None
|
|
271
|
+
Submit directory candidate for the run with the given job id. If no
|
|
272
|
+
directory can be associated with the provided WMS id, it will be set
|
|
273
|
+
to None.
|
|
274
|
+
id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
|
|
275
|
+
The type of the provided id.
|
|
276
|
+
|
|
277
|
+
Raises
|
|
278
|
+
------
|
|
279
|
+
TypeError
|
|
280
|
+
Raised if provided WMS id has invalid type.
|
|
281
|
+
"""
|
|
282
|
+
coll = htcondor.Collector()
|
|
283
|
+
schedd_ads = []
|
|
284
|
+
|
|
285
|
+
constraint = None
|
|
286
|
+
wms_path = None
|
|
287
|
+
id_type = _wms_id_type(wms_id)
|
|
288
|
+
match id_type:
|
|
289
|
+
case WmsIdType.LOCAL:
|
|
290
|
+
constraint = f"ClusterId == {int(float(wms_id))}"
|
|
291
|
+
schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
|
|
292
|
+
case WmsIdType.GLOBAL:
|
|
293
|
+
constraint = f'GlobalJobId == "{wms_id}"'
|
|
294
|
+
schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
|
|
295
|
+
case WmsIdType.PATH:
|
|
296
|
+
wms_path = Path(wms_id).resolve()
|
|
297
|
+
case WmsIdType.UNKNOWN:
|
|
298
|
+
raise TypeError(f"Invalid job id type: {wms_id}")
|
|
299
|
+
if constraint is not None:
|
|
300
|
+
schedds = {ad["name"]: htcondor.Schedd(ad) for ad in schedd_ads}
|
|
301
|
+
job_info = condor_history(constraint=constraint, schedds=schedds, projection=["Iwd"])
|
|
302
|
+
if job_info:
|
|
303
|
+
_, job_rec = job_info.popitem()
|
|
304
|
+
_, job_ad = job_rec.popitem()
|
|
305
|
+
wms_path = Path(job_ad["Iwd"])
|
|
306
|
+
return wms_path, id_type
|