lsst-ctrl-bps-htcondor 29.2025.3800__py3-none-any.whl → 29.2025.4100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/ctrl/bps/htcondor/__init__.py +1 -0
- lsst/ctrl/bps/htcondor/common_utils.py +306 -0
- lsst/ctrl/bps/htcondor/htcondor_service.py +12 -2060
- lsst/ctrl/bps/htcondor/htcondor_workflow.py +89 -0
- lsst/ctrl/bps/htcondor/lssthtc.py +26 -0
- lsst/ctrl/bps/htcondor/prepare_utils.py +967 -0
- lsst/ctrl/bps/htcondor/report_utils.py +842 -0
- lsst/ctrl/bps/htcondor/version.py +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.3800.dist-info → lsst_ctrl_bps_htcondor-29.2025.4100.dist-info}/METADATA +1 -1
- lsst_ctrl_bps_htcondor-29.2025.4100.dist-info/RECORD +23 -0
- lsst_ctrl_bps_htcondor-29.2025.3800.dist-info/RECORD +0 -19
- {lsst_ctrl_bps_htcondor-29.2025.3800.dist-info → lsst_ctrl_bps_htcondor-29.2025.4100.dist-info}/WHEEL +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3800.dist-info → lsst_ctrl_bps_htcondor-29.2025.4100.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3800.dist-info → lsst_ctrl_bps_htcondor-29.2025.4100.dist-info}/licenses/LICENSE +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3800.dist-info → lsst_ctrl_bps_htcondor-29.2025.4100.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3800.dist-info → lsst_ctrl_bps_htcondor-29.2025.4100.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3800.dist-info → lsst_ctrl_bps_htcondor-29.2025.4100.dist-info}/top_level.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3800.dist-info → lsst_ctrl_bps_htcondor-29.2025.4100.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,967 @@
|
|
|
1
|
+
# This file is part of ctrl_bps_htcondor.
|
|
2
|
+
#
|
|
3
|
+
# Developed for the LSST Data Management System.
|
|
4
|
+
# This product includes software developed by the LSST Project
|
|
5
|
+
# (https://www.lsst.org).
|
|
6
|
+
# See the COPYRIGHT file at the top-level directory of this distribution
|
|
7
|
+
# for details of code ownership.
|
|
8
|
+
#
|
|
9
|
+
# This software is dual licensed under the GNU General Public License and also
|
|
10
|
+
# under a 3-clause BSD license. Recipients may choose which of these licenses
|
|
11
|
+
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
|
|
12
|
+
# respectively. If you choose the GPL option then the following text applies
|
|
13
|
+
# (but note that there is still no warranty even if you opt for BSD instead):
|
|
14
|
+
#
|
|
15
|
+
# This program is free software: you can redistribute it and/or modify
|
|
16
|
+
# it under the terms of the GNU General Public License as published by
|
|
17
|
+
# the Free Software Foundation, either version 3 of the License, or
|
|
18
|
+
# (at your option) any later version.
|
|
19
|
+
#
|
|
20
|
+
# This program is distributed in the hope that it will be useful,
|
|
21
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
22
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
23
|
+
# GNU General Public License for more details.
|
|
24
|
+
#
|
|
25
|
+
# You should have received a copy of the GNU General Public License
|
|
26
|
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
27
|
+
|
|
28
|
+
"""Utility functions for preparing the HTCondor workflow."""
|
|
29
|
+
|
|
30
|
+
import logging
|
|
31
|
+
import os
|
|
32
|
+
import re
|
|
33
|
+
from collections import defaultdict
|
|
34
|
+
from copy import deepcopy
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
from typing import Any, cast
|
|
37
|
+
|
|
38
|
+
from lsst.ctrl.bps import (
|
|
39
|
+
BpsConfig,
|
|
40
|
+
GenericWorkflow,
|
|
41
|
+
GenericWorkflowGroup,
|
|
42
|
+
GenericWorkflowJob,
|
|
43
|
+
GenericWorkflowNodeType,
|
|
44
|
+
GenericWorkflowNoopJob,
|
|
45
|
+
)
|
|
46
|
+
from lsst.ctrl.bps.bps_utils import create_count_summary
|
|
47
|
+
|
|
48
|
+
from .lssthtc import (
|
|
49
|
+
HTCDag,
|
|
50
|
+
HTCJob,
|
|
51
|
+
condor_status,
|
|
52
|
+
htc_escape,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
_LOG = logging.getLogger(__name__)
|
|
56
|
+
|
|
57
|
+
DEFAULT_HTC_EXEC_PATT = ".*worker.*"
|
|
58
|
+
"""Default pattern for searching execute machines in an HTCondor pool.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _create_job(subdir_template, cached_values, generic_workflow, gwjob, out_prefix):
|
|
63
|
+
"""Convert GenericWorkflow job nodes to DAG jobs.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
subdir_template : `str`
|
|
68
|
+
Template for making subdirs.
|
|
69
|
+
cached_values : `dict`
|
|
70
|
+
Site and label specific values.
|
|
71
|
+
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
72
|
+
Generic workflow that is being converted.
|
|
73
|
+
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
74
|
+
The generic job to convert to a HTCondor job.
|
|
75
|
+
out_prefix : `str`
|
|
76
|
+
Directory prefix for HTCondor files.
|
|
77
|
+
|
|
78
|
+
Returns
|
|
79
|
+
-------
|
|
80
|
+
htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
|
|
81
|
+
The HTCondor job equivalent to the given generic job.
|
|
82
|
+
"""
|
|
83
|
+
htc_job = HTCJob(gwjob.name, label=gwjob.label)
|
|
84
|
+
|
|
85
|
+
curvals = defaultdict(str)
|
|
86
|
+
curvals["label"] = gwjob.label
|
|
87
|
+
if gwjob.tags:
|
|
88
|
+
curvals.update(gwjob.tags)
|
|
89
|
+
|
|
90
|
+
subdir = Path("jobs") / subdir_template.format_map(curvals)
|
|
91
|
+
htc_job.subdir = subdir
|
|
92
|
+
htc_job.subfile = f"{gwjob.name}.sub"
|
|
93
|
+
htc_job.add_dag_cmds({"dir": subdir})
|
|
94
|
+
|
|
95
|
+
htc_job_cmds = {
|
|
96
|
+
"universe": "vanilla",
|
|
97
|
+
"should_transfer_files": "YES",
|
|
98
|
+
"when_to_transfer_output": "ON_EXIT_OR_EVICT",
|
|
99
|
+
"transfer_output_files": '""', # Set to empty string to disable
|
|
100
|
+
"transfer_executable": "False",
|
|
101
|
+
"getenv": "True",
|
|
102
|
+
# Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell
|
|
103
|
+
# htcondor to put on hold any jobs which exited by a signal.
|
|
104
|
+
"on_exit_hold": "ExitBySignal == true",
|
|
105
|
+
"on_exit_hold_reason": (
|
|
106
|
+
'strcat("Job raised a signal ", string(ExitSignal), ". ", '
|
|
107
|
+
'"Handling signal as if job has gone over memory limit.")'
|
|
108
|
+
),
|
|
109
|
+
"on_exit_hold_subcode": "34",
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
htc_job_cmds.update(_translate_job_cmds(cached_values, generic_workflow, gwjob))
|
|
113
|
+
|
|
114
|
+
# Combine stdout and stderr to reduce the number of files.
|
|
115
|
+
for key in ("output", "error"):
|
|
116
|
+
if cached_values["overwriteJobFiles"]:
|
|
117
|
+
htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).out"
|
|
118
|
+
else:
|
|
119
|
+
htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).$$([NumJobStarts ?: 0]).out"
|
|
120
|
+
_LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
|
|
121
|
+
|
|
122
|
+
key = "log"
|
|
123
|
+
htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).{key}"
|
|
124
|
+
_LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
|
|
125
|
+
|
|
126
|
+
htc_job_cmds.update(
|
|
127
|
+
_handle_job_inputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
htc_job_cmds.update(
|
|
131
|
+
_handle_job_outputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Add the job cmds dict to the job object.
|
|
135
|
+
htc_job.add_job_cmds(htc_job_cmds)
|
|
136
|
+
|
|
137
|
+
htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
|
|
138
|
+
|
|
139
|
+
# Add job attributes to job.
|
|
140
|
+
_LOG.debug("gwjob.attrs = %s", gwjob.attrs)
|
|
141
|
+
htc_job.add_job_attrs(gwjob.attrs)
|
|
142
|
+
htc_job.add_job_attrs(cached_values["attrs"])
|
|
143
|
+
htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
|
|
144
|
+
htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
|
|
145
|
+
|
|
146
|
+
return htc_job
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
150
|
+
"""Translate the job data that are one to one mapping
|
|
151
|
+
|
|
152
|
+
Parameters
|
|
153
|
+
----------
|
|
154
|
+
cached_vals : `dict` [`str`, `~typing.Any`]
|
|
155
|
+
Config values common to jobs with same site or label.
|
|
156
|
+
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
157
|
+
Generic workflow that contains job to being converted.
|
|
158
|
+
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
159
|
+
Generic workflow job to be converted.
|
|
160
|
+
|
|
161
|
+
Returns
|
|
162
|
+
-------
|
|
163
|
+
htc_job_commands : `dict` [`str`, `~typing.Any`]
|
|
164
|
+
Contains commands which can appear in the HTCondor submit description
|
|
165
|
+
file.
|
|
166
|
+
"""
|
|
167
|
+
# Values in the job script that just are name mappings.
|
|
168
|
+
job_translation = {
|
|
169
|
+
"mail_to": "notify_user",
|
|
170
|
+
"when_to_mail": "notification",
|
|
171
|
+
"request_cpus": "request_cpus",
|
|
172
|
+
"priority": "priority",
|
|
173
|
+
"category": "category",
|
|
174
|
+
"accounting_group": "accounting_group",
|
|
175
|
+
"accounting_user": "accounting_group_user",
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
jobcmds = {}
|
|
179
|
+
for gwkey, htckey in job_translation.items():
|
|
180
|
+
jobcmds[htckey] = getattr(gwjob, gwkey, None)
|
|
181
|
+
|
|
182
|
+
# If accounting info was not set explicitly, use site settings if any.
|
|
183
|
+
if not gwjob.accounting_group:
|
|
184
|
+
jobcmds["accounting_group"] = cached_vals.get("accountingGroup")
|
|
185
|
+
if not gwjob.accounting_user:
|
|
186
|
+
jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
|
|
187
|
+
|
|
188
|
+
# job commands that need modification
|
|
189
|
+
if gwjob.retry_unless_exit:
|
|
190
|
+
if isinstance(gwjob.retry_unless_exit, int):
|
|
191
|
+
jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
|
|
192
|
+
elif isinstance(gwjob.retry_unless_exit, list):
|
|
193
|
+
jobcmds["retry_until"] = (
|
|
194
|
+
f"member(ExitCode, {{{','.join([str(x) for x in gwjob.retry_unless_exit])}}})"
|
|
195
|
+
)
|
|
196
|
+
else:
|
|
197
|
+
raise ValueError("retryUnlessExit must be an integer or a list of integers.")
|
|
198
|
+
|
|
199
|
+
if gwjob.request_disk:
|
|
200
|
+
jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
|
|
201
|
+
|
|
202
|
+
if gwjob.request_memory:
|
|
203
|
+
jobcmds["request_memory"] = f"{gwjob.request_memory}"
|
|
204
|
+
|
|
205
|
+
memory_max = 0
|
|
206
|
+
if gwjob.memory_multiplier:
|
|
207
|
+
# Do not use try-except! At the moment, BpsConfig returns an empty
|
|
208
|
+
# string if it does not contain the key.
|
|
209
|
+
memory_limit = cached_vals["memoryLimit"]
|
|
210
|
+
if not memory_limit:
|
|
211
|
+
raise RuntimeError(
|
|
212
|
+
"Memory autoscaling enabled, but automatic detection of the memory limit "
|
|
213
|
+
"failed; setting it explicitly with 'memoryLimit' or changing worker node "
|
|
214
|
+
"search pattern 'executeMachinesPattern' might help."
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Set maximal amount of memory job can ask for.
|
|
218
|
+
#
|
|
219
|
+
# The check below assumes that 'memory_limit' was set to a value which
|
|
220
|
+
# realistically reflects actual physical limitations of a given compute
|
|
221
|
+
# resource.
|
|
222
|
+
memory_max = memory_limit
|
|
223
|
+
if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit:
|
|
224
|
+
memory_max = gwjob.request_memory_max
|
|
225
|
+
|
|
226
|
+
# Make job ask for more memory each time it failed due to insufficient
|
|
227
|
+
# memory requirements.
|
|
228
|
+
jobcmds["request_memory"] = _create_request_memory_expr(
|
|
229
|
+
gwjob.request_memory, gwjob.memory_multiplier, memory_max
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
user_release_expr = cached_vals.get("releaseExpr", "")
|
|
233
|
+
if gwjob.number_of_retries is not None and gwjob.number_of_retries >= 0:
|
|
234
|
+
jobcmds["max_retries"] = gwjob.number_of_retries
|
|
235
|
+
|
|
236
|
+
# No point in adding periodic_release if 0 retries
|
|
237
|
+
if gwjob.number_of_retries > 0:
|
|
238
|
+
periodic_release = _create_periodic_release_expr(
|
|
239
|
+
gwjob.request_memory, gwjob.memory_multiplier, memory_max, user_release_expr
|
|
240
|
+
)
|
|
241
|
+
if periodic_release:
|
|
242
|
+
jobcmds["periodic_release"] = periodic_release
|
|
243
|
+
|
|
244
|
+
jobcmds["periodic_remove"] = _create_periodic_remove_expr(
|
|
245
|
+
gwjob.request_memory, gwjob.memory_multiplier, memory_max
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Assume concurrency_limit implemented using HTCondor concurrency limits.
|
|
249
|
+
# May need to move to special site-specific implementation if sites use
|
|
250
|
+
# other mechanisms.
|
|
251
|
+
if gwjob.concurrency_limit:
|
|
252
|
+
jobcmds["concurrency_limit"] = gwjob.concurrency_limit
|
|
253
|
+
|
|
254
|
+
# Handle command line
|
|
255
|
+
if gwjob.executable.transfer_executable:
|
|
256
|
+
jobcmds["transfer_executable"] = "True"
|
|
257
|
+
jobcmds["executable"] = gwjob.executable.src_uri
|
|
258
|
+
else:
|
|
259
|
+
jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
|
|
260
|
+
|
|
261
|
+
if gwjob.arguments:
|
|
262
|
+
arguments = gwjob.arguments
|
|
263
|
+
arguments = _replace_cmd_vars(arguments, gwjob)
|
|
264
|
+
arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob)
|
|
265
|
+
arguments = _fix_env_var_syntax(arguments)
|
|
266
|
+
jobcmds["arguments"] = arguments
|
|
267
|
+
|
|
268
|
+
if gwjob.environment:
|
|
269
|
+
env_str = ""
|
|
270
|
+
for name, value in gwjob.environment.items():
|
|
271
|
+
if isinstance(value, str):
|
|
272
|
+
value2 = _replace_cmd_vars(value, gwjob)
|
|
273
|
+
value2 = _fix_env_var_syntax(value2)
|
|
274
|
+
value2 = htc_escape(value2)
|
|
275
|
+
env_str += f"{name}='{value2}' " # Add single quotes to allow internal spaces
|
|
276
|
+
else:
|
|
277
|
+
env_str += f"{name}={value} "
|
|
278
|
+
|
|
279
|
+
# Process above added one trailing space
|
|
280
|
+
jobcmds["environment"] = env_str.rstrip()
|
|
281
|
+
|
|
282
|
+
# Add extra "pass-thru" job commands
|
|
283
|
+
if gwjob.profile:
|
|
284
|
+
for key, val in gwjob.profile.items():
|
|
285
|
+
jobcmds[key] = htc_escape(val)
|
|
286
|
+
for key, val in cached_vals["profile"].items():
|
|
287
|
+
jobcmds[key] = htc_escape(val)
|
|
288
|
+
|
|
289
|
+
return jobcmds
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _translate_dag_cmds(gwjob):
|
|
293
|
+
"""Translate job values into DAGMan commands.
|
|
294
|
+
|
|
295
|
+
Parameters
|
|
296
|
+
----------
|
|
297
|
+
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
298
|
+
Job containing values to be translated.
|
|
299
|
+
|
|
300
|
+
Returns
|
|
301
|
+
-------
|
|
302
|
+
dagcmds : `dict` [`str`, `~typing.Any`]
|
|
303
|
+
DAGMan commands for the job.
|
|
304
|
+
"""
|
|
305
|
+
# Values in the dag script that just are name mappings.
|
|
306
|
+
dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"}
|
|
307
|
+
|
|
308
|
+
dagcmds = {}
|
|
309
|
+
for gwkey, htckey in dag_translation.items():
|
|
310
|
+
dagcmds[htckey] = getattr(gwjob, gwkey, None)
|
|
311
|
+
|
|
312
|
+
# Still to be coded: vars "pre_cmdline", "post_cmdline"
|
|
313
|
+
return dagcmds
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def _fix_env_var_syntax(oldstr):
|
|
317
|
+
"""Change ENV place holders to HTCondor Env var syntax.
|
|
318
|
+
|
|
319
|
+
Parameters
|
|
320
|
+
----------
|
|
321
|
+
oldstr : `str`
|
|
322
|
+
String in which environment variable syntax is to be fixed.
|
|
323
|
+
|
|
324
|
+
Returns
|
|
325
|
+
-------
|
|
326
|
+
newstr : `str`
|
|
327
|
+
Given string with environment variable syntax fixed.
|
|
328
|
+
"""
|
|
329
|
+
newstr = oldstr
|
|
330
|
+
for key in re.findall(r"<ENV:([^>]+)>", oldstr):
|
|
331
|
+
newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
|
|
332
|
+
return newstr
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _replace_file_vars(use_shared, arguments, workflow, gwjob):
|
|
336
|
+
"""Replace file placeholders in command line arguments with correct
|
|
337
|
+
physical file names.
|
|
338
|
+
|
|
339
|
+
Parameters
|
|
340
|
+
----------
|
|
341
|
+
use_shared : `bool`
|
|
342
|
+
Whether HTCondor can assume shared filesystem.
|
|
343
|
+
arguments : `str`
|
|
344
|
+
Arguments string in which to replace file placeholders.
|
|
345
|
+
workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
346
|
+
Generic workflow that contains file information.
|
|
347
|
+
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
348
|
+
The job corresponding to the arguments.
|
|
349
|
+
|
|
350
|
+
Returns
|
|
351
|
+
-------
|
|
352
|
+
arguments : `str`
|
|
353
|
+
Given arguments string with file placeholders replaced.
|
|
354
|
+
"""
|
|
355
|
+
# Replace input file placeholders with paths.
|
|
356
|
+
for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
|
|
357
|
+
if not gwfile.wms_transfer:
|
|
358
|
+
# Must assume full URI if in command line and told WMS is not
|
|
359
|
+
# responsible for transferring file.
|
|
360
|
+
uri = gwfile.src_uri
|
|
361
|
+
elif use_shared:
|
|
362
|
+
if gwfile.job_shared:
|
|
363
|
+
# Have shared filesystems and jobs can share file.
|
|
364
|
+
uri = gwfile.src_uri
|
|
365
|
+
else:
|
|
366
|
+
uri = os.path.basename(gwfile.src_uri)
|
|
367
|
+
else: # Using push transfer
|
|
368
|
+
uri = os.path.basename(gwfile.src_uri)
|
|
369
|
+
arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
|
|
370
|
+
|
|
371
|
+
# Replace output file placeholders with paths.
|
|
372
|
+
for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
|
|
373
|
+
if not gwfile.wms_transfer:
|
|
374
|
+
# Must assume full URI if in command line and told WMS is not
|
|
375
|
+
# responsible for transferring file.
|
|
376
|
+
uri = gwfile.src_uri
|
|
377
|
+
elif use_shared:
|
|
378
|
+
if gwfile.job_shared:
|
|
379
|
+
# Have shared filesystems and jobs can share file.
|
|
380
|
+
uri = gwfile.src_uri
|
|
381
|
+
else:
|
|
382
|
+
uri = os.path.basename(gwfile.src_uri)
|
|
383
|
+
else: # Using push transfer
|
|
384
|
+
uri = os.path.basename(gwfile.src_uri)
|
|
385
|
+
arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
|
|
386
|
+
return arguments
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _replace_cmd_vars(arguments, gwjob):
|
|
390
|
+
"""Replace format-style placeholders in arguments.
|
|
391
|
+
|
|
392
|
+
Parameters
|
|
393
|
+
----------
|
|
394
|
+
arguments : `str`
|
|
395
|
+
Arguments string in which to replace placeholders.
|
|
396
|
+
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
397
|
+
Job containing values to be used to replace placeholders
|
|
398
|
+
(in particular gwjob.cmdvals).
|
|
399
|
+
|
|
400
|
+
Returns
|
|
401
|
+
-------
|
|
402
|
+
arguments : `str`
|
|
403
|
+
Given arguments string with placeholders replaced.
|
|
404
|
+
"""
|
|
405
|
+
replacements = gwjob.cmdvals if gwjob.cmdvals is not None else {}
|
|
406
|
+
try:
|
|
407
|
+
arguments = arguments.format(**replacements)
|
|
408
|
+
except (KeyError, TypeError) as exc: # TypeError in case None instead of {}
|
|
409
|
+
_LOG.error("Could not replace command variables: replacement for %s not provided", str(exc))
|
|
410
|
+
_LOG.debug("arguments: %s\ncmdvals: %s", arguments, replacements)
|
|
411
|
+
raise
|
|
412
|
+
return arguments
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def _handle_job_inputs(
|
|
416
|
+
generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str
|
|
417
|
+
) -> dict[str, str]:
|
|
418
|
+
"""Add job input files from generic workflow to job.
|
|
419
|
+
|
|
420
|
+
Parameters
|
|
421
|
+
----------
|
|
422
|
+
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
423
|
+
The generic workflow (e.g., has executable name and arguments).
|
|
424
|
+
job_name : `str`
|
|
425
|
+
Unique name for the job.
|
|
426
|
+
use_shared : `bool`
|
|
427
|
+
Whether job has access to files via shared filesystem.
|
|
428
|
+
out_prefix : `str`
|
|
429
|
+
The root directory into which all WMS-specific files are written.
|
|
430
|
+
|
|
431
|
+
Returns
|
|
432
|
+
-------
|
|
433
|
+
htc_commands : `dict` [`str`, `str`]
|
|
434
|
+
HTCondor commands for the job submission script.
|
|
435
|
+
"""
|
|
436
|
+
inputs = []
|
|
437
|
+
for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
|
|
438
|
+
_LOG.debug("src_uri=%s", gwf_file.src_uri)
|
|
439
|
+
|
|
440
|
+
uri = Path(gwf_file.src_uri)
|
|
441
|
+
|
|
442
|
+
# Note if use_shared and job_shared, don't need to transfer file.
|
|
443
|
+
|
|
444
|
+
if not use_shared: # Copy file using push to job
|
|
445
|
+
inputs.append(str(uri))
|
|
446
|
+
elif not gwf_file.job_shared: # Jobs require own copy
|
|
447
|
+
# if using shared filesystem, but still need copy in job. Use
|
|
448
|
+
# HTCondor's curl plugin for a local copy.
|
|
449
|
+
if uri.is_dir():
|
|
450
|
+
raise RuntimeError(
|
|
451
|
+
f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}"
|
|
452
|
+
)
|
|
453
|
+
inputs.append(f"file://{uri}")
|
|
454
|
+
|
|
455
|
+
htc_commands = {}
|
|
456
|
+
if inputs:
|
|
457
|
+
htc_commands["transfer_input_files"] = ",".join(inputs)
|
|
458
|
+
_LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
|
|
459
|
+
return htc_commands
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def _handle_job_outputs(
|
|
463
|
+
generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str
|
|
464
|
+
) -> dict[str, str]:
|
|
465
|
+
"""Add job output files from generic workflow to the job if any.
|
|
466
|
+
|
|
467
|
+
Parameters
|
|
468
|
+
----------
|
|
469
|
+
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
470
|
+
The generic workflow (e.g., has executable name and arguments).
|
|
471
|
+
job_name : `str`
|
|
472
|
+
Unique name for the job.
|
|
473
|
+
use_shared : `bool`
|
|
474
|
+
Whether job has access to files via shared filesystem.
|
|
475
|
+
out_prefix : `str`
|
|
476
|
+
The root directory into which all WMS-specific files are written.
|
|
477
|
+
|
|
478
|
+
Returns
|
|
479
|
+
-------
|
|
480
|
+
htc_commands : `dict` [`str`, `str`]
|
|
481
|
+
HTCondor commands for the job submission script.
|
|
482
|
+
"""
|
|
483
|
+
outputs = []
|
|
484
|
+
output_remaps = []
|
|
485
|
+
for gwf_file in generic_workflow.get_job_outputs(job_name, data=True, transfer_only=True):
|
|
486
|
+
_LOG.debug("src_uri=%s", gwf_file.src_uri)
|
|
487
|
+
|
|
488
|
+
uri = Path(gwf_file.src_uri)
|
|
489
|
+
if not use_shared:
|
|
490
|
+
outputs.append(uri.name)
|
|
491
|
+
output_remaps.append(f"{uri.name}={str(uri)}")
|
|
492
|
+
|
|
493
|
+
# Set to an empty string to disable and only update if there are output
|
|
494
|
+
# files to transfer. Otherwise, HTCondor will transfer back all files in
|
|
495
|
+
# the job’s temporary working directory that have been modified or created
|
|
496
|
+
# by the job.
|
|
497
|
+
htc_commands = {"transfer_output_files": '""'}
|
|
498
|
+
if outputs:
|
|
499
|
+
htc_commands["transfer_output_files"] = ",".join(outputs)
|
|
500
|
+
_LOG.debug("transfer_output_files=%s", htc_commands["transfer_output_files"])
|
|
501
|
+
|
|
502
|
+
htc_commands["transfer_output_remaps"] = f'"{";".join(output_remaps)}"'
|
|
503
|
+
_LOG.debug("transfer_output_remaps=%s", htc_commands["transfer_output_remaps"])
|
|
504
|
+
return htc_commands
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def _create_periodic_release_expr(
|
|
508
|
+
memory: int, multiplier: float | None, limit: int, additional_expr: str = ""
|
|
509
|
+
) -> str:
|
|
510
|
+
"""Construct an HTCondorAd expression for releasing held jobs.
|
|
511
|
+
|
|
512
|
+
Parameters
|
|
513
|
+
----------
|
|
514
|
+
memory : `int`
|
|
515
|
+
Requested memory in MB.
|
|
516
|
+
multiplier : `float` or None
|
|
517
|
+
Memory growth rate between retries.
|
|
518
|
+
limit : `int`
|
|
519
|
+
Memory limit.
|
|
520
|
+
additional_expr : `str`, optional
|
|
521
|
+
Expression to add to periodic_release. Defaults to empty string.
|
|
522
|
+
|
|
523
|
+
Returns
|
|
524
|
+
-------
|
|
525
|
+
expr : `str`
|
|
526
|
+
A string representing an HTCondor ClassAd expression for releasing job.
|
|
527
|
+
"""
|
|
528
|
+
_LOG.debug(
|
|
529
|
+
"periodic_release: memory: %s, multiplier: %s, limit: %s, additional_expr: %s",
|
|
530
|
+
memory,
|
|
531
|
+
multiplier,
|
|
532
|
+
limit,
|
|
533
|
+
additional_expr,
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
# ctrl_bps sets multiplier to None in the GenericWorkflow if
|
|
537
|
+
# memoryMultiplier <= 1, but checking value just in case.
|
|
538
|
+
if (not multiplier or multiplier <= 1) and not additional_expr:
|
|
539
|
+
return ""
|
|
540
|
+
|
|
541
|
+
# Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
|
|
542
|
+
# UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
|
|
543
|
+
# The special comparison operators ensure that all comparisons below will
|
|
544
|
+
# evaluate to FALSE in this case.
|
|
545
|
+
#
|
|
546
|
+
# Note:
|
|
547
|
+
# May not be strictly necessary. Operators '&&' and '||' are not strict so
|
|
548
|
+
# the entire expression should evaluate to FALSE when the job is not HELD.
|
|
549
|
+
# According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
|
|
550
|
+
# but better safe than sorry.
|
|
551
|
+
is_held = "JobStatus == 5"
|
|
552
|
+
is_retry_allowed = "NumJobStarts <= JobMaxRetries"
|
|
553
|
+
|
|
554
|
+
mem_expr = ""
|
|
555
|
+
if memory and multiplier and multiplier > 1 and limit:
|
|
556
|
+
was_mem_exceeded = (
|
|
557
|
+
"(HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
|
|
558
|
+
"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
|
|
559
|
+
)
|
|
560
|
+
was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
|
|
561
|
+
mem_expr = f"{was_mem_exceeded} && {was_below_limit}"
|
|
562
|
+
|
|
563
|
+
user_expr = ""
|
|
564
|
+
if additional_expr:
|
|
565
|
+
# Never auto release a job held by user.
|
|
566
|
+
user_expr = f"HoldReasonCode =!= 1 && {additional_expr}"
|
|
567
|
+
|
|
568
|
+
expr = f"{is_held} && {is_retry_allowed}"
|
|
569
|
+
if user_expr and mem_expr:
|
|
570
|
+
expr += f" && ({mem_expr} || {user_expr})"
|
|
571
|
+
elif user_expr:
|
|
572
|
+
expr += f" && {user_expr}"
|
|
573
|
+
elif mem_expr:
|
|
574
|
+
expr += f" && {mem_expr}"
|
|
575
|
+
|
|
576
|
+
return expr
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
def _create_periodic_remove_expr(memory, multiplier, limit):
|
|
580
|
+
"""Construct an HTCondorAd expression for removing jobs from the queue.
|
|
581
|
+
|
|
582
|
+
Parameters
|
|
583
|
+
----------
|
|
584
|
+
memory : `int`
|
|
585
|
+
Requested memory in MB.
|
|
586
|
+
multiplier : `float`
|
|
587
|
+
Memory growth rate between retries.
|
|
588
|
+
limit : `int`
|
|
589
|
+
Memory limit.
|
|
590
|
+
|
|
591
|
+
Returns
|
|
592
|
+
-------
|
|
593
|
+
expr : `str`
|
|
594
|
+
A string representing an HTCondor ClassAd expression for removing jobs.
|
|
595
|
+
"""
|
|
596
|
+
# Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode'
|
|
597
|
+
# are UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
|
|
598
|
+
# The special comparison operators ensure that all comparisons below
|
|
599
|
+
# will evaluate to FALSE in this case.
|
|
600
|
+
#
|
|
601
|
+
# Note:
|
|
602
|
+
# May not be strictly necessary. Operators '&&' and '||' are not
|
|
603
|
+
# strict so the entire expression should evaluate to FALSE when the
|
|
604
|
+
# job is not HELD. According to ClassAd evaluation semantics
|
|
605
|
+
# FALSE && UNDEFINED is FALSE, but better safe than sorry.
|
|
606
|
+
is_held = "JobStatus == 5"
|
|
607
|
+
is_retry_disallowed = "NumJobStarts > JobMaxRetries"
|
|
608
|
+
|
|
609
|
+
mem_expr = ""
|
|
610
|
+
if memory and multiplier and multiplier > 1 and limit:
|
|
611
|
+
mem_limit_expr = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
|
|
612
|
+
|
|
613
|
+
mem_expr = ( # Add || here so only added if adding memory expr
|
|
614
|
+
" || ((HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
|
|
615
|
+
f"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && {mem_limit_expr})"
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
expr = f"{is_held} && ({is_retry_disallowed}{mem_expr})"
|
|
619
|
+
return expr
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
def _create_request_memory_expr(memory, multiplier, limit):
|
|
623
|
+
"""Construct an HTCondor ClassAd expression for safe memory scaling.
|
|
624
|
+
|
|
625
|
+
Parameters
|
|
626
|
+
----------
|
|
627
|
+
memory : `int`
|
|
628
|
+
Requested memory in MB.
|
|
629
|
+
multiplier : `float`
|
|
630
|
+
Memory growth rate between retries.
|
|
631
|
+
limit : `int`
|
|
632
|
+
Memory limit.
|
|
633
|
+
|
|
634
|
+
Returns
|
|
635
|
+
-------
|
|
636
|
+
expr : `str`
|
|
637
|
+
A string representing an HTCondor ClassAd expression enabling safe
|
|
638
|
+
memory scaling between job retries.
|
|
639
|
+
"""
|
|
640
|
+
# The check if the job was held due to exceeding memory requirements
|
|
641
|
+
# will be made *after* job was released back to the job queue (is in
|
|
642
|
+
# the IDLE state), hence the need to use `Last*` job ClassAds instead of
|
|
643
|
+
# the ones describing job's current state.
|
|
644
|
+
#
|
|
645
|
+
# Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is
|
|
646
|
+
# initially put in the job queue. The special comparison operators ensure
|
|
647
|
+
# that all comparisons below will evaluate to FALSE in this case.
|
|
648
|
+
was_mem_exceeded = (
|
|
649
|
+
"LastJobStatus =?= 5 "
|
|
650
|
+
"&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 "
|
|
651
|
+
"|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
# If job runs the first time or was held for reasons other than exceeding
|
|
655
|
+
# the memory, set the required memory to the requested value or use
|
|
656
|
+
# the memory value measured by HTCondor (MemoryUsage) depending on
|
|
657
|
+
# whichever is greater.
|
|
658
|
+
expr = (
|
|
659
|
+
f"({was_mem_exceeded}) "
|
|
660
|
+
f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) "
|
|
661
|
+
f": max({{{memory}, MemoryUsage ?: 0}})"
|
|
662
|
+
)
|
|
663
|
+
return expr
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
def _gather_site_values(config, compute_site):
|
|
667
|
+
"""Gather values specific to given site.
|
|
668
|
+
|
|
669
|
+
Parameters
|
|
670
|
+
----------
|
|
671
|
+
config : `lsst.ctrl.bps.BpsConfig`
|
|
672
|
+
BPS configuration that includes necessary submit/runtime
|
|
673
|
+
information.
|
|
674
|
+
compute_site : `str`
|
|
675
|
+
Compute site name.
|
|
676
|
+
|
|
677
|
+
Returns
|
|
678
|
+
-------
|
|
679
|
+
site_values : `dict` [`str`, `~typing.Any`]
|
|
680
|
+
Values specific to the given site.
|
|
681
|
+
"""
|
|
682
|
+
site_values = {"attrs": {}, "profile": {}}
|
|
683
|
+
search_opts = {}
|
|
684
|
+
if compute_site:
|
|
685
|
+
search_opts["curvals"] = {"curr_site": compute_site}
|
|
686
|
+
|
|
687
|
+
# Determine the hard limit for the memory requirement.
|
|
688
|
+
found, limit = config.search("memoryLimit", opt=search_opts)
|
|
689
|
+
if not found:
|
|
690
|
+
search_opts["default"] = DEFAULT_HTC_EXEC_PATT
|
|
691
|
+
_, patt = config.search("executeMachinesPattern", opt=search_opts)
|
|
692
|
+
del search_opts["default"]
|
|
693
|
+
|
|
694
|
+
# To reduce the amount of data, ignore dynamic slots (if any) as,
|
|
695
|
+
# by definition, they cannot have more memory than
|
|
696
|
+
# the partitionable slot they are the part of.
|
|
697
|
+
constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
|
|
698
|
+
pool_info = condor_status(constraint=constraint)
|
|
699
|
+
try:
|
|
700
|
+
limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
|
|
701
|
+
except ValueError:
|
|
702
|
+
_LOG.debug("No execute machine in the pool matches %s", patt)
|
|
703
|
+
if limit:
|
|
704
|
+
config[".bps_defined.memory_limit"] = limit
|
|
705
|
+
|
|
706
|
+
_, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False})
|
|
707
|
+
site_values["memoryLimit"] = limit
|
|
708
|
+
|
|
709
|
+
found, value = config.search("accountingGroup", opt=search_opts)
|
|
710
|
+
if found:
|
|
711
|
+
site_values["accountingGroup"] = value
|
|
712
|
+
found, value = config.search("accountingUser", opt=search_opts)
|
|
713
|
+
if found:
|
|
714
|
+
site_values["accountingUser"] = value
|
|
715
|
+
|
|
716
|
+
key = f".site.{compute_site}.profile.condor"
|
|
717
|
+
if key in config:
|
|
718
|
+
for subkey, val in config[key].items():
|
|
719
|
+
if subkey.startswith("+"):
|
|
720
|
+
site_values["attrs"][subkey[1:]] = val
|
|
721
|
+
else:
|
|
722
|
+
site_values["profile"][subkey] = val
|
|
723
|
+
|
|
724
|
+
return site_values
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
def _gather_label_values(config: BpsConfig, label: str) -> dict[str, Any]:
|
|
728
|
+
"""Gather values specific to given job label.
|
|
729
|
+
|
|
730
|
+
Parameters
|
|
731
|
+
----------
|
|
732
|
+
config : `lsst.ctrl.bps.BpsConfig`
|
|
733
|
+
BPS configuration that includes necessary submit/runtime
|
|
734
|
+
information.
|
|
735
|
+
label : `str`
|
|
736
|
+
GenericWorkflowJob label.
|
|
737
|
+
|
|
738
|
+
Returns
|
|
739
|
+
-------
|
|
740
|
+
values : `dict` [`str`, `~typing.Any`]
|
|
741
|
+
Values specific to the given job label.
|
|
742
|
+
"""
|
|
743
|
+
values: dict[str, Any] = {"attrs": {}, "profile": {}}
|
|
744
|
+
|
|
745
|
+
search_opts = {}
|
|
746
|
+
profile_key = ""
|
|
747
|
+
if label == "finalJob":
|
|
748
|
+
search_opts["searchobj"] = config["finalJob"]
|
|
749
|
+
profile_key = ".finalJob.profile.condor"
|
|
750
|
+
elif label in config["cluster"]:
|
|
751
|
+
search_opts["curvals"] = {"curr_cluster": label}
|
|
752
|
+
profile_key = f".cluster.{label}.profile.condor"
|
|
753
|
+
elif label in config["pipetask"]:
|
|
754
|
+
search_opts["curvals"] = {"curr_pipetask": label}
|
|
755
|
+
profile_key = f".pipetask.{label}.profile.condor"
|
|
756
|
+
|
|
757
|
+
found, value = config.search("releaseExpr", opt=search_opts)
|
|
758
|
+
if found:
|
|
759
|
+
values["releaseExpr"] = value
|
|
760
|
+
|
|
761
|
+
found, value = config.search("overwriteJobFiles", opt=search_opts)
|
|
762
|
+
if found:
|
|
763
|
+
values["overwriteJobFiles"] = value
|
|
764
|
+
else:
|
|
765
|
+
values["overwriteJobFiles"] = True
|
|
766
|
+
|
|
767
|
+
if profile_key and profile_key in config:
|
|
768
|
+
for subkey, val in config[profile_key].items():
|
|
769
|
+
if subkey.startswith("+"):
|
|
770
|
+
values["attrs"][subkey[1:]] = val
|
|
771
|
+
else:
|
|
772
|
+
values["profile"][subkey] = val
|
|
773
|
+
|
|
774
|
+
return values
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
def _group_to_subdag(
|
|
778
|
+
config: BpsConfig, generic_workflow_group: GenericWorkflowGroup, out_prefix: str
|
|
779
|
+
) -> HTCJob:
|
|
780
|
+
"""Convert a generic workflow group to an HTCondor dag.
|
|
781
|
+
|
|
782
|
+
Parameters
|
|
783
|
+
----------
|
|
784
|
+
config : `lsst.ctrl.bps.BpsConfig`
|
|
785
|
+
Workflow configuration.
|
|
786
|
+
generic_workflow_group : `lsst.ctrl.bps.GenericWorkflowGroup`
|
|
787
|
+
The generic workflow group to convert.
|
|
788
|
+
out_prefix : `str`
|
|
789
|
+
Location prefix to be used when creating jobs.
|
|
790
|
+
|
|
791
|
+
Returns
|
|
792
|
+
-------
|
|
793
|
+
htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
|
|
794
|
+
Job for running the HTCondor dag.
|
|
795
|
+
"""
|
|
796
|
+
jobname = f"wms_{generic_workflow_group.name}"
|
|
797
|
+
htc_job = HTCJob(name=jobname, label=generic_workflow_group.label)
|
|
798
|
+
htc_job.add_dag_cmds({"dir": f"subdags/{jobname}"})
|
|
799
|
+
htc_job.subdag = _generic_workflow_to_htcondor_dag(config, generic_workflow_group, out_prefix)
|
|
800
|
+
if not generic_workflow_group.blocking:
|
|
801
|
+
htc_job.dagcmds["post"] = {
|
|
802
|
+
"defer": "",
|
|
803
|
+
"executable": f"{os.path.dirname(__file__)}/subdag_post.sh",
|
|
804
|
+
"arguments": f"{jobname} $RETURN",
|
|
805
|
+
}
|
|
806
|
+
return htc_job
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
def _create_check_job(group_job_name: str, job_label: str) -> HTCJob:
|
|
810
|
+
"""Create a job to check status of a group job.
|
|
811
|
+
|
|
812
|
+
Parameters
|
|
813
|
+
----------
|
|
814
|
+
group_job_name : `str`
|
|
815
|
+
Name of the group job.
|
|
816
|
+
job_label : `str`
|
|
817
|
+
Label to use for the check status job.
|
|
818
|
+
|
|
819
|
+
Returns
|
|
820
|
+
-------
|
|
821
|
+
htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
|
|
822
|
+
Job description for the job to check group job status.
|
|
823
|
+
"""
|
|
824
|
+
htc_job = HTCJob(name=f"wms_check_status_{group_job_name}", label=job_label)
|
|
825
|
+
htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/check_group_status.sub"
|
|
826
|
+
htc_job.add_dag_cmds({"dir": f"subdags/{group_job_name}", "vars": {"group_job_name": group_job_name}})
|
|
827
|
+
|
|
828
|
+
return htc_job
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
def _generic_workflow_to_htcondor_dag(
|
|
832
|
+
config: BpsConfig, generic_workflow: GenericWorkflow, out_prefix: str
|
|
833
|
+
) -> HTCDag:
|
|
834
|
+
"""Convert a GenericWorkflow to a HTCDag.
|
|
835
|
+
|
|
836
|
+
Parameters
|
|
837
|
+
----------
|
|
838
|
+
config : `lsst.ctrl.bps.BpsConfig`
|
|
839
|
+
Workflow configuration.
|
|
840
|
+
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
841
|
+
The GenericWorkflow to convert.
|
|
842
|
+
out_prefix : `str`
|
|
843
|
+
Location prefix where the HTCondor files will be written.
|
|
844
|
+
|
|
845
|
+
Returns
|
|
846
|
+
-------
|
|
847
|
+
dag : `lsst.ctrl.bps.htcondor.HTCDag`
|
|
848
|
+
The HTCDag representation of the given GenericWorkflow.
|
|
849
|
+
"""
|
|
850
|
+
dag = HTCDag(name=generic_workflow.name)
|
|
851
|
+
|
|
852
|
+
_LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
|
|
853
|
+
dag.add_attribs(generic_workflow.run_attrs)
|
|
854
|
+
dag.add_attribs(
|
|
855
|
+
{
|
|
856
|
+
"bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
|
|
857
|
+
"bps_job_summary": create_count_summary(generic_workflow.job_counts),
|
|
858
|
+
}
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
_, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
|
|
862
|
+
if isinstance(tmp_template, str):
|
|
863
|
+
subdir_template = defaultdict(lambda: tmp_template)
|
|
864
|
+
else:
|
|
865
|
+
subdir_template = tmp_template
|
|
866
|
+
|
|
867
|
+
# Create all DAG jobs
|
|
868
|
+
site_values = {} # Cache compute site specific values to reduce config lookups.
|
|
869
|
+
cached_values = {} # Cache label-specific values to reduce config lookups.
|
|
870
|
+
# Note: Can't use get_job_by_label because those only include payload jobs.
|
|
871
|
+
for job_name in generic_workflow:
|
|
872
|
+
gwjob = generic_workflow.get_job(job_name)
|
|
873
|
+
if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
874
|
+
gwjob = cast(GenericWorkflowJob, gwjob)
|
|
875
|
+
if gwjob.compute_site not in site_values:
|
|
876
|
+
site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
|
|
877
|
+
if gwjob.label not in cached_values:
|
|
878
|
+
cached_values[gwjob.label] = deepcopy(site_values[gwjob.compute_site])
|
|
879
|
+
cached_values[gwjob.label].update(_gather_label_values(config, gwjob.label))
|
|
880
|
+
_LOG.debug("cached: %s= %s", gwjob.label, cached_values[gwjob.label])
|
|
881
|
+
htc_job = _create_job(
|
|
882
|
+
subdir_template[gwjob.label],
|
|
883
|
+
cached_values[gwjob.label],
|
|
884
|
+
generic_workflow,
|
|
885
|
+
gwjob,
|
|
886
|
+
out_prefix,
|
|
887
|
+
)
|
|
888
|
+
elif gwjob.node_type == GenericWorkflowNodeType.NOOP:
|
|
889
|
+
gwjob = cast(GenericWorkflowNoopJob, gwjob)
|
|
890
|
+
htc_job = HTCJob(f"wms_{gwjob.name}", label=gwjob.label)
|
|
891
|
+
htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/noop.sub"
|
|
892
|
+
htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
|
|
893
|
+
htc_job.add_dag_cmds({"noop": True})
|
|
894
|
+
elif gwjob.node_type == GenericWorkflowNodeType.GROUP:
|
|
895
|
+
gwjob = cast(GenericWorkflowGroup, gwjob)
|
|
896
|
+
htc_job = _group_to_subdag(config, gwjob, out_prefix)
|
|
897
|
+
# In case DAGMAN_GENERATE_SUBDAG_SUBMITS is False,
|
|
898
|
+
dag.graph["submit_options"]["do_recurse"] = True
|
|
899
|
+
else:
|
|
900
|
+
raise RuntimeError(f"Unsupported generic workflow node type {gwjob.node_type} ({gwjob.name})")
|
|
901
|
+
_LOG.debug("Calling adding job %s %s", htc_job.name, htc_job.label)
|
|
902
|
+
dag.add_job(htc_job)
|
|
903
|
+
|
|
904
|
+
# Add job dependencies to the DAG (be careful with wms_ jobs)
|
|
905
|
+
for job_name in generic_workflow:
|
|
906
|
+
gwjob = generic_workflow.get_job(job_name)
|
|
907
|
+
parent_name = (
|
|
908
|
+
gwjob.name if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD else f"wms_{gwjob.name}"
|
|
909
|
+
)
|
|
910
|
+
successor_jobs = [generic_workflow.get_job(j) for j in generic_workflow.successors(job_name)]
|
|
911
|
+
children_names = []
|
|
912
|
+
if gwjob.node_type == GenericWorkflowNodeType.GROUP:
|
|
913
|
+
gwjob = cast(GenericWorkflowGroup, gwjob)
|
|
914
|
+
group_children = [] # Dependencies between same group jobs
|
|
915
|
+
for sjob in successor_jobs:
|
|
916
|
+
if sjob.node_type == GenericWorkflowNodeType.GROUP and sjob.label == gwjob.label:
|
|
917
|
+
group_children.append(f"wms_{sjob.name}")
|
|
918
|
+
elif sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
919
|
+
children_names.append(sjob.name)
|
|
920
|
+
else:
|
|
921
|
+
children_names.append(f"wms_{sjob.name}")
|
|
922
|
+
if group_children:
|
|
923
|
+
dag.add_job_relationships([parent_name], group_children)
|
|
924
|
+
if not gwjob.blocking:
|
|
925
|
+
# Since subdag will always succeed, need to add a special
|
|
926
|
+
# job that fails if group failed to block payload children.
|
|
927
|
+
check_job = _create_check_job(f"wms_{gwjob.name}", gwjob.label)
|
|
928
|
+
dag.add_job(check_job)
|
|
929
|
+
dag.add_job_relationships([f"wms_{gwjob.name}"], [check_job.name])
|
|
930
|
+
parent_name = check_job.name
|
|
931
|
+
else:
|
|
932
|
+
for sjob in successor_jobs:
|
|
933
|
+
if sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
934
|
+
children_names.append(sjob.name)
|
|
935
|
+
else:
|
|
936
|
+
children_names.append(f"wms_{sjob.name}")
|
|
937
|
+
|
|
938
|
+
dag.add_job_relationships([parent_name], children_names)
|
|
939
|
+
|
|
940
|
+
# If final job exists in generic workflow, create DAG final job
|
|
941
|
+
final = generic_workflow.get_final()
|
|
942
|
+
if final and isinstance(final, GenericWorkflowJob):
|
|
943
|
+
if final.compute_site and final.compute_site not in site_values:
|
|
944
|
+
site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
|
|
945
|
+
if final.label not in cached_values:
|
|
946
|
+
cached_values[final.label] = deepcopy(site_values[final.compute_site])
|
|
947
|
+
cached_values[final.label].update(_gather_label_values(config, final.label))
|
|
948
|
+
final_htjob = _create_job(
|
|
949
|
+
subdir_template[final.label],
|
|
950
|
+
cached_values[final.label],
|
|
951
|
+
generic_workflow,
|
|
952
|
+
final,
|
|
953
|
+
out_prefix,
|
|
954
|
+
)
|
|
955
|
+
if "post" not in final_htjob.dagcmds:
|
|
956
|
+
final_htjob.dagcmds["post"] = {
|
|
957
|
+
"defer": "",
|
|
958
|
+
"executable": f"{os.path.dirname(__file__)}/final_post.sh",
|
|
959
|
+
"arguments": f"{final.name} $DAG_STATUS $RETURN",
|
|
960
|
+
}
|
|
961
|
+
dag.add_final_job(final_htjob)
|
|
962
|
+
elif final and isinstance(final, GenericWorkflow):
|
|
963
|
+
raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
|
|
964
|
+
elif final:
|
|
965
|
+
raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
|
|
966
|
+
|
|
967
|
+
return dag
|