lsst-ctrl-bps-htcondor 29.2025.1300__tar.gz → 29.2025.1500__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lsst_ctrl_bps_htcondor-29.2025.1300/python/lsst_ctrl_bps_htcondor.egg-info → lsst_ctrl_bps_htcondor-29.2025.1500}/PKG-INFO +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/doc/lsst.ctrl.bps.htcondor/userguide.rst +21 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst/ctrl/bps/htcondor/htcondor_service.py +276 -124
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst/ctrl/bps/htcondor/lssthtc.py +848 -260
- lsst_ctrl_bps_htcondor-29.2025.1500/python/lsst/ctrl/bps/htcondor/version.py +2 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500/python/lsst_ctrl_bps_htcondor.egg-info}/PKG-INFO +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/tests/test_htcondor_service.py +257 -132
- lsst_ctrl_bps_htcondor-29.2025.1500/tests/test_lssthtc.py +1143 -0
- lsst_ctrl_bps_htcondor-29.2025.1300/python/lsst/ctrl/bps/htcondor/version.py +0 -2
- lsst_ctrl_bps_htcondor-29.2025.1300/tests/test_lssthtc.py +0 -320
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/COPYRIGHT +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/LICENSE +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/MANIFEST.in +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/README.rst +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/bsd_license.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/doc/lsst.ctrl.bps.htcondor/CHANGES.rst +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/doc/lsst.ctrl.bps.htcondor/index.rst +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/gpl-v3.0.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/pyproject.toml +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst/ctrl/bps/htcondor/__init__.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst/ctrl/bps/htcondor/etc/__init__.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst/ctrl/bps/htcondor/etc/htcondor_defaults.yaml +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst/ctrl/bps/htcondor/final_post.sh +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst/ctrl/bps/htcondor/handlers.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst/ctrl/bps/htcondor/htcondor_config.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst/ctrl/bps/htcondor/provisioner.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst_ctrl_bps_htcondor.egg-info/SOURCES.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst_ctrl_bps_htcondor.egg-info/dependency_links.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst_ctrl_bps_htcondor.egg-info/requires.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst_ctrl_bps_htcondor.egg-info/top_level.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/python/lsst_ctrl_bps_htcondor.egg-info/zip-safe +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/setup.cfg +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/tests/test_handlers.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300 → lsst_ctrl_bps_htcondor-29.2025.1500}/tests/test_provisioner.py +0 -0
|
@@ -64,6 +64,27 @@ The plugin supports all settings described in `ctrl_bps documentation`__
|
|
|
64
64
|
|
|
65
65
|
.. Describe any plugin specific aspects of defining a submission below if any.
|
|
66
66
|
|
|
67
|
+
Job Ordering
|
|
68
|
+
^^^^^^^^^^^^
|
|
69
|
+
|
|
70
|
+
This plugin supports both ordering types of ``group`` and ``noop``.
|
|
71
|
+
Job outputs are still underneath the ``jobs`` subdirectory.
|
|
72
|
+
|
|
73
|
+
If one is looking at HTCondor information directly:
|
|
74
|
+
|
|
75
|
+
* ``group`` ordering is implemented as subdags so you will see more dagman
|
|
76
|
+
jobs in the queue as well as a new ``subdags`` subdirectory for the
|
|
77
|
+
internal files for running a group. To enable running other subdags after
|
|
78
|
+
a failure but pruning downstream jobs, another job, name starting with
|
|
79
|
+
``wms_check_status``, runs after the subdag to check for a failure and trigger
|
|
80
|
+
the pruning.
|
|
81
|
+
|
|
82
|
+
* ``noop`` ordering is directly implemented as DAGMan NOOP jobs. These jobs
|
|
83
|
+
do not actually do anything, but provide a mechanism for telling HTCondor
|
|
84
|
+
about more job dependencies without using a large number (all-to-all) of
|
|
85
|
+
dependencies.
|
|
86
|
+
|
|
87
|
+
|
|
67
88
|
Job Environment
|
|
68
89
|
^^^^^^^^^^^^^^^
|
|
69
90
|
|
|
@@ -36,7 +36,7 @@ import re
|
|
|
36
36
|
from collections import defaultdict
|
|
37
37
|
from enum import IntEnum, auto
|
|
38
38
|
from pathlib import Path
|
|
39
|
-
from typing import Any
|
|
39
|
+
from typing import Any, cast
|
|
40
40
|
|
|
41
41
|
import htcondor
|
|
42
42
|
from packaging import version
|
|
@@ -44,8 +44,12 @@ from packaging import version
|
|
|
44
44
|
from lsst.ctrl.bps import (
|
|
45
45
|
BaseWmsService,
|
|
46
46
|
BaseWmsWorkflow,
|
|
47
|
+
BpsConfig,
|
|
47
48
|
GenericWorkflow,
|
|
49
|
+
GenericWorkflowGroup,
|
|
48
50
|
GenericWorkflowJob,
|
|
51
|
+
GenericWorkflowNodeType,
|
|
52
|
+
GenericWorkflowNoopJob,
|
|
49
53
|
WmsJobReport,
|
|
50
54
|
WmsRunReport,
|
|
51
55
|
WmsSpecificInfo,
|
|
@@ -60,8 +64,9 @@ from .lssthtc import (
|
|
|
60
64
|
MISSING_ID,
|
|
61
65
|
HTCDag,
|
|
62
66
|
HTCJob,
|
|
63
|
-
JobStatus,
|
|
64
67
|
NodeStatus,
|
|
68
|
+
WmsNodeType,
|
|
69
|
+
_update_rescue_file,
|
|
65
70
|
condor_history,
|
|
66
71
|
condor_q,
|
|
67
72
|
condor_search,
|
|
@@ -175,17 +180,23 @@ class HTCondorService(BaseWmsService):
|
|
|
175
180
|
Keyword arguments for the options.
|
|
176
181
|
"""
|
|
177
182
|
dag = workflow.dag
|
|
178
|
-
|
|
179
183
|
ver = version.parse(htc_version())
|
|
180
|
-
if ver >= version.parse("8.9.3"):
|
|
181
|
-
sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {})
|
|
182
|
-
else:
|
|
183
|
-
sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {})
|
|
184
184
|
|
|
185
185
|
# For workflow portability, internal paths are all relative. Hence
|
|
186
186
|
# the DAG needs to be submitted to HTCondor from inside the submit
|
|
187
187
|
# directory.
|
|
188
188
|
with chdir(workflow.submit_path):
|
|
189
|
+
try:
|
|
190
|
+
if ver >= version.parse("8.9.3"):
|
|
191
|
+
sub = htc_create_submit_from_dag(dag.graph["dag_filename"], dag.graph["submit_options"])
|
|
192
|
+
else:
|
|
193
|
+
sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], dag.graph["submit_options"])
|
|
194
|
+
except Exception:
|
|
195
|
+
_LOG.error(
|
|
196
|
+
"Problems creating HTCondor submit object from filename: %s", dag.graph["dag_filename"]
|
|
197
|
+
)
|
|
198
|
+
raise
|
|
199
|
+
|
|
189
200
|
_LOG.info("Submitting from directory: %s", os.getcwd())
|
|
190
201
|
schedd_dag_info = htc_submit_dag(sub)
|
|
191
202
|
if schedd_dag_info:
|
|
@@ -226,7 +237,7 @@ class HTCondorService(BaseWmsService):
|
|
|
226
237
|
None,
|
|
227
238
|
(
|
|
228
239
|
f"workflow with run id '{wms_workflow_id}' not found. "
|
|
229
|
-
|
|
240
|
+
"Hint: use run's submit directory as the id instead"
|
|
230
241
|
),
|
|
231
242
|
)
|
|
232
243
|
|
|
@@ -266,7 +277,9 @@ class HTCondorService(BaseWmsService):
|
|
|
266
277
|
)
|
|
267
278
|
|
|
268
279
|
_LOG.info("Backing up select HTCondor files from previous run attempt")
|
|
269
|
-
htc_backup_files(wms_path, subdir="backups")
|
|
280
|
+
rescue_file = htc_backup_files(wms_path, subdir="backups")
|
|
281
|
+
if (wms_path / "subdags").exists():
|
|
282
|
+
_update_rescue_file(rescue_file)
|
|
270
283
|
|
|
271
284
|
# For workflow portability, internal paths are all relative. Hence
|
|
272
285
|
# the DAG needs to be resubmitted to HTCondor from inside the submit
|
|
@@ -563,66 +576,17 @@ class HTCondorWorkflow(BaseWmsWorkflow):
|
|
|
563
576
|
def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
|
|
564
577
|
# Docstring inherited
|
|
565
578
|
htc_workflow = cls(generic_workflow.name, config)
|
|
566
|
-
htc_workflow.dag =
|
|
579
|
+
htc_workflow.dag = _generic_workflow_to_htcondor_dag(config, generic_workflow, out_prefix)
|
|
567
580
|
|
|
568
581
|
_LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
|
|
569
|
-
|
|
582
|
+
# Add extra attributes to top most DAG.
|
|
570
583
|
htc_workflow.dag.add_attribs(
|
|
571
584
|
{
|
|
572
585
|
"bps_wms_service": service_class,
|
|
573
586
|
"bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
|
|
574
|
-
"bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
|
|
575
|
-
"bps_job_summary": create_count_summary(generic_workflow.job_counts),
|
|
576
587
|
}
|
|
577
588
|
)
|
|
578
589
|
|
|
579
|
-
_, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
|
|
580
|
-
if isinstance(tmp_template, str):
|
|
581
|
-
subdir_template = defaultdict(lambda: tmp_template)
|
|
582
|
-
else:
|
|
583
|
-
subdir_template = tmp_template
|
|
584
|
-
|
|
585
|
-
# Create all DAG jobs
|
|
586
|
-
site_values = {} # cache compute site specific values to reduce config lookups
|
|
587
|
-
for job_name in generic_workflow:
|
|
588
|
-
gwjob = generic_workflow.get_job(job_name)
|
|
589
|
-
if gwjob.compute_site not in site_values:
|
|
590
|
-
site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
|
|
591
|
-
htc_job = _create_job(
|
|
592
|
-
subdir_template[gwjob.label],
|
|
593
|
-
site_values[gwjob.compute_site],
|
|
594
|
-
generic_workflow,
|
|
595
|
-
gwjob,
|
|
596
|
-
out_prefix,
|
|
597
|
-
)
|
|
598
|
-
htc_workflow.dag.add_job(htc_job)
|
|
599
|
-
|
|
600
|
-
# Add job dependencies to the DAG
|
|
601
|
-
for job_name in generic_workflow:
|
|
602
|
-
htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
|
|
603
|
-
|
|
604
|
-
# If final job exists in generic workflow, create DAG final job
|
|
605
|
-
final = generic_workflow.get_final()
|
|
606
|
-
if final and isinstance(final, GenericWorkflowJob):
|
|
607
|
-
if final.compute_site and final.compute_site not in site_values:
|
|
608
|
-
site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
|
|
609
|
-
final_htjob = _create_job(
|
|
610
|
-
subdir_template[final.label],
|
|
611
|
-
site_values[final.compute_site],
|
|
612
|
-
generic_workflow,
|
|
613
|
-
final,
|
|
614
|
-
out_prefix,
|
|
615
|
-
)
|
|
616
|
-
if "post" not in final_htjob.dagcmds:
|
|
617
|
-
final_htjob.dagcmds["post"] = (
|
|
618
|
-
f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN"
|
|
619
|
-
)
|
|
620
|
-
htc_workflow.dag.add_final_job(final_htjob)
|
|
621
|
-
elif final and isinstance(final, GenericWorkflow):
|
|
622
|
-
raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
|
|
623
|
-
elif final:
|
|
624
|
-
return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
|
|
625
|
-
|
|
626
590
|
return htc_workflow
|
|
627
591
|
|
|
628
592
|
def write(self, out_prefix):
|
|
@@ -637,7 +601,7 @@ class HTCondorWorkflow(BaseWmsWorkflow):
|
|
|
637
601
|
os.makedirs(out_prefix, exist_ok=True)
|
|
638
602
|
|
|
639
603
|
# Write down the workflow in HTCondor format.
|
|
640
|
-
self.dag.write(out_prefix, "jobs/{self.label}")
|
|
604
|
+
self.dag.write(out_prefix, job_subdir="jobs/{self.label}")
|
|
641
605
|
|
|
642
606
|
|
|
643
607
|
def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix):
|
|
@@ -668,8 +632,10 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
|
|
|
668
632
|
if gwjob.tags:
|
|
669
633
|
curvals.update(gwjob.tags)
|
|
670
634
|
|
|
671
|
-
subdir = subdir_template.format_map(curvals)
|
|
672
|
-
htc_job.
|
|
635
|
+
subdir = Path("jobs") / subdir_template.format_map(curvals)
|
|
636
|
+
htc_job.subdir = subdir
|
|
637
|
+
htc_job.subfile = f"{gwjob.name}.sub"
|
|
638
|
+
htc_job.add_dag_cmds({"dir": subdir})
|
|
673
639
|
|
|
674
640
|
htc_job_cmds = {
|
|
675
641
|
"universe": "vanilla",
|
|
@@ -681,8 +647,10 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
|
|
|
681
647
|
# Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell
|
|
682
648
|
# htcondor to put on hold any jobs which exited by a signal.
|
|
683
649
|
"on_exit_hold": "ExitBySignal == true",
|
|
684
|
-
"on_exit_hold_reason":
|
|
685
|
-
|
|
650
|
+
"on_exit_hold_reason": (
|
|
651
|
+
'strcat("Job raised a signal ", string(ExitSignal), ". ", '
|
|
652
|
+
'"Handling signal as if job has gone over memory limit.")'
|
|
653
|
+
),
|
|
686
654
|
"on_exit_hold_subcode": "34",
|
|
687
655
|
}
|
|
688
656
|
|
|
@@ -690,7 +658,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
|
|
|
690
658
|
|
|
691
659
|
# job stdout, stderr, htcondor user log.
|
|
692
660
|
for key in ("output", "error", "log"):
|
|
693
|
-
htc_job_cmds[key] =
|
|
661
|
+
htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).{key[:3]}"
|
|
694
662
|
_LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
|
|
695
663
|
|
|
696
664
|
htc_job_cmds.update(
|
|
@@ -817,7 +785,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
817
785
|
# Handle command line
|
|
818
786
|
if gwjob.executable.transfer_executable:
|
|
819
787
|
jobcmds["transfer_executable"] = "True"
|
|
820
|
-
jobcmds["executable"] =
|
|
788
|
+
jobcmds["executable"] = gwjob.executable.src_uri
|
|
821
789
|
else:
|
|
822
790
|
jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
|
|
823
791
|
|
|
@@ -974,7 +942,7 @@ def _replace_cmd_vars(arguments, gwjob):
|
|
|
974
942
|
replacements = gwjob.cmdvals if gwjob.cmdvals is not None else {}
|
|
975
943
|
try:
|
|
976
944
|
arguments = arguments.format(**replacements)
|
|
977
|
-
except KeyError as exc:
|
|
945
|
+
except (KeyError, TypeError) as exc: # TypeError in case None instead of {}
|
|
978
946
|
_LOG.error("Could not replace command variables: replacement for %s not provided", str(exc))
|
|
979
947
|
_LOG.debug("arguments: %s\ncmdvals: %s", arguments, replacements)
|
|
980
948
|
raise
|
|
@@ -1200,12 +1168,12 @@ def _get_info_from_schedd(
|
|
|
1200
1168
|
return schedd_dag_info
|
|
1201
1169
|
|
|
1202
1170
|
|
|
1203
|
-
def _get_info_from_path(wms_path: str) -> tuple[str, dict[str, dict[str, Any]], str]:
|
|
1171
|
+
def _get_info_from_path(wms_path: str | os.PathLike) -> tuple[str, dict[str, dict[str, Any]], str]:
|
|
1204
1172
|
"""Gather run information from a given run directory.
|
|
1205
1173
|
|
|
1206
1174
|
Parameters
|
|
1207
1175
|
----------
|
|
1208
|
-
wms_path : `str`
|
|
1176
|
+
wms_path : `str` or `os.PathLike`
|
|
1209
1177
|
Directory containing HTCondor files.
|
|
1210
1178
|
|
|
1211
1179
|
Returns
|
|
@@ -1263,9 +1231,9 @@ def _get_info_from_path(wms_path: str) -> tuple[str, dict[str, dict[str, Any]],
|
|
|
1263
1231
|
schedd_name = next(iter(job_info))
|
|
1264
1232
|
job_ad = next(iter(job_info[schedd_name].values()))
|
|
1265
1233
|
job.update(job_ad)
|
|
1266
|
-
except FileNotFoundError:
|
|
1267
|
-
message = f"Could not find HTCondor files in '{wms_path}'"
|
|
1268
|
-
_LOG.
|
|
1234
|
+
except FileNotFoundError as err:
|
|
1235
|
+
message = f"Could not find HTCondor files in '{wms_path}' ({err})"
|
|
1236
|
+
_LOG.debug(message)
|
|
1269
1237
|
messages.append(message)
|
|
1270
1238
|
message = htc_check_dagman_output(wms_path)
|
|
1271
1239
|
if message:
|
|
@@ -1298,8 +1266,9 @@ def _create_detailed_report_from_jobs(
|
|
|
1298
1266
|
id and the value is a collection of report information for that run.
|
|
1299
1267
|
"""
|
|
1300
1268
|
_LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
|
|
1301
|
-
|
|
1302
|
-
|
|
1269
|
+
|
|
1270
|
+
dag_ad = jobs[wms_workflow_id]
|
|
1271
|
+
|
|
1303
1272
|
report = WmsRunReport(
|
|
1304
1273
|
wms_id=f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}",
|
|
1305
1274
|
global_wms_id=dag_ad.get("GlobalJobId", "MISS"),
|
|
@@ -1312,28 +1281,34 @@ def _create_detailed_report_from_jobs(
|
|
|
1312
1281
|
operator=_get_owner(dag_ad),
|
|
1313
1282
|
run_summary=_get_run_summary(dag_ad),
|
|
1314
1283
|
state=_htc_status_to_wms_state(dag_ad),
|
|
1284
|
+
total_number_jobs=0,
|
|
1315
1285
|
jobs=[],
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
exit_code_summary=_get_exit_code_summary(jobs),
|
|
1286
|
+
job_state_counts=dict.fromkeys(WmsStates, 0),
|
|
1287
|
+
exit_code_summary={},
|
|
1319
1288
|
)
|
|
1289
|
+
|
|
1290
|
+
payload_jobs = {} # keep track for later processing
|
|
1320
1291
|
specific_info = WmsSpecificInfo()
|
|
1321
1292
|
for job_id, job_ad in jobs.items():
|
|
1322
|
-
if
|
|
1293
|
+
if job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [WmsNodeType.PAYLOAD, WmsNodeType.FINAL]:
|
|
1323
1294
|
try:
|
|
1295
|
+
name = job_ad.get("DAGNodeName", job_id)
|
|
1296
|
+
wms_state = _htc_status_to_wms_state(job_ad)
|
|
1324
1297
|
job_report = WmsJobReport(
|
|
1325
1298
|
wms_id=job_id,
|
|
1326
|
-
name=
|
|
1327
|
-
label=job_ad.get("bps_job_label", pegasus_name_to_label(
|
|
1328
|
-
state=
|
|
1299
|
+
name=name,
|
|
1300
|
+
label=job_ad.get("bps_job_label", pegasus_name_to_label(name)),
|
|
1301
|
+
state=wms_state,
|
|
1329
1302
|
)
|
|
1330
1303
|
if job_report.label == "init":
|
|
1331
1304
|
job_report.label = "pipetaskInit"
|
|
1305
|
+
report.job_state_counts[wms_state] += 1
|
|
1332
1306
|
report.jobs.append(job_report)
|
|
1307
|
+
payload_jobs[job_id] = job_ad
|
|
1333
1308
|
except KeyError as ex:
|
|
1334
1309
|
_LOG.error("Job missing key '%s': %s", str(ex), job_ad)
|
|
1335
1310
|
raise
|
|
1336
|
-
|
|
1311
|
+
elif is_service_job(job_ad):
|
|
1337
1312
|
_LOG.debug(
|
|
1338
1313
|
"Found service job: id='%s', name='%s', label='%s', NodeStatus='%s', JobStatus='%s'",
|
|
1339
1314
|
job_id,
|
|
@@ -1344,13 +1319,11 @@ def _create_detailed_report_from_jobs(
|
|
|
1344
1319
|
)
|
|
1345
1320
|
_add_service_job_specific_info(job_ad, specific_info)
|
|
1346
1321
|
|
|
1322
|
+
report.total_number_jobs = len(payload_jobs)
|
|
1323
|
+
report.exit_code_summary = _get_exit_code_summary(payload_jobs)
|
|
1347
1324
|
if specific_info:
|
|
1348
1325
|
report.specific_info = specific_info
|
|
1349
1326
|
|
|
1350
|
-
# Add the removed entry to restore the original content of the dictionary.
|
|
1351
|
-
# The ordering of keys will be change permanently though.
|
|
1352
|
-
jobs.update({wms_workflow_id: dag_ad})
|
|
1353
|
-
|
|
1354
1327
|
# Workflow will exit with non-zero DAG_STATUS if problem with
|
|
1355
1328
|
# any of the wms jobs. So change FAILED to SUCCEEDED if all
|
|
1356
1329
|
# payload jobs SUCCEEDED.
|
|
@@ -1450,6 +1423,7 @@ def _summary_report(user, hist, pass_thru, schedds=None):
|
|
|
1450
1423
|
|
|
1451
1424
|
# Have list of DAGMan jobs, need to get run_report info.
|
|
1452
1425
|
run_reports = {}
|
|
1426
|
+
msg = ""
|
|
1453
1427
|
for jobs in job_info.values():
|
|
1454
1428
|
for job_id, job in jobs.items():
|
|
1455
1429
|
total_jobs, state_counts = _get_state_counts_from_dag_job(job)
|
|
@@ -1482,7 +1456,7 @@ def _summary_report(user, hist, pass_thru, schedds=None):
|
|
|
1482
1456
|
)
|
|
1483
1457
|
run_reports[report.global_wms_id] = report
|
|
1484
1458
|
|
|
1485
|
-
return run_reports,
|
|
1459
|
+
return run_reports, msg
|
|
1486
1460
|
|
|
1487
1461
|
|
|
1488
1462
|
def _add_run_info(wms_path, job):
|
|
@@ -1596,14 +1570,14 @@ def _get_exit_code_summary(jobs):
|
|
|
1596
1570
|
exit_code = 0
|
|
1597
1571
|
job_status = job_ad["JobStatus"]
|
|
1598
1572
|
match job_status:
|
|
1599
|
-
case JobStatus.COMPLETED | JobStatus.HELD:
|
|
1573
|
+
case htcondor.JobStatus.COMPLETED | htcondor.JobStatus.HELD:
|
|
1600
1574
|
exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["ExitCode"]
|
|
1601
1575
|
case (
|
|
1602
|
-
JobStatus.IDLE
|
|
1603
|
-
| JobStatus.RUNNING
|
|
1604
|
-
| JobStatus.REMOVED
|
|
1605
|
-
| JobStatus.TRANSFERRING_OUTPUT
|
|
1606
|
-
| JobStatus.SUSPENDED
|
|
1576
|
+
htcondor.JobStatus.IDLE
|
|
1577
|
+
| htcondor.JobStatus.RUNNING
|
|
1578
|
+
| htcondor.JobStatus.REMOVED
|
|
1579
|
+
| htcondor.JobStatus.TRANSFERRING_OUTPUT
|
|
1580
|
+
| htcondor.JobStatus.SUSPENDED
|
|
1607
1581
|
):
|
|
1608
1582
|
pass
|
|
1609
1583
|
case _:
|
|
@@ -1639,16 +1613,13 @@ def _get_state_counts_from_jobs(
|
|
|
1639
1613
|
"""
|
|
1640
1614
|
state_counts = dict.fromkeys(WmsStates, 0)
|
|
1641
1615
|
for job_id, job_ad in jobs.items():
|
|
1642
|
-
if job_id != wms_workflow_id and
|
|
1616
|
+
if job_id != wms_workflow_id and job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [
|
|
1617
|
+
WmsNodeType.PAYLOAD,
|
|
1618
|
+
WmsNodeType.FINAL,
|
|
1619
|
+
]:
|
|
1643
1620
|
state_counts[_htc_status_to_wms_state(job_ad)] += 1
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
if "NodesTotal" in jobs[wms_workflow_id]:
|
|
1647
|
-
total_count = jobs[wms_workflow_id]["NodesTotal"]
|
|
1648
|
-
else:
|
|
1649
|
-
total_count = total_counted
|
|
1621
|
+
total_count = sum(state_counts.values())
|
|
1650
1622
|
|
|
1651
|
-
state_counts[WmsStates.UNREADY] += total_count - total_counted
|
|
1652
1623
|
return total_count, state_counts
|
|
1653
1624
|
|
|
1654
1625
|
|
|
@@ -1746,27 +1717,28 @@ def _htc_job_status_to_wms_state(job):
|
|
|
1746
1717
|
_LOG.debug(
|
|
1747
1718
|
"htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"])
|
|
1748
1719
|
)
|
|
1749
|
-
job_status = int(job["JobStatus"])
|
|
1750
1720
|
wms_state = WmsStates.MISFIT
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1721
|
+
if "JobStatus" in job and job["JobStatus"]:
|
|
1722
|
+
job_status = int(job["JobStatus"])
|
|
1723
|
+
|
|
1724
|
+
_LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
|
|
1725
|
+
if job_status == htcondor.JobStatus.IDLE:
|
|
1726
|
+
wms_state = WmsStates.PENDING
|
|
1727
|
+
elif job_status == htcondor.JobStatus.RUNNING:
|
|
1728
|
+
wms_state = WmsStates.RUNNING
|
|
1729
|
+
elif job_status == htcondor.JobStatus.REMOVED:
|
|
1730
|
+
wms_state = WmsStates.DELETED
|
|
1731
|
+
elif job_status == htcondor.JobStatus.COMPLETED:
|
|
1732
|
+
if (
|
|
1733
|
+
(job.get("ExitBySignal", False) and job.get("ExitSignal", 0))
|
|
1734
|
+
or job.get("ExitCode", 0)
|
|
1735
|
+
or job.get("DAG_Status", 0)
|
|
1736
|
+
):
|
|
1737
|
+
wms_state = WmsStates.FAILED
|
|
1738
|
+
else:
|
|
1739
|
+
wms_state = WmsStates.SUCCEEDED
|
|
1740
|
+
elif job_status == htcondor.JobStatus.HELD:
|
|
1741
|
+
wms_state = WmsStates.HELD
|
|
1770
1742
|
|
|
1771
1743
|
return wms_state
|
|
1772
1744
|
|
|
@@ -2212,6 +2184,186 @@ def is_service_job(job_ad: dict[str, Any]) -> bool:
|
|
|
2212
2184
|
-----
|
|
2213
2185
|
At the moment, HTCondor does not provide a native way to distinguish
|
|
2214
2186
|
between payload and service jobs in the workflow. This code depends
|
|
2215
|
-
on read_node_status adding
|
|
2187
|
+
on read_node_status adding wms_node_type.
|
|
2188
|
+
"""
|
|
2189
|
+
return job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) == WmsNodeType.SERVICE
|
|
2190
|
+
|
|
2191
|
+
|
|
2192
|
+
def _group_to_subdag(
|
|
2193
|
+
config: BpsConfig, generic_workflow_group: GenericWorkflowGroup, out_prefix: str
|
|
2194
|
+
) -> HTCJob:
|
|
2195
|
+
"""Convert a generic workflow group to an HTCondor dag.
|
|
2196
|
+
|
|
2197
|
+
Parameters
|
|
2198
|
+
----------
|
|
2199
|
+
config : `lsst.ctrl.bps.BpsConfig`
|
|
2200
|
+
Workflow configuration.
|
|
2201
|
+
generic_workflow_group : `lsst.ctrl.bps.GenericWorkflowGroup`
|
|
2202
|
+
The generic workflow group to convert.
|
|
2203
|
+
out_prefix : `str`
|
|
2204
|
+
Location prefix to be used when creating jobs.
|
|
2205
|
+
|
|
2206
|
+
Returns
|
|
2207
|
+
-------
|
|
2208
|
+
htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
|
|
2209
|
+
Job for running the HTCondor dag.
|
|
2216
2210
|
"""
|
|
2217
|
-
|
|
2211
|
+
jobname = f"wms_{generic_workflow_group.name}"
|
|
2212
|
+
htc_job = HTCJob(name=jobname, label=generic_workflow_group.label)
|
|
2213
|
+
htc_job.add_dag_cmds({"dir": f"subdags/{jobname}"})
|
|
2214
|
+
htc_job.subdag = _generic_workflow_to_htcondor_dag(config, generic_workflow_group, out_prefix)
|
|
2215
|
+
if not generic_workflow_group.blocking:
|
|
2216
|
+
htc_job.dagcmds["post"] = {
|
|
2217
|
+
"defer": "",
|
|
2218
|
+
"executable": f"{os.path.dirname(__file__)}/subdag_post.sh",
|
|
2219
|
+
"arguments": f"{jobname} $RETURN",
|
|
2220
|
+
}
|
|
2221
|
+
return htc_job
|
|
2222
|
+
|
|
2223
|
+
|
|
2224
|
+
def _create_check_job(group_job_name: str, job_label: str) -> HTCJob:
|
|
2225
|
+
"""Create a job to check status of a group job.
|
|
2226
|
+
|
|
2227
|
+
Parameters
|
|
2228
|
+
----------
|
|
2229
|
+
group_job_name : `str`
|
|
2230
|
+
Name of the group job.
|
|
2231
|
+
job_label : `str`
|
|
2232
|
+
Label to use for the check status job.
|
|
2233
|
+
|
|
2234
|
+
Returns
|
|
2235
|
+
-------
|
|
2236
|
+
htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
|
|
2237
|
+
Job description for the job to check group job status.
|
|
2238
|
+
"""
|
|
2239
|
+
htc_job = HTCJob(name=f"wms_check_status_{group_job_name}", label=job_label)
|
|
2240
|
+
htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/check_group_status.sub"
|
|
2241
|
+
htc_job.add_dag_cmds({"dir": f"subdags/{group_job_name}", "vars": {"group_job_name": group_job_name}})
|
|
2242
|
+
|
|
2243
|
+
return htc_job
|
|
2244
|
+
|
|
2245
|
+
|
|
2246
|
+
def _generic_workflow_to_htcondor_dag(
|
|
2247
|
+
config: BpsConfig, generic_workflow: GenericWorkflow, out_prefix: str
|
|
2248
|
+
) -> HTCDag:
|
|
2249
|
+
"""Convert a GenericWorkflow to a HTCDag.
|
|
2250
|
+
|
|
2251
|
+
Parameters
|
|
2252
|
+
----------
|
|
2253
|
+
config : `lsst.ctrl.bps.BpsConfig`
|
|
2254
|
+
Workflow configuration.
|
|
2255
|
+
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
2256
|
+
The GenericWorkflow to convert.
|
|
2257
|
+
out_prefix : `str`
|
|
2258
|
+
Location prefix where the HTCondor files will be written.
|
|
2259
|
+
|
|
2260
|
+
Returns
|
|
2261
|
+
-------
|
|
2262
|
+
dag : `lsst.ctrl.bps.htcondor.HTCDag`
|
|
2263
|
+
The HTCDag representation of the given GenericWorkflow.
|
|
2264
|
+
"""
|
|
2265
|
+
dag = HTCDag(name=generic_workflow.name)
|
|
2266
|
+
|
|
2267
|
+
_LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
|
|
2268
|
+
dag.add_attribs(generic_workflow.run_attrs)
|
|
2269
|
+
dag.add_attribs(
|
|
2270
|
+
{
|
|
2271
|
+
"bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
|
|
2272
|
+
"bps_job_summary": create_count_summary(generic_workflow.job_counts),
|
|
2273
|
+
}
|
|
2274
|
+
)
|
|
2275
|
+
|
|
2276
|
+
_, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
|
|
2277
|
+
if isinstance(tmp_template, str):
|
|
2278
|
+
subdir_template = defaultdict(lambda: tmp_template)
|
|
2279
|
+
else:
|
|
2280
|
+
subdir_template = tmp_template
|
|
2281
|
+
|
|
2282
|
+
# Create all DAG jobs
|
|
2283
|
+
site_values = {} # cache compute site specific values to reduce config lookups
|
|
2284
|
+
for job_name in generic_workflow:
|
|
2285
|
+
gwjob = generic_workflow.get_job(job_name)
|
|
2286
|
+
if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
2287
|
+
gwjob = cast(GenericWorkflowJob, gwjob)
|
|
2288
|
+
if gwjob.compute_site not in site_values:
|
|
2289
|
+
site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
|
|
2290
|
+
htc_job = _create_job(
|
|
2291
|
+
subdir_template[gwjob.label],
|
|
2292
|
+
site_values[gwjob.compute_site],
|
|
2293
|
+
generic_workflow,
|
|
2294
|
+
gwjob,
|
|
2295
|
+
out_prefix,
|
|
2296
|
+
)
|
|
2297
|
+
elif gwjob.node_type == GenericWorkflowNodeType.NOOP:
|
|
2298
|
+
gwjob = cast(GenericWorkflowNoopJob, gwjob)
|
|
2299
|
+
htc_job = HTCJob(f"wms_{gwjob.name}", label=gwjob.label)
|
|
2300
|
+
htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/noop.sub"
|
|
2301
|
+
htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
|
|
2302
|
+
htc_job.add_dag_cmds({"noop": True})
|
|
2303
|
+
elif gwjob.node_type == GenericWorkflowNodeType.GROUP:
|
|
2304
|
+
gwjob = cast(GenericWorkflowGroup, gwjob)
|
|
2305
|
+
htc_job = _group_to_subdag(config, gwjob, out_prefix)
|
|
2306
|
+
# In case DAGMAN_GENERATE_SUBDAG_SUBMITS is False,
|
|
2307
|
+
dag.graph["submit_options"]["do_recurse"] = True
|
|
2308
|
+
else:
|
|
2309
|
+
raise RuntimeError(f"Unsupported generic workflow node type {gwjob.node_type} ({gwjob.name})")
|
|
2310
|
+
_LOG.debug("Calling adding job %s %s", htc_job.name, htc_job.label)
|
|
2311
|
+
dag.add_job(htc_job)
|
|
2312
|
+
|
|
2313
|
+
# Add job dependencies to the DAG (be careful with wms_ jobs)
|
|
2314
|
+
for job_name in generic_workflow:
|
|
2315
|
+
gwjob = generic_workflow.get_job(job_name)
|
|
2316
|
+
parent_name = (
|
|
2317
|
+
gwjob.name if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD else f"wms_{gwjob.name}"
|
|
2318
|
+
)
|
|
2319
|
+
successor_jobs = [generic_workflow.get_job(j) for j in generic_workflow.successors(job_name)]
|
|
2320
|
+
children_names = []
|
|
2321
|
+
if gwjob.node_type == GenericWorkflowNodeType.GROUP:
|
|
2322
|
+
gwjob = cast(GenericWorkflowGroup, gwjob)
|
|
2323
|
+
group_children = [] # Dependencies between same group jobs
|
|
2324
|
+
for sjob in successor_jobs:
|
|
2325
|
+
if sjob.node_type == GenericWorkflowNodeType.GROUP and sjob.label == gwjob.label:
|
|
2326
|
+
group_children.append(f"wms_{sjob.name}")
|
|
2327
|
+
elif sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
2328
|
+
children_names.append(sjob.name)
|
|
2329
|
+
else:
|
|
2330
|
+
children_names.append(f"wms_{sjob.name}")
|
|
2331
|
+
if group_children:
|
|
2332
|
+
dag.add_job_relationships([parent_name], group_children)
|
|
2333
|
+
if not gwjob.blocking:
|
|
2334
|
+
# Since subdag will always succeed, need to add a special
|
|
2335
|
+
# job that fails if group failed to block payload children.
|
|
2336
|
+
check_job = _create_check_job(f"wms_{gwjob.name}", gwjob.label)
|
|
2337
|
+
dag.add_job(check_job)
|
|
2338
|
+
dag.add_job_relationships([f"wms_{gwjob.name}"], [check_job.name])
|
|
2339
|
+
parent_name = check_job.name
|
|
2340
|
+
else:
|
|
2341
|
+
for sjob in successor_jobs:
|
|
2342
|
+
if sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
2343
|
+
children_names.append(sjob.name)
|
|
2344
|
+
else:
|
|
2345
|
+
children_names.append(f"wms_{sjob.name}")
|
|
2346
|
+
|
|
2347
|
+
dag.add_job_relationships([parent_name], children_names)
|
|
2348
|
+
|
|
2349
|
+
# If final job exists in generic workflow, create DAG final job
|
|
2350
|
+
final = generic_workflow.get_final()
|
|
2351
|
+
if final and isinstance(final, GenericWorkflowJob):
|
|
2352
|
+
if final.compute_site and final.compute_site not in site_values:
|
|
2353
|
+
site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
|
|
2354
|
+
final_htjob = _create_job(
|
|
2355
|
+
subdir_template[final.label], site_values[final.compute_site], generic_workflow, final, out_prefix
|
|
2356
|
+
)
|
|
2357
|
+
if "post" not in final_htjob.dagcmds:
|
|
2358
|
+
final_htjob.dagcmds["post"] = {
|
|
2359
|
+
"defer": "",
|
|
2360
|
+
"executable": f"{os.path.dirname(__file__)}/final_post.sh",
|
|
2361
|
+
"arguments": f"{final.name} $DAG_STATUS $RETURN",
|
|
2362
|
+
}
|
|
2363
|
+
dag.add_final_job(final_htjob)
|
|
2364
|
+
elif final and isinstance(final, GenericWorkflow):
|
|
2365
|
+
raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
|
|
2366
|
+
elif final:
|
|
2367
|
+
raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
|
|
2368
|
+
|
|
2369
|
+
return dag
|