lsst-ctrl-bps-htcondor 29.0.1rc1__py3-none-any.whl → 29.1.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/ctrl/bps/htcondor/htcondor_service.py +438 -209
- lsst/ctrl/bps/htcondor/lssthtc.py +864 -261
- lsst/ctrl/bps/htcondor/version.py +1 -1
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc1.dist-info}/METADATA +1 -1
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc1.dist-info}/RECORD +12 -12
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc1.dist-info}/WHEEL +1 -1
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc1.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc1.dist-info}/licenses/LICENSE +0 -0
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc1.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc1.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc1.dist-info}/top_level.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc1.dist-info}/zip-safe +0 -0
|
@@ -34,9 +34,10 @@ import logging
|
|
|
34
34
|
import os
|
|
35
35
|
import re
|
|
36
36
|
from collections import defaultdict
|
|
37
|
+
from copy import deepcopy
|
|
37
38
|
from enum import IntEnum, auto
|
|
38
39
|
from pathlib import Path
|
|
39
|
-
from typing import Any
|
|
40
|
+
from typing import Any, cast
|
|
40
41
|
|
|
41
42
|
import htcondor
|
|
42
43
|
from packaging import version
|
|
@@ -44,8 +45,12 @@ from packaging import version
|
|
|
44
45
|
from lsst.ctrl.bps import (
|
|
45
46
|
BaseWmsService,
|
|
46
47
|
BaseWmsWorkflow,
|
|
48
|
+
BpsConfig,
|
|
47
49
|
GenericWorkflow,
|
|
50
|
+
GenericWorkflowGroup,
|
|
48
51
|
GenericWorkflowJob,
|
|
52
|
+
GenericWorkflowNodeType,
|
|
53
|
+
GenericWorkflowNoopJob,
|
|
49
54
|
WmsJobReport,
|
|
50
55
|
WmsRunReport,
|
|
51
56
|
WmsSpecificInfo,
|
|
@@ -60,8 +65,9 @@ from .lssthtc import (
|
|
|
60
65
|
MISSING_ID,
|
|
61
66
|
HTCDag,
|
|
62
67
|
HTCJob,
|
|
63
|
-
JobStatus,
|
|
64
68
|
NodeStatus,
|
|
69
|
+
WmsNodeType,
|
|
70
|
+
_update_rescue_file,
|
|
65
71
|
condor_history,
|
|
66
72
|
condor_q,
|
|
67
73
|
condor_search,
|
|
@@ -175,17 +181,23 @@ class HTCondorService(BaseWmsService):
|
|
|
175
181
|
Keyword arguments for the options.
|
|
176
182
|
"""
|
|
177
183
|
dag = workflow.dag
|
|
178
|
-
|
|
179
184
|
ver = version.parse(htc_version())
|
|
180
|
-
if ver >= version.parse("8.9.3"):
|
|
181
|
-
sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {})
|
|
182
|
-
else:
|
|
183
|
-
sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {})
|
|
184
185
|
|
|
185
186
|
# For workflow portability, internal paths are all relative. Hence
|
|
186
187
|
# the DAG needs to be submitted to HTCondor from inside the submit
|
|
187
188
|
# directory.
|
|
188
189
|
with chdir(workflow.submit_path):
|
|
190
|
+
try:
|
|
191
|
+
if ver >= version.parse("8.9.3"):
|
|
192
|
+
sub = htc_create_submit_from_dag(dag.graph["dag_filename"], dag.graph["submit_options"])
|
|
193
|
+
else:
|
|
194
|
+
sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], dag.graph["submit_options"])
|
|
195
|
+
except Exception:
|
|
196
|
+
_LOG.error(
|
|
197
|
+
"Problems creating HTCondor submit object from filename: %s", dag.graph["dag_filename"]
|
|
198
|
+
)
|
|
199
|
+
raise
|
|
200
|
+
|
|
189
201
|
_LOG.info("Submitting from directory: %s", os.getcwd())
|
|
190
202
|
schedd_dag_info = htc_submit_dag(sub)
|
|
191
203
|
if schedd_dag_info:
|
|
@@ -226,7 +238,7 @@ class HTCondorService(BaseWmsService):
|
|
|
226
238
|
None,
|
|
227
239
|
(
|
|
228
240
|
f"workflow with run id '{wms_workflow_id}' not found. "
|
|
229
|
-
|
|
241
|
+
"Hint: use run's submit directory as the id instead"
|
|
230
242
|
),
|
|
231
243
|
)
|
|
232
244
|
|
|
@@ -266,7 +278,9 @@ class HTCondorService(BaseWmsService):
|
|
|
266
278
|
)
|
|
267
279
|
|
|
268
280
|
_LOG.info("Backing up select HTCondor files from previous run attempt")
|
|
269
|
-
htc_backup_files(wms_path, subdir="backups")
|
|
281
|
+
rescue_file = htc_backup_files(wms_path, subdir="backups")
|
|
282
|
+
if (wms_path / "subdags").exists():
|
|
283
|
+
_update_rescue_file(rescue_file)
|
|
270
284
|
|
|
271
285
|
# For workflow portability, internal paths are all relative. Hence
|
|
272
286
|
# the DAG needs to be resubmitted to HTCondor from inside the submit
|
|
@@ -318,7 +332,7 @@ class HTCondorService(BaseWmsService):
|
|
|
318
332
|
|
|
319
333
|
Returns
|
|
320
334
|
-------
|
|
321
|
-
job_ids : `list` [
|
|
335
|
+
job_ids : `list` [`~typing.Any`]
|
|
322
336
|
Only job ids to be used by cancel and other functions. Typically
|
|
323
337
|
this means top-level jobs (i.e., not children jobs).
|
|
324
338
|
"""
|
|
@@ -563,66 +577,17 @@ class HTCondorWorkflow(BaseWmsWorkflow):
|
|
|
563
577
|
def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
|
|
564
578
|
# Docstring inherited
|
|
565
579
|
htc_workflow = cls(generic_workflow.name, config)
|
|
566
|
-
htc_workflow.dag =
|
|
580
|
+
htc_workflow.dag = _generic_workflow_to_htcondor_dag(config, generic_workflow, out_prefix)
|
|
567
581
|
|
|
568
582
|
_LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
|
|
569
|
-
|
|
583
|
+
# Add extra attributes to top most DAG.
|
|
570
584
|
htc_workflow.dag.add_attribs(
|
|
571
585
|
{
|
|
572
586
|
"bps_wms_service": service_class,
|
|
573
587
|
"bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
|
|
574
|
-
"bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
|
|
575
|
-
"bps_job_summary": create_count_summary(generic_workflow.job_counts),
|
|
576
588
|
}
|
|
577
589
|
)
|
|
578
590
|
|
|
579
|
-
_, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
|
|
580
|
-
if isinstance(tmp_template, str):
|
|
581
|
-
subdir_template = defaultdict(lambda: tmp_template)
|
|
582
|
-
else:
|
|
583
|
-
subdir_template = tmp_template
|
|
584
|
-
|
|
585
|
-
# Create all DAG jobs
|
|
586
|
-
site_values = {} # cache compute site specific values to reduce config lookups
|
|
587
|
-
for job_name in generic_workflow:
|
|
588
|
-
gwjob = generic_workflow.get_job(job_name)
|
|
589
|
-
if gwjob.compute_site not in site_values:
|
|
590
|
-
site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
|
|
591
|
-
htc_job = _create_job(
|
|
592
|
-
subdir_template[gwjob.label],
|
|
593
|
-
site_values[gwjob.compute_site],
|
|
594
|
-
generic_workflow,
|
|
595
|
-
gwjob,
|
|
596
|
-
out_prefix,
|
|
597
|
-
)
|
|
598
|
-
htc_workflow.dag.add_job(htc_job)
|
|
599
|
-
|
|
600
|
-
# Add job dependencies to the DAG
|
|
601
|
-
for job_name in generic_workflow:
|
|
602
|
-
htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
|
|
603
|
-
|
|
604
|
-
# If final job exists in generic workflow, create DAG final job
|
|
605
|
-
final = generic_workflow.get_final()
|
|
606
|
-
if final and isinstance(final, GenericWorkflowJob):
|
|
607
|
-
if final.compute_site and final.compute_site not in site_values:
|
|
608
|
-
site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
|
|
609
|
-
final_htjob = _create_job(
|
|
610
|
-
subdir_template[final.label],
|
|
611
|
-
site_values[final.compute_site],
|
|
612
|
-
generic_workflow,
|
|
613
|
-
final,
|
|
614
|
-
out_prefix,
|
|
615
|
-
)
|
|
616
|
-
if "post" not in final_htjob.dagcmds:
|
|
617
|
-
final_htjob.dagcmds["post"] = (
|
|
618
|
-
f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN"
|
|
619
|
-
)
|
|
620
|
-
htc_workflow.dag.add_final_job(final_htjob)
|
|
621
|
-
elif final and isinstance(final, GenericWorkflow):
|
|
622
|
-
raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
|
|
623
|
-
elif final:
|
|
624
|
-
return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
|
|
625
|
-
|
|
626
591
|
return htc_workflow
|
|
627
592
|
|
|
628
593
|
def write(self, out_prefix):
|
|
@@ -637,18 +602,18 @@ class HTCondorWorkflow(BaseWmsWorkflow):
|
|
|
637
602
|
os.makedirs(out_prefix, exist_ok=True)
|
|
638
603
|
|
|
639
604
|
# Write down the workflow in HTCondor format.
|
|
640
|
-
self.dag.write(out_prefix, "jobs/{self.label}")
|
|
605
|
+
self.dag.write(out_prefix, job_subdir="jobs/{self.label}")
|
|
641
606
|
|
|
642
607
|
|
|
643
|
-
def _create_job(subdir_template,
|
|
608
|
+
def _create_job(subdir_template, cached_values, generic_workflow, gwjob, out_prefix):
|
|
644
609
|
"""Convert GenericWorkflow job nodes to DAG jobs.
|
|
645
610
|
|
|
646
611
|
Parameters
|
|
647
612
|
----------
|
|
648
613
|
subdir_template : `str`
|
|
649
614
|
Template for making subdirs.
|
|
650
|
-
|
|
651
|
-
Site specific values
|
|
615
|
+
cached_values : `dict`
|
|
616
|
+
Site and label specific values.
|
|
652
617
|
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
653
618
|
Generic workflow that is being converted.
|
|
654
619
|
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
@@ -668,8 +633,10 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
|
|
|
668
633
|
if gwjob.tags:
|
|
669
634
|
curvals.update(gwjob.tags)
|
|
670
635
|
|
|
671
|
-
subdir = subdir_template.format_map(curvals)
|
|
672
|
-
htc_job.
|
|
636
|
+
subdir = Path("jobs") / subdir_template.format_map(curvals)
|
|
637
|
+
htc_job.subdir = subdir
|
|
638
|
+
htc_job.subfile = f"{gwjob.name}.sub"
|
|
639
|
+
htc_job.add_dag_cmds({"dir": subdir})
|
|
673
640
|
|
|
674
641
|
htc_job_cmds = {
|
|
675
642
|
"universe": "vanilla",
|
|
@@ -681,20 +648,22 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
|
|
|
681
648
|
# Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell
|
|
682
649
|
# htcondor to put on hold any jobs which exited by a signal.
|
|
683
650
|
"on_exit_hold": "ExitBySignal == true",
|
|
684
|
-
"on_exit_hold_reason":
|
|
685
|
-
|
|
651
|
+
"on_exit_hold_reason": (
|
|
652
|
+
'strcat("Job raised a signal ", string(ExitSignal), ". ", '
|
|
653
|
+
'"Handling signal as if job has gone over memory limit.")'
|
|
654
|
+
),
|
|
686
655
|
"on_exit_hold_subcode": "34",
|
|
687
656
|
}
|
|
688
657
|
|
|
689
|
-
htc_job_cmds.update(_translate_job_cmds(
|
|
658
|
+
htc_job_cmds.update(_translate_job_cmds(cached_values, generic_workflow, gwjob))
|
|
690
659
|
|
|
691
660
|
# job stdout, stderr, htcondor user log.
|
|
692
661
|
for key in ("output", "error", "log"):
|
|
693
|
-
htc_job_cmds[key] =
|
|
662
|
+
htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).{key[:3]}"
|
|
694
663
|
_LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
|
|
695
664
|
|
|
696
665
|
htc_job_cmds.update(
|
|
697
|
-
_handle_job_inputs(generic_workflow, gwjob.name,
|
|
666
|
+
_handle_job_inputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
|
|
698
667
|
)
|
|
699
668
|
|
|
700
669
|
# Add the job cmds dict to the job object.
|
|
@@ -705,7 +674,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
|
|
|
705
674
|
# Add job attributes to job.
|
|
706
675
|
_LOG.debug("gwjob.attrs = %s", gwjob.attrs)
|
|
707
676
|
htc_job.add_job_attrs(gwjob.attrs)
|
|
708
|
-
htc_job.add_job_attrs(
|
|
677
|
+
htc_job.add_job_attrs(cached_values["attrs"])
|
|
709
678
|
htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
|
|
710
679
|
htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
|
|
711
680
|
|
|
@@ -717,8 +686,8 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
717
686
|
|
|
718
687
|
Parameters
|
|
719
688
|
----------
|
|
720
|
-
cached_vals : `dict` [`str`,
|
|
721
|
-
Config values common to jobs with same label.
|
|
689
|
+
cached_vals : `dict` [`str`, `~typing.Any`]
|
|
690
|
+
Config values common to jobs with same site or label.
|
|
722
691
|
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
723
692
|
Generic workflow that contains job to being converted.
|
|
724
693
|
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
@@ -726,7 +695,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
726
695
|
|
|
727
696
|
Returns
|
|
728
697
|
-------
|
|
729
|
-
htc_job_commands : `dict` [`str`,
|
|
698
|
+
htc_job_commands : `dict` [`str`, `~typing.Any`]
|
|
730
699
|
Contains commands which can appear in the HTCondor submit description
|
|
731
700
|
file.
|
|
732
701
|
"""
|
|
@@ -752,9 +721,6 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
752
721
|
jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
|
|
753
722
|
|
|
754
723
|
# job commands that need modification
|
|
755
|
-
if gwjob.number_of_retries:
|
|
756
|
-
jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
|
|
757
|
-
|
|
758
724
|
if gwjob.retry_unless_exit:
|
|
759
725
|
if isinstance(gwjob.retry_unless_exit, int):
|
|
760
726
|
jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
|
|
@@ -771,6 +737,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
771
737
|
if gwjob.request_memory:
|
|
772
738
|
jobcmds["request_memory"] = f"{gwjob.request_memory}"
|
|
773
739
|
|
|
740
|
+
memory_max = 0
|
|
774
741
|
if gwjob.memory_multiplier:
|
|
775
742
|
# Do not use try-except! At the moment, BpsConfig returns an empty
|
|
776
743
|
# string if it does not contain the key.
|
|
@@ -797,13 +764,18 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
797
764
|
gwjob.request_memory, gwjob.memory_multiplier, memory_max
|
|
798
765
|
)
|
|
799
766
|
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
767
|
+
user_release_expr = cached_vals.get("releaseExpr", "")
|
|
768
|
+
if gwjob.number_of_retries is not None and gwjob.number_of_retries >= 0:
|
|
769
|
+
jobcmds["max_retries"] = gwjob.number_of_retries
|
|
770
|
+
|
|
771
|
+
# No point in adding periodic_release if 0 retries
|
|
772
|
+
if gwjob.number_of_retries > 0:
|
|
773
|
+
periodic_release = _create_periodic_release_expr(
|
|
774
|
+
gwjob.request_memory, gwjob.memory_multiplier, memory_max, user_release_expr
|
|
775
|
+
)
|
|
776
|
+
if periodic_release:
|
|
777
|
+
jobcmds["periodic_release"] = periodic_release
|
|
778
|
+
|
|
807
779
|
jobcmds["periodic_remove"] = _create_periodic_remove_expr(
|
|
808
780
|
gwjob.request_memory, gwjob.memory_multiplier, memory_max
|
|
809
781
|
)
|
|
@@ -817,7 +789,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
817
789
|
# Handle command line
|
|
818
790
|
if gwjob.executable.transfer_executable:
|
|
819
791
|
jobcmds["transfer_executable"] = "True"
|
|
820
|
-
jobcmds["executable"] =
|
|
792
|
+
jobcmds["executable"] = gwjob.executable.src_uri
|
|
821
793
|
else:
|
|
822
794
|
jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
|
|
823
795
|
|
|
@@ -862,7 +834,7 @@ def _translate_dag_cmds(gwjob):
|
|
|
862
834
|
|
|
863
835
|
Returns
|
|
864
836
|
-------
|
|
865
|
-
dagcmds : `dict` [`str`,
|
|
837
|
+
dagcmds : `dict` [`str`, `~typing.Any`]
|
|
866
838
|
DAGMan commands for the job.
|
|
867
839
|
"""
|
|
868
840
|
# Values in the dag script that just are name mappings.
|
|
@@ -974,7 +946,7 @@ def _replace_cmd_vars(arguments, gwjob):
|
|
|
974
946
|
replacements = gwjob.cmdvals if gwjob.cmdvals is not None else {}
|
|
975
947
|
try:
|
|
976
948
|
arguments = arguments.format(**replacements)
|
|
977
|
-
except KeyError as exc:
|
|
949
|
+
except (KeyError, TypeError) as exc: # TypeError in case None instead of {}
|
|
978
950
|
_LOG.error("Could not replace command variables: replacement for %s not provided", str(exc))
|
|
979
951
|
_LOG.debug("arguments: %s\ncmdvals: %s", arguments, replacements)
|
|
980
952
|
raise
|
|
@@ -1200,19 +1172,19 @@ def _get_info_from_schedd(
|
|
|
1200
1172
|
return schedd_dag_info
|
|
1201
1173
|
|
|
1202
1174
|
|
|
1203
|
-
def _get_info_from_path(wms_path: str) -> tuple[str, dict[str, dict[str, Any]], str]:
|
|
1175
|
+
def _get_info_from_path(wms_path: str | os.PathLike) -> tuple[str, dict[str, dict[str, Any]], str]:
|
|
1204
1176
|
"""Gather run information from a given run directory.
|
|
1205
1177
|
|
|
1206
1178
|
Parameters
|
|
1207
1179
|
----------
|
|
1208
|
-
wms_path : `str`
|
|
1180
|
+
wms_path : `str` or `os.PathLike`
|
|
1209
1181
|
Directory containing HTCondor files.
|
|
1210
1182
|
|
|
1211
1183
|
Returns
|
|
1212
1184
|
-------
|
|
1213
1185
|
wms_workflow_id : `str`
|
|
1214
1186
|
The run id which is a DAGman job id.
|
|
1215
|
-
jobs : `dict` [`str`, `dict` [`str`,
|
|
1187
|
+
jobs : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1216
1188
|
Information about jobs read from files in the given directory.
|
|
1217
1189
|
The key is the HTCondor id and the value is a dictionary of HTCondor
|
|
1218
1190
|
keys and values.
|
|
@@ -1263,9 +1235,9 @@ def _get_info_from_path(wms_path: str) -> tuple[str, dict[str, dict[str, Any]],
|
|
|
1263
1235
|
schedd_name = next(iter(job_info))
|
|
1264
1236
|
job_ad = next(iter(job_info[schedd_name].values()))
|
|
1265
1237
|
job.update(job_ad)
|
|
1266
|
-
except FileNotFoundError:
|
|
1267
|
-
message = f"Could not find HTCondor files in '{wms_path}'"
|
|
1268
|
-
_LOG.
|
|
1238
|
+
except FileNotFoundError as err:
|
|
1239
|
+
message = f"Could not find HTCondor files in '{wms_path}' ({err})"
|
|
1240
|
+
_LOG.debug(message)
|
|
1269
1241
|
messages.append(message)
|
|
1270
1242
|
message = htc_check_dagman_output(wms_path)
|
|
1271
1243
|
if message:
|
|
@@ -1298,8 +1270,9 @@ def _create_detailed_report_from_jobs(
|
|
|
1298
1270
|
id and the value is a collection of report information for that run.
|
|
1299
1271
|
"""
|
|
1300
1272
|
_LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
|
|
1301
|
-
|
|
1302
|
-
|
|
1273
|
+
|
|
1274
|
+
dag_ad = jobs[wms_workflow_id]
|
|
1275
|
+
|
|
1303
1276
|
report = WmsRunReport(
|
|
1304
1277
|
wms_id=f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}",
|
|
1305
1278
|
global_wms_id=dag_ad.get("GlobalJobId", "MISS"),
|
|
@@ -1312,28 +1285,34 @@ def _create_detailed_report_from_jobs(
|
|
|
1312
1285
|
operator=_get_owner(dag_ad),
|
|
1313
1286
|
run_summary=_get_run_summary(dag_ad),
|
|
1314
1287
|
state=_htc_status_to_wms_state(dag_ad),
|
|
1288
|
+
total_number_jobs=0,
|
|
1315
1289
|
jobs=[],
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
exit_code_summary=_get_exit_code_summary(jobs),
|
|
1290
|
+
job_state_counts=dict.fromkeys(WmsStates, 0),
|
|
1291
|
+
exit_code_summary={},
|
|
1319
1292
|
)
|
|
1293
|
+
|
|
1294
|
+
payload_jobs = {} # keep track for later processing
|
|
1320
1295
|
specific_info = WmsSpecificInfo()
|
|
1321
1296
|
for job_id, job_ad in jobs.items():
|
|
1322
|
-
if
|
|
1297
|
+
if job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [WmsNodeType.PAYLOAD, WmsNodeType.FINAL]:
|
|
1323
1298
|
try:
|
|
1299
|
+
name = job_ad.get("DAGNodeName", job_id)
|
|
1300
|
+
wms_state = _htc_status_to_wms_state(job_ad)
|
|
1324
1301
|
job_report = WmsJobReport(
|
|
1325
1302
|
wms_id=job_id,
|
|
1326
|
-
name=
|
|
1327
|
-
label=job_ad.get("bps_job_label", pegasus_name_to_label(
|
|
1328
|
-
state=
|
|
1303
|
+
name=name,
|
|
1304
|
+
label=job_ad.get("bps_job_label", pegasus_name_to_label(name)),
|
|
1305
|
+
state=wms_state,
|
|
1329
1306
|
)
|
|
1330
1307
|
if job_report.label == "init":
|
|
1331
1308
|
job_report.label = "pipetaskInit"
|
|
1309
|
+
report.job_state_counts[wms_state] += 1
|
|
1332
1310
|
report.jobs.append(job_report)
|
|
1311
|
+
payload_jobs[job_id] = job_ad
|
|
1333
1312
|
except KeyError as ex:
|
|
1334
1313
|
_LOG.error("Job missing key '%s': %s", str(ex), job_ad)
|
|
1335
1314
|
raise
|
|
1336
|
-
|
|
1315
|
+
elif is_service_job(job_ad):
|
|
1337
1316
|
_LOG.debug(
|
|
1338
1317
|
"Found service job: id='%s', name='%s', label='%s', NodeStatus='%s', JobStatus='%s'",
|
|
1339
1318
|
job_id,
|
|
@@ -1344,13 +1323,11 @@ def _create_detailed_report_from_jobs(
|
|
|
1344
1323
|
)
|
|
1345
1324
|
_add_service_job_specific_info(job_ad, specific_info)
|
|
1346
1325
|
|
|
1326
|
+
report.total_number_jobs = len(payload_jobs)
|
|
1327
|
+
report.exit_code_summary = _get_exit_code_summary(payload_jobs)
|
|
1347
1328
|
if specific_info:
|
|
1348
1329
|
report.specific_info = specific_info
|
|
1349
1330
|
|
|
1350
|
-
# Add the removed entry to restore the original content of the dictionary.
|
|
1351
|
-
# The ordering of keys will be change permanently though.
|
|
1352
|
-
jobs.update({wms_workflow_id: dag_ad})
|
|
1353
|
-
|
|
1354
1331
|
# Workflow will exit with non-zero DAG_STATUS if problem with
|
|
1355
1332
|
# any of the wms jobs. So change FAILED to SUCCEEDED if all
|
|
1356
1333
|
# payload jobs SUCCEEDED.
|
|
@@ -1367,7 +1344,7 @@ def _add_service_job_specific_info(job_ad: dict[str, Any], specific_info: WmsSpe
|
|
|
1367
1344
|
|
|
1368
1345
|
Parameters
|
|
1369
1346
|
----------
|
|
1370
|
-
job_ad : `dict` [`str`,
|
|
1347
|
+
job_ad : `dict` [`str`, `~typing.Any`]
|
|
1371
1348
|
Provisioning job information.
|
|
1372
1349
|
specific_info : `lsst.ctrl.bps.WmsSpecificInfo`
|
|
1373
1350
|
Where to add message.
|
|
@@ -1450,6 +1427,7 @@ def _summary_report(user, hist, pass_thru, schedds=None):
|
|
|
1450
1427
|
|
|
1451
1428
|
# Have list of DAGMan jobs, need to get run_report info.
|
|
1452
1429
|
run_reports = {}
|
|
1430
|
+
msg = ""
|
|
1453
1431
|
for jobs in job_info.values():
|
|
1454
1432
|
for job_id, job in jobs.items():
|
|
1455
1433
|
total_jobs, state_counts = _get_state_counts_from_dag_job(job)
|
|
@@ -1482,7 +1460,7 @@ def _summary_report(user, hist, pass_thru, schedds=None):
|
|
|
1482
1460
|
)
|
|
1483
1461
|
run_reports[report.global_wms_id] = report
|
|
1484
1462
|
|
|
1485
|
-
return run_reports,
|
|
1463
|
+
return run_reports, msg
|
|
1486
1464
|
|
|
1487
1465
|
|
|
1488
1466
|
def _add_run_info(wms_path, job):
|
|
@@ -1492,7 +1470,7 @@ def _add_run_info(wms_path, job):
|
|
|
1492
1470
|
----------
|
|
1493
1471
|
wms_path : `str`
|
|
1494
1472
|
Path to submit files for the run.
|
|
1495
|
-
job : `dict` [`str`,
|
|
1473
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1496
1474
|
HTCondor dag job information.
|
|
1497
1475
|
|
|
1498
1476
|
Raises
|
|
@@ -1528,7 +1506,7 @@ def _get_owner(job):
|
|
|
1528
1506
|
|
|
1529
1507
|
Parameters
|
|
1530
1508
|
----------
|
|
1531
|
-
job : `dict` [`str`,
|
|
1509
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1532
1510
|
HTCondor dag job information.
|
|
1533
1511
|
|
|
1534
1512
|
Returns
|
|
@@ -1550,7 +1528,7 @@ def _get_run_summary(job):
|
|
|
1550
1528
|
|
|
1551
1529
|
Parameters
|
|
1552
1530
|
----------
|
|
1553
|
-
job : `dict` [`str`,
|
|
1531
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1554
1532
|
HTCondor dag job information.
|
|
1555
1533
|
|
|
1556
1534
|
Returns
|
|
@@ -1596,14 +1574,14 @@ def _get_exit_code_summary(jobs):
|
|
|
1596
1574
|
exit_code = 0
|
|
1597
1575
|
job_status = job_ad["JobStatus"]
|
|
1598
1576
|
match job_status:
|
|
1599
|
-
case JobStatus.COMPLETED | JobStatus.HELD:
|
|
1577
|
+
case htcondor.JobStatus.COMPLETED | htcondor.JobStatus.HELD:
|
|
1600
1578
|
exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["ExitCode"]
|
|
1601
1579
|
case (
|
|
1602
|
-
JobStatus.IDLE
|
|
1603
|
-
| JobStatus.RUNNING
|
|
1604
|
-
| JobStatus.REMOVED
|
|
1605
|
-
| JobStatus.TRANSFERRING_OUTPUT
|
|
1606
|
-
| JobStatus.SUSPENDED
|
|
1580
|
+
htcondor.JobStatus.IDLE
|
|
1581
|
+
| htcondor.JobStatus.RUNNING
|
|
1582
|
+
| htcondor.JobStatus.REMOVED
|
|
1583
|
+
| htcondor.JobStatus.TRANSFERRING_OUTPUT
|
|
1584
|
+
| htcondor.JobStatus.SUSPENDED
|
|
1607
1585
|
):
|
|
1608
1586
|
pass
|
|
1609
1587
|
case _:
|
|
@@ -1626,7 +1604,7 @@ def _get_state_counts_from_jobs(
|
|
|
1626
1604
|
----------
|
|
1627
1605
|
wms_workflow_id : `str`
|
|
1628
1606
|
HTCondor job id.
|
|
1629
|
-
jobs : `dict [`dict` [`str`,
|
|
1607
|
+
jobs : `dict [`dict` [`str`, `~typing.Any`]]
|
|
1630
1608
|
HTCondor dag job information.
|
|
1631
1609
|
|
|
1632
1610
|
Returns
|
|
@@ -1639,16 +1617,13 @@ def _get_state_counts_from_jobs(
|
|
|
1639
1617
|
"""
|
|
1640
1618
|
state_counts = dict.fromkeys(WmsStates, 0)
|
|
1641
1619
|
for job_id, job_ad in jobs.items():
|
|
1642
|
-
if job_id != wms_workflow_id and
|
|
1620
|
+
if job_id != wms_workflow_id and job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [
|
|
1621
|
+
WmsNodeType.PAYLOAD,
|
|
1622
|
+
WmsNodeType.FINAL,
|
|
1623
|
+
]:
|
|
1643
1624
|
state_counts[_htc_status_to_wms_state(job_ad)] += 1
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
if "NodesTotal" in jobs[wms_workflow_id]:
|
|
1647
|
-
total_count = jobs[wms_workflow_id]["NodesTotal"]
|
|
1648
|
-
else:
|
|
1649
|
-
total_count = total_counted
|
|
1625
|
+
total_count = sum(state_counts.values())
|
|
1650
1626
|
|
|
1651
|
-
state_counts[WmsStates.UNREADY] += total_count - total_counted
|
|
1652
1627
|
return total_count, state_counts
|
|
1653
1628
|
|
|
1654
1629
|
|
|
@@ -1657,7 +1632,7 @@ def _get_state_counts_from_dag_job(job):
|
|
|
1657
1632
|
|
|
1658
1633
|
Parameters
|
|
1659
1634
|
----------
|
|
1660
|
-
job : `dict` [`str`,
|
|
1635
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1661
1636
|
HTCondor dag job information.
|
|
1662
1637
|
|
|
1663
1638
|
Returns
|
|
@@ -1713,7 +1688,7 @@ def _htc_status_to_wms_state(job):
|
|
|
1713
1688
|
|
|
1714
1689
|
Parameters
|
|
1715
1690
|
----------
|
|
1716
|
-
job : `dict` [`str`,
|
|
1691
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1717
1692
|
HTCondor job information.
|
|
1718
1693
|
|
|
1719
1694
|
Returns
|
|
@@ -1735,7 +1710,7 @@ def _htc_job_status_to_wms_state(job):
|
|
|
1735
1710
|
|
|
1736
1711
|
Parameters
|
|
1737
1712
|
----------
|
|
1738
|
-
job : `dict` [`str`,
|
|
1713
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1739
1714
|
HTCondor job information.
|
|
1740
1715
|
|
|
1741
1716
|
Returns
|
|
@@ -1746,27 +1721,28 @@ def _htc_job_status_to_wms_state(job):
|
|
|
1746
1721
|
_LOG.debug(
|
|
1747
1722
|
"htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"])
|
|
1748
1723
|
)
|
|
1749
|
-
job_status = int(job["JobStatus"])
|
|
1750
1724
|
wms_state = WmsStates.MISFIT
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1725
|
+
if "JobStatus" in job and job["JobStatus"]:
|
|
1726
|
+
job_status = int(job["JobStatus"])
|
|
1727
|
+
|
|
1728
|
+
_LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
|
|
1729
|
+
if job_status == htcondor.JobStatus.IDLE:
|
|
1730
|
+
wms_state = WmsStates.PENDING
|
|
1731
|
+
elif job_status == htcondor.JobStatus.RUNNING:
|
|
1732
|
+
wms_state = WmsStates.RUNNING
|
|
1733
|
+
elif job_status == htcondor.JobStatus.REMOVED:
|
|
1734
|
+
wms_state = WmsStates.DELETED
|
|
1735
|
+
elif job_status == htcondor.JobStatus.COMPLETED:
|
|
1736
|
+
if (
|
|
1737
|
+
(job.get("ExitBySignal", False) and job.get("ExitSignal", 0))
|
|
1738
|
+
or job.get("ExitCode", 0)
|
|
1739
|
+
or job.get("DAG_Status", 0)
|
|
1740
|
+
):
|
|
1741
|
+
wms_state = WmsStates.FAILED
|
|
1742
|
+
else:
|
|
1743
|
+
wms_state = WmsStates.SUCCEEDED
|
|
1744
|
+
elif job_status == htcondor.JobStatus.HELD:
|
|
1745
|
+
wms_state = WmsStates.HELD
|
|
1770
1746
|
|
|
1771
1747
|
return wms_state
|
|
1772
1748
|
|
|
@@ -1776,7 +1752,7 @@ def _htc_node_status_to_wms_state(job):
|
|
|
1776
1752
|
|
|
1777
1753
|
Parameters
|
|
1778
1754
|
----------
|
|
1779
|
-
job : `dict` [`str`,
|
|
1755
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1780
1756
|
HTCondor job information.
|
|
1781
1757
|
|
|
1782
1758
|
Returns
|
|
@@ -1823,9 +1799,9 @@ def _update_jobs(jobs1, jobs2):
|
|
|
1823
1799
|
|
|
1824
1800
|
Parameters
|
|
1825
1801
|
----------
|
|
1826
|
-
jobs1 : `dict` [`str`, `dict` [`str`,
|
|
1802
|
+
jobs1 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1827
1803
|
HTCondor job information to be updated.
|
|
1828
|
-
jobs2 : `dict` [`str`, `dict` [`str`,
|
|
1804
|
+
jobs2 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1829
1805
|
Additional HTCondor job information.
|
|
1830
1806
|
"""
|
|
1831
1807
|
for job_id, job_ad in jobs2.items():
|
|
@@ -1965,34 +1941,39 @@ def _wms_id_to_dir(wms_id):
|
|
|
1965
1941
|
return wms_path, id_type
|
|
1966
1942
|
|
|
1967
1943
|
|
|
1968
|
-
def _create_periodic_release_expr(
|
|
1944
|
+
def _create_periodic_release_expr(
|
|
1945
|
+
memory: int, multiplier: float | None, limit: int, additional_expr: str = ""
|
|
1946
|
+
) -> str:
|
|
1969
1947
|
"""Construct an HTCondorAd expression for releasing held jobs.
|
|
1970
1948
|
|
|
1971
|
-
The expression instruct HTCondor to release any job which was put on hold
|
|
1972
|
-
due to exceeding memory requirements back to the job queue providing it
|
|
1973
|
-
satisfies all of the conditions below:
|
|
1974
|
-
|
|
1975
|
-
* number of run attempts did not reach allowable number of retries,
|
|
1976
|
-
* the memory requirements in the last failed run attempt did not reach
|
|
1977
|
-
the specified memory limit.
|
|
1978
|
-
|
|
1979
1949
|
Parameters
|
|
1980
1950
|
----------
|
|
1981
1951
|
memory : `int`
|
|
1982
1952
|
Requested memory in MB.
|
|
1983
|
-
multiplier : `float`
|
|
1984
|
-
Memory growth rate between
|
|
1953
|
+
multiplier : `float` or None
|
|
1954
|
+
Memory growth rate between retries.
|
|
1985
1955
|
limit : `int`
|
|
1986
1956
|
Memory limit.
|
|
1957
|
+
additional_expr : `str`, optional
|
|
1958
|
+
Expression to add to periodic_release. Defaults to empty string.
|
|
1987
1959
|
|
|
1988
1960
|
Returns
|
|
1989
1961
|
-------
|
|
1990
1962
|
expr : `str`
|
|
1991
|
-
A string representing an HTCondor ClassAd expression for releasing
|
|
1992
|
-
which have been held due to exceeding the memory requirements.
|
|
1963
|
+
A string representing an HTCondor ClassAd expression for releasing job.
|
|
1993
1964
|
"""
|
|
1994
|
-
|
|
1995
|
-
|
|
1965
|
+
_LOG.debug(
|
|
1966
|
+
"periodic_release: memory: %s, multiplier: %s, limit: %s, additional_expr: %s",
|
|
1967
|
+
memory,
|
|
1968
|
+
multiplier,
|
|
1969
|
+
limit,
|
|
1970
|
+
additional_expr,
|
|
1971
|
+
)
|
|
1972
|
+
|
|
1973
|
+
# ctrl_bps sets multiplier to None in the GenericWorkflow if
|
|
1974
|
+
# memoryMultiplier <= 1, but checking value just in case.
|
|
1975
|
+
if (not multiplier or multiplier <= 1) and not additional_expr:
|
|
1976
|
+
return ""
|
|
1996
1977
|
|
|
1997
1978
|
# Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
|
|
1998
1979
|
# UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
|
|
@@ -2004,63 +1985,74 @@ def _create_periodic_release_expr(memory, multiplier, limit):
|
|
|
2004
1985
|
# the entire expression should evaluate to FALSE when the job is not HELD.
|
|
2005
1986
|
# According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
|
|
2006
1987
|
# but better safe than sorry.
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
1988
|
+
is_held = "JobStatus == 5"
|
|
1989
|
+
is_retry_allowed = "NumJobStarts <= JobMaxRetries"
|
|
1990
|
+
|
|
1991
|
+
mem_expr = ""
|
|
1992
|
+
if memory and multiplier and multiplier > 1 and limit:
|
|
1993
|
+
was_mem_exceeded = (
|
|
1994
|
+
"(HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
|
|
1995
|
+
"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
|
|
1996
|
+
)
|
|
1997
|
+
was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
|
|
1998
|
+
mem_expr = f"{was_mem_exceeded} && {was_below_limit}"
|
|
1999
|
+
|
|
2000
|
+
user_expr = ""
|
|
2001
|
+
if additional_expr:
|
|
2002
|
+
# Never auto release a job held by user.
|
|
2003
|
+
user_expr = f"HoldReasonCode =!= 1 && {additional_expr}"
|
|
2004
|
+
|
|
2005
|
+
expr = f"{is_held} && {is_retry_allowed}"
|
|
2006
|
+
if user_expr and mem_expr:
|
|
2007
|
+
expr += f" && ({mem_expr} || {user_expr})"
|
|
2008
|
+
elif user_expr:
|
|
2009
|
+
expr += f" && {user_expr}"
|
|
2010
|
+
elif mem_expr:
|
|
2011
|
+
expr += f" && {mem_expr}"
|
|
2012
2012
|
|
|
2013
|
-
expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
|
|
2014
2013
|
return expr
|
|
2015
2014
|
|
|
2016
2015
|
|
|
2017
2016
|
def _create_periodic_remove_expr(memory, multiplier, limit):
|
|
2018
2017
|
"""Construct an HTCondorAd expression for removing jobs from the queue.
|
|
2019
2018
|
|
|
2020
|
-
The expression instruct HTCondor to remove any job which was put on hold
|
|
2021
|
-
due to exceeding memory requirements from the job queue providing it
|
|
2022
|
-
satisfies any of the conditions below:
|
|
2023
|
-
|
|
2024
|
-
* allowable number of retries was reached,
|
|
2025
|
-
* the memory requirements during the last failed run attempt reached
|
|
2026
|
-
the specified memory limit.
|
|
2027
|
-
|
|
2028
2019
|
Parameters
|
|
2029
2020
|
----------
|
|
2030
2021
|
memory : `int`
|
|
2031
2022
|
Requested memory in MB.
|
|
2032
2023
|
multiplier : `float`
|
|
2033
|
-
Memory growth rate between
|
|
2024
|
+
Memory growth rate between retries.
|
|
2034
2025
|
limit : `int`
|
|
2035
2026
|
Memory limit.
|
|
2036
2027
|
|
|
2037
2028
|
Returns
|
|
2038
2029
|
-------
|
|
2039
2030
|
expr : `str`
|
|
2040
|
-
A string representing an HTCondor ClassAd expression for removing jobs
|
|
2041
|
-
which were run at the maximal allowable memory and still exceeded
|
|
2042
|
-
the memory requirements.
|
|
2031
|
+
A string representing an HTCondor ClassAd expression for removing jobs.
|
|
2043
2032
|
"""
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2047
|
-
#
|
|
2048
|
-
# UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
|
|
2049
|
-
# The special comparison operators ensure that all comparisons below will
|
|
2050
|
-
# evaluate to FALSE in this case.
|
|
2033
|
+
# Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode'
|
|
2034
|
+
# are UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
|
|
2035
|
+
# The special comparison operators ensure that all comparisons below
|
|
2036
|
+
# will evaluate to FALSE in this case.
|
|
2051
2037
|
#
|
|
2052
2038
|
# Note:
|
|
2053
|
-
# May not be strictly necessary. Operators '&&' and '||' are not
|
|
2054
|
-
# the entire expression should evaluate to FALSE when the
|
|
2055
|
-
# According to ClassAd evaluation semantics
|
|
2056
|
-
# but better safe than sorry.
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
"&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
|
|
2060
|
-
"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
|
|
2061
|
-
)
|
|
2039
|
+
# May not be strictly necessary. Operators '&&' and '||' are not
|
|
2040
|
+
# strict so the entire expression should evaluate to FALSE when the
|
|
2041
|
+
# job is not HELD. According to ClassAd evaluation semantics
|
|
2042
|
+
# FALSE && UNDEFINED is FALSE, but better safe than sorry.
|
|
2043
|
+
is_held = "JobStatus == 5"
|
|
2044
|
+
is_retry_disallowed = "NumJobStarts > JobMaxRetries"
|
|
2062
2045
|
|
|
2063
|
-
|
|
2046
|
+
mem_expr = ""
|
|
2047
|
+
if memory and multiplier and multiplier > 1 and limit:
|
|
2048
|
+
mem_limit_expr = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
|
|
2049
|
+
|
|
2050
|
+
mem_expr = ( # Add || here so only added if adding memory expr
|
|
2051
|
+
" || ((HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
|
|
2052
|
+
f"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && {mem_limit_expr})"
|
|
2053
|
+
)
|
|
2054
|
+
|
|
2055
|
+
expr = f"{is_held} && ({is_retry_disallowed}{mem_expr})"
|
|
2064
2056
|
return expr
|
|
2065
2057
|
|
|
2066
2058
|
|
|
@@ -2072,7 +2064,7 @@ def _create_request_memory_expr(memory, multiplier, limit):
|
|
|
2072
2064
|
memory : `int`
|
|
2073
2065
|
Requested memory in MB.
|
|
2074
2066
|
multiplier : `float`
|
|
2075
|
-
Memory growth rate between
|
|
2067
|
+
Memory growth rate between retries.
|
|
2076
2068
|
limit : `int`
|
|
2077
2069
|
Memory limit.
|
|
2078
2070
|
|
|
@@ -2147,7 +2139,7 @@ def _gather_site_values(config, compute_site):
|
|
|
2147
2139
|
|
|
2148
2140
|
Returns
|
|
2149
2141
|
-------
|
|
2150
|
-
site_values : `dict` [`str`,
|
|
2142
|
+
site_values : `dict` [`str`, `~typing.Any`]
|
|
2151
2143
|
Values specific to the given site.
|
|
2152
2144
|
"""
|
|
2153
2145
|
site_values = {"attrs": {}, "profile": {}}
|
|
@@ -2195,6 +2187,50 @@ def _gather_site_values(config, compute_site):
|
|
|
2195
2187
|
return site_values
|
|
2196
2188
|
|
|
2197
2189
|
|
|
2190
|
+
def _gather_label_values(config: BpsConfig, label: str) -> dict[str, Any]:
|
|
2191
|
+
"""Gather values specific to given job label.
|
|
2192
|
+
|
|
2193
|
+
Parameters
|
|
2194
|
+
----------
|
|
2195
|
+
config : `lsst.ctrl.bps.BpsConfig`
|
|
2196
|
+
BPS configuration that includes necessary submit/runtime
|
|
2197
|
+
information.
|
|
2198
|
+
label : `str`
|
|
2199
|
+
GenericWorkflowJob label.
|
|
2200
|
+
|
|
2201
|
+
Returns
|
|
2202
|
+
-------
|
|
2203
|
+
values : `dict` [`str`, `~typing.Any`]
|
|
2204
|
+
Values specific to the given job label.
|
|
2205
|
+
"""
|
|
2206
|
+
values: dict[str, Any] = {"attrs": {}, "profile": {}}
|
|
2207
|
+
|
|
2208
|
+
search_opts = {}
|
|
2209
|
+
profile_key = ""
|
|
2210
|
+
if label == "finalJob":
|
|
2211
|
+
search_opts["searchobj"] = config["finalJob"]
|
|
2212
|
+
profile_key = ".finalJob.profile.condor"
|
|
2213
|
+
elif label in config["cluster"]:
|
|
2214
|
+
search_opts["curvals"] = {"curr_cluster": label}
|
|
2215
|
+
profile_key = f".cluster.{label}.profile.condor"
|
|
2216
|
+
elif label in config["pipetask"]:
|
|
2217
|
+
search_opts["curvals"] = {"curr_pipetask": label}
|
|
2218
|
+
profile_key = f".pipetask.{label}.profile.condor"
|
|
2219
|
+
|
|
2220
|
+
found, value = config.search("releaseExpr", opt=search_opts)
|
|
2221
|
+
if found:
|
|
2222
|
+
values["releaseExpr"] = value
|
|
2223
|
+
|
|
2224
|
+
if profile_key and profile_key in config:
|
|
2225
|
+
for subkey, val in config[profile_key].items():
|
|
2226
|
+
if subkey.startswith("+"):
|
|
2227
|
+
values["attrs"][subkey[1:]] = val
|
|
2228
|
+
else:
|
|
2229
|
+
values["profile"][subkey] = val
|
|
2230
|
+
|
|
2231
|
+
return values
|
|
2232
|
+
|
|
2233
|
+
|
|
2198
2234
|
def is_service_job(job_ad: dict[str, Any]) -> bool:
|
|
2199
2235
|
"""Determine if a job is a service one.
|
|
2200
2236
|
|
|
@@ -2212,6 +2248,199 @@ def is_service_job(job_ad: dict[str, Any]) -> bool:
|
|
|
2212
2248
|
-----
|
|
2213
2249
|
At the moment, HTCondor does not provide a native way to distinguish
|
|
2214
2250
|
between payload and service jobs in the workflow. This code depends
|
|
2215
|
-
on read_node_status adding
|
|
2251
|
+
on read_node_status adding wms_node_type.
|
|
2252
|
+
"""
|
|
2253
|
+
return job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) == WmsNodeType.SERVICE
|
|
2254
|
+
|
|
2255
|
+
|
|
2256
|
+
def _group_to_subdag(
|
|
2257
|
+
config: BpsConfig, generic_workflow_group: GenericWorkflowGroup, out_prefix: str
|
|
2258
|
+
) -> HTCJob:
|
|
2259
|
+
"""Convert a generic workflow group to an HTCondor dag.
|
|
2260
|
+
|
|
2261
|
+
Parameters
|
|
2262
|
+
----------
|
|
2263
|
+
config : `lsst.ctrl.bps.BpsConfig`
|
|
2264
|
+
Workflow configuration.
|
|
2265
|
+
generic_workflow_group : `lsst.ctrl.bps.GenericWorkflowGroup`
|
|
2266
|
+
The generic workflow group to convert.
|
|
2267
|
+
out_prefix : `str`
|
|
2268
|
+
Location prefix to be used when creating jobs.
|
|
2269
|
+
|
|
2270
|
+
Returns
|
|
2271
|
+
-------
|
|
2272
|
+
htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
|
|
2273
|
+
Job for running the HTCondor dag.
|
|
2274
|
+
"""
|
|
2275
|
+
jobname = f"wms_{generic_workflow_group.name}"
|
|
2276
|
+
htc_job = HTCJob(name=jobname, label=generic_workflow_group.label)
|
|
2277
|
+
htc_job.add_dag_cmds({"dir": f"subdags/{jobname}"})
|
|
2278
|
+
htc_job.subdag = _generic_workflow_to_htcondor_dag(config, generic_workflow_group, out_prefix)
|
|
2279
|
+
if not generic_workflow_group.blocking:
|
|
2280
|
+
htc_job.dagcmds["post"] = {
|
|
2281
|
+
"defer": "",
|
|
2282
|
+
"executable": f"{os.path.dirname(__file__)}/subdag_post.sh",
|
|
2283
|
+
"arguments": f"{jobname} $RETURN",
|
|
2284
|
+
}
|
|
2285
|
+
return htc_job
|
|
2286
|
+
|
|
2287
|
+
|
|
2288
|
+
def _create_check_job(group_job_name: str, job_label: str) -> HTCJob:
|
|
2289
|
+
"""Create a job to check status of a group job.
|
|
2290
|
+
|
|
2291
|
+
Parameters
|
|
2292
|
+
----------
|
|
2293
|
+
group_job_name : `str`
|
|
2294
|
+
Name of the group job.
|
|
2295
|
+
job_label : `str`
|
|
2296
|
+
Label to use for the check status job.
|
|
2297
|
+
|
|
2298
|
+
Returns
|
|
2299
|
+
-------
|
|
2300
|
+
htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
|
|
2301
|
+
Job description for the job to check group job status.
|
|
2216
2302
|
"""
|
|
2217
|
-
|
|
2303
|
+
htc_job = HTCJob(name=f"wms_check_status_{group_job_name}", label=job_label)
|
|
2304
|
+
htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/check_group_status.sub"
|
|
2305
|
+
htc_job.add_dag_cmds({"dir": f"subdags/{group_job_name}", "vars": {"group_job_name": group_job_name}})
|
|
2306
|
+
|
|
2307
|
+
return htc_job
|
|
2308
|
+
|
|
2309
|
+
|
|
2310
|
+
def _generic_workflow_to_htcondor_dag(
|
|
2311
|
+
config: BpsConfig, generic_workflow: GenericWorkflow, out_prefix: str
|
|
2312
|
+
) -> HTCDag:
|
|
2313
|
+
"""Convert a GenericWorkflow to a HTCDag.
|
|
2314
|
+
|
|
2315
|
+
Parameters
|
|
2316
|
+
----------
|
|
2317
|
+
config : `lsst.ctrl.bps.BpsConfig`
|
|
2318
|
+
Workflow configuration.
|
|
2319
|
+
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
2320
|
+
The GenericWorkflow to convert.
|
|
2321
|
+
out_prefix : `str`
|
|
2322
|
+
Location prefix where the HTCondor files will be written.
|
|
2323
|
+
|
|
2324
|
+
Returns
|
|
2325
|
+
-------
|
|
2326
|
+
dag : `lsst.ctrl.bps.htcondor.HTCDag`
|
|
2327
|
+
The HTCDag representation of the given GenericWorkflow.
|
|
2328
|
+
"""
|
|
2329
|
+
dag = HTCDag(name=generic_workflow.name)
|
|
2330
|
+
|
|
2331
|
+
_LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
|
|
2332
|
+
dag.add_attribs(generic_workflow.run_attrs)
|
|
2333
|
+
dag.add_attribs(
|
|
2334
|
+
{
|
|
2335
|
+
"bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
|
|
2336
|
+
"bps_job_summary": create_count_summary(generic_workflow.job_counts),
|
|
2337
|
+
}
|
|
2338
|
+
)
|
|
2339
|
+
|
|
2340
|
+
_, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
|
|
2341
|
+
if isinstance(tmp_template, str):
|
|
2342
|
+
subdir_template = defaultdict(lambda: tmp_template)
|
|
2343
|
+
else:
|
|
2344
|
+
subdir_template = tmp_template
|
|
2345
|
+
|
|
2346
|
+
# Create all DAG jobs
|
|
2347
|
+
site_values = {} # Cache compute site specific values to reduce config lookups.
|
|
2348
|
+
cached_values = {} # Cache label-specific values to reduce config lookups.
|
|
2349
|
+
# Note: Can't use get_job_by_label because those only include payload jobs.
|
|
2350
|
+
for job_name in generic_workflow:
|
|
2351
|
+
gwjob = generic_workflow.get_job(job_name)
|
|
2352
|
+
if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
2353
|
+
gwjob = cast(GenericWorkflowJob, gwjob)
|
|
2354
|
+
if gwjob.compute_site not in site_values:
|
|
2355
|
+
site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
|
|
2356
|
+
if gwjob.label not in cached_values:
|
|
2357
|
+
cached_values[gwjob.label] = deepcopy(site_values[gwjob.compute_site])
|
|
2358
|
+
cached_values[gwjob.label].update(_gather_label_values(config, gwjob.label))
|
|
2359
|
+
_LOG.debug("cached: %s= %s", gwjob.label, cached_values[gwjob.label])
|
|
2360
|
+
htc_job = _create_job(
|
|
2361
|
+
subdir_template[gwjob.label],
|
|
2362
|
+
cached_values[gwjob.label],
|
|
2363
|
+
generic_workflow,
|
|
2364
|
+
gwjob,
|
|
2365
|
+
out_prefix,
|
|
2366
|
+
)
|
|
2367
|
+
elif gwjob.node_type == GenericWorkflowNodeType.NOOP:
|
|
2368
|
+
gwjob = cast(GenericWorkflowNoopJob, gwjob)
|
|
2369
|
+
htc_job = HTCJob(f"wms_{gwjob.name}", label=gwjob.label)
|
|
2370
|
+
htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/noop.sub"
|
|
2371
|
+
htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
|
|
2372
|
+
htc_job.add_dag_cmds({"noop": True})
|
|
2373
|
+
elif gwjob.node_type == GenericWorkflowNodeType.GROUP:
|
|
2374
|
+
gwjob = cast(GenericWorkflowGroup, gwjob)
|
|
2375
|
+
htc_job = _group_to_subdag(config, gwjob, out_prefix)
|
|
2376
|
+
# In case DAGMAN_GENERATE_SUBDAG_SUBMITS is False,
|
|
2377
|
+
dag.graph["submit_options"]["do_recurse"] = True
|
|
2378
|
+
else:
|
|
2379
|
+
raise RuntimeError(f"Unsupported generic workflow node type {gwjob.node_type} ({gwjob.name})")
|
|
2380
|
+
_LOG.debug("Calling adding job %s %s", htc_job.name, htc_job.label)
|
|
2381
|
+
dag.add_job(htc_job)
|
|
2382
|
+
|
|
2383
|
+
# Add job dependencies to the DAG (be careful with wms_ jobs)
|
|
2384
|
+
for job_name in generic_workflow:
|
|
2385
|
+
gwjob = generic_workflow.get_job(job_name)
|
|
2386
|
+
parent_name = (
|
|
2387
|
+
gwjob.name if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD else f"wms_{gwjob.name}"
|
|
2388
|
+
)
|
|
2389
|
+
successor_jobs = [generic_workflow.get_job(j) for j in generic_workflow.successors(job_name)]
|
|
2390
|
+
children_names = []
|
|
2391
|
+
if gwjob.node_type == GenericWorkflowNodeType.GROUP:
|
|
2392
|
+
gwjob = cast(GenericWorkflowGroup, gwjob)
|
|
2393
|
+
group_children = [] # Dependencies between same group jobs
|
|
2394
|
+
for sjob in successor_jobs:
|
|
2395
|
+
if sjob.node_type == GenericWorkflowNodeType.GROUP and sjob.label == gwjob.label:
|
|
2396
|
+
group_children.append(f"wms_{sjob.name}")
|
|
2397
|
+
elif sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
2398
|
+
children_names.append(sjob.name)
|
|
2399
|
+
else:
|
|
2400
|
+
children_names.append(f"wms_{sjob.name}")
|
|
2401
|
+
if group_children:
|
|
2402
|
+
dag.add_job_relationships([parent_name], group_children)
|
|
2403
|
+
if not gwjob.blocking:
|
|
2404
|
+
# Since subdag will always succeed, need to add a special
|
|
2405
|
+
# job that fails if group failed to block payload children.
|
|
2406
|
+
check_job = _create_check_job(f"wms_{gwjob.name}", gwjob.label)
|
|
2407
|
+
dag.add_job(check_job)
|
|
2408
|
+
dag.add_job_relationships([f"wms_{gwjob.name}"], [check_job.name])
|
|
2409
|
+
parent_name = check_job.name
|
|
2410
|
+
else:
|
|
2411
|
+
for sjob in successor_jobs:
|
|
2412
|
+
if sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
2413
|
+
children_names.append(sjob.name)
|
|
2414
|
+
else:
|
|
2415
|
+
children_names.append(f"wms_{sjob.name}")
|
|
2416
|
+
|
|
2417
|
+
dag.add_job_relationships([parent_name], children_names)
|
|
2418
|
+
|
|
2419
|
+
# If final job exists in generic workflow, create DAG final job
|
|
2420
|
+
final = generic_workflow.get_final()
|
|
2421
|
+
if final and isinstance(final, GenericWorkflowJob):
|
|
2422
|
+
if final.compute_site and final.compute_site not in site_values:
|
|
2423
|
+
site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
|
|
2424
|
+
if final.label not in cached_values:
|
|
2425
|
+
cached_values[final.label] = deepcopy(site_values[final.compute_site])
|
|
2426
|
+
cached_values[final.label].update(_gather_label_values(config, final.label))
|
|
2427
|
+
final_htjob = _create_job(
|
|
2428
|
+
subdir_template[final.label],
|
|
2429
|
+
cached_values[final.label],
|
|
2430
|
+
generic_workflow,
|
|
2431
|
+
final,
|
|
2432
|
+
out_prefix,
|
|
2433
|
+
)
|
|
2434
|
+
if "post" not in final_htjob.dagcmds:
|
|
2435
|
+
final_htjob.dagcmds["post"] = {
|
|
2436
|
+
"defer": "",
|
|
2437
|
+
"executable": f"{os.path.dirname(__file__)}/final_post.sh",
|
|
2438
|
+
"arguments": f"{final.name} $DAG_STATUS $RETURN",
|
|
2439
|
+
}
|
|
2440
|
+
dag.add_final_job(final_htjob)
|
|
2441
|
+
elif final and isinstance(final, GenericWorkflow):
|
|
2442
|
+
raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
|
|
2443
|
+
elif final:
|
|
2444
|
+
raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
|
|
2445
|
+
|
|
2446
|
+
return dag
|