lsst-ctrl-bps-htcondor 29.2025.3700__py3-none-any.whl → 29.2025.3900__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/ctrl/bps/htcondor/__init__.py +1 -0
- lsst/ctrl/bps/htcondor/common_utils.py +306 -0
- lsst/ctrl/bps/htcondor/htcondor_service.py +12 -2060
- lsst/ctrl/bps/htcondor/htcondor_workflow.py +89 -0
- lsst/ctrl/bps/htcondor/lssthtc.py +27 -1
- lsst/ctrl/bps/htcondor/prepare_utils.py +967 -0
- lsst/ctrl/bps/htcondor/provisioner.py +3 -2
- lsst/ctrl/bps/htcondor/report_utils.py +842 -0
- lsst/ctrl/bps/htcondor/version.py +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/METADATA +1 -1
- lsst_ctrl_bps_htcondor-29.2025.3900.dist-info/RECORD +23 -0
- lsst_ctrl_bps_htcondor-29.2025.3700.dist-info/RECORD +0 -19
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/WHEEL +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/licenses/LICENSE +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/top_level.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.3700.dist-info → lsst_ctrl_bps_htcondor-29.2025.3900.dist-info}/zip-safe +0 -0
|
@@ -27,93 +27,48 @@
|
|
|
27
27
|
|
|
28
28
|
"""Interface between generic workflow to HTCondor workflow system."""
|
|
29
29
|
|
|
30
|
-
__all__ = ["HTCondorService"
|
|
30
|
+
__all__ = ["HTCondorService"]
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
import logging
|
|
34
34
|
import os
|
|
35
|
-
import re
|
|
36
|
-
from collections import defaultdict
|
|
37
|
-
from copy import deepcopy
|
|
38
|
-
from enum import IntEnum, auto
|
|
39
35
|
from pathlib import Path
|
|
40
|
-
from typing import Any, cast
|
|
41
36
|
|
|
42
37
|
import htcondor
|
|
43
38
|
from packaging import version
|
|
44
39
|
|
|
45
40
|
from lsst.ctrl.bps import (
|
|
46
41
|
BaseWmsService,
|
|
47
|
-
BaseWmsWorkflow,
|
|
48
|
-
BpsConfig,
|
|
49
|
-
GenericWorkflow,
|
|
50
|
-
GenericWorkflowGroup,
|
|
51
|
-
GenericWorkflowJob,
|
|
52
|
-
GenericWorkflowNodeType,
|
|
53
|
-
GenericWorkflowNoopJob,
|
|
54
|
-
WmsJobReport,
|
|
55
|
-
WmsRunReport,
|
|
56
|
-
WmsSpecificInfo,
|
|
57
42
|
WmsStates,
|
|
58
43
|
)
|
|
59
|
-
from lsst.ctrl.bps.bps_utils import chdir
|
|
44
|
+
from lsst.ctrl.bps.bps_utils import chdir
|
|
60
45
|
from lsst.daf.butler import Config
|
|
61
46
|
from lsst.utils.timer import time_this
|
|
62
47
|
|
|
48
|
+
from .common_utils import WmsIdType, _wms_id_to_cluster, _wms_id_to_dir, _wms_id_type
|
|
63
49
|
from .htcondor_config import HTC_DEFAULTS_URI
|
|
50
|
+
from .htcondor_workflow import HTCondorWorkflow
|
|
64
51
|
from .lssthtc import (
|
|
65
|
-
|
|
66
|
-
HTCDag,
|
|
67
|
-
HTCJob,
|
|
68
|
-
NodeStatus,
|
|
69
|
-
WmsNodeType,
|
|
52
|
+
_locate_schedds,
|
|
70
53
|
_update_rescue_file,
|
|
71
|
-
condor_history,
|
|
72
54
|
condor_q,
|
|
73
|
-
condor_search,
|
|
74
|
-
condor_status,
|
|
75
55
|
htc_backup_files,
|
|
76
|
-
htc_check_dagman_output,
|
|
77
56
|
htc_create_submit_from_cmd,
|
|
78
57
|
htc_create_submit_from_dag,
|
|
79
58
|
htc_create_submit_from_file,
|
|
80
|
-
htc_escape,
|
|
81
59
|
htc_submit_dag,
|
|
82
60
|
htc_version,
|
|
83
|
-
pegasus_name_to_label,
|
|
84
|
-
read_dag_info,
|
|
85
|
-
read_dag_log,
|
|
86
61
|
read_dag_status,
|
|
87
|
-
read_node_status,
|
|
88
|
-
summarize_dag,
|
|
89
62
|
write_dag_info,
|
|
90
63
|
)
|
|
91
64
|
from .provisioner import Provisioner
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
"""
|
|
100
|
-
|
|
101
|
-
LOCAL = auto()
|
|
102
|
-
"""The id is HTCondor job's ClusterId (with optional '.ProcId').
|
|
103
|
-
"""
|
|
104
|
-
|
|
105
|
-
GLOBAL = auto()
|
|
106
|
-
"""Id is a HTCondor's global job id.
|
|
107
|
-
"""
|
|
108
|
-
|
|
109
|
-
PATH = auto()
|
|
110
|
-
"""Id is a submission path.
|
|
111
|
-
"""
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
DEFAULT_HTC_EXEC_PATT = ".*worker.*"
|
|
115
|
-
"""Default pattern for searching execute machines in an HTCondor pool.
|
|
116
|
-
"""
|
|
65
|
+
from .report_utils import (
|
|
66
|
+
_get_status_from_id,
|
|
67
|
+
_get_status_from_path,
|
|
68
|
+
_report_from_id,
|
|
69
|
+
_report_from_path,
|
|
70
|
+
_summary_report,
|
|
71
|
+
)
|
|
117
72
|
|
|
118
73
|
_LOG = logging.getLogger(__name__)
|
|
119
74
|
|
|
@@ -604,2006 +559,3 @@ class HTCondorService(BaseWmsService):
|
|
|
604
559
|
status = 1
|
|
605
560
|
message = f"Permission problem with {daemon_type} service."
|
|
606
561
|
return status, message
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
class HTCondorWorkflow(BaseWmsWorkflow):
|
|
610
|
-
"""Single HTCondor workflow.
|
|
611
|
-
|
|
612
|
-
Parameters
|
|
613
|
-
----------
|
|
614
|
-
name : `str`
|
|
615
|
-
Unique name for Workflow used when naming files.
|
|
616
|
-
config : `lsst.ctrl.bps.BpsConfig`
|
|
617
|
-
BPS configuration that includes necessary submit/runtime information.
|
|
618
|
-
"""
|
|
619
|
-
|
|
620
|
-
def __init__(self, name, config=None):
|
|
621
|
-
super().__init__(name, config)
|
|
622
|
-
self.dag = None
|
|
623
|
-
|
|
624
|
-
@classmethod
|
|
625
|
-
def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
|
|
626
|
-
# Docstring inherited
|
|
627
|
-
htc_workflow = cls(generic_workflow.name, config)
|
|
628
|
-
htc_workflow.dag = _generic_workflow_to_htcondor_dag(config, generic_workflow, out_prefix)
|
|
629
|
-
|
|
630
|
-
_LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
|
|
631
|
-
# Add extra attributes to top most DAG.
|
|
632
|
-
htc_workflow.dag.add_attribs(
|
|
633
|
-
{
|
|
634
|
-
"bps_wms_service": service_class,
|
|
635
|
-
"bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
|
|
636
|
-
}
|
|
637
|
-
)
|
|
638
|
-
|
|
639
|
-
return htc_workflow
|
|
640
|
-
|
|
641
|
-
def write(self, out_prefix):
|
|
642
|
-
"""Output HTCondor DAGMan files needed for workflow submission.
|
|
643
|
-
|
|
644
|
-
Parameters
|
|
645
|
-
----------
|
|
646
|
-
out_prefix : `str`
|
|
647
|
-
Directory prefix for HTCondor files.
|
|
648
|
-
"""
|
|
649
|
-
self.submit_path = out_prefix
|
|
650
|
-
os.makedirs(out_prefix, exist_ok=True)
|
|
651
|
-
|
|
652
|
-
# Write down the workflow in HTCondor format.
|
|
653
|
-
self.dag.write(out_prefix, job_subdir="jobs/{self.label}")
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
def _create_job(subdir_template, cached_values, generic_workflow, gwjob, out_prefix):
|
|
657
|
-
"""Convert GenericWorkflow job nodes to DAG jobs.
|
|
658
|
-
|
|
659
|
-
Parameters
|
|
660
|
-
----------
|
|
661
|
-
subdir_template : `str`
|
|
662
|
-
Template for making subdirs.
|
|
663
|
-
cached_values : `dict`
|
|
664
|
-
Site and label specific values.
|
|
665
|
-
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
666
|
-
Generic workflow that is being converted.
|
|
667
|
-
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
668
|
-
The generic job to convert to a HTCondor job.
|
|
669
|
-
out_prefix : `str`
|
|
670
|
-
Directory prefix for HTCondor files.
|
|
671
|
-
|
|
672
|
-
Returns
|
|
673
|
-
-------
|
|
674
|
-
htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
|
|
675
|
-
The HTCondor job equivalent to the given generic job.
|
|
676
|
-
"""
|
|
677
|
-
htc_job = HTCJob(gwjob.name, label=gwjob.label)
|
|
678
|
-
|
|
679
|
-
curvals = defaultdict(str)
|
|
680
|
-
curvals["label"] = gwjob.label
|
|
681
|
-
if gwjob.tags:
|
|
682
|
-
curvals.update(gwjob.tags)
|
|
683
|
-
|
|
684
|
-
subdir = Path("jobs") / subdir_template.format_map(curvals)
|
|
685
|
-
htc_job.subdir = subdir
|
|
686
|
-
htc_job.subfile = f"{gwjob.name}.sub"
|
|
687
|
-
htc_job.add_dag_cmds({"dir": subdir})
|
|
688
|
-
|
|
689
|
-
htc_job_cmds = {
|
|
690
|
-
"universe": "vanilla",
|
|
691
|
-
"should_transfer_files": "YES",
|
|
692
|
-
"when_to_transfer_output": "ON_EXIT_OR_EVICT",
|
|
693
|
-
"transfer_output_files": '""', # Set to empty string to disable
|
|
694
|
-
"transfer_executable": "False",
|
|
695
|
-
"getenv": "True",
|
|
696
|
-
# Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell
|
|
697
|
-
# htcondor to put on hold any jobs which exited by a signal.
|
|
698
|
-
"on_exit_hold": "ExitBySignal == true",
|
|
699
|
-
"on_exit_hold_reason": (
|
|
700
|
-
'strcat("Job raised a signal ", string(ExitSignal), ". ", '
|
|
701
|
-
'"Handling signal as if job has gone over memory limit.")'
|
|
702
|
-
),
|
|
703
|
-
"on_exit_hold_subcode": "34",
|
|
704
|
-
}
|
|
705
|
-
|
|
706
|
-
htc_job_cmds.update(_translate_job_cmds(cached_values, generic_workflow, gwjob))
|
|
707
|
-
|
|
708
|
-
# job stdout, stderr, htcondor user log.
|
|
709
|
-
for key in ("output", "error"):
|
|
710
|
-
if cached_values["overwriteJobFiles"]:
|
|
711
|
-
htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).{key[:3]}"
|
|
712
|
-
else:
|
|
713
|
-
htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).$$([NumJobStarts ?: 0]).{key[:3]}"
|
|
714
|
-
_LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
|
|
715
|
-
|
|
716
|
-
key = "log"
|
|
717
|
-
htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).{key[:3]}"
|
|
718
|
-
_LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
|
|
719
|
-
|
|
720
|
-
htc_job_cmds.update(
|
|
721
|
-
_handle_job_inputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
|
|
722
|
-
)
|
|
723
|
-
|
|
724
|
-
htc_job_cmds.update(
|
|
725
|
-
_handle_job_outputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
|
|
726
|
-
)
|
|
727
|
-
|
|
728
|
-
# Add the job cmds dict to the job object.
|
|
729
|
-
htc_job.add_job_cmds(htc_job_cmds)
|
|
730
|
-
|
|
731
|
-
htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
|
|
732
|
-
|
|
733
|
-
# Add job attributes to job.
|
|
734
|
-
_LOG.debug("gwjob.attrs = %s", gwjob.attrs)
|
|
735
|
-
htc_job.add_job_attrs(gwjob.attrs)
|
|
736
|
-
htc_job.add_job_attrs(cached_values["attrs"])
|
|
737
|
-
htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
|
|
738
|
-
htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
|
|
739
|
-
|
|
740
|
-
return htc_job
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
744
|
-
"""Translate the job data that are one to one mapping
|
|
745
|
-
|
|
746
|
-
Parameters
|
|
747
|
-
----------
|
|
748
|
-
cached_vals : `dict` [`str`, `~typing.Any`]
|
|
749
|
-
Config values common to jobs with same site or label.
|
|
750
|
-
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
751
|
-
Generic workflow that contains job to being converted.
|
|
752
|
-
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
753
|
-
Generic workflow job to be converted.
|
|
754
|
-
|
|
755
|
-
Returns
|
|
756
|
-
-------
|
|
757
|
-
htc_job_commands : `dict` [`str`, `~typing.Any`]
|
|
758
|
-
Contains commands which can appear in the HTCondor submit description
|
|
759
|
-
file.
|
|
760
|
-
"""
|
|
761
|
-
# Values in the job script that just are name mappings.
|
|
762
|
-
job_translation = {
|
|
763
|
-
"mail_to": "notify_user",
|
|
764
|
-
"when_to_mail": "notification",
|
|
765
|
-
"request_cpus": "request_cpus",
|
|
766
|
-
"priority": "priority",
|
|
767
|
-
"category": "category",
|
|
768
|
-
"accounting_group": "accounting_group",
|
|
769
|
-
"accounting_user": "accounting_group_user",
|
|
770
|
-
}
|
|
771
|
-
|
|
772
|
-
jobcmds = {}
|
|
773
|
-
for gwkey, htckey in job_translation.items():
|
|
774
|
-
jobcmds[htckey] = getattr(gwjob, gwkey, None)
|
|
775
|
-
|
|
776
|
-
# If accounting info was not set explicitly, use site settings if any.
|
|
777
|
-
if not gwjob.accounting_group:
|
|
778
|
-
jobcmds["accounting_group"] = cached_vals.get("accountingGroup")
|
|
779
|
-
if not gwjob.accounting_user:
|
|
780
|
-
jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
|
|
781
|
-
|
|
782
|
-
# job commands that need modification
|
|
783
|
-
if gwjob.retry_unless_exit:
|
|
784
|
-
if isinstance(gwjob.retry_unless_exit, int):
|
|
785
|
-
jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
|
|
786
|
-
elif isinstance(gwjob.retry_unless_exit, list):
|
|
787
|
-
jobcmds["retry_until"] = (
|
|
788
|
-
f"member(ExitCode, {{{','.join([str(x) for x in gwjob.retry_unless_exit])}}})"
|
|
789
|
-
)
|
|
790
|
-
else:
|
|
791
|
-
raise ValueError("retryUnlessExit must be an integer or a list of integers.")
|
|
792
|
-
|
|
793
|
-
if gwjob.request_disk:
|
|
794
|
-
jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
|
|
795
|
-
|
|
796
|
-
if gwjob.request_memory:
|
|
797
|
-
jobcmds["request_memory"] = f"{gwjob.request_memory}"
|
|
798
|
-
|
|
799
|
-
memory_max = 0
|
|
800
|
-
if gwjob.memory_multiplier:
|
|
801
|
-
# Do not use try-except! At the moment, BpsConfig returns an empty
|
|
802
|
-
# string if it does not contain the key.
|
|
803
|
-
memory_limit = cached_vals["memoryLimit"]
|
|
804
|
-
if not memory_limit:
|
|
805
|
-
raise RuntimeError(
|
|
806
|
-
"Memory autoscaling enabled, but automatic detection of the memory limit "
|
|
807
|
-
"failed; setting it explicitly with 'memoryLimit' or changing worker node "
|
|
808
|
-
"search pattern 'executeMachinesPattern' might help."
|
|
809
|
-
)
|
|
810
|
-
|
|
811
|
-
# Set maximal amount of memory job can ask for.
|
|
812
|
-
#
|
|
813
|
-
# The check below assumes that 'memory_limit' was set to a value which
|
|
814
|
-
# realistically reflects actual physical limitations of a given compute
|
|
815
|
-
# resource.
|
|
816
|
-
memory_max = memory_limit
|
|
817
|
-
if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit:
|
|
818
|
-
memory_max = gwjob.request_memory_max
|
|
819
|
-
|
|
820
|
-
# Make job ask for more memory each time it failed due to insufficient
|
|
821
|
-
# memory requirements.
|
|
822
|
-
jobcmds["request_memory"] = _create_request_memory_expr(
|
|
823
|
-
gwjob.request_memory, gwjob.memory_multiplier, memory_max
|
|
824
|
-
)
|
|
825
|
-
|
|
826
|
-
user_release_expr = cached_vals.get("releaseExpr", "")
|
|
827
|
-
if gwjob.number_of_retries is not None and gwjob.number_of_retries >= 0:
|
|
828
|
-
jobcmds["max_retries"] = gwjob.number_of_retries
|
|
829
|
-
|
|
830
|
-
# No point in adding periodic_release if 0 retries
|
|
831
|
-
if gwjob.number_of_retries > 0:
|
|
832
|
-
periodic_release = _create_periodic_release_expr(
|
|
833
|
-
gwjob.request_memory, gwjob.memory_multiplier, memory_max, user_release_expr
|
|
834
|
-
)
|
|
835
|
-
if periodic_release:
|
|
836
|
-
jobcmds["periodic_release"] = periodic_release
|
|
837
|
-
|
|
838
|
-
jobcmds["periodic_remove"] = _create_periodic_remove_expr(
|
|
839
|
-
gwjob.request_memory, gwjob.memory_multiplier, memory_max
|
|
840
|
-
)
|
|
841
|
-
|
|
842
|
-
# Assume concurrency_limit implemented using HTCondor concurrency limits.
|
|
843
|
-
# May need to move to special site-specific implementation if sites use
|
|
844
|
-
# other mechanisms.
|
|
845
|
-
if gwjob.concurrency_limit:
|
|
846
|
-
jobcmds["concurrency_limit"] = gwjob.concurrency_limit
|
|
847
|
-
|
|
848
|
-
# Handle command line
|
|
849
|
-
if gwjob.executable.transfer_executable:
|
|
850
|
-
jobcmds["transfer_executable"] = "True"
|
|
851
|
-
jobcmds["executable"] = gwjob.executable.src_uri
|
|
852
|
-
else:
|
|
853
|
-
jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
|
|
854
|
-
|
|
855
|
-
if gwjob.arguments:
|
|
856
|
-
arguments = gwjob.arguments
|
|
857
|
-
arguments = _replace_cmd_vars(arguments, gwjob)
|
|
858
|
-
arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob)
|
|
859
|
-
arguments = _fix_env_var_syntax(arguments)
|
|
860
|
-
jobcmds["arguments"] = arguments
|
|
861
|
-
|
|
862
|
-
if gwjob.environment:
|
|
863
|
-
env_str = ""
|
|
864
|
-
for name, value in gwjob.environment.items():
|
|
865
|
-
if isinstance(value, str):
|
|
866
|
-
value2 = _replace_cmd_vars(value, gwjob)
|
|
867
|
-
value2 = _fix_env_var_syntax(value2)
|
|
868
|
-
value2 = htc_escape(value2)
|
|
869
|
-
env_str += f"{name}='{value2}' " # Add single quotes to allow internal spaces
|
|
870
|
-
else:
|
|
871
|
-
env_str += f"{name}={value} "
|
|
872
|
-
|
|
873
|
-
# Process above added one trailing space
|
|
874
|
-
jobcmds["environment"] = env_str.rstrip()
|
|
875
|
-
|
|
876
|
-
# Add extra "pass-thru" job commands
|
|
877
|
-
if gwjob.profile:
|
|
878
|
-
for key, val in gwjob.profile.items():
|
|
879
|
-
jobcmds[key] = htc_escape(val)
|
|
880
|
-
for key, val in cached_vals["profile"].items():
|
|
881
|
-
jobcmds[key] = htc_escape(val)
|
|
882
|
-
|
|
883
|
-
return jobcmds
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
def _translate_dag_cmds(gwjob):
|
|
887
|
-
"""Translate job values into DAGMan commands.
|
|
888
|
-
|
|
889
|
-
Parameters
|
|
890
|
-
----------
|
|
891
|
-
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
892
|
-
Job containing values to be translated.
|
|
893
|
-
|
|
894
|
-
Returns
|
|
895
|
-
-------
|
|
896
|
-
dagcmds : `dict` [`str`, `~typing.Any`]
|
|
897
|
-
DAGMan commands for the job.
|
|
898
|
-
"""
|
|
899
|
-
# Values in the dag script that just are name mappings.
|
|
900
|
-
dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"}
|
|
901
|
-
|
|
902
|
-
dagcmds = {}
|
|
903
|
-
for gwkey, htckey in dag_translation.items():
|
|
904
|
-
dagcmds[htckey] = getattr(gwjob, gwkey, None)
|
|
905
|
-
|
|
906
|
-
# Still to be coded: vars "pre_cmdline", "post_cmdline"
|
|
907
|
-
return dagcmds
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
def _fix_env_var_syntax(oldstr):
|
|
911
|
-
"""Change ENV place holders to HTCondor Env var syntax.
|
|
912
|
-
|
|
913
|
-
Parameters
|
|
914
|
-
----------
|
|
915
|
-
oldstr : `str`
|
|
916
|
-
String in which environment variable syntax is to be fixed.
|
|
917
|
-
|
|
918
|
-
Returns
|
|
919
|
-
-------
|
|
920
|
-
newstr : `str`
|
|
921
|
-
Given string with environment variable syntax fixed.
|
|
922
|
-
"""
|
|
923
|
-
newstr = oldstr
|
|
924
|
-
for key in re.findall(r"<ENV:([^>]+)>", oldstr):
|
|
925
|
-
newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
|
|
926
|
-
return newstr
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
def _replace_file_vars(use_shared, arguments, workflow, gwjob):
|
|
930
|
-
"""Replace file placeholders in command line arguments with correct
|
|
931
|
-
physical file names.
|
|
932
|
-
|
|
933
|
-
Parameters
|
|
934
|
-
----------
|
|
935
|
-
use_shared : `bool`
|
|
936
|
-
Whether HTCondor can assume shared filesystem.
|
|
937
|
-
arguments : `str`
|
|
938
|
-
Arguments string in which to replace file placeholders.
|
|
939
|
-
workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
940
|
-
Generic workflow that contains file information.
|
|
941
|
-
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
942
|
-
The job corresponding to the arguments.
|
|
943
|
-
|
|
944
|
-
Returns
|
|
945
|
-
-------
|
|
946
|
-
arguments : `str`
|
|
947
|
-
Given arguments string with file placeholders replaced.
|
|
948
|
-
"""
|
|
949
|
-
# Replace input file placeholders with paths.
|
|
950
|
-
for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
|
|
951
|
-
if not gwfile.wms_transfer:
|
|
952
|
-
# Must assume full URI if in command line and told WMS is not
|
|
953
|
-
# responsible for transferring file.
|
|
954
|
-
uri = gwfile.src_uri
|
|
955
|
-
elif use_shared:
|
|
956
|
-
if gwfile.job_shared:
|
|
957
|
-
# Have shared filesystems and jobs can share file.
|
|
958
|
-
uri = gwfile.src_uri
|
|
959
|
-
else:
|
|
960
|
-
uri = os.path.basename(gwfile.src_uri)
|
|
961
|
-
else: # Using push transfer
|
|
962
|
-
uri = os.path.basename(gwfile.src_uri)
|
|
963
|
-
arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
|
|
964
|
-
|
|
965
|
-
# Replace output file placeholders with paths.
|
|
966
|
-
for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
|
|
967
|
-
if not gwfile.wms_transfer:
|
|
968
|
-
# Must assume full URI if in command line and told WMS is not
|
|
969
|
-
# responsible for transferring file.
|
|
970
|
-
uri = gwfile.src_uri
|
|
971
|
-
elif use_shared:
|
|
972
|
-
if gwfile.job_shared:
|
|
973
|
-
# Have shared filesystems and jobs can share file.
|
|
974
|
-
uri = gwfile.src_uri
|
|
975
|
-
else:
|
|
976
|
-
uri = os.path.basename(gwfile.src_uri)
|
|
977
|
-
else: # Using push transfer
|
|
978
|
-
uri = os.path.basename(gwfile.src_uri)
|
|
979
|
-
arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
|
|
980
|
-
return arguments
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
def _replace_cmd_vars(arguments, gwjob):
|
|
984
|
-
"""Replace format-style placeholders in arguments.
|
|
985
|
-
|
|
986
|
-
Parameters
|
|
987
|
-
----------
|
|
988
|
-
arguments : `str`
|
|
989
|
-
Arguments string in which to replace placeholders.
|
|
990
|
-
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
991
|
-
Job containing values to be used to replace placeholders
|
|
992
|
-
(in particular gwjob.cmdvals).
|
|
993
|
-
|
|
994
|
-
Returns
|
|
995
|
-
-------
|
|
996
|
-
arguments : `str`
|
|
997
|
-
Given arguments string with placeholders replaced.
|
|
998
|
-
"""
|
|
999
|
-
replacements = gwjob.cmdvals if gwjob.cmdvals is not None else {}
|
|
1000
|
-
try:
|
|
1001
|
-
arguments = arguments.format(**replacements)
|
|
1002
|
-
except (KeyError, TypeError) as exc: # TypeError in case None instead of {}
|
|
1003
|
-
_LOG.error("Could not replace command variables: replacement for %s not provided", str(exc))
|
|
1004
|
-
_LOG.debug("arguments: %s\ncmdvals: %s", arguments, replacements)
|
|
1005
|
-
raise
|
|
1006
|
-
return arguments
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
def _handle_job_inputs(
|
|
1010
|
-
generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str
|
|
1011
|
-
) -> dict[str, str]:
|
|
1012
|
-
"""Add job input files from generic workflow to job.
|
|
1013
|
-
|
|
1014
|
-
Parameters
|
|
1015
|
-
----------
|
|
1016
|
-
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
1017
|
-
The generic workflow (e.g., has executable name and arguments).
|
|
1018
|
-
job_name : `str`
|
|
1019
|
-
Unique name for the job.
|
|
1020
|
-
use_shared : `bool`
|
|
1021
|
-
Whether job has access to files via shared filesystem.
|
|
1022
|
-
out_prefix : `str`
|
|
1023
|
-
The root directory into which all WMS-specific files are written.
|
|
1024
|
-
|
|
1025
|
-
Returns
|
|
1026
|
-
-------
|
|
1027
|
-
htc_commands : `dict` [`str`, `str`]
|
|
1028
|
-
HTCondor commands for the job submission script.
|
|
1029
|
-
"""
|
|
1030
|
-
inputs = []
|
|
1031
|
-
for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
|
|
1032
|
-
_LOG.debug("src_uri=%s", gwf_file.src_uri)
|
|
1033
|
-
|
|
1034
|
-
uri = Path(gwf_file.src_uri)
|
|
1035
|
-
|
|
1036
|
-
# Note if use_shared and job_shared, don't need to transfer file.
|
|
1037
|
-
|
|
1038
|
-
if not use_shared: # Copy file using push to job
|
|
1039
|
-
inputs.append(str(uri))
|
|
1040
|
-
elif not gwf_file.job_shared: # Jobs require own copy
|
|
1041
|
-
# if using shared filesystem, but still need copy in job. Use
|
|
1042
|
-
# HTCondor's curl plugin for a local copy.
|
|
1043
|
-
if uri.is_dir():
|
|
1044
|
-
raise RuntimeError(
|
|
1045
|
-
f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}"
|
|
1046
|
-
)
|
|
1047
|
-
inputs.append(f"file://{uri}")
|
|
1048
|
-
|
|
1049
|
-
htc_commands = {}
|
|
1050
|
-
if inputs:
|
|
1051
|
-
htc_commands["transfer_input_files"] = ",".join(inputs)
|
|
1052
|
-
_LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
|
|
1053
|
-
return htc_commands
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
def _handle_job_outputs(
|
|
1057
|
-
generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str
|
|
1058
|
-
) -> dict[str, str]:
|
|
1059
|
-
"""Add job output files from generic workflow to the job if any.
|
|
1060
|
-
|
|
1061
|
-
Parameters
|
|
1062
|
-
----------
|
|
1063
|
-
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
1064
|
-
The generic workflow (e.g., has executable name and arguments).
|
|
1065
|
-
job_name : `str`
|
|
1066
|
-
Unique name for the job.
|
|
1067
|
-
use_shared : `bool`
|
|
1068
|
-
Whether job has access to files via shared filesystem.
|
|
1069
|
-
out_prefix : `str`
|
|
1070
|
-
The root directory into which all WMS-specific files are written.
|
|
1071
|
-
|
|
1072
|
-
Returns
|
|
1073
|
-
-------
|
|
1074
|
-
htc_commands : `dict` [`str`, `str`]
|
|
1075
|
-
HTCondor commands for the job submission script.
|
|
1076
|
-
"""
|
|
1077
|
-
outputs = []
|
|
1078
|
-
output_remaps = []
|
|
1079
|
-
for gwf_file in generic_workflow.get_job_outputs(job_name, data=True, transfer_only=True):
|
|
1080
|
-
_LOG.debug("src_uri=%s", gwf_file.src_uri)
|
|
1081
|
-
|
|
1082
|
-
uri = Path(gwf_file.src_uri)
|
|
1083
|
-
if not use_shared:
|
|
1084
|
-
outputs.append(uri.name)
|
|
1085
|
-
output_remaps.append(f"{uri.name}={str(uri)}")
|
|
1086
|
-
|
|
1087
|
-
# Set to an empty string to disable and only update if there are output
|
|
1088
|
-
# files to transfer. Otherwise, HTCondor will transfer back all files in
|
|
1089
|
-
# the job’s temporary working directory that have been modified or created
|
|
1090
|
-
# by the job.
|
|
1091
|
-
htc_commands = {"transfer_output_files": '""'}
|
|
1092
|
-
if outputs:
|
|
1093
|
-
htc_commands["transfer_output_files"] = ",".join(outputs)
|
|
1094
|
-
_LOG.debug("transfer_output_files=%s", htc_commands["transfer_output_files"])
|
|
1095
|
-
|
|
1096
|
-
htc_commands["transfer_output_remaps"] = f'"{";".join(output_remaps)}"'
|
|
1097
|
-
_LOG.debug("transfer_output_remaps=%s", htc_commands["transfer_output_remaps"])
|
|
1098
|
-
return htc_commands
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
def _get_status_from_id(
|
|
1102
|
-
wms_workflow_id: str, hist: float, schedds: dict[str, htcondor.Schedd]
|
|
1103
|
-
) -> tuple[WmsStates, str]:
|
|
1104
|
-
"""Gather run information using workflow id.
|
|
1105
|
-
|
|
1106
|
-
Parameters
|
|
1107
|
-
----------
|
|
1108
|
-
wms_workflow_id : `str`
|
|
1109
|
-
Limit to specific run based on id.
|
|
1110
|
-
hist : `float`
|
|
1111
|
-
Limit history search to this many days.
|
|
1112
|
-
schedds : `dict` [ `str`, `htcondor.Schedd` ]
|
|
1113
|
-
HTCondor schedulers which to query for job information. If empty
|
|
1114
|
-
dictionary, all queries will be run against the local scheduler only.
|
|
1115
|
-
|
|
1116
|
-
Returns
|
|
1117
|
-
-------
|
|
1118
|
-
state : `lsst.ctrl.bps.WmsStates`
|
|
1119
|
-
Status for the corresponding run.
|
|
1120
|
-
message : `str`
|
|
1121
|
-
Message with extra error information.
|
|
1122
|
-
"""
|
|
1123
|
-
_LOG.debug("_get_status_from_id: id=%s, hist=%s, schedds=%s", wms_workflow_id, hist, schedds)
|
|
1124
|
-
|
|
1125
|
-
message = ""
|
|
1126
|
-
|
|
1127
|
-
# Collect information about the job by querying HTCondor schedd and
|
|
1128
|
-
# HTCondor history.
|
|
1129
|
-
schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds)
|
|
1130
|
-
if len(schedd_dag_info) == 1:
|
|
1131
|
-
schedd_name = next(iter(schedd_dag_info))
|
|
1132
|
-
dag_id = next(iter(schedd_dag_info[schedd_name]))
|
|
1133
|
-
dag_ad = schedd_dag_info[schedd_name][dag_id]
|
|
1134
|
-
state = _htc_status_to_wms_state(dag_ad)
|
|
1135
|
-
else:
|
|
1136
|
-
state = WmsStates.UNKNOWN
|
|
1137
|
-
message = f"DAGMan job {wms_workflow_id} not found in queue or history. Check id or try path."
|
|
1138
|
-
return state, message
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
def _get_status_from_path(wms_path: str | os.PathLike) -> tuple[WmsStates, str]:
|
|
1142
|
-
"""Gather run status from a given run directory.
|
|
1143
|
-
|
|
1144
|
-
Parameters
|
|
1145
|
-
----------
|
|
1146
|
-
wms_path : `str` | `os.PathLike`
|
|
1147
|
-
The directory containing the submit side files (e.g., HTCondor files).
|
|
1148
|
-
|
|
1149
|
-
Returns
|
|
1150
|
-
-------
|
|
1151
|
-
state : `lsst.ctrl.bps.WmsStates`
|
|
1152
|
-
Status for the run.
|
|
1153
|
-
message : `str`
|
|
1154
|
-
Message to be printed.
|
|
1155
|
-
"""
|
|
1156
|
-
wms_path = Path(wms_path).resolve()
|
|
1157
|
-
message = ""
|
|
1158
|
-
try:
|
|
1159
|
-
wms_workflow_id, dag_ad = read_dag_log(wms_path)
|
|
1160
|
-
except FileNotFoundError:
|
|
1161
|
-
wms_workflow_id = MISSING_ID
|
|
1162
|
-
message = f"DAGMan log not found in {wms_path}. Check path."
|
|
1163
|
-
|
|
1164
|
-
if wms_workflow_id == MISSING_ID:
|
|
1165
|
-
state = WmsStates.UNKNOWN
|
|
1166
|
-
else:
|
|
1167
|
-
state = _htc_status_to_wms_state(dag_ad[wms_workflow_id])
|
|
1168
|
-
|
|
1169
|
-
return state, message
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
def _report_from_path(wms_path):
|
|
1173
|
-
"""Gather run information from a given run directory.
|
|
1174
|
-
|
|
1175
|
-
Parameters
|
|
1176
|
-
----------
|
|
1177
|
-
wms_path : `str`
|
|
1178
|
-
The directory containing the submit side files (e.g., HTCondor files).
|
|
1179
|
-
|
|
1180
|
-
Returns
|
|
1181
|
-
-------
|
|
1182
|
-
run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
|
|
1183
|
-
Run information for the detailed report. The key is the HTCondor id
|
|
1184
|
-
and the value is a collection of report information for that run.
|
|
1185
|
-
message : `str`
|
|
1186
|
-
Message to be printed with the summary report.
|
|
1187
|
-
"""
|
|
1188
|
-
wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
|
|
1189
|
-
if wms_workflow_id == MISSING_ID:
|
|
1190
|
-
run_reports = {}
|
|
1191
|
-
else:
|
|
1192
|
-
run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
|
|
1193
|
-
return run_reports, message
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
def _report_from_id(wms_workflow_id, hist, schedds=None):
|
|
1197
|
-
"""Gather run information using workflow id.
|
|
1198
|
-
|
|
1199
|
-
Parameters
|
|
1200
|
-
----------
|
|
1201
|
-
wms_workflow_id : `str`
|
|
1202
|
-
Limit to specific run based on id.
|
|
1203
|
-
hist : `float`
|
|
1204
|
-
Limit history search to this many days.
|
|
1205
|
-
schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
|
|
1206
|
-
HTCondor schedulers which to query for job information. If None
|
|
1207
|
-
(default), all queries will be run against the local scheduler only.
|
|
1208
|
-
|
|
1209
|
-
Returns
|
|
1210
|
-
-------
|
|
1211
|
-
run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
|
|
1212
|
-
Run information for the detailed report. The key is the HTCondor id
|
|
1213
|
-
and the value is a collection of report information for that run.
|
|
1214
|
-
message : `str`
|
|
1215
|
-
Message to be printed with the summary report.
|
|
1216
|
-
"""
|
|
1217
|
-
messages = []
|
|
1218
|
-
|
|
1219
|
-
# Collect information about the job by querying HTCondor schedd and
|
|
1220
|
-
# HTCondor history.
|
|
1221
|
-
schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds)
|
|
1222
|
-
if len(schedd_dag_info) == 1:
|
|
1223
|
-
# Extract the DAG info without altering the results of the query.
|
|
1224
|
-
schedd_name = next(iter(schedd_dag_info))
|
|
1225
|
-
dag_id = next(iter(schedd_dag_info[schedd_name]))
|
|
1226
|
-
dag_ad = schedd_dag_info[schedd_name][dag_id]
|
|
1227
|
-
|
|
1228
|
-
# If the provided workflow id does not correspond to the one extracted
|
|
1229
|
-
# from the DAGMan log file in the submit directory, rerun the query
|
|
1230
|
-
# with the id found in the file.
|
|
1231
|
-
#
|
|
1232
|
-
# This is to cover the situation in which the user provided the old job
|
|
1233
|
-
# id of a restarted run.
|
|
1234
|
-
try:
|
|
1235
|
-
path_dag_id, _ = read_dag_log(dag_ad["Iwd"])
|
|
1236
|
-
except FileNotFoundError as exc:
|
|
1237
|
-
# At the moment missing DAGMan log is pretty much a fatal error.
|
|
1238
|
-
# So empty the DAG info to finish early (see the if statement
|
|
1239
|
-
# below).
|
|
1240
|
-
schedd_dag_info.clear()
|
|
1241
|
-
messages.append(f"Cannot create the report for '{dag_id}': {exc}")
|
|
1242
|
-
else:
|
|
1243
|
-
if path_dag_id != dag_id:
|
|
1244
|
-
schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds)
|
|
1245
|
-
messages.append(
|
|
1246
|
-
f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. "
|
|
1247
|
-
"This normally occurs when a run is restarted. The report shown is for the most "
|
|
1248
|
-
f"recent status with run id '{path_dag_id}'"
|
|
1249
|
-
)
|
|
1250
|
-
|
|
1251
|
-
if len(schedd_dag_info) == 0:
|
|
1252
|
-
run_reports = {}
|
|
1253
|
-
elif len(schedd_dag_info) == 1:
|
|
1254
|
-
_, dag_info = schedd_dag_info.popitem()
|
|
1255
|
-
dag_id, dag_ad = dag_info.popitem()
|
|
1256
|
-
|
|
1257
|
-
# Create a mapping between jobs and their classads. The keys will
|
|
1258
|
-
# be of format 'ClusterId.ProcId'.
|
|
1259
|
-
job_info = {dag_id: dag_ad}
|
|
1260
|
-
|
|
1261
|
-
# Find jobs (nodes) belonging to that DAGMan job.
|
|
1262
|
-
job_constraint = f"DAGManJobId == {int(float(dag_id))}"
|
|
1263
|
-
schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds)
|
|
1264
|
-
if schedd_job_info:
|
|
1265
|
-
_, node_info = schedd_job_info.popitem()
|
|
1266
|
-
job_info.update(node_info)
|
|
1267
|
-
|
|
1268
|
-
# Collect additional pieces of information about jobs using HTCondor
|
|
1269
|
-
# files in the submission directory.
|
|
1270
|
-
_, path_jobs, message = _get_info_from_path(dag_ad["Iwd"])
|
|
1271
|
-
_update_jobs(job_info, path_jobs)
|
|
1272
|
-
if message:
|
|
1273
|
-
messages.append(message)
|
|
1274
|
-
run_reports = _create_detailed_report_from_jobs(dag_id, job_info)
|
|
1275
|
-
else:
|
|
1276
|
-
ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()]
|
|
1277
|
-
message = (
|
|
1278
|
-
f"More than one job matches id '{wms_workflow_id}', "
|
|
1279
|
-
f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids"
|
|
1280
|
-
)
|
|
1281
|
-
messages.append(message)
|
|
1282
|
-
run_reports = {}
|
|
1283
|
-
|
|
1284
|
-
message = "\n".join(messages)
|
|
1285
|
-
return run_reports, message
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
def _get_info_from_schedd(
|
|
1289
|
-
wms_workflow_id: str, hist: float, schedds: dict[str, htcondor.Schedd]
|
|
1290
|
-
) -> dict[str, dict[str, dict[str, Any]]]:
|
|
1291
|
-
"""Gather run information from HTCondor.
|
|
1292
|
-
|
|
1293
|
-
Parameters
|
|
1294
|
-
----------
|
|
1295
|
-
wms_workflow_id : `str`
|
|
1296
|
-
Limit to specific run based on id.
|
|
1297
|
-
hist : `float`
|
|
1298
|
-
Limit history search to this many days.
|
|
1299
|
-
schedds : `dict` [ `str`, `htcondor.Schedd` ]
|
|
1300
|
-
HTCondor schedulers which to query for job information. If empty
|
|
1301
|
-
dictionary, all queries will be run against the local scheduler only.
|
|
1302
|
-
|
|
1303
|
-
Returns
|
|
1304
|
-
-------
|
|
1305
|
-
schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
|
|
1306
|
-
Information about jobs satisfying the search criteria where for each
|
|
1307
|
-
Scheduler, local HTCondor job ids are mapped to their respective
|
|
1308
|
-
classads.
|
|
1309
|
-
"""
|
|
1310
|
-
_LOG.debug("_get_info_from_schedd: id=%s, hist=%s, schedds=%s", wms_workflow_id, hist, schedds)
|
|
1311
|
-
|
|
1312
|
-
dag_constraint = 'regexp("dagman$", Cmd)'
|
|
1313
|
-
try:
|
|
1314
|
-
cluster_id = int(float(wms_workflow_id))
|
|
1315
|
-
except ValueError:
|
|
1316
|
-
dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"'
|
|
1317
|
-
else:
|
|
1318
|
-
dag_constraint += f" && ClusterId == {cluster_id}"
|
|
1319
|
-
|
|
1320
|
-
# With the current implementation of the condor_* functions the query
|
|
1321
|
-
# will always return only one match per Scheduler.
|
|
1322
|
-
#
|
|
1323
|
-
# Even in the highly unlikely situation where HTCondor history (which
|
|
1324
|
-
# condor_search queries too) is long enough to have jobs from before
|
|
1325
|
-
# the cluster ids were rolled over (and as a result there is more then
|
|
1326
|
-
# one job with the same cluster id) they will not show up in
|
|
1327
|
-
# the results.
|
|
1328
|
-
schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds)
|
|
1329
|
-
return schedd_dag_info
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
def _get_info_from_path(wms_path: str | os.PathLike) -> tuple[str, dict[str, dict[str, Any]], str]:
|
|
1333
|
-
"""Gather run information from a given run directory.
|
|
1334
|
-
|
|
1335
|
-
Parameters
|
|
1336
|
-
----------
|
|
1337
|
-
wms_path : `str` or `os.PathLike`
|
|
1338
|
-
Directory containing HTCondor files.
|
|
1339
|
-
|
|
1340
|
-
Returns
|
|
1341
|
-
-------
|
|
1342
|
-
wms_workflow_id : `str`
|
|
1343
|
-
The run id which is a DAGman job id.
|
|
1344
|
-
jobs : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1345
|
-
Information about jobs read from files in the given directory.
|
|
1346
|
-
The key is the HTCondor id and the value is a dictionary of HTCondor
|
|
1347
|
-
keys and values.
|
|
1348
|
-
message : `str`
|
|
1349
|
-
Message to be printed with the summary report.
|
|
1350
|
-
"""
|
|
1351
|
-
# Ensure path is absolute, in particular for folks helping
|
|
1352
|
-
# debug failures that need to dig around submit files.
|
|
1353
|
-
wms_path = Path(wms_path).resolve()
|
|
1354
|
-
|
|
1355
|
-
messages = []
|
|
1356
|
-
try:
|
|
1357
|
-
wms_workflow_id, jobs = read_dag_log(wms_path)
|
|
1358
|
-
_LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
|
|
1359
|
-
_update_jobs(jobs, read_node_status(wms_path))
|
|
1360
|
-
_LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
|
|
1361
|
-
|
|
1362
|
-
# Add more info for DAGman job
|
|
1363
|
-
job = jobs[wms_workflow_id]
|
|
1364
|
-
job.update(read_dag_status(wms_path))
|
|
1365
|
-
|
|
1366
|
-
job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
|
|
1367
|
-
if "bps_run" not in job:
|
|
1368
|
-
_add_run_info(wms_path, job)
|
|
1369
|
-
|
|
1370
|
-
message = htc_check_dagman_output(wms_path)
|
|
1371
|
-
if message:
|
|
1372
|
-
messages.append(message)
|
|
1373
|
-
_LOG.debug(
|
|
1374
|
-
"_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"]
|
|
1375
|
-
)
|
|
1376
|
-
|
|
1377
|
-
# Add extra pieces of information which cannot be found in HTCondor
|
|
1378
|
-
# generated files like 'GlobalJobId'.
|
|
1379
|
-
#
|
|
1380
|
-
# Do not treat absence of this file as a serious error. Neither runs
|
|
1381
|
-
# submitted with earlier versions of the plugin nor the runs submitted
|
|
1382
|
-
# with Pegasus plugin will have it at the moment. However, once enough
|
|
1383
|
-
# time passes and Pegasus plugin will have its own report() method
|
|
1384
|
-
# (instead of sneakily using HTCondor's one), the lack of that file
|
|
1385
|
-
# should be treated as seriously as lack of any other file.
|
|
1386
|
-
try:
|
|
1387
|
-
job_info = read_dag_info(wms_path)
|
|
1388
|
-
except FileNotFoundError as exc:
|
|
1389
|
-
message = f"Warn: Some information may not be available: {exc}"
|
|
1390
|
-
messages.append(message)
|
|
1391
|
-
else:
|
|
1392
|
-
schedd_name = next(iter(job_info))
|
|
1393
|
-
job_ad = next(iter(job_info[schedd_name].values()))
|
|
1394
|
-
job.update(job_ad)
|
|
1395
|
-
except FileNotFoundError as err:
|
|
1396
|
-
message = f"Could not find HTCondor files in '{wms_path}' ({err})"
|
|
1397
|
-
_LOG.debug(message)
|
|
1398
|
-
messages.append(message)
|
|
1399
|
-
message = htc_check_dagman_output(wms_path)
|
|
1400
|
-
if message:
|
|
1401
|
-
messages.append(message)
|
|
1402
|
-
wms_workflow_id = MISSING_ID
|
|
1403
|
-
jobs = {}
|
|
1404
|
-
|
|
1405
|
-
message = "\n".join([msg for msg in messages if msg])
|
|
1406
|
-
_LOG.debug("wms_workflow_id = %s, jobs = %s", wms_workflow_id, jobs.keys())
|
|
1407
|
-
_LOG.debug("message = %s", message)
|
|
1408
|
-
return wms_workflow_id, jobs, message
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
def _create_detailed_report_from_jobs(
|
|
1412
|
-
wms_workflow_id: str, jobs: dict[str, dict[str, Any]]
|
|
1413
|
-
) -> dict[str, WmsRunReport]:
|
|
1414
|
-
"""Gather run information to be used in generating summary reports.
|
|
1415
|
-
|
|
1416
|
-
Parameters
|
|
1417
|
-
----------
|
|
1418
|
-
wms_workflow_id : `str`
|
|
1419
|
-
The run id to create the report for.
|
|
1420
|
-
jobs : `dict` [`str`, `dict` [`str`, Any]]
|
|
1421
|
-
Mapping HTCondor job id to job information.
|
|
1422
|
-
|
|
1423
|
-
Returns
|
|
1424
|
-
-------
|
|
1425
|
-
run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
|
|
1426
|
-
Run information for the detailed report. The key is the given HTCondor
|
|
1427
|
-
id and the value is a collection of report information for that run.
|
|
1428
|
-
"""
|
|
1429
|
-
_LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
|
|
1430
|
-
|
|
1431
|
-
dag_ad = jobs[wms_workflow_id]
|
|
1432
|
-
|
|
1433
|
-
report = WmsRunReport(
|
|
1434
|
-
wms_id=f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}",
|
|
1435
|
-
global_wms_id=dag_ad.get("GlobalJobId", "MISS"),
|
|
1436
|
-
path=dag_ad["Iwd"],
|
|
1437
|
-
label=dag_ad.get("bps_job_label", "MISS"),
|
|
1438
|
-
run=dag_ad.get("bps_run", "MISS"),
|
|
1439
|
-
project=dag_ad.get("bps_project", "MISS"),
|
|
1440
|
-
campaign=dag_ad.get("bps_campaign", "MISS"),
|
|
1441
|
-
payload=dag_ad.get("bps_payload", "MISS"),
|
|
1442
|
-
operator=_get_owner(dag_ad),
|
|
1443
|
-
run_summary=_get_run_summary(dag_ad),
|
|
1444
|
-
state=_htc_status_to_wms_state(dag_ad),
|
|
1445
|
-
total_number_jobs=0,
|
|
1446
|
-
jobs=[],
|
|
1447
|
-
job_state_counts=dict.fromkeys(WmsStates, 0),
|
|
1448
|
-
exit_code_summary={},
|
|
1449
|
-
)
|
|
1450
|
-
|
|
1451
|
-
payload_jobs = {} # keep track for later processing
|
|
1452
|
-
specific_info = WmsSpecificInfo()
|
|
1453
|
-
for job_id, job_ad in jobs.items():
|
|
1454
|
-
if job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [WmsNodeType.PAYLOAD, WmsNodeType.FINAL]:
|
|
1455
|
-
try:
|
|
1456
|
-
name = job_ad.get("DAGNodeName", job_id)
|
|
1457
|
-
wms_state = _htc_status_to_wms_state(job_ad)
|
|
1458
|
-
job_report = WmsJobReport(
|
|
1459
|
-
wms_id=job_id,
|
|
1460
|
-
name=name,
|
|
1461
|
-
label=job_ad.get("bps_job_label", pegasus_name_to_label(name)),
|
|
1462
|
-
state=wms_state,
|
|
1463
|
-
)
|
|
1464
|
-
if job_report.label == "init":
|
|
1465
|
-
job_report.label = "pipetaskInit"
|
|
1466
|
-
report.job_state_counts[wms_state] += 1
|
|
1467
|
-
report.jobs.append(job_report)
|
|
1468
|
-
payload_jobs[job_id] = job_ad
|
|
1469
|
-
except KeyError as ex:
|
|
1470
|
-
_LOG.error("Job missing key '%s': %s", str(ex), job_ad)
|
|
1471
|
-
raise
|
|
1472
|
-
elif is_service_job(job_ad):
|
|
1473
|
-
_LOG.debug(
|
|
1474
|
-
"Found service job: id='%s', name='%s', label='%s', NodeStatus='%s', JobStatus='%s'",
|
|
1475
|
-
job_id,
|
|
1476
|
-
job_ad["DAGNodeName"],
|
|
1477
|
-
job_ad.get("bps_job_label", "MISS"),
|
|
1478
|
-
job_ad.get("NodeStatus", "MISS"),
|
|
1479
|
-
job_ad.get("JobStatus", "MISS"),
|
|
1480
|
-
)
|
|
1481
|
-
_add_service_job_specific_info(job_ad, specific_info)
|
|
1482
|
-
|
|
1483
|
-
report.total_number_jobs = len(payload_jobs)
|
|
1484
|
-
report.exit_code_summary = _get_exit_code_summary(payload_jobs)
|
|
1485
|
-
if specific_info:
|
|
1486
|
-
report.specific_info = specific_info
|
|
1487
|
-
|
|
1488
|
-
# Workflow will exit with non-zero DAG_STATUS if problem with
|
|
1489
|
-
# any of the wms jobs. So change FAILED to SUCCEEDED if all
|
|
1490
|
-
# payload jobs SUCCEEDED.
|
|
1491
|
-
if report.total_number_jobs == report.job_state_counts[WmsStates.SUCCEEDED]:
|
|
1492
|
-
report.state = WmsStates.SUCCEEDED
|
|
1493
|
-
|
|
1494
|
-
run_reports = {report.wms_id: report}
|
|
1495
|
-
_LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
|
|
1496
|
-
return run_reports
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
def _add_service_job_specific_info(job_ad: dict[str, Any], specific_info: WmsSpecificInfo) -> None:
|
|
1500
|
-
"""Generate report information for service job.
|
|
1501
|
-
|
|
1502
|
-
Parameters
|
|
1503
|
-
----------
|
|
1504
|
-
job_ad : `dict` [`str`, `~typing.Any`]
|
|
1505
|
-
Provisioning job information.
|
|
1506
|
-
specific_info : `lsst.ctrl.bps.WmsSpecificInfo`
|
|
1507
|
-
Where to add message.
|
|
1508
|
-
"""
|
|
1509
|
-
status_details = ""
|
|
1510
|
-
job_status = _htc_status_to_wms_state(job_ad)
|
|
1511
|
-
|
|
1512
|
-
# Service jobs in queue are deleted when DAG is done.
|
|
1513
|
-
# To get accurate status, need to check other info.
|
|
1514
|
-
if (
|
|
1515
|
-
job_status == WmsStates.DELETED
|
|
1516
|
-
and "Reason" in job_ad
|
|
1517
|
-
and (
|
|
1518
|
-
"Removed by DAGMan" in job_ad["Reason"]
|
|
1519
|
-
or "removed because <OtherJobRemoveRequirements = DAGManJobId =?=" in job_ad["Reason"]
|
|
1520
|
-
or "DAG is exiting and writing rescue file." in job_ad["Reason"]
|
|
1521
|
-
)
|
|
1522
|
-
):
|
|
1523
|
-
if "HoldReason" in job_ad:
|
|
1524
|
-
# HoldReason exists even if released, so check.
|
|
1525
|
-
if "job_released_time" in job_ad and job_ad["job_held_time"] < job_ad["job_released_time"]:
|
|
1526
|
-
# If released, assume running until deleted.
|
|
1527
|
-
job_status = WmsStates.SUCCEEDED
|
|
1528
|
-
status_details = ""
|
|
1529
|
-
else:
|
|
1530
|
-
# If job held when deleted by DAGMan, still want to
|
|
1531
|
-
# report hold reason
|
|
1532
|
-
status_details = f"(Job was held for the following reason: {job_ad['HoldReason']})"
|
|
1533
|
-
|
|
1534
|
-
else:
|
|
1535
|
-
job_status = WmsStates.SUCCEEDED
|
|
1536
|
-
elif job_status == WmsStates.SUCCEEDED:
|
|
1537
|
-
status_details = "(Note: Finished before workflow.)"
|
|
1538
|
-
elif job_status == WmsStates.HELD:
|
|
1539
|
-
status_details = f"({job_ad['HoldReason']})"
|
|
1540
|
-
|
|
1541
|
-
template = "Status of {job_name}: {status} {status_details}"
|
|
1542
|
-
context = {
|
|
1543
|
-
"job_name": job_ad["DAGNodeName"],
|
|
1544
|
-
"status": job_status.name,
|
|
1545
|
-
"status_details": status_details,
|
|
1546
|
-
}
|
|
1547
|
-
specific_info.add_message(template=template, context=context)
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
def _summary_report(user, hist, pass_thru, schedds=None):
|
|
1551
|
-
"""Gather run information to be used in generating summary reports.
|
|
1552
|
-
|
|
1553
|
-
Parameters
|
|
1554
|
-
----------
|
|
1555
|
-
user : `str`
|
|
1556
|
-
Run lookup restricted to given user.
|
|
1557
|
-
hist : `float`
|
|
1558
|
-
How many previous days to search for run information.
|
|
1559
|
-
pass_thru : `str`
|
|
1560
|
-
Advanced users can define the HTCondor constraint to be used
|
|
1561
|
-
when searching queue and history.
|
|
1562
|
-
|
|
1563
|
-
Returns
|
|
1564
|
-
-------
|
|
1565
|
-
run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
|
|
1566
|
-
Run information for the summary report. The keys are HTCondor ids and
|
|
1567
|
-
the values are collections of report information for each run.
|
|
1568
|
-
message : `str`
|
|
1569
|
-
Message to be printed with the summary report.
|
|
1570
|
-
"""
|
|
1571
|
-
# only doing summary report so only look for dagman jobs
|
|
1572
|
-
if pass_thru:
|
|
1573
|
-
constraint = pass_thru
|
|
1574
|
-
else:
|
|
1575
|
-
# Notes:
|
|
1576
|
-
# * bps_isjob == 'True' isn't getting set for DAG jobs that are
|
|
1577
|
-
# manually restarted.
|
|
1578
|
-
# * Any job with DAGManJobID isn't a DAG job
|
|
1579
|
-
constraint = 'bps_isjob == "True" && JobUniverse == 7'
|
|
1580
|
-
if user:
|
|
1581
|
-
constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
|
|
1582
|
-
|
|
1583
|
-
job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds)
|
|
1584
|
-
|
|
1585
|
-
# Have list of DAGMan jobs, need to get run_report info.
|
|
1586
|
-
run_reports = {}
|
|
1587
|
-
msg = ""
|
|
1588
|
-
for jobs in job_info.values():
|
|
1589
|
-
for job_id, job in jobs.items():
|
|
1590
|
-
total_jobs, state_counts = _get_state_counts_from_dag_job(job)
|
|
1591
|
-
# If didn't get from queue information (e.g., Kerberos bug),
|
|
1592
|
-
# try reading from file.
|
|
1593
|
-
if total_jobs == 0:
|
|
1594
|
-
try:
|
|
1595
|
-
job.update(read_dag_status(job["Iwd"]))
|
|
1596
|
-
total_jobs, state_counts = _get_state_counts_from_dag_job(job)
|
|
1597
|
-
except StopIteration:
|
|
1598
|
-
pass # don't kill report can't find htcondor files
|
|
1599
|
-
|
|
1600
|
-
if "bps_run" not in job:
|
|
1601
|
-
_add_run_info(job["Iwd"], job)
|
|
1602
|
-
report = WmsRunReport(
|
|
1603
|
-
wms_id=job_id,
|
|
1604
|
-
global_wms_id=job["GlobalJobId"],
|
|
1605
|
-
path=job["Iwd"],
|
|
1606
|
-
label=job.get("bps_job_label", "MISS"),
|
|
1607
|
-
run=job.get("bps_run", "MISS"),
|
|
1608
|
-
project=job.get("bps_project", "MISS"),
|
|
1609
|
-
campaign=job.get("bps_campaign", "MISS"),
|
|
1610
|
-
payload=job.get("bps_payload", "MISS"),
|
|
1611
|
-
operator=_get_owner(job),
|
|
1612
|
-
run_summary=_get_run_summary(job),
|
|
1613
|
-
state=_htc_status_to_wms_state(job),
|
|
1614
|
-
jobs=[],
|
|
1615
|
-
total_number_jobs=total_jobs,
|
|
1616
|
-
job_state_counts=state_counts,
|
|
1617
|
-
)
|
|
1618
|
-
run_reports[report.global_wms_id] = report
|
|
1619
|
-
|
|
1620
|
-
return run_reports, msg
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
def _add_run_info(wms_path, job):
|
|
1624
|
-
"""Find BPS run information elsewhere for runs without bps attributes.
|
|
1625
|
-
|
|
1626
|
-
Parameters
|
|
1627
|
-
----------
|
|
1628
|
-
wms_path : `str`
|
|
1629
|
-
Path to submit files for the run.
|
|
1630
|
-
job : `dict` [`str`, `~typing.Any`]
|
|
1631
|
-
HTCondor dag job information.
|
|
1632
|
-
|
|
1633
|
-
Raises
|
|
1634
|
-
------
|
|
1635
|
-
StopIteration
|
|
1636
|
-
If cannot find file it is looking for. Permission errors are
|
|
1637
|
-
caught and job's run is marked with error.
|
|
1638
|
-
"""
|
|
1639
|
-
path = Path(wms_path) / "jobs"
|
|
1640
|
-
try:
|
|
1641
|
-
subfile = next(path.glob("**/*.sub"))
|
|
1642
|
-
except (StopIteration, PermissionError):
|
|
1643
|
-
job["bps_run"] = "Unavailable"
|
|
1644
|
-
else:
|
|
1645
|
-
_LOG.debug("_add_run_info: subfile = %s", subfile)
|
|
1646
|
-
try:
|
|
1647
|
-
with open(subfile, encoding="utf-8") as fh:
|
|
1648
|
-
for line in fh:
|
|
1649
|
-
if line.startswith("+bps_"):
|
|
1650
|
-
m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
|
|
1651
|
-
if m:
|
|
1652
|
-
_LOG.debug("Matching line: %s", line)
|
|
1653
|
-
job[m.group(1)] = m.group(2).replace('"', "")
|
|
1654
|
-
else:
|
|
1655
|
-
_LOG.debug("Could not parse attribute: %s", line)
|
|
1656
|
-
except PermissionError:
|
|
1657
|
-
job["bps_run"] = "PermissionError"
|
|
1658
|
-
_LOG.debug("After adding job = %s", job)
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
def _get_owner(job):
|
|
1662
|
-
"""Get the owner of a dag job.
|
|
1663
|
-
|
|
1664
|
-
Parameters
|
|
1665
|
-
----------
|
|
1666
|
-
job : `dict` [`str`, `~typing.Any`]
|
|
1667
|
-
HTCondor dag job information.
|
|
1668
|
-
|
|
1669
|
-
Returns
|
|
1670
|
-
-------
|
|
1671
|
-
owner : `str`
|
|
1672
|
-
Owner of the dag job.
|
|
1673
|
-
"""
|
|
1674
|
-
owner = job.get("bps_operator", None)
|
|
1675
|
-
if not owner:
|
|
1676
|
-
owner = job.get("Owner", None)
|
|
1677
|
-
if not owner:
|
|
1678
|
-
_LOG.warning("Could not get Owner from htcondor job: %s", job)
|
|
1679
|
-
owner = "MISS"
|
|
1680
|
-
return owner
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
def _get_run_summary(job):
|
|
1684
|
-
"""Get the run summary for a job.
|
|
1685
|
-
|
|
1686
|
-
Parameters
|
|
1687
|
-
----------
|
|
1688
|
-
job : `dict` [`str`, `~typing.Any`]
|
|
1689
|
-
HTCondor dag job information.
|
|
1690
|
-
|
|
1691
|
-
Returns
|
|
1692
|
-
-------
|
|
1693
|
-
summary : `str`
|
|
1694
|
-
Number of jobs per PipelineTask label in approximate pipeline order.
|
|
1695
|
-
Format: <label>:<count>[;<label>:<count>]+
|
|
1696
|
-
"""
|
|
1697
|
-
summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
|
|
1698
|
-
if not summary:
|
|
1699
|
-
summary, _, _ = summarize_dag(job["Iwd"])
|
|
1700
|
-
if not summary:
|
|
1701
|
-
_LOG.warning("Could not get run summary for htcondor job: %s", job)
|
|
1702
|
-
_LOG.debug("_get_run_summary: summary=%s", summary)
|
|
1703
|
-
|
|
1704
|
-
# Workaround sometimes using init vs pipetaskInit
|
|
1705
|
-
summary = summary.replace("init:", "pipetaskInit:")
|
|
1706
|
-
|
|
1707
|
-
if "pegasus_version" in job and "pegasus" not in summary:
|
|
1708
|
-
summary += ";pegasus:0"
|
|
1709
|
-
|
|
1710
|
-
return summary
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
def _get_exit_code_summary(jobs):
|
|
1714
|
-
"""Get the exit code summary for a run.
|
|
1715
|
-
|
|
1716
|
-
Parameters
|
|
1717
|
-
----------
|
|
1718
|
-
jobs : `dict` [`str`, `dict` [`str`, Any]]
|
|
1719
|
-
Mapping HTCondor job id to job information.
|
|
1720
|
-
|
|
1721
|
-
Returns
|
|
1722
|
-
-------
|
|
1723
|
-
summary : `dict` [`str`, `list` [`int`]]
|
|
1724
|
-
Jobs' exit codes per job label.
|
|
1725
|
-
"""
|
|
1726
|
-
summary = {}
|
|
1727
|
-
for job_id, job_ad in jobs.items():
|
|
1728
|
-
job_label = job_ad["bps_job_label"]
|
|
1729
|
-
summary.setdefault(job_label, [])
|
|
1730
|
-
try:
|
|
1731
|
-
exit_code = 0
|
|
1732
|
-
job_status = job_ad["JobStatus"]
|
|
1733
|
-
match job_status:
|
|
1734
|
-
case htcondor.JobStatus.COMPLETED | htcondor.JobStatus.HELD:
|
|
1735
|
-
exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["ExitCode"]
|
|
1736
|
-
case (
|
|
1737
|
-
htcondor.JobStatus.IDLE
|
|
1738
|
-
| htcondor.JobStatus.RUNNING
|
|
1739
|
-
| htcondor.JobStatus.REMOVED
|
|
1740
|
-
| htcondor.JobStatus.TRANSFERRING_OUTPUT
|
|
1741
|
-
| htcondor.JobStatus.SUSPENDED
|
|
1742
|
-
):
|
|
1743
|
-
pass
|
|
1744
|
-
case _:
|
|
1745
|
-
_LOG.debug("Unknown 'JobStatus' value ('%d') in classad for job '%s'", job_status, job_id)
|
|
1746
|
-
if exit_code != 0:
|
|
1747
|
-
summary[job_label].append(exit_code)
|
|
1748
|
-
except KeyError as ex:
|
|
1749
|
-
_LOG.debug("Attribute '%s' not found in the classad for job '%s'", ex, job_id)
|
|
1750
|
-
return summary
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
def _get_state_counts_from_jobs(
|
|
1754
|
-
wms_workflow_id: str, jobs: dict[str, dict[str, Any]]
|
|
1755
|
-
) -> tuple[int, dict[WmsStates, int]]:
|
|
1756
|
-
"""Count number of jobs per WMS state.
|
|
1757
|
-
|
|
1758
|
-
The workflow job and the service jobs are excluded from the count.
|
|
1759
|
-
|
|
1760
|
-
Parameters
|
|
1761
|
-
----------
|
|
1762
|
-
wms_workflow_id : `str`
|
|
1763
|
-
HTCondor job id.
|
|
1764
|
-
jobs : `dict [`dict` [`str`, `~typing.Any`]]
|
|
1765
|
-
HTCondor dag job information.
|
|
1766
|
-
|
|
1767
|
-
Returns
|
|
1768
|
-
-------
|
|
1769
|
-
total_count : `int`
|
|
1770
|
-
Total number of dag nodes.
|
|
1771
|
-
state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
|
|
1772
|
-
Keys are the different WMS states and values are counts of jobs
|
|
1773
|
-
that are in that WMS state.
|
|
1774
|
-
"""
|
|
1775
|
-
state_counts = dict.fromkeys(WmsStates, 0)
|
|
1776
|
-
for job_id, job_ad in jobs.items():
|
|
1777
|
-
if job_id != wms_workflow_id and job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [
|
|
1778
|
-
WmsNodeType.PAYLOAD,
|
|
1779
|
-
WmsNodeType.FINAL,
|
|
1780
|
-
]:
|
|
1781
|
-
state_counts[_htc_status_to_wms_state(job_ad)] += 1
|
|
1782
|
-
total_count = sum(state_counts.values())
|
|
1783
|
-
|
|
1784
|
-
return total_count, state_counts
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
def _get_state_counts_from_dag_job(job):
|
|
1788
|
-
"""Count number of jobs per WMS state.
|
|
1789
|
-
|
|
1790
|
-
Parameters
|
|
1791
|
-
----------
|
|
1792
|
-
job : `dict` [`str`, `~typing.Any`]
|
|
1793
|
-
HTCondor dag job information.
|
|
1794
|
-
|
|
1795
|
-
Returns
|
|
1796
|
-
-------
|
|
1797
|
-
total_count : `int`
|
|
1798
|
-
Total number of dag nodes.
|
|
1799
|
-
state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
|
|
1800
|
-
Keys are the different WMS states and values are counts of jobs
|
|
1801
|
-
that are in that WMS state.
|
|
1802
|
-
"""
|
|
1803
|
-
_LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
|
|
1804
|
-
state_counts = dict.fromkeys(WmsStates, 0)
|
|
1805
|
-
if "DAG_NodesReady" in job:
|
|
1806
|
-
state_counts = {
|
|
1807
|
-
WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
|
|
1808
|
-
WmsStates.READY: job.get("DAG_NodesReady", 0),
|
|
1809
|
-
WmsStates.HELD: job.get("DAG_JobsHeld", 0),
|
|
1810
|
-
WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
|
|
1811
|
-
WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
|
|
1812
|
-
WmsStates.PRUNED: job.get("DAG_NodesFutile", 0),
|
|
1813
|
-
WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0),
|
|
1814
|
-
}
|
|
1815
|
-
total_jobs = job.get("DAG_NodesTotal")
|
|
1816
|
-
_LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
|
|
1817
|
-
elif "NodesFailed" in job:
|
|
1818
|
-
state_counts = {
|
|
1819
|
-
WmsStates.UNREADY: job.get("NodesUnready", 0),
|
|
1820
|
-
WmsStates.READY: job.get("NodesReady", 0),
|
|
1821
|
-
WmsStates.HELD: job.get("JobProcsHeld", 0),
|
|
1822
|
-
WmsStates.SUCCEEDED: job.get("NodesDone", 0),
|
|
1823
|
-
WmsStates.FAILED: job.get("NodesFailed", 0),
|
|
1824
|
-
WmsStates.PRUNED: job.get("NodesFutile", 0),
|
|
1825
|
-
WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0),
|
|
1826
|
-
}
|
|
1827
|
-
try:
|
|
1828
|
-
total_jobs = job.get("NodesTotal")
|
|
1829
|
-
except KeyError as ex:
|
|
1830
|
-
_LOG.error("Job missing %s. job = %s", str(ex), job)
|
|
1831
|
-
raise
|
|
1832
|
-
_LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
|
|
1833
|
-
else:
|
|
1834
|
-
# With Kerberos job auth and Kerberos bug, if warning would be printed
|
|
1835
|
-
# for every DAG.
|
|
1836
|
-
_LOG.debug("Can't get job state counts %s", job["Iwd"])
|
|
1837
|
-
total_jobs = 0
|
|
1838
|
-
|
|
1839
|
-
_LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
|
|
1840
|
-
return total_jobs, state_counts
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
def _htc_status_to_wms_state(job):
|
|
1844
|
-
"""Convert HTCondor job status to generic wms state.
|
|
1845
|
-
|
|
1846
|
-
Parameters
|
|
1847
|
-
----------
|
|
1848
|
-
job : `dict` [`str`, `~typing.Any`]
|
|
1849
|
-
HTCondor job information.
|
|
1850
|
-
|
|
1851
|
-
Returns
|
|
1852
|
-
-------
|
|
1853
|
-
wms_state : `WmsStates`
|
|
1854
|
-
The equivalent WmsState to given job's status.
|
|
1855
|
-
"""
|
|
1856
|
-
wms_state = WmsStates.MISFIT
|
|
1857
|
-
if "JobStatus" in job:
|
|
1858
|
-
wms_state = _htc_job_status_to_wms_state(job)
|
|
1859
|
-
|
|
1860
|
-
if wms_state == WmsStates.MISFIT and "NodeStatus" in job:
|
|
1861
|
-
wms_state = _htc_node_status_to_wms_state(job)
|
|
1862
|
-
return wms_state
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
def _htc_job_status_to_wms_state(job):
|
|
1866
|
-
"""Convert HTCondor job status to generic wms state.
|
|
1867
|
-
|
|
1868
|
-
Parameters
|
|
1869
|
-
----------
|
|
1870
|
-
job : `dict` [`str`, `~typing.Any`]
|
|
1871
|
-
HTCondor job information.
|
|
1872
|
-
|
|
1873
|
-
Returns
|
|
1874
|
-
-------
|
|
1875
|
-
wms_state : `lsst.ctrl.bps.WmsStates`
|
|
1876
|
-
The equivalent WmsState to given job's status.
|
|
1877
|
-
"""
|
|
1878
|
-
_LOG.debug(
|
|
1879
|
-
"htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"])
|
|
1880
|
-
)
|
|
1881
|
-
wms_state = WmsStates.MISFIT
|
|
1882
|
-
if "JobStatus" in job and job["JobStatus"]:
|
|
1883
|
-
job_status = int(job["JobStatus"])
|
|
1884
|
-
|
|
1885
|
-
_LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
|
|
1886
|
-
if job_status == htcondor.JobStatus.IDLE:
|
|
1887
|
-
wms_state = WmsStates.PENDING
|
|
1888
|
-
elif job_status == htcondor.JobStatus.RUNNING:
|
|
1889
|
-
wms_state = WmsStates.RUNNING
|
|
1890
|
-
elif job_status == htcondor.JobStatus.REMOVED:
|
|
1891
|
-
wms_state = WmsStates.DELETED
|
|
1892
|
-
elif job_status == htcondor.JobStatus.COMPLETED:
|
|
1893
|
-
if (
|
|
1894
|
-
(job.get("ExitBySignal", False) and job.get("ExitSignal", 0))
|
|
1895
|
-
or job.get("ExitCode", 0)
|
|
1896
|
-
or job.get("DAG_Status", 0)
|
|
1897
|
-
):
|
|
1898
|
-
wms_state = WmsStates.FAILED
|
|
1899
|
-
else:
|
|
1900
|
-
wms_state = WmsStates.SUCCEEDED
|
|
1901
|
-
elif job_status == htcondor.JobStatus.HELD:
|
|
1902
|
-
wms_state = WmsStates.HELD
|
|
1903
|
-
|
|
1904
|
-
return wms_state
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
def _htc_node_status_to_wms_state(job):
|
|
1908
|
-
"""Convert HTCondor node status to generic wms state.
|
|
1909
|
-
|
|
1910
|
-
Parameters
|
|
1911
|
-
----------
|
|
1912
|
-
job : `dict` [`str`, `~typing.Any`]
|
|
1913
|
-
HTCondor job information.
|
|
1914
|
-
|
|
1915
|
-
Returns
|
|
1916
|
-
-------
|
|
1917
|
-
wms_state : `lsst.ctrl.bps.WmsStates`
|
|
1918
|
-
The equivalent WmsState to given node's status.
|
|
1919
|
-
"""
|
|
1920
|
-
wms_state = WmsStates.MISFIT
|
|
1921
|
-
match job["NodeStatus"]:
|
|
1922
|
-
case NodeStatus.NOT_READY:
|
|
1923
|
-
wms_state = WmsStates.UNREADY
|
|
1924
|
-
case NodeStatus.READY:
|
|
1925
|
-
wms_state = WmsStates.READY
|
|
1926
|
-
case NodeStatus.PRERUN:
|
|
1927
|
-
wms_state = WmsStates.MISFIT
|
|
1928
|
-
case NodeStatus.SUBMITTED:
|
|
1929
|
-
if job["JobProcsHeld"]:
|
|
1930
|
-
wms_state = WmsStates.HELD
|
|
1931
|
-
elif job["StatusDetails"] == "not_idle":
|
|
1932
|
-
wms_state = WmsStates.RUNNING
|
|
1933
|
-
elif job["JobProcsQueued"]:
|
|
1934
|
-
wms_state = WmsStates.PENDING
|
|
1935
|
-
case NodeStatus.POSTRUN:
|
|
1936
|
-
wms_state = WmsStates.MISFIT
|
|
1937
|
-
case NodeStatus.DONE:
|
|
1938
|
-
wms_state = WmsStates.SUCCEEDED
|
|
1939
|
-
case NodeStatus.ERROR:
|
|
1940
|
-
# Use job exit status instead of post script exit status.
|
|
1941
|
-
if "DAGMAN error 0" in job["StatusDetails"]:
|
|
1942
|
-
wms_state = WmsStates.SUCCEEDED
|
|
1943
|
-
elif "ULOG_JOB_ABORTED" in job["StatusDetails"]:
|
|
1944
|
-
wms_state = WmsStates.DELETED
|
|
1945
|
-
else:
|
|
1946
|
-
wms_state = WmsStates.FAILED
|
|
1947
|
-
case NodeStatus.FUTILE:
|
|
1948
|
-
wms_state = WmsStates.PRUNED
|
|
1949
|
-
return wms_state
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
def _update_jobs(jobs1, jobs2):
|
|
1953
|
-
"""Update jobs1 with info in jobs2.
|
|
1954
|
-
|
|
1955
|
-
(Basically an update for nested dictionaries.)
|
|
1956
|
-
|
|
1957
|
-
Parameters
|
|
1958
|
-
----------
|
|
1959
|
-
jobs1 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1960
|
-
HTCondor job information to be updated.
|
|
1961
|
-
jobs2 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1962
|
-
Additional HTCondor job information.
|
|
1963
|
-
"""
|
|
1964
|
-
for job_id, job_ad in jobs2.items():
|
|
1965
|
-
if job_id in jobs1:
|
|
1966
|
-
jobs1[job_id].update(job_ad)
|
|
1967
|
-
else:
|
|
1968
|
-
jobs1[job_id] = job_ad
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
def _wms_id_type(wms_id):
|
|
1972
|
-
"""Determine the type of the WMS id.
|
|
1973
|
-
|
|
1974
|
-
Parameters
|
|
1975
|
-
----------
|
|
1976
|
-
wms_id : `str`
|
|
1977
|
-
WMS id identifying a job.
|
|
1978
|
-
|
|
1979
|
-
Returns
|
|
1980
|
-
-------
|
|
1981
|
-
id_type : `lsst.ctrl.bps.htcondor.WmsIdType`
|
|
1982
|
-
Type of WMS id.
|
|
1983
|
-
"""
|
|
1984
|
-
try:
|
|
1985
|
-
int(float(wms_id))
|
|
1986
|
-
except ValueError:
|
|
1987
|
-
wms_path = Path(wms_id)
|
|
1988
|
-
if wms_path.is_dir():
|
|
1989
|
-
id_type = WmsIdType.PATH
|
|
1990
|
-
else:
|
|
1991
|
-
id_type = WmsIdType.GLOBAL
|
|
1992
|
-
except TypeError:
|
|
1993
|
-
id_type = WmsIdType.UNKNOWN
|
|
1994
|
-
else:
|
|
1995
|
-
id_type = WmsIdType.LOCAL
|
|
1996
|
-
return id_type
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
def _wms_id_to_cluster(wms_id):
|
|
2000
|
-
"""Convert WMS id to cluster id.
|
|
2001
|
-
|
|
2002
|
-
Parameters
|
|
2003
|
-
----------
|
|
2004
|
-
wms_id : `int` or `float` or `str`
|
|
2005
|
-
HTCondor job id or path.
|
|
2006
|
-
|
|
2007
|
-
Returns
|
|
2008
|
-
-------
|
|
2009
|
-
schedd_ad : `classad.ClassAd`
|
|
2010
|
-
ClassAd describing the scheduler managing the job with the given id.
|
|
2011
|
-
cluster_id : `int`
|
|
2012
|
-
HTCondor cluster id.
|
|
2013
|
-
id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
|
|
2014
|
-
The type of the provided id.
|
|
2015
|
-
"""
|
|
2016
|
-
coll = htcondor.Collector()
|
|
2017
|
-
|
|
2018
|
-
schedd_ad = None
|
|
2019
|
-
cluster_id = None
|
|
2020
|
-
id_type = _wms_id_type(wms_id)
|
|
2021
|
-
if id_type == WmsIdType.LOCAL:
|
|
2022
|
-
schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
|
|
2023
|
-
cluster_id = int(float(wms_id))
|
|
2024
|
-
elif id_type == WmsIdType.GLOBAL:
|
|
2025
|
-
constraint = f'GlobalJobId == "{wms_id}"'
|
|
2026
|
-
schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)}
|
|
2027
|
-
schedds = {name: htcondor.Schedd(ad) for name, ad in schedd_ads.items()}
|
|
2028
|
-
job_info = condor_q(constraint=constraint, schedds=schedds)
|
|
2029
|
-
if job_info:
|
|
2030
|
-
schedd_name, job_rec = job_info.popitem()
|
|
2031
|
-
job_id, _ = job_rec.popitem()
|
|
2032
|
-
schedd_ad = schedd_ads[schedd_name]
|
|
2033
|
-
cluster_id = int(float(job_id))
|
|
2034
|
-
elif id_type == WmsIdType.PATH:
|
|
2035
|
-
try:
|
|
2036
|
-
job_info = read_dag_info(wms_id)
|
|
2037
|
-
except (FileNotFoundError, PermissionError, OSError):
|
|
2038
|
-
pass
|
|
2039
|
-
else:
|
|
2040
|
-
schedd_name, job_rec = job_info.popitem()
|
|
2041
|
-
job_id, _ = job_rec.popitem()
|
|
2042
|
-
schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name)
|
|
2043
|
-
cluster_id = int(float(job_id))
|
|
2044
|
-
else:
|
|
2045
|
-
pass
|
|
2046
|
-
return schedd_ad, cluster_id, id_type
|
|
2047
|
-
|
|
2048
|
-
|
|
2049
|
-
def _wms_id_to_dir(wms_id):
|
|
2050
|
-
"""Convert WMS id to a submit directory candidate.
|
|
2051
|
-
|
|
2052
|
-
The function does not check if the directory exists or if it is a valid
|
|
2053
|
-
BPS submit directory.
|
|
2054
|
-
|
|
2055
|
-
Parameters
|
|
2056
|
-
----------
|
|
2057
|
-
wms_id : `int` or `float` or `str`
|
|
2058
|
-
HTCondor job id or path.
|
|
2059
|
-
|
|
2060
|
-
Returns
|
|
2061
|
-
-------
|
|
2062
|
-
wms_path : `pathlib.Path` or None
|
|
2063
|
-
Submit directory candidate for the run with the given job id. If no
|
|
2064
|
-
directory can be associated with the provided WMS id, it will be set
|
|
2065
|
-
to None.
|
|
2066
|
-
id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
|
|
2067
|
-
The type of the provided id.
|
|
2068
|
-
|
|
2069
|
-
Raises
|
|
2070
|
-
------
|
|
2071
|
-
TypeError
|
|
2072
|
-
Raised if provided WMS id has invalid type.
|
|
2073
|
-
"""
|
|
2074
|
-
coll = htcondor.Collector()
|
|
2075
|
-
schedd_ads = []
|
|
2076
|
-
|
|
2077
|
-
constraint = None
|
|
2078
|
-
wms_path = None
|
|
2079
|
-
id_type = _wms_id_type(wms_id)
|
|
2080
|
-
match id_type:
|
|
2081
|
-
case WmsIdType.LOCAL:
|
|
2082
|
-
constraint = f"ClusterId == {int(float(wms_id))}"
|
|
2083
|
-
schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
|
|
2084
|
-
case WmsIdType.GLOBAL:
|
|
2085
|
-
constraint = f'GlobalJobId == "{wms_id}"'
|
|
2086
|
-
schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
|
|
2087
|
-
case WmsIdType.PATH:
|
|
2088
|
-
wms_path = Path(wms_id).resolve()
|
|
2089
|
-
case WmsIdType.UNKNOWN:
|
|
2090
|
-
raise TypeError(f"Invalid job id type: {wms_id}")
|
|
2091
|
-
if constraint is not None:
|
|
2092
|
-
schedds = {ad["name"]: htcondor.Schedd(ad) for ad in schedd_ads}
|
|
2093
|
-
job_info = condor_history(constraint=constraint, schedds=schedds, projection=["Iwd"])
|
|
2094
|
-
if job_info:
|
|
2095
|
-
_, job_rec = job_info.popitem()
|
|
2096
|
-
_, job_ad = job_rec.popitem()
|
|
2097
|
-
wms_path = Path(job_ad["Iwd"])
|
|
2098
|
-
return wms_path, id_type
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
def _create_periodic_release_expr(
|
|
2102
|
-
memory: int, multiplier: float | None, limit: int, additional_expr: str = ""
|
|
2103
|
-
) -> str:
|
|
2104
|
-
"""Construct an HTCondorAd expression for releasing held jobs.
|
|
2105
|
-
|
|
2106
|
-
Parameters
|
|
2107
|
-
----------
|
|
2108
|
-
memory : `int`
|
|
2109
|
-
Requested memory in MB.
|
|
2110
|
-
multiplier : `float` or None
|
|
2111
|
-
Memory growth rate between retries.
|
|
2112
|
-
limit : `int`
|
|
2113
|
-
Memory limit.
|
|
2114
|
-
additional_expr : `str`, optional
|
|
2115
|
-
Expression to add to periodic_release. Defaults to empty string.
|
|
2116
|
-
|
|
2117
|
-
Returns
|
|
2118
|
-
-------
|
|
2119
|
-
expr : `str`
|
|
2120
|
-
A string representing an HTCondor ClassAd expression for releasing job.
|
|
2121
|
-
"""
|
|
2122
|
-
_LOG.debug(
|
|
2123
|
-
"periodic_release: memory: %s, multiplier: %s, limit: %s, additional_expr: %s",
|
|
2124
|
-
memory,
|
|
2125
|
-
multiplier,
|
|
2126
|
-
limit,
|
|
2127
|
-
additional_expr,
|
|
2128
|
-
)
|
|
2129
|
-
|
|
2130
|
-
# ctrl_bps sets multiplier to None in the GenericWorkflow if
|
|
2131
|
-
# memoryMultiplier <= 1, but checking value just in case.
|
|
2132
|
-
if (not multiplier or multiplier <= 1) and not additional_expr:
|
|
2133
|
-
return ""
|
|
2134
|
-
|
|
2135
|
-
# Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
|
|
2136
|
-
# UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
|
|
2137
|
-
# The special comparison operators ensure that all comparisons below will
|
|
2138
|
-
# evaluate to FALSE in this case.
|
|
2139
|
-
#
|
|
2140
|
-
# Note:
|
|
2141
|
-
# May not be strictly necessary. Operators '&&' and '||' are not strict so
|
|
2142
|
-
# the entire expression should evaluate to FALSE when the job is not HELD.
|
|
2143
|
-
# According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
|
|
2144
|
-
# but better safe than sorry.
|
|
2145
|
-
is_held = "JobStatus == 5"
|
|
2146
|
-
is_retry_allowed = "NumJobStarts <= JobMaxRetries"
|
|
2147
|
-
|
|
2148
|
-
mem_expr = ""
|
|
2149
|
-
if memory and multiplier and multiplier > 1 and limit:
|
|
2150
|
-
was_mem_exceeded = (
|
|
2151
|
-
"(HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
|
|
2152
|
-
"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
|
|
2153
|
-
)
|
|
2154
|
-
was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
|
|
2155
|
-
mem_expr = f"{was_mem_exceeded} && {was_below_limit}"
|
|
2156
|
-
|
|
2157
|
-
user_expr = ""
|
|
2158
|
-
if additional_expr:
|
|
2159
|
-
# Never auto release a job held by user.
|
|
2160
|
-
user_expr = f"HoldReasonCode =!= 1 && {additional_expr}"
|
|
2161
|
-
|
|
2162
|
-
expr = f"{is_held} && {is_retry_allowed}"
|
|
2163
|
-
if user_expr and mem_expr:
|
|
2164
|
-
expr += f" && ({mem_expr} || {user_expr})"
|
|
2165
|
-
elif user_expr:
|
|
2166
|
-
expr += f" && {user_expr}"
|
|
2167
|
-
elif mem_expr:
|
|
2168
|
-
expr += f" && {mem_expr}"
|
|
2169
|
-
|
|
2170
|
-
return expr
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
def _create_periodic_remove_expr(memory, multiplier, limit):
|
|
2174
|
-
"""Construct an HTCondorAd expression for removing jobs from the queue.
|
|
2175
|
-
|
|
2176
|
-
Parameters
|
|
2177
|
-
----------
|
|
2178
|
-
memory : `int`
|
|
2179
|
-
Requested memory in MB.
|
|
2180
|
-
multiplier : `float`
|
|
2181
|
-
Memory growth rate between retries.
|
|
2182
|
-
limit : `int`
|
|
2183
|
-
Memory limit.
|
|
2184
|
-
|
|
2185
|
-
Returns
|
|
2186
|
-
-------
|
|
2187
|
-
expr : `str`
|
|
2188
|
-
A string representing an HTCondor ClassAd expression for removing jobs.
|
|
2189
|
-
"""
|
|
2190
|
-
# Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode'
|
|
2191
|
-
# are UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
|
|
2192
|
-
# The special comparison operators ensure that all comparisons below
|
|
2193
|
-
# will evaluate to FALSE in this case.
|
|
2194
|
-
#
|
|
2195
|
-
# Note:
|
|
2196
|
-
# May not be strictly necessary. Operators '&&' and '||' are not
|
|
2197
|
-
# strict so the entire expression should evaluate to FALSE when the
|
|
2198
|
-
# job is not HELD. According to ClassAd evaluation semantics
|
|
2199
|
-
# FALSE && UNDEFINED is FALSE, but better safe than sorry.
|
|
2200
|
-
is_held = "JobStatus == 5"
|
|
2201
|
-
is_retry_disallowed = "NumJobStarts > JobMaxRetries"
|
|
2202
|
-
|
|
2203
|
-
mem_expr = ""
|
|
2204
|
-
if memory and multiplier and multiplier > 1 and limit:
|
|
2205
|
-
mem_limit_expr = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
|
|
2206
|
-
|
|
2207
|
-
mem_expr = ( # Add || here so only added if adding memory expr
|
|
2208
|
-
" || ((HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
|
|
2209
|
-
f"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && {mem_limit_expr})"
|
|
2210
|
-
)
|
|
2211
|
-
|
|
2212
|
-
expr = f"{is_held} && ({is_retry_disallowed}{mem_expr})"
|
|
2213
|
-
return expr
|
|
2214
|
-
|
|
2215
|
-
|
|
2216
|
-
def _create_request_memory_expr(memory, multiplier, limit):
|
|
2217
|
-
"""Construct an HTCondor ClassAd expression for safe memory scaling.
|
|
2218
|
-
|
|
2219
|
-
Parameters
|
|
2220
|
-
----------
|
|
2221
|
-
memory : `int`
|
|
2222
|
-
Requested memory in MB.
|
|
2223
|
-
multiplier : `float`
|
|
2224
|
-
Memory growth rate between retries.
|
|
2225
|
-
limit : `int`
|
|
2226
|
-
Memory limit.
|
|
2227
|
-
|
|
2228
|
-
Returns
|
|
2229
|
-
-------
|
|
2230
|
-
expr : `str`
|
|
2231
|
-
A string representing an HTCondor ClassAd expression enabling safe
|
|
2232
|
-
memory scaling between job retries.
|
|
2233
|
-
"""
|
|
2234
|
-
# The check if the job was held due to exceeding memory requirements
|
|
2235
|
-
# will be made *after* job was released back to the job queue (is in
|
|
2236
|
-
# the IDLE state), hence the need to use `Last*` job ClassAds instead of
|
|
2237
|
-
# the ones describing job's current state.
|
|
2238
|
-
#
|
|
2239
|
-
# Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is
|
|
2240
|
-
# initially put in the job queue. The special comparison operators ensure
|
|
2241
|
-
# that all comparisons below will evaluate to FALSE in this case.
|
|
2242
|
-
was_mem_exceeded = (
|
|
2243
|
-
"LastJobStatus =?= 5 "
|
|
2244
|
-
"&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 "
|
|
2245
|
-
"|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
|
|
2246
|
-
)
|
|
2247
|
-
|
|
2248
|
-
# If job runs the first time or was held for reasons other than exceeding
|
|
2249
|
-
# the memory, set the required memory to the requested value or use
|
|
2250
|
-
# the memory value measured by HTCondor (MemoryUsage) depending on
|
|
2251
|
-
# whichever is greater.
|
|
2252
|
-
expr = (
|
|
2253
|
-
f"({was_mem_exceeded}) "
|
|
2254
|
-
f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) "
|
|
2255
|
-
f": max({{{memory}, MemoryUsage ?: 0}})"
|
|
2256
|
-
)
|
|
2257
|
-
return expr
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
def _locate_schedds(locate_all=False):
|
|
2261
|
-
"""Find out Scheduler daemons in an HTCondor pool.
|
|
2262
|
-
|
|
2263
|
-
Parameters
|
|
2264
|
-
----------
|
|
2265
|
-
locate_all : `bool`, optional
|
|
2266
|
-
If True, all available schedulers in the HTCondor pool will be located.
|
|
2267
|
-
False by default which means that the search will be limited to looking
|
|
2268
|
-
for the Scheduler running on a local host.
|
|
2269
|
-
|
|
2270
|
-
Returns
|
|
2271
|
-
-------
|
|
2272
|
-
schedds : `dict` [`str`, `htcondor.Schedd`]
|
|
2273
|
-
A mapping between Scheduler names and Python objects allowing for
|
|
2274
|
-
interacting with them.
|
|
2275
|
-
"""
|
|
2276
|
-
coll = htcondor.Collector()
|
|
2277
|
-
|
|
2278
|
-
schedd_ads = []
|
|
2279
|
-
if locate_all:
|
|
2280
|
-
schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
|
|
2281
|
-
else:
|
|
2282
|
-
schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
|
|
2283
|
-
return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
def _gather_site_values(config, compute_site):
|
|
2287
|
-
"""Gather values specific to given site.
|
|
2288
|
-
|
|
2289
|
-
Parameters
|
|
2290
|
-
----------
|
|
2291
|
-
config : `lsst.ctrl.bps.BpsConfig`
|
|
2292
|
-
BPS configuration that includes necessary submit/runtime
|
|
2293
|
-
information.
|
|
2294
|
-
compute_site : `str`
|
|
2295
|
-
Compute site name.
|
|
2296
|
-
|
|
2297
|
-
Returns
|
|
2298
|
-
-------
|
|
2299
|
-
site_values : `dict` [`str`, `~typing.Any`]
|
|
2300
|
-
Values specific to the given site.
|
|
2301
|
-
"""
|
|
2302
|
-
site_values = {"attrs": {}, "profile": {}}
|
|
2303
|
-
search_opts = {}
|
|
2304
|
-
if compute_site:
|
|
2305
|
-
search_opts["curvals"] = {"curr_site": compute_site}
|
|
2306
|
-
|
|
2307
|
-
# Determine the hard limit for the memory requirement.
|
|
2308
|
-
found, limit = config.search("memoryLimit", opt=search_opts)
|
|
2309
|
-
if not found:
|
|
2310
|
-
search_opts["default"] = DEFAULT_HTC_EXEC_PATT
|
|
2311
|
-
_, patt = config.search("executeMachinesPattern", opt=search_opts)
|
|
2312
|
-
del search_opts["default"]
|
|
2313
|
-
|
|
2314
|
-
# To reduce the amount of data, ignore dynamic slots (if any) as,
|
|
2315
|
-
# by definition, they cannot have more memory than
|
|
2316
|
-
# the partitionable slot they are the part of.
|
|
2317
|
-
constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
|
|
2318
|
-
pool_info = condor_status(constraint=constraint)
|
|
2319
|
-
try:
|
|
2320
|
-
limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
|
|
2321
|
-
except ValueError:
|
|
2322
|
-
_LOG.debug("No execute machine in the pool matches %s", patt)
|
|
2323
|
-
if limit:
|
|
2324
|
-
config[".bps_defined.memory_limit"] = limit
|
|
2325
|
-
|
|
2326
|
-
_, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False})
|
|
2327
|
-
site_values["memoryLimit"] = limit
|
|
2328
|
-
|
|
2329
|
-
found, value = config.search("accountingGroup", opt=search_opts)
|
|
2330
|
-
if found:
|
|
2331
|
-
site_values["accountingGroup"] = value
|
|
2332
|
-
found, value = config.search("accountingUser", opt=search_opts)
|
|
2333
|
-
if found:
|
|
2334
|
-
site_values["accountingUser"] = value
|
|
2335
|
-
|
|
2336
|
-
key = f".site.{compute_site}.profile.condor"
|
|
2337
|
-
if key in config:
|
|
2338
|
-
for subkey, val in config[key].items():
|
|
2339
|
-
if subkey.startswith("+"):
|
|
2340
|
-
site_values["attrs"][subkey[1:]] = val
|
|
2341
|
-
else:
|
|
2342
|
-
site_values["profile"][subkey] = val
|
|
2343
|
-
|
|
2344
|
-
return site_values
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
def _gather_label_values(config: BpsConfig, label: str) -> dict[str, Any]:
|
|
2348
|
-
"""Gather values specific to given job label.
|
|
2349
|
-
|
|
2350
|
-
Parameters
|
|
2351
|
-
----------
|
|
2352
|
-
config : `lsst.ctrl.bps.BpsConfig`
|
|
2353
|
-
BPS configuration that includes necessary submit/runtime
|
|
2354
|
-
information.
|
|
2355
|
-
label : `str`
|
|
2356
|
-
GenericWorkflowJob label.
|
|
2357
|
-
|
|
2358
|
-
Returns
|
|
2359
|
-
-------
|
|
2360
|
-
values : `dict` [`str`, `~typing.Any`]
|
|
2361
|
-
Values specific to the given job label.
|
|
2362
|
-
"""
|
|
2363
|
-
values: dict[str, Any] = {"attrs": {}, "profile": {}}
|
|
2364
|
-
|
|
2365
|
-
search_opts = {}
|
|
2366
|
-
profile_key = ""
|
|
2367
|
-
if label == "finalJob":
|
|
2368
|
-
search_opts["searchobj"] = config["finalJob"]
|
|
2369
|
-
profile_key = ".finalJob.profile.condor"
|
|
2370
|
-
elif label in config["cluster"]:
|
|
2371
|
-
search_opts["curvals"] = {"curr_cluster": label}
|
|
2372
|
-
profile_key = f".cluster.{label}.profile.condor"
|
|
2373
|
-
elif label in config["pipetask"]:
|
|
2374
|
-
search_opts["curvals"] = {"curr_pipetask": label}
|
|
2375
|
-
profile_key = f".pipetask.{label}.profile.condor"
|
|
2376
|
-
|
|
2377
|
-
found, value = config.search("releaseExpr", opt=search_opts)
|
|
2378
|
-
if found:
|
|
2379
|
-
values["releaseExpr"] = value
|
|
2380
|
-
|
|
2381
|
-
found, value = config.search("overwriteJobFiles", opt=search_opts)
|
|
2382
|
-
if found:
|
|
2383
|
-
values["overwriteJobFiles"] = value
|
|
2384
|
-
else:
|
|
2385
|
-
values["overwriteJobFiles"] = True
|
|
2386
|
-
|
|
2387
|
-
if profile_key and profile_key in config:
|
|
2388
|
-
for subkey, val in config[profile_key].items():
|
|
2389
|
-
if subkey.startswith("+"):
|
|
2390
|
-
values["attrs"][subkey[1:]] = val
|
|
2391
|
-
else:
|
|
2392
|
-
values["profile"][subkey] = val
|
|
2393
|
-
|
|
2394
|
-
return values
|
|
2395
|
-
|
|
2396
|
-
|
|
2397
|
-
def is_service_job(job_ad: dict[str, Any]) -> bool:
|
|
2398
|
-
"""Determine if a job is a service one.
|
|
2399
|
-
|
|
2400
|
-
Parameters
|
|
2401
|
-
----------
|
|
2402
|
-
job_ad : `dict` [`str`, Any]
|
|
2403
|
-
Information about an HTCondor job.
|
|
2404
|
-
|
|
2405
|
-
Returns
|
|
2406
|
-
-------
|
|
2407
|
-
is_service_job : `bool`
|
|
2408
|
-
True if the job is a service one, false otherwise.
|
|
2409
|
-
|
|
2410
|
-
Notes
|
|
2411
|
-
-----
|
|
2412
|
-
At the moment, HTCondor does not provide a native way to distinguish
|
|
2413
|
-
between payload and service jobs in the workflow. This code depends
|
|
2414
|
-
on read_node_status adding wms_node_type.
|
|
2415
|
-
"""
|
|
2416
|
-
return job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) == WmsNodeType.SERVICE
|
|
2417
|
-
|
|
2418
|
-
|
|
2419
|
-
def _group_to_subdag(
|
|
2420
|
-
config: BpsConfig, generic_workflow_group: GenericWorkflowGroup, out_prefix: str
|
|
2421
|
-
) -> HTCJob:
|
|
2422
|
-
"""Convert a generic workflow group to an HTCondor dag.
|
|
2423
|
-
|
|
2424
|
-
Parameters
|
|
2425
|
-
----------
|
|
2426
|
-
config : `lsst.ctrl.bps.BpsConfig`
|
|
2427
|
-
Workflow configuration.
|
|
2428
|
-
generic_workflow_group : `lsst.ctrl.bps.GenericWorkflowGroup`
|
|
2429
|
-
The generic workflow group to convert.
|
|
2430
|
-
out_prefix : `str`
|
|
2431
|
-
Location prefix to be used when creating jobs.
|
|
2432
|
-
|
|
2433
|
-
Returns
|
|
2434
|
-
-------
|
|
2435
|
-
htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
|
|
2436
|
-
Job for running the HTCondor dag.
|
|
2437
|
-
"""
|
|
2438
|
-
jobname = f"wms_{generic_workflow_group.name}"
|
|
2439
|
-
htc_job = HTCJob(name=jobname, label=generic_workflow_group.label)
|
|
2440
|
-
htc_job.add_dag_cmds({"dir": f"subdags/{jobname}"})
|
|
2441
|
-
htc_job.subdag = _generic_workflow_to_htcondor_dag(config, generic_workflow_group, out_prefix)
|
|
2442
|
-
if not generic_workflow_group.blocking:
|
|
2443
|
-
htc_job.dagcmds["post"] = {
|
|
2444
|
-
"defer": "",
|
|
2445
|
-
"executable": f"{os.path.dirname(__file__)}/subdag_post.sh",
|
|
2446
|
-
"arguments": f"{jobname} $RETURN",
|
|
2447
|
-
}
|
|
2448
|
-
return htc_job
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
def _create_check_job(group_job_name: str, job_label: str) -> HTCJob:
|
|
2452
|
-
"""Create a job to check status of a group job.
|
|
2453
|
-
|
|
2454
|
-
Parameters
|
|
2455
|
-
----------
|
|
2456
|
-
group_job_name : `str`
|
|
2457
|
-
Name of the group job.
|
|
2458
|
-
job_label : `str`
|
|
2459
|
-
Label to use for the check status job.
|
|
2460
|
-
|
|
2461
|
-
Returns
|
|
2462
|
-
-------
|
|
2463
|
-
htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
|
|
2464
|
-
Job description for the job to check group job status.
|
|
2465
|
-
"""
|
|
2466
|
-
htc_job = HTCJob(name=f"wms_check_status_{group_job_name}", label=job_label)
|
|
2467
|
-
htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/check_group_status.sub"
|
|
2468
|
-
htc_job.add_dag_cmds({"dir": f"subdags/{group_job_name}", "vars": {"group_job_name": group_job_name}})
|
|
2469
|
-
|
|
2470
|
-
return htc_job
|
|
2471
|
-
|
|
2472
|
-
|
|
2473
|
-
def _generic_workflow_to_htcondor_dag(
|
|
2474
|
-
config: BpsConfig, generic_workflow: GenericWorkflow, out_prefix: str
|
|
2475
|
-
) -> HTCDag:
|
|
2476
|
-
"""Convert a GenericWorkflow to a HTCDag.
|
|
2477
|
-
|
|
2478
|
-
Parameters
|
|
2479
|
-
----------
|
|
2480
|
-
config : `lsst.ctrl.bps.BpsConfig`
|
|
2481
|
-
Workflow configuration.
|
|
2482
|
-
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
2483
|
-
The GenericWorkflow to convert.
|
|
2484
|
-
out_prefix : `str`
|
|
2485
|
-
Location prefix where the HTCondor files will be written.
|
|
2486
|
-
|
|
2487
|
-
Returns
|
|
2488
|
-
-------
|
|
2489
|
-
dag : `lsst.ctrl.bps.htcondor.HTCDag`
|
|
2490
|
-
The HTCDag representation of the given GenericWorkflow.
|
|
2491
|
-
"""
|
|
2492
|
-
dag = HTCDag(name=generic_workflow.name)
|
|
2493
|
-
|
|
2494
|
-
_LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
|
|
2495
|
-
dag.add_attribs(generic_workflow.run_attrs)
|
|
2496
|
-
dag.add_attribs(
|
|
2497
|
-
{
|
|
2498
|
-
"bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
|
|
2499
|
-
"bps_job_summary": create_count_summary(generic_workflow.job_counts),
|
|
2500
|
-
}
|
|
2501
|
-
)
|
|
2502
|
-
|
|
2503
|
-
_, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
|
|
2504
|
-
if isinstance(tmp_template, str):
|
|
2505
|
-
subdir_template = defaultdict(lambda: tmp_template)
|
|
2506
|
-
else:
|
|
2507
|
-
subdir_template = tmp_template
|
|
2508
|
-
|
|
2509
|
-
# Create all DAG jobs
|
|
2510
|
-
site_values = {} # Cache compute site specific values to reduce config lookups.
|
|
2511
|
-
cached_values = {} # Cache label-specific values to reduce config lookups.
|
|
2512
|
-
# Note: Can't use get_job_by_label because those only include payload jobs.
|
|
2513
|
-
for job_name in generic_workflow:
|
|
2514
|
-
gwjob = generic_workflow.get_job(job_name)
|
|
2515
|
-
if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
2516
|
-
gwjob = cast(GenericWorkflowJob, gwjob)
|
|
2517
|
-
if gwjob.compute_site not in site_values:
|
|
2518
|
-
site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
|
|
2519
|
-
if gwjob.label not in cached_values:
|
|
2520
|
-
cached_values[gwjob.label] = deepcopy(site_values[gwjob.compute_site])
|
|
2521
|
-
cached_values[gwjob.label].update(_gather_label_values(config, gwjob.label))
|
|
2522
|
-
_LOG.debug("cached: %s= %s", gwjob.label, cached_values[gwjob.label])
|
|
2523
|
-
htc_job = _create_job(
|
|
2524
|
-
subdir_template[gwjob.label],
|
|
2525
|
-
cached_values[gwjob.label],
|
|
2526
|
-
generic_workflow,
|
|
2527
|
-
gwjob,
|
|
2528
|
-
out_prefix,
|
|
2529
|
-
)
|
|
2530
|
-
elif gwjob.node_type == GenericWorkflowNodeType.NOOP:
|
|
2531
|
-
gwjob = cast(GenericWorkflowNoopJob, gwjob)
|
|
2532
|
-
htc_job = HTCJob(f"wms_{gwjob.name}", label=gwjob.label)
|
|
2533
|
-
htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/noop.sub"
|
|
2534
|
-
htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
|
|
2535
|
-
htc_job.add_dag_cmds({"noop": True})
|
|
2536
|
-
elif gwjob.node_type == GenericWorkflowNodeType.GROUP:
|
|
2537
|
-
gwjob = cast(GenericWorkflowGroup, gwjob)
|
|
2538
|
-
htc_job = _group_to_subdag(config, gwjob, out_prefix)
|
|
2539
|
-
# In case DAGMAN_GENERATE_SUBDAG_SUBMITS is False,
|
|
2540
|
-
dag.graph["submit_options"]["do_recurse"] = True
|
|
2541
|
-
else:
|
|
2542
|
-
raise RuntimeError(f"Unsupported generic workflow node type {gwjob.node_type} ({gwjob.name})")
|
|
2543
|
-
_LOG.debug("Calling adding job %s %s", htc_job.name, htc_job.label)
|
|
2544
|
-
dag.add_job(htc_job)
|
|
2545
|
-
|
|
2546
|
-
# Add job dependencies to the DAG (be careful with wms_ jobs)
|
|
2547
|
-
for job_name in generic_workflow:
|
|
2548
|
-
gwjob = generic_workflow.get_job(job_name)
|
|
2549
|
-
parent_name = (
|
|
2550
|
-
gwjob.name if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD else f"wms_{gwjob.name}"
|
|
2551
|
-
)
|
|
2552
|
-
successor_jobs = [generic_workflow.get_job(j) for j in generic_workflow.successors(job_name)]
|
|
2553
|
-
children_names = []
|
|
2554
|
-
if gwjob.node_type == GenericWorkflowNodeType.GROUP:
|
|
2555
|
-
gwjob = cast(GenericWorkflowGroup, gwjob)
|
|
2556
|
-
group_children = [] # Dependencies between same group jobs
|
|
2557
|
-
for sjob in successor_jobs:
|
|
2558
|
-
if sjob.node_type == GenericWorkflowNodeType.GROUP and sjob.label == gwjob.label:
|
|
2559
|
-
group_children.append(f"wms_{sjob.name}")
|
|
2560
|
-
elif sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
2561
|
-
children_names.append(sjob.name)
|
|
2562
|
-
else:
|
|
2563
|
-
children_names.append(f"wms_{sjob.name}")
|
|
2564
|
-
if group_children:
|
|
2565
|
-
dag.add_job_relationships([parent_name], group_children)
|
|
2566
|
-
if not gwjob.blocking:
|
|
2567
|
-
# Since subdag will always succeed, need to add a special
|
|
2568
|
-
# job that fails if group failed to block payload children.
|
|
2569
|
-
check_job = _create_check_job(f"wms_{gwjob.name}", gwjob.label)
|
|
2570
|
-
dag.add_job(check_job)
|
|
2571
|
-
dag.add_job_relationships([f"wms_{gwjob.name}"], [check_job.name])
|
|
2572
|
-
parent_name = check_job.name
|
|
2573
|
-
else:
|
|
2574
|
-
for sjob in successor_jobs:
|
|
2575
|
-
if sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
2576
|
-
children_names.append(sjob.name)
|
|
2577
|
-
else:
|
|
2578
|
-
children_names.append(f"wms_{sjob.name}")
|
|
2579
|
-
|
|
2580
|
-
dag.add_job_relationships([parent_name], children_names)
|
|
2581
|
-
|
|
2582
|
-
# If final job exists in generic workflow, create DAG final job
|
|
2583
|
-
final = generic_workflow.get_final()
|
|
2584
|
-
if final and isinstance(final, GenericWorkflowJob):
|
|
2585
|
-
if final.compute_site and final.compute_site not in site_values:
|
|
2586
|
-
site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
|
|
2587
|
-
if final.label not in cached_values:
|
|
2588
|
-
cached_values[final.label] = deepcopy(site_values[final.compute_site])
|
|
2589
|
-
cached_values[final.label].update(_gather_label_values(config, final.label))
|
|
2590
|
-
final_htjob = _create_job(
|
|
2591
|
-
subdir_template[final.label],
|
|
2592
|
-
cached_values[final.label],
|
|
2593
|
-
generic_workflow,
|
|
2594
|
-
final,
|
|
2595
|
-
out_prefix,
|
|
2596
|
-
)
|
|
2597
|
-
if "post" not in final_htjob.dagcmds:
|
|
2598
|
-
final_htjob.dagcmds["post"] = {
|
|
2599
|
-
"defer": "",
|
|
2600
|
-
"executable": f"{os.path.dirname(__file__)}/final_post.sh",
|
|
2601
|
-
"arguments": f"{final.name} $DAG_STATUS $RETURN",
|
|
2602
|
-
}
|
|
2603
|
-
dag.add_final_job(final_htjob)
|
|
2604
|
-
elif final and isinstance(final, GenericWorkflow):
|
|
2605
|
-
raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
|
|
2606
|
-
elif final:
|
|
2607
|
-
raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
|
|
2608
|
-
|
|
2609
|
-
return dag
|