lsst-ctrl-bps-htcondor 29.2025.3800__py3-none-any.whl → 29.2025.3900__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,93 +27,48 @@
27
27
 
28
28
  """Interface between generic workflow to HTCondor workflow system."""
29
29
 
30
- __all__ = ["HTCondorService", "HTCondorWorkflow"]
30
+ __all__ = ["HTCondorService"]
31
31
 
32
32
 
33
33
  import logging
34
34
  import os
35
- import re
36
- from collections import defaultdict
37
- from copy import deepcopy
38
- from enum import IntEnum, auto
39
35
  from pathlib import Path
40
- from typing import Any, cast
41
36
 
42
37
  import htcondor
43
38
  from packaging import version
44
39
 
45
40
  from lsst.ctrl.bps import (
46
41
  BaseWmsService,
47
- BaseWmsWorkflow,
48
- BpsConfig,
49
- GenericWorkflow,
50
- GenericWorkflowGroup,
51
- GenericWorkflowJob,
52
- GenericWorkflowNodeType,
53
- GenericWorkflowNoopJob,
54
- WmsJobReport,
55
- WmsRunReport,
56
- WmsSpecificInfo,
57
42
  WmsStates,
58
43
  )
59
- from lsst.ctrl.bps.bps_utils import chdir, create_count_summary
44
+ from lsst.ctrl.bps.bps_utils import chdir
60
45
  from lsst.daf.butler import Config
61
46
  from lsst.utils.timer import time_this
62
47
 
48
+ from .common_utils import WmsIdType, _wms_id_to_cluster, _wms_id_to_dir, _wms_id_type
63
49
  from .htcondor_config import HTC_DEFAULTS_URI
50
+ from .htcondor_workflow import HTCondorWorkflow
64
51
  from .lssthtc import (
65
- MISSING_ID,
66
- HTCDag,
67
- HTCJob,
68
- NodeStatus,
69
- WmsNodeType,
52
+ _locate_schedds,
70
53
  _update_rescue_file,
71
- condor_history,
72
54
  condor_q,
73
- condor_search,
74
- condor_status,
75
55
  htc_backup_files,
76
- htc_check_dagman_output,
77
56
  htc_create_submit_from_cmd,
78
57
  htc_create_submit_from_dag,
79
58
  htc_create_submit_from_file,
80
- htc_escape,
81
59
  htc_submit_dag,
82
60
  htc_version,
83
- pegasus_name_to_label,
84
- read_dag_info,
85
- read_dag_log,
86
61
  read_dag_status,
87
- read_node_status,
88
- summarize_dag,
89
62
  write_dag_info,
90
63
  )
91
64
  from .provisioner import Provisioner
92
-
93
-
94
- class WmsIdType(IntEnum):
95
- """Type of valid WMS ids."""
96
-
97
- UNKNOWN = auto()
98
- """The type of id cannot be determined.
99
- """
100
-
101
- LOCAL = auto()
102
- """The id is HTCondor job's ClusterId (with optional '.ProcId').
103
- """
104
-
105
- GLOBAL = auto()
106
- """Id is a HTCondor's global job id.
107
- """
108
-
109
- PATH = auto()
110
- """Id is a submission path.
111
- """
112
-
113
-
114
- DEFAULT_HTC_EXEC_PATT = ".*worker.*"
115
- """Default pattern for searching execute machines in an HTCondor pool.
116
- """
65
+ from .report_utils import (
66
+ _get_status_from_id,
67
+ _get_status_from_path,
68
+ _report_from_id,
69
+ _report_from_path,
70
+ _summary_report,
71
+ )
117
72
 
118
73
  _LOG = logging.getLogger(__name__)
119
74
 
@@ -604,2006 +559,3 @@ class HTCondorService(BaseWmsService):
604
559
  status = 1
605
560
  message = f"Permission problem with {daemon_type} service."
606
561
  return status, message
607
-
608
-
609
- class HTCondorWorkflow(BaseWmsWorkflow):
610
- """Single HTCondor workflow.
611
-
612
- Parameters
613
- ----------
614
- name : `str`
615
- Unique name for Workflow used when naming files.
616
- config : `lsst.ctrl.bps.BpsConfig`
617
- BPS configuration that includes necessary submit/runtime information.
618
- """
619
-
620
- def __init__(self, name, config=None):
621
- super().__init__(name, config)
622
- self.dag = None
623
-
624
- @classmethod
625
- def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
626
- # Docstring inherited
627
- htc_workflow = cls(generic_workflow.name, config)
628
- htc_workflow.dag = _generic_workflow_to_htcondor_dag(config, generic_workflow, out_prefix)
629
-
630
- _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
631
- # Add extra attributes to top most DAG.
632
- htc_workflow.dag.add_attribs(
633
- {
634
- "bps_wms_service": service_class,
635
- "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
636
- }
637
- )
638
-
639
- return htc_workflow
640
-
641
- def write(self, out_prefix):
642
- """Output HTCondor DAGMan files needed for workflow submission.
643
-
644
- Parameters
645
- ----------
646
- out_prefix : `str`
647
- Directory prefix for HTCondor files.
648
- """
649
- self.submit_path = out_prefix
650
- os.makedirs(out_prefix, exist_ok=True)
651
-
652
- # Write down the workflow in HTCondor format.
653
- self.dag.write(out_prefix, job_subdir="jobs/{self.label}")
654
-
655
-
656
- def _create_job(subdir_template, cached_values, generic_workflow, gwjob, out_prefix):
657
- """Convert GenericWorkflow job nodes to DAG jobs.
658
-
659
- Parameters
660
- ----------
661
- subdir_template : `str`
662
- Template for making subdirs.
663
- cached_values : `dict`
664
- Site and label specific values.
665
- generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
666
- Generic workflow that is being converted.
667
- gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
668
- The generic job to convert to a HTCondor job.
669
- out_prefix : `str`
670
- Directory prefix for HTCondor files.
671
-
672
- Returns
673
- -------
674
- htc_job : `lsst.ctrl.bps.wms.htcondor.HTCJob`
675
- The HTCondor job equivalent to the given generic job.
676
- """
677
- htc_job = HTCJob(gwjob.name, label=gwjob.label)
678
-
679
- curvals = defaultdict(str)
680
- curvals["label"] = gwjob.label
681
- if gwjob.tags:
682
- curvals.update(gwjob.tags)
683
-
684
- subdir = Path("jobs") / subdir_template.format_map(curvals)
685
- htc_job.subdir = subdir
686
- htc_job.subfile = f"{gwjob.name}.sub"
687
- htc_job.add_dag_cmds({"dir": subdir})
688
-
689
- htc_job_cmds = {
690
- "universe": "vanilla",
691
- "should_transfer_files": "YES",
692
- "when_to_transfer_output": "ON_EXIT_OR_EVICT",
693
- "transfer_output_files": '""', # Set to empty string to disable
694
- "transfer_executable": "False",
695
- "getenv": "True",
696
- # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell
697
- # htcondor to put on hold any jobs which exited by a signal.
698
- "on_exit_hold": "ExitBySignal == true",
699
- "on_exit_hold_reason": (
700
- 'strcat("Job raised a signal ", string(ExitSignal), ". ", '
701
- '"Handling signal as if job has gone over memory limit.")'
702
- ),
703
- "on_exit_hold_subcode": "34",
704
- }
705
-
706
- htc_job_cmds.update(_translate_job_cmds(cached_values, generic_workflow, gwjob))
707
-
708
- # Combine stdout and stderr to reduce the number of files.
709
- for key in ("output", "error"):
710
- if cached_values["overwriteJobFiles"]:
711
- htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).out"
712
- else:
713
- htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).$$([NumJobStarts ?: 0]).out"
714
- _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
715
-
716
- key = "log"
717
- htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).{key}"
718
- _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
719
-
720
- htc_job_cmds.update(
721
- _handle_job_inputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
722
- )
723
-
724
- htc_job_cmds.update(
725
- _handle_job_outputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
726
- )
727
-
728
- # Add the job cmds dict to the job object.
729
- htc_job.add_job_cmds(htc_job_cmds)
730
-
731
- htc_job.add_dag_cmds(_translate_dag_cmds(gwjob))
732
-
733
- # Add job attributes to job.
734
- _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
735
- htc_job.add_job_attrs(gwjob.attrs)
736
- htc_job.add_job_attrs(cached_values["attrs"])
737
- htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
738
- htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
739
-
740
- return htc_job
741
-
742
-
743
- def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
744
- """Translate the job data that are one to one mapping
745
-
746
- Parameters
747
- ----------
748
- cached_vals : `dict` [`str`, `~typing.Any`]
749
- Config values common to jobs with same site or label.
750
- generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
751
- Generic workflow that contains job to being converted.
752
- gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
753
- Generic workflow job to be converted.
754
-
755
- Returns
756
- -------
757
- htc_job_commands : `dict` [`str`, `~typing.Any`]
758
- Contains commands which can appear in the HTCondor submit description
759
- file.
760
- """
761
- # Values in the job script that just are name mappings.
762
- job_translation = {
763
- "mail_to": "notify_user",
764
- "when_to_mail": "notification",
765
- "request_cpus": "request_cpus",
766
- "priority": "priority",
767
- "category": "category",
768
- "accounting_group": "accounting_group",
769
- "accounting_user": "accounting_group_user",
770
- }
771
-
772
- jobcmds = {}
773
- for gwkey, htckey in job_translation.items():
774
- jobcmds[htckey] = getattr(gwjob, gwkey, None)
775
-
776
- # If accounting info was not set explicitly, use site settings if any.
777
- if not gwjob.accounting_group:
778
- jobcmds["accounting_group"] = cached_vals.get("accountingGroup")
779
- if not gwjob.accounting_user:
780
- jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
781
-
782
- # job commands that need modification
783
- if gwjob.retry_unless_exit:
784
- if isinstance(gwjob.retry_unless_exit, int):
785
- jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
786
- elif isinstance(gwjob.retry_unless_exit, list):
787
- jobcmds["retry_until"] = (
788
- f"member(ExitCode, {{{','.join([str(x) for x in gwjob.retry_unless_exit])}}})"
789
- )
790
- else:
791
- raise ValueError("retryUnlessExit must be an integer or a list of integers.")
792
-
793
- if gwjob.request_disk:
794
- jobcmds["request_disk"] = f"{gwjob.request_disk}MB"
795
-
796
- if gwjob.request_memory:
797
- jobcmds["request_memory"] = f"{gwjob.request_memory}"
798
-
799
- memory_max = 0
800
- if gwjob.memory_multiplier:
801
- # Do not use try-except! At the moment, BpsConfig returns an empty
802
- # string if it does not contain the key.
803
- memory_limit = cached_vals["memoryLimit"]
804
- if not memory_limit:
805
- raise RuntimeError(
806
- "Memory autoscaling enabled, but automatic detection of the memory limit "
807
- "failed; setting it explicitly with 'memoryLimit' or changing worker node "
808
- "search pattern 'executeMachinesPattern' might help."
809
- )
810
-
811
- # Set maximal amount of memory job can ask for.
812
- #
813
- # The check below assumes that 'memory_limit' was set to a value which
814
- # realistically reflects actual physical limitations of a given compute
815
- # resource.
816
- memory_max = memory_limit
817
- if gwjob.request_memory_max and gwjob.request_memory_max < memory_limit:
818
- memory_max = gwjob.request_memory_max
819
-
820
- # Make job ask for more memory each time it failed due to insufficient
821
- # memory requirements.
822
- jobcmds["request_memory"] = _create_request_memory_expr(
823
- gwjob.request_memory, gwjob.memory_multiplier, memory_max
824
- )
825
-
826
- user_release_expr = cached_vals.get("releaseExpr", "")
827
- if gwjob.number_of_retries is not None and gwjob.number_of_retries >= 0:
828
- jobcmds["max_retries"] = gwjob.number_of_retries
829
-
830
- # No point in adding periodic_release if 0 retries
831
- if gwjob.number_of_retries > 0:
832
- periodic_release = _create_periodic_release_expr(
833
- gwjob.request_memory, gwjob.memory_multiplier, memory_max, user_release_expr
834
- )
835
- if periodic_release:
836
- jobcmds["periodic_release"] = periodic_release
837
-
838
- jobcmds["periodic_remove"] = _create_periodic_remove_expr(
839
- gwjob.request_memory, gwjob.memory_multiplier, memory_max
840
- )
841
-
842
- # Assume concurrency_limit implemented using HTCondor concurrency limits.
843
- # May need to move to special site-specific implementation if sites use
844
- # other mechanisms.
845
- if gwjob.concurrency_limit:
846
- jobcmds["concurrency_limit"] = gwjob.concurrency_limit
847
-
848
- # Handle command line
849
- if gwjob.executable.transfer_executable:
850
- jobcmds["transfer_executable"] = "True"
851
- jobcmds["executable"] = gwjob.executable.src_uri
852
- else:
853
- jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
854
-
855
- if gwjob.arguments:
856
- arguments = gwjob.arguments
857
- arguments = _replace_cmd_vars(arguments, gwjob)
858
- arguments = _replace_file_vars(cached_vals["bpsUseShared"], arguments, generic_workflow, gwjob)
859
- arguments = _fix_env_var_syntax(arguments)
860
- jobcmds["arguments"] = arguments
861
-
862
- if gwjob.environment:
863
- env_str = ""
864
- for name, value in gwjob.environment.items():
865
- if isinstance(value, str):
866
- value2 = _replace_cmd_vars(value, gwjob)
867
- value2 = _fix_env_var_syntax(value2)
868
- value2 = htc_escape(value2)
869
- env_str += f"{name}='{value2}' " # Add single quotes to allow internal spaces
870
- else:
871
- env_str += f"{name}={value} "
872
-
873
- # Process above added one trailing space
874
- jobcmds["environment"] = env_str.rstrip()
875
-
876
- # Add extra "pass-thru" job commands
877
- if gwjob.profile:
878
- for key, val in gwjob.profile.items():
879
- jobcmds[key] = htc_escape(val)
880
- for key, val in cached_vals["profile"].items():
881
- jobcmds[key] = htc_escape(val)
882
-
883
- return jobcmds
884
-
885
-
886
- def _translate_dag_cmds(gwjob):
887
- """Translate job values into DAGMan commands.
888
-
889
- Parameters
890
- ----------
891
- gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
892
- Job containing values to be translated.
893
-
894
- Returns
895
- -------
896
- dagcmds : `dict` [`str`, `~typing.Any`]
897
- DAGMan commands for the job.
898
- """
899
- # Values in the dag script that just are name mappings.
900
- dag_translation = {"abort_on_value": "abort_dag_on", "abort_return_value": "abort_exit"}
901
-
902
- dagcmds = {}
903
- for gwkey, htckey in dag_translation.items():
904
- dagcmds[htckey] = getattr(gwjob, gwkey, None)
905
-
906
- # Still to be coded: vars "pre_cmdline", "post_cmdline"
907
- return dagcmds
908
-
909
-
910
- def _fix_env_var_syntax(oldstr):
911
- """Change ENV place holders to HTCondor Env var syntax.
912
-
913
- Parameters
914
- ----------
915
- oldstr : `str`
916
- String in which environment variable syntax is to be fixed.
917
-
918
- Returns
919
- -------
920
- newstr : `str`
921
- Given string with environment variable syntax fixed.
922
- """
923
- newstr = oldstr
924
- for key in re.findall(r"<ENV:([^>]+)>", oldstr):
925
- newstr = newstr.replace(rf"<ENV:{key}>", f"$ENV({key})")
926
- return newstr
927
-
928
-
929
- def _replace_file_vars(use_shared, arguments, workflow, gwjob):
930
- """Replace file placeholders in command line arguments with correct
931
- physical file names.
932
-
933
- Parameters
934
- ----------
935
- use_shared : `bool`
936
- Whether HTCondor can assume shared filesystem.
937
- arguments : `str`
938
- Arguments string in which to replace file placeholders.
939
- workflow : `lsst.ctrl.bps.GenericWorkflow`
940
- Generic workflow that contains file information.
941
- gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
942
- The job corresponding to the arguments.
943
-
944
- Returns
945
- -------
946
- arguments : `str`
947
- Given arguments string with file placeholders replaced.
948
- """
949
- # Replace input file placeholders with paths.
950
- for gwfile in workflow.get_job_inputs(gwjob.name, data=True, transfer_only=False):
951
- if not gwfile.wms_transfer:
952
- # Must assume full URI if in command line and told WMS is not
953
- # responsible for transferring file.
954
- uri = gwfile.src_uri
955
- elif use_shared:
956
- if gwfile.job_shared:
957
- # Have shared filesystems and jobs can share file.
958
- uri = gwfile.src_uri
959
- else:
960
- uri = os.path.basename(gwfile.src_uri)
961
- else: # Using push transfer
962
- uri = os.path.basename(gwfile.src_uri)
963
- arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
964
-
965
- # Replace output file placeholders with paths.
966
- for gwfile in workflow.get_job_outputs(gwjob.name, data=True, transfer_only=False):
967
- if not gwfile.wms_transfer:
968
- # Must assume full URI if in command line and told WMS is not
969
- # responsible for transferring file.
970
- uri = gwfile.src_uri
971
- elif use_shared:
972
- if gwfile.job_shared:
973
- # Have shared filesystems and jobs can share file.
974
- uri = gwfile.src_uri
975
- else:
976
- uri = os.path.basename(gwfile.src_uri)
977
- else: # Using push transfer
978
- uri = os.path.basename(gwfile.src_uri)
979
- arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
980
- return arguments
981
-
982
-
983
- def _replace_cmd_vars(arguments, gwjob):
984
- """Replace format-style placeholders in arguments.
985
-
986
- Parameters
987
- ----------
988
- arguments : `str`
989
- Arguments string in which to replace placeholders.
990
- gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
991
- Job containing values to be used to replace placeholders
992
- (in particular gwjob.cmdvals).
993
-
994
- Returns
995
- -------
996
- arguments : `str`
997
- Given arguments string with placeholders replaced.
998
- """
999
- replacements = gwjob.cmdvals if gwjob.cmdvals is not None else {}
1000
- try:
1001
- arguments = arguments.format(**replacements)
1002
- except (KeyError, TypeError) as exc: # TypeError in case None instead of {}
1003
- _LOG.error("Could not replace command variables: replacement for %s not provided", str(exc))
1004
- _LOG.debug("arguments: %s\ncmdvals: %s", arguments, replacements)
1005
- raise
1006
- return arguments
1007
-
1008
-
1009
- def _handle_job_inputs(
1010
- generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str
1011
- ) -> dict[str, str]:
1012
- """Add job input files from generic workflow to job.
1013
-
1014
- Parameters
1015
- ----------
1016
- generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
1017
- The generic workflow (e.g., has executable name and arguments).
1018
- job_name : `str`
1019
- Unique name for the job.
1020
- use_shared : `bool`
1021
- Whether job has access to files via shared filesystem.
1022
- out_prefix : `str`
1023
- The root directory into which all WMS-specific files are written.
1024
-
1025
- Returns
1026
- -------
1027
- htc_commands : `dict` [`str`, `str`]
1028
- HTCondor commands for the job submission script.
1029
- """
1030
- inputs = []
1031
- for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
1032
- _LOG.debug("src_uri=%s", gwf_file.src_uri)
1033
-
1034
- uri = Path(gwf_file.src_uri)
1035
-
1036
- # Note if use_shared and job_shared, don't need to transfer file.
1037
-
1038
- if not use_shared: # Copy file using push to job
1039
- inputs.append(str(uri))
1040
- elif not gwf_file.job_shared: # Jobs require own copy
1041
- # if using shared filesystem, but still need copy in job. Use
1042
- # HTCondor's curl plugin for a local copy.
1043
- if uri.is_dir():
1044
- raise RuntimeError(
1045
- f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}"
1046
- )
1047
- inputs.append(f"file://{uri}")
1048
-
1049
- htc_commands = {}
1050
- if inputs:
1051
- htc_commands["transfer_input_files"] = ",".join(inputs)
1052
- _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
1053
- return htc_commands
1054
-
1055
-
1056
- def _handle_job_outputs(
1057
- generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str
1058
- ) -> dict[str, str]:
1059
- """Add job output files from generic workflow to the job if any.
1060
-
1061
- Parameters
1062
- ----------
1063
- generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
1064
- The generic workflow (e.g., has executable name and arguments).
1065
- job_name : `str`
1066
- Unique name for the job.
1067
- use_shared : `bool`
1068
- Whether job has access to files via shared filesystem.
1069
- out_prefix : `str`
1070
- The root directory into which all WMS-specific files are written.
1071
-
1072
- Returns
1073
- -------
1074
- htc_commands : `dict` [`str`, `str`]
1075
- HTCondor commands for the job submission script.
1076
- """
1077
- outputs = []
1078
- output_remaps = []
1079
- for gwf_file in generic_workflow.get_job_outputs(job_name, data=True, transfer_only=True):
1080
- _LOG.debug("src_uri=%s", gwf_file.src_uri)
1081
-
1082
- uri = Path(gwf_file.src_uri)
1083
- if not use_shared:
1084
- outputs.append(uri.name)
1085
- output_remaps.append(f"{uri.name}={str(uri)}")
1086
-
1087
- # Set to an empty string to disable and only update if there are output
1088
- # files to transfer. Otherwise, HTCondor will transfer back all files in
1089
- # the job’s temporary working directory that have been modified or created
1090
- # by the job.
1091
- htc_commands = {"transfer_output_files": '""'}
1092
- if outputs:
1093
- htc_commands["transfer_output_files"] = ",".join(outputs)
1094
- _LOG.debug("transfer_output_files=%s", htc_commands["transfer_output_files"])
1095
-
1096
- htc_commands["transfer_output_remaps"] = f'"{";".join(output_remaps)}"'
1097
- _LOG.debug("transfer_output_remaps=%s", htc_commands["transfer_output_remaps"])
1098
- return htc_commands
1099
-
1100
-
1101
- def _get_status_from_id(
1102
- wms_workflow_id: str, hist: float, schedds: dict[str, htcondor.Schedd]
1103
- ) -> tuple[WmsStates, str]:
1104
- """Gather run information using workflow id.
1105
-
1106
- Parameters
1107
- ----------
1108
- wms_workflow_id : `str`
1109
- Limit to specific run based on id.
1110
- hist : `float`
1111
- Limit history search to this many days.
1112
- schedds : `dict` [ `str`, `htcondor.Schedd` ]
1113
- HTCondor schedulers which to query for job information. If empty
1114
- dictionary, all queries will be run against the local scheduler only.
1115
-
1116
- Returns
1117
- -------
1118
- state : `lsst.ctrl.bps.WmsStates`
1119
- Status for the corresponding run.
1120
- message : `str`
1121
- Message with extra error information.
1122
- """
1123
- _LOG.debug("_get_status_from_id: id=%s, hist=%s, schedds=%s", wms_workflow_id, hist, schedds)
1124
-
1125
- message = ""
1126
-
1127
- # Collect information about the job by querying HTCondor schedd and
1128
- # HTCondor history.
1129
- schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds)
1130
- if len(schedd_dag_info) == 1:
1131
- schedd_name = next(iter(schedd_dag_info))
1132
- dag_id = next(iter(schedd_dag_info[schedd_name]))
1133
- dag_ad = schedd_dag_info[schedd_name][dag_id]
1134
- state = _htc_status_to_wms_state(dag_ad)
1135
- else:
1136
- state = WmsStates.UNKNOWN
1137
- message = f"DAGMan job {wms_workflow_id} not found in queue or history. Check id or try path."
1138
- return state, message
1139
-
1140
-
1141
- def _get_status_from_path(wms_path: str | os.PathLike) -> tuple[WmsStates, str]:
1142
- """Gather run status from a given run directory.
1143
-
1144
- Parameters
1145
- ----------
1146
- wms_path : `str` | `os.PathLike`
1147
- The directory containing the submit side files (e.g., HTCondor files).
1148
-
1149
- Returns
1150
- -------
1151
- state : `lsst.ctrl.bps.WmsStates`
1152
- Status for the run.
1153
- message : `str`
1154
- Message to be printed.
1155
- """
1156
- wms_path = Path(wms_path).resolve()
1157
- message = ""
1158
- try:
1159
- wms_workflow_id, dag_ad = read_dag_log(wms_path)
1160
- except FileNotFoundError:
1161
- wms_workflow_id = MISSING_ID
1162
- message = f"DAGMan log not found in {wms_path}. Check path."
1163
-
1164
- if wms_workflow_id == MISSING_ID:
1165
- state = WmsStates.UNKNOWN
1166
- else:
1167
- state = _htc_status_to_wms_state(dag_ad[wms_workflow_id])
1168
-
1169
- return state, message
1170
-
1171
-
1172
- def _report_from_path(wms_path):
1173
- """Gather run information from a given run directory.
1174
-
1175
- Parameters
1176
- ----------
1177
- wms_path : `str`
1178
- The directory containing the submit side files (e.g., HTCondor files).
1179
-
1180
- Returns
1181
- -------
1182
- run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1183
- Run information for the detailed report. The key is the HTCondor id
1184
- and the value is a collection of report information for that run.
1185
- message : `str`
1186
- Message to be printed with the summary report.
1187
- """
1188
- wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
1189
- if wms_workflow_id == MISSING_ID:
1190
- run_reports = {}
1191
- else:
1192
- run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
1193
- return run_reports, message
1194
-
1195
-
1196
- def _report_from_id(wms_workflow_id, hist, schedds=None):
1197
- """Gather run information using workflow id.
1198
-
1199
- Parameters
1200
- ----------
1201
- wms_workflow_id : `str`
1202
- Limit to specific run based on id.
1203
- hist : `float`
1204
- Limit history search to this many days.
1205
- schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
1206
- HTCondor schedulers which to query for job information. If None
1207
- (default), all queries will be run against the local scheduler only.
1208
-
1209
- Returns
1210
- -------
1211
- run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1212
- Run information for the detailed report. The key is the HTCondor id
1213
- and the value is a collection of report information for that run.
1214
- message : `str`
1215
- Message to be printed with the summary report.
1216
- """
1217
- messages = []
1218
-
1219
- # Collect information about the job by querying HTCondor schedd and
1220
- # HTCondor history.
1221
- schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds)
1222
- if len(schedd_dag_info) == 1:
1223
- # Extract the DAG info without altering the results of the query.
1224
- schedd_name = next(iter(schedd_dag_info))
1225
- dag_id = next(iter(schedd_dag_info[schedd_name]))
1226
- dag_ad = schedd_dag_info[schedd_name][dag_id]
1227
-
1228
- # If the provided workflow id does not correspond to the one extracted
1229
- # from the DAGMan log file in the submit directory, rerun the query
1230
- # with the id found in the file.
1231
- #
1232
- # This is to cover the situation in which the user provided the old job
1233
- # id of a restarted run.
1234
- try:
1235
- path_dag_id, _ = read_dag_log(dag_ad["Iwd"])
1236
- except FileNotFoundError as exc:
1237
- # At the moment missing DAGMan log is pretty much a fatal error.
1238
- # So empty the DAG info to finish early (see the if statement
1239
- # below).
1240
- schedd_dag_info.clear()
1241
- messages.append(f"Cannot create the report for '{dag_id}': {exc}")
1242
- else:
1243
- if path_dag_id != dag_id:
1244
- schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds)
1245
- messages.append(
1246
- f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. "
1247
- "This normally occurs when a run is restarted. The report shown is for the most "
1248
- f"recent status with run id '{path_dag_id}'"
1249
- )
1250
-
1251
- if len(schedd_dag_info) == 0:
1252
- run_reports = {}
1253
- elif len(schedd_dag_info) == 1:
1254
- _, dag_info = schedd_dag_info.popitem()
1255
- dag_id, dag_ad = dag_info.popitem()
1256
-
1257
- # Create a mapping between jobs and their classads. The keys will
1258
- # be of format 'ClusterId.ProcId'.
1259
- job_info = {dag_id: dag_ad}
1260
-
1261
- # Find jobs (nodes) belonging to that DAGMan job.
1262
- job_constraint = f"DAGManJobId == {int(float(dag_id))}"
1263
- schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds)
1264
- if schedd_job_info:
1265
- _, node_info = schedd_job_info.popitem()
1266
- job_info.update(node_info)
1267
-
1268
- # Collect additional pieces of information about jobs using HTCondor
1269
- # files in the submission directory.
1270
- _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"])
1271
- _update_jobs(job_info, path_jobs)
1272
- if message:
1273
- messages.append(message)
1274
- run_reports = _create_detailed_report_from_jobs(dag_id, job_info)
1275
- else:
1276
- ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()]
1277
- message = (
1278
- f"More than one job matches id '{wms_workflow_id}', "
1279
- f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids"
1280
- )
1281
- messages.append(message)
1282
- run_reports = {}
1283
-
1284
- message = "\n".join(messages)
1285
- return run_reports, message
1286
-
1287
-
1288
- def _get_info_from_schedd(
1289
- wms_workflow_id: str, hist: float, schedds: dict[str, htcondor.Schedd]
1290
- ) -> dict[str, dict[str, dict[str, Any]]]:
1291
- """Gather run information from HTCondor.
1292
-
1293
- Parameters
1294
- ----------
1295
- wms_workflow_id : `str`
1296
- Limit to specific run based on id.
1297
- hist : `float`
1298
- Limit history search to this many days.
1299
- schedds : `dict` [ `str`, `htcondor.Schedd` ]
1300
- HTCondor schedulers which to query for job information. If empty
1301
- dictionary, all queries will be run against the local scheduler only.
1302
-
1303
- Returns
1304
- -------
1305
- schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1306
- Information about jobs satisfying the search criteria where for each
1307
- Scheduler, local HTCondor job ids are mapped to their respective
1308
- classads.
1309
- """
1310
- _LOG.debug("_get_info_from_schedd: id=%s, hist=%s, schedds=%s", wms_workflow_id, hist, schedds)
1311
-
1312
- dag_constraint = 'regexp("dagman$", Cmd)'
1313
- try:
1314
- cluster_id = int(float(wms_workflow_id))
1315
- except ValueError:
1316
- dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"'
1317
- else:
1318
- dag_constraint += f" && ClusterId == {cluster_id}"
1319
-
1320
- # With the current implementation of the condor_* functions the query
1321
- # will always return only one match per Scheduler.
1322
- #
1323
- # Even in the highly unlikely situation where HTCondor history (which
1324
- # condor_search queries too) is long enough to have jobs from before
1325
- # the cluster ids were rolled over (and as a result there is more then
1326
- # one job with the same cluster id) they will not show up in
1327
- # the results.
1328
- schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds)
1329
- return schedd_dag_info
1330
-
1331
-
1332
- def _get_info_from_path(wms_path: str | os.PathLike) -> tuple[str, dict[str, dict[str, Any]], str]:
1333
- """Gather run information from a given run directory.
1334
-
1335
- Parameters
1336
- ----------
1337
- wms_path : `str` or `os.PathLike`
1338
- Directory containing HTCondor files.
1339
-
1340
- Returns
1341
- -------
1342
- wms_workflow_id : `str`
1343
- The run id which is a DAGman job id.
1344
- jobs : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1345
- Information about jobs read from files in the given directory.
1346
- The key is the HTCondor id and the value is a dictionary of HTCondor
1347
- keys and values.
1348
- message : `str`
1349
- Message to be printed with the summary report.
1350
- """
1351
- # Ensure path is absolute, in particular for folks helping
1352
- # debug failures that need to dig around submit files.
1353
- wms_path = Path(wms_path).resolve()
1354
-
1355
- messages = []
1356
- try:
1357
- wms_workflow_id, jobs = read_dag_log(wms_path)
1358
- _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
1359
- _update_jobs(jobs, read_node_status(wms_path))
1360
- _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
1361
-
1362
- # Add more info for DAGman job
1363
- job = jobs[wms_workflow_id]
1364
- job.update(read_dag_status(wms_path))
1365
-
1366
- job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
1367
- if "bps_run" not in job:
1368
- _add_run_info(wms_path, job)
1369
-
1370
- message = htc_check_dagman_output(wms_path)
1371
- if message:
1372
- messages.append(message)
1373
- _LOG.debug(
1374
- "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"]
1375
- )
1376
-
1377
- # Add extra pieces of information which cannot be found in HTCondor
1378
- # generated files like 'GlobalJobId'.
1379
- #
1380
- # Do not treat absence of this file as a serious error. Neither runs
1381
- # submitted with earlier versions of the plugin nor the runs submitted
1382
- # with Pegasus plugin will have it at the moment. However, once enough
1383
- # time passes and Pegasus plugin will have its own report() method
1384
- # (instead of sneakily using HTCondor's one), the lack of that file
1385
- # should be treated as seriously as lack of any other file.
1386
- try:
1387
- job_info = read_dag_info(wms_path)
1388
- except FileNotFoundError as exc:
1389
- message = f"Warn: Some information may not be available: {exc}"
1390
- messages.append(message)
1391
- else:
1392
- schedd_name = next(iter(job_info))
1393
- job_ad = next(iter(job_info[schedd_name].values()))
1394
- job.update(job_ad)
1395
- except FileNotFoundError as err:
1396
- message = f"Could not find HTCondor files in '{wms_path}' ({err})"
1397
- _LOG.debug(message)
1398
- messages.append(message)
1399
- message = htc_check_dagman_output(wms_path)
1400
- if message:
1401
- messages.append(message)
1402
- wms_workflow_id = MISSING_ID
1403
- jobs = {}
1404
-
1405
- message = "\n".join([msg for msg in messages if msg])
1406
- _LOG.debug("wms_workflow_id = %s, jobs = %s", wms_workflow_id, jobs.keys())
1407
- _LOG.debug("message = %s", message)
1408
- return wms_workflow_id, jobs, message
1409
-
1410
-
1411
- def _create_detailed_report_from_jobs(
1412
- wms_workflow_id: str, jobs: dict[str, dict[str, Any]]
1413
- ) -> dict[str, WmsRunReport]:
1414
- """Gather run information to be used in generating summary reports.
1415
-
1416
- Parameters
1417
- ----------
1418
- wms_workflow_id : `str`
1419
- The run id to create the report for.
1420
- jobs : `dict` [`str`, `dict` [`str`, Any]]
1421
- Mapping HTCondor job id to job information.
1422
-
1423
- Returns
1424
- -------
1425
- run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1426
- Run information for the detailed report. The key is the given HTCondor
1427
- id and the value is a collection of report information for that run.
1428
- """
1429
- _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
1430
-
1431
- dag_ad = jobs[wms_workflow_id]
1432
-
1433
- report = WmsRunReport(
1434
- wms_id=f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}",
1435
- global_wms_id=dag_ad.get("GlobalJobId", "MISS"),
1436
- path=dag_ad["Iwd"],
1437
- label=dag_ad.get("bps_job_label", "MISS"),
1438
- run=dag_ad.get("bps_run", "MISS"),
1439
- project=dag_ad.get("bps_project", "MISS"),
1440
- campaign=dag_ad.get("bps_campaign", "MISS"),
1441
- payload=dag_ad.get("bps_payload", "MISS"),
1442
- operator=_get_owner(dag_ad),
1443
- run_summary=_get_run_summary(dag_ad),
1444
- state=_htc_status_to_wms_state(dag_ad),
1445
- total_number_jobs=0,
1446
- jobs=[],
1447
- job_state_counts=dict.fromkeys(WmsStates, 0),
1448
- exit_code_summary={},
1449
- )
1450
-
1451
- payload_jobs = {} # keep track for later processing
1452
- specific_info = WmsSpecificInfo()
1453
- for job_id, job_ad in jobs.items():
1454
- if job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [WmsNodeType.PAYLOAD, WmsNodeType.FINAL]:
1455
- try:
1456
- name = job_ad.get("DAGNodeName", job_id)
1457
- wms_state = _htc_status_to_wms_state(job_ad)
1458
- job_report = WmsJobReport(
1459
- wms_id=job_id,
1460
- name=name,
1461
- label=job_ad.get("bps_job_label", pegasus_name_to_label(name)),
1462
- state=wms_state,
1463
- )
1464
- if job_report.label == "init":
1465
- job_report.label = "pipetaskInit"
1466
- report.job_state_counts[wms_state] += 1
1467
- report.jobs.append(job_report)
1468
- payload_jobs[job_id] = job_ad
1469
- except KeyError as ex:
1470
- _LOG.error("Job missing key '%s': %s", str(ex), job_ad)
1471
- raise
1472
- elif is_service_job(job_ad):
1473
- _LOG.debug(
1474
- "Found service job: id='%s', name='%s', label='%s', NodeStatus='%s', JobStatus='%s'",
1475
- job_id,
1476
- job_ad["DAGNodeName"],
1477
- job_ad.get("bps_job_label", "MISS"),
1478
- job_ad.get("NodeStatus", "MISS"),
1479
- job_ad.get("JobStatus", "MISS"),
1480
- )
1481
- _add_service_job_specific_info(job_ad, specific_info)
1482
-
1483
- report.total_number_jobs = len(payload_jobs)
1484
- report.exit_code_summary = _get_exit_code_summary(payload_jobs)
1485
- if specific_info:
1486
- report.specific_info = specific_info
1487
-
1488
- # Workflow will exit with non-zero DAG_STATUS if problem with
1489
- # any of the wms jobs. So change FAILED to SUCCEEDED if all
1490
- # payload jobs SUCCEEDED.
1491
- if report.total_number_jobs == report.job_state_counts[WmsStates.SUCCEEDED]:
1492
- report.state = WmsStates.SUCCEEDED
1493
-
1494
- run_reports = {report.wms_id: report}
1495
- _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
1496
- return run_reports
1497
-
1498
-
1499
- def _add_service_job_specific_info(job_ad: dict[str, Any], specific_info: WmsSpecificInfo) -> None:
1500
- """Generate report information for service job.
1501
-
1502
- Parameters
1503
- ----------
1504
- job_ad : `dict` [`str`, `~typing.Any`]
1505
- Provisioning job information.
1506
- specific_info : `lsst.ctrl.bps.WmsSpecificInfo`
1507
- Where to add message.
1508
- """
1509
- status_details = ""
1510
- job_status = _htc_status_to_wms_state(job_ad)
1511
-
1512
- # Service jobs in queue are deleted when DAG is done.
1513
- # To get accurate status, need to check other info.
1514
- if (
1515
- job_status == WmsStates.DELETED
1516
- and "Reason" in job_ad
1517
- and (
1518
- "Removed by DAGMan" in job_ad["Reason"]
1519
- or "removed because <OtherJobRemoveRequirements = DAGManJobId =?=" in job_ad["Reason"]
1520
- or "DAG is exiting and writing rescue file." in job_ad["Reason"]
1521
- )
1522
- ):
1523
- if "HoldReason" in job_ad:
1524
- # HoldReason exists even if released, so check.
1525
- if "job_released_time" in job_ad and job_ad["job_held_time"] < job_ad["job_released_time"]:
1526
- # If released, assume running until deleted.
1527
- job_status = WmsStates.SUCCEEDED
1528
- status_details = ""
1529
- else:
1530
- # If job held when deleted by DAGMan, still want to
1531
- # report hold reason
1532
- status_details = f"(Job was held for the following reason: {job_ad['HoldReason']})"
1533
-
1534
- else:
1535
- job_status = WmsStates.SUCCEEDED
1536
- elif job_status == WmsStates.SUCCEEDED:
1537
- status_details = "(Note: Finished before workflow.)"
1538
- elif job_status == WmsStates.HELD:
1539
- status_details = f"({job_ad['HoldReason']})"
1540
-
1541
- template = "Status of {job_name}: {status} {status_details}"
1542
- context = {
1543
- "job_name": job_ad["DAGNodeName"],
1544
- "status": job_status.name,
1545
- "status_details": status_details,
1546
- }
1547
- specific_info.add_message(template=template, context=context)
1548
-
1549
-
1550
- def _summary_report(user, hist, pass_thru, schedds=None):
1551
- """Gather run information to be used in generating summary reports.
1552
-
1553
- Parameters
1554
- ----------
1555
- user : `str`
1556
- Run lookup restricted to given user.
1557
- hist : `float`
1558
- How many previous days to search for run information.
1559
- pass_thru : `str`
1560
- Advanced users can define the HTCondor constraint to be used
1561
- when searching queue and history.
1562
-
1563
- Returns
1564
- -------
1565
- run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
1566
- Run information for the summary report. The keys are HTCondor ids and
1567
- the values are collections of report information for each run.
1568
- message : `str`
1569
- Message to be printed with the summary report.
1570
- """
1571
- # only doing summary report so only look for dagman jobs
1572
- if pass_thru:
1573
- constraint = pass_thru
1574
- else:
1575
- # Notes:
1576
- # * bps_isjob == 'True' isn't getting set for DAG jobs that are
1577
- # manually restarted.
1578
- # * Any job with DAGManJobID isn't a DAG job
1579
- constraint = 'bps_isjob == "True" && JobUniverse == 7'
1580
- if user:
1581
- constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
1582
-
1583
- job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds)
1584
-
1585
- # Have list of DAGMan jobs, need to get run_report info.
1586
- run_reports = {}
1587
- msg = ""
1588
- for jobs in job_info.values():
1589
- for job_id, job in jobs.items():
1590
- total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1591
- # If didn't get from queue information (e.g., Kerberos bug),
1592
- # try reading from file.
1593
- if total_jobs == 0:
1594
- try:
1595
- job.update(read_dag_status(job["Iwd"]))
1596
- total_jobs, state_counts = _get_state_counts_from_dag_job(job)
1597
- except StopIteration:
1598
- pass # don't kill report can't find htcondor files
1599
-
1600
- if "bps_run" not in job:
1601
- _add_run_info(job["Iwd"], job)
1602
- report = WmsRunReport(
1603
- wms_id=job_id,
1604
- global_wms_id=job["GlobalJobId"],
1605
- path=job["Iwd"],
1606
- label=job.get("bps_job_label", "MISS"),
1607
- run=job.get("bps_run", "MISS"),
1608
- project=job.get("bps_project", "MISS"),
1609
- campaign=job.get("bps_campaign", "MISS"),
1610
- payload=job.get("bps_payload", "MISS"),
1611
- operator=_get_owner(job),
1612
- run_summary=_get_run_summary(job),
1613
- state=_htc_status_to_wms_state(job),
1614
- jobs=[],
1615
- total_number_jobs=total_jobs,
1616
- job_state_counts=state_counts,
1617
- )
1618
- run_reports[report.global_wms_id] = report
1619
-
1620
- return run_reports, msg
1621
-
1622
-
1623
- def _add_run_info(wms_path, job):
1624
- """Find BPS run information elsewhere for runs without bps attributes.
1625
-
1626
- Parameters
1627
- ----------
1628
- wms_path : `str`
1629
- Path to submit files for the run.
1630
- job : `dict` [`str`, `~typing.Any`]
1631
- HTCondor dag job information.
1632
-
1633
- Raises
1634
- ------
1635
- StopIteration
1636
- If cannot find file it is looking for. Permission errors are
1637
- caught and job's run is marked with error.
1638
- """
1639
- path = Path(wms_path) / "jobs"
1640
- try:
1641
- subfile = next(path.glob("**/*.sub"))
1642
- except (StopIteration, PermissionError):
1643
- job["bps_run"] = "Unavailable"
1644
- else:
1645
- _LOG.debug("_add_run_info: subfile = %s", subfile)
1646
- try:
1647
- with open(subfile, encoding="utf-8") as fh:
1648
- for line in fh:
1649
- if line.startswith("+bps_"):
1650
- m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
1651
- if m:
1652
- _LOG.debug("Matching line: %s", line)
1653
- job[m.group(1)] = m.group(2).replace('"', "")
1654
- else:
1655
- _LOG.debug("Could not parse attribute: %s", line)
1656
- except PermissionError:
1657
- job["bps_run"] = "PermissionError"
1658
- _LOG.debug("After adding job = %s", job)
1659
-
1660
-
1661
- def _get_owner(job):
1662
- """Get the owner of a dag job.
1663
-
1664
- Parameters
1665
- ----------
1666
- job : `dict` [`str`, `~typing.Any`]
1667
- HTCondor dag job information.
1668
-
1669
- Returns
1670
- -------
1671
- owner : `str`
1672
- Owner of the dag job.
1673
- """
1674
- owner = job.get("bps_operator", None)
1675
- if not owner:
1676
- owner = job.get("Owner", None)
1677
- if not owner:
1678
- _LOG.warning("Could not get Owner from htcondor job: %s", job)
1679
- owner = "MISS"
1680
- return owner
1681
-
1682
-
1683
- def _get_run_summary(job):
1684
- """Get the run summary for a job.
1685
-
1686
- Parameters
1687
- ----------
1688
- job : `dict` [`str`, `~typing.Any`]
1689
- HTCondor dag job information.
1690
-
1691
- Returns
1692
- -------
1693
- summary : `str`
1694
- Number of jobs per PipelineTask label in approximate pipeline order.
1695
- Format: <label>:<count>[;<label>:<count>]+
1696
- """
1697
- summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
1698
- if not summary:
1699
- summary, _, _ = summarize_dag(job["Iwd"])
1700
- if not summary:
1701
- _LOG.warning("Could not get run summary for htcondor job: %s", job)
1702
- _LOG.debug("_get_run_summary: summary=%s", summary)
1703
-
1704
- # Workaround sometimes using init vs pipetaskInit
1705
- summary = summary.replace("init:", "pipetaskInit:")
1706
-
1707
- if "pegasus_version" in job and "pegasus" not in summary:
1708
- summary += ";pegasus:0"
1709
-
1710
- return summary
1711
-
1712
-
1713
- def _get_exit_code_summary(jobs):
1714
- """Get the exit code summary for a run.
1715
-
1716
- Parameters
1717
- ----------
1718
- jobs : `dict` [`str`, `dict` [`str`, Any]]
1719
- Mapping HTCondor job id to job information.
1720
-
1721
- Returns
1722
- -------
1723
- summary : `dict` [`str`, `list` [`int`]]
1724
- Jobs' exit codes per job label.
1725
- """
1726
- summary = {}
1727
- for job_id, job_ad in jobs.items():
1728
- job_label = job_ad["bps_job_label"]
1729
- summary.setdefault(job_label, [])
1730
- try:
1731
- exit_code = 0
1732
- job_status = job_ad["JobStatus"]
1733
- match job_status:
1734
- case htcondor.JobStatus.COMPLETED | htcondor.JobStatus.HELD:
1735
- exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["ExitCode"]
1736
- case (
1737
- htcondor.JobStatus.IDLE
1738
- | htcondor.JobStatus.RUNNING
1739
- | htcondor.JobStatus.REMOVED
1740
- | htcondor.JobStatus.TRANSFERRING_OUTPUT
1741
- | htcondor.JobStatus.SUSPENDED
1742
- ):
1743
- pass
1744
- case _:
1745
- _LOG.debug("Unknown 'JobStatus' value ('%d') in classad for job '%s'", job_status, job_id)
1746
- if exit_code != 0:
1747
- summary[job_label].append(exit_code)
1748
- except KeyError as ex:
1749
- _LOG.debug("Attribute '%s' not found in the classad for job '%s'", ex, job_id)
1750
- return summary
1751
-
1752
-
1753
- def _get_state_counts_from_jobs(
1754
- wms_workflow_id: str, jobs: dict[str, dict[str, Any]]
1755
- ) -> tuple[int, dict[WmsStates, int]]:
1756
- """Count number of jobs per WMS state.
1757
-
1758
- The workflow job and the service jobs are excluded from the count.
1759
-
1760
- Parameters
1761
- ----------
1762
- wms_workflow_id : `str`
1763
- HTCondor job id.
1764
- jobs : `dict [`dict` [`str`, `~typing.Any`]]
1765
- HTCondor dag job information.
1766
-
1767
- Returns
1768
- -------
1769
- total_count : `int`
1770
- Total number of dag nodes.
1771
- state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1772
- Keys are the different WMS states and values are counts of jobs
1773
- that are in that WMS state.
1774
- """
1775
- state_counts = dict.fromkeys(WmsStates, 0)
1776
- for job_id, job_ad in jobs.items():
1777
- if job_id != wms_workflow_id and job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [
1778
- WmsNodeType.PAYLOAD,
1779
- WmsNodeType.FINAL,
1780
- ]:
1781
- state_counts[_htc_status_to_wms_state(job_ad)] += 1
1782
- total_count = sum(state_counts.values())
1783
-
1784
- return total_count, state_counts
1785
-
1786
-
1787
- def _get_state_counts_from_dag_job(job):
1788
- """Count number of jobs per WMS state.
1789
-
1790
- Parameters
1791
- ----------
1792
- job : `dict` [`str`, `~typing.Any`]
1793
- HTCondor dag job information.
1794
-
1795
- Returns
1796
- -------
1797
- total_count : `int`
1798
- Total number of dag nodes.
1799
- state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
1800
- Keys are the different WMS states and values are counts of jobs
1801
- that are in that WMS state.
1802
- """
1803
- _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
1804
- state_counts = dict.fromkeys(WmsStates, 0)
1805
- if "DAG_NodesReady" in job:
1806
- state_counts = {
1807
- WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
1808
- WmsStates.READY: job.get("DAG_NodesReady", 0),
1809
- WmsStates.HELD: job.get("DAG_JobsHeld", 0),
1810
- WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
1811
- WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
1812
- WmsStates.PRUNED: job.get("DAG_NodesFutile", 0),
1813
- WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0),
1814
- }
1815
- total_jobs = job.get("DAG_NodesTotal")
1816
- _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
1817
- elif "NodesFailed" in job:
1818
- state_counts = {
1819
- WmsStates.UNREADY: job.get("NodesUnready", 0),
1820
- WmsStates.READY: job.get("NodesReady", 0),
1821
- WmsStates.HELD: job.get("JobProcsHeld", 0),
1822
- WmsStates.SUCCEEDED: job.get("NodesDone", 0),
1823
- WmsStates.FAILED: job.get("NodesFailed", 0),
1824
- WmsStates.PRUNED: job.get("NodesFutile", 0),
1825
- WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0),
1826
- }
1827
- try:
1828
- total_jobs = job.get("NodesTotal")
1829
- except KeyError as ex:
1830
- _LOG.error("Job missing %s. job = %s", str(ex), job)
1831
- raise
1832
- _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
1833
- else:
1834
- # With Kerberos job auth and Kerberos bug, if warning would be printed
1835
- # for every DAG.
1836
- _LOG.debug("Can't get job state counts %s", job["Iwd"])
1837
- total_jobs = 0
1838
-
1839
- _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
1840
- return total_jobs, state_counts
1841
-
1842
-
1843
- def _htc_status_to_wms_state(job):
1844
- """Convert HTCondor job status to generic wms state.
1845
-
1846
- Parameters
1847
- ----------
1848
- job : `dict` [`str`, `~typing.Any`]
1849
- HTCondor job information.
1850
-
1851
- Returns
1852
- -------
1853
- wms_state : `WmsStates`
1854
- The equivalent WmsState to given job's status.
1855
- """
1856
- wms_state = WmsStates.MISFIT
1857
- if "JobStatus" in job:
1858
- wms_state = _htc_job_status_to_wms_state(job)
1859
-
1860
- if wms_state == WmsStates.MISFIT and "NodeStatus" in job:
1861
- wms_state = _htc_node_status_to_wms_state(job)
1862
- return wms_state
1863
-
1864
-
1865
- def _htc_job_status_to_wms_state(job):
1866
- """Convert HTCondor job status to generic wms state.
1867
-
1868
- Parameters
1869
- ----------
1870
- job : `dict` [`str`, `~typing.Any`]
1871
- HTCondor job information.
1872
-
1873
- Returns
1874
- -------
1875
- wms_state : `lsst.ctrl.bps.WmsStates`
1876
- The equivalent WmsState to given job's status.
1877
- """
1878
- _LOG.debug(
1879
- "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"])
1880
- )
1881
- wms_state = WmsStates.MISFIT
1882
- if "JobStatus" in job and job["JobStatus"]:
1883
- job_status = int(job["JobStatus"])
1884
-
1885
- _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1886
- if job_status == htcondor.JobStatus.IDLE:
1887
- wms_state = WmsStates.PENDING
1888
- elif job_status == htcondor.JobStatus.RUNNING:
1889
- wms_state = WmsStates.RUNNING
1890
- elif job_status == htcondor.JobStatus.REMOVED:
1891
- wms_state = WmsStates.DELETED
1892
- elif job_status == htcondor.JobStatus.COMPLETED:
1893
- if (
1894
- (job.get("ExitBySignal", False) and job.get("ExitSignal", 0))
1895
- or job.get("ExitCode", 0)
1896
- or job.get("DAG_Status", 0)
1897
- ):
1898
- wms_state = WmsStates.FAILED
1899
- else:
1900
- wms_state = WmsStates.SUCCEEDED
1901
- elif job_status == htcondor.JobStatus.HELD:
1902
- wms_state = WmsStates.HELD
1903
-
1904
- return wms_state
1905
-
1906
-
1907
- def _htc_node_status_to_wms_state(job):
1908
- """Convert HTCondor node status to generic wms state.
1909
-
1910
- Parameters
1911
- ----------
1912
- job : `dict` [`str`, `~typing.Any`]
1913
- HTCondor job information.
1914
-
1915
- Returns
1916
- -------
1917
- wms_state : `lsst.ctrl.bps.WmsStates`
1918
- The equivalent WmsState to given node's status.
1919
- """
1920
- wms_state = WmsStates.MISFIT
1921
- match job["NodeStatus"]:
1922
- case NodeStatus.NOT_READY:
1923
- wms_state = WmsStates.UNREADY
1924
- case NodeStatus.READY:
1925
- wms_state = WmsStates.READY
1926
- case NodeStatus.PRERUN:
1927
- wms_state = WmsStates.MISFIT
1928
- case NodeStatus.SUBMITTED:
1929
- if job["JobProcsHeld"]:
1930
- wms_state = WmsStates.HELD
1931
- elif job["StatusDetails"] == "not_idle":
1932
- wms_state = WmsStates.RUNNING
1933
- elif job["JobProcsQueued"]:
1934
- wms_state = WmsStates.PENDING
1935
- case NodeStatus.POSTRUN:
1936
- wms_state = WmsStates.MISFIT
1937
- case NodeStatus.DONE:
1938
- wms_state = WmsStates.SUCCEEDED
1939
- case NodeStatus.ERROR:
1940
- # Use job exit status instead of post script exit status.
1941
- if "DAGMAN error 0" in job["StatusDetails"]:
1942
- wms_state = WmsStates.SUCCEEDED
1943
- elif "ULOG_JOB_ABORTED" in job["StatusDetails"]:
1944
- wms_state = WmsStates.DELETED
1945
- else:
1946
- wms_state = WmsStates.FAILED
1947
- case NodeStatus.FUTILE:
1948
- wms_state = WmsStates.PRUNED
1949
- return wms_state
1950
-
1951
-
1952
- def _update_jobs(jobs1, jobs2):
1953
- """Update jobs1 with info in jobs2.
1954
-
1955
- (Basically an update for nested dictionaries.)
1956
-
1957
- Parameters
1958
- ----------
1959
- jobs1 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1960
- HTCondor job information to be updated.
1961
- jobs2 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1962
- Additional HTCondor job information.
1963
- """
1964
- for job_id, job_ad in jobs2.items():
1965
- if job_id in jobs1:
1966
- jobs1[job_id].update(job_ad)
1967
- else:
1968
- jobs1[job_id] = job_ad
1969
-
1970
-
1971
- def _wms_id_type(wms_id):
1972
- """Determine the type of the WMS id.
1973
-
1974
- Parameters
1975
- ----------
1976
- wms_id : `str`
1977
- WMS id identifying a job.
1978
-
1979
- Returns
1980
- -------
1981
- id_type : `lsst.ctrl.bps.htcondor.WmsIdType`
1982
- Type of WMS id.
1983
- """
1984
- try:
1985
- int(float(wms_id))
1986
- except ValueError:
1987
- wms_path = Path(wms_id)
1988
- if wms_path.is_dir():
1989
- id_type = WmsIdType.PATH
1990
- else:
1991
- id_type = WmsIdType.GLOBAL
1992
- except TypeError:
1993
- id_type = WmsIdType.UNKNOWN
1994
- else:
1995
- id_type = WmsIdType.LOCAL
1996
- return id_type
1997
-
1998
-
1999
- def _wms_id_to_cluster(wms_id):
2000
- """Convert WMS id to cluster id.
2001
-
2002
- Parameters
2003
- ----------
2004
- wms_id : `int` or `float` or `str`
2005
- HTCondor job id or path.
2006
-
2007
- Returns
2008
- -------
2009
- schedd_ad : `classad.ClassAd`
2010
- ClassAd describing the scheduler managing the job with the given id.
2011
- cluster_id : `int`
2012
- HTCondor cluster id.
2013
- id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
2014
- The type of the provided id.
2015
- """
2016
- coll = htcondor.Collector()
2017
-
2018
- schedd_ad = None
2019
- cluster_id = None
2020
- id_type = _wms_id_type(wms_id)
2021
- if id_type == WmsIdType.LOCAL:
2022
- schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd)
2023
- cluster_id = int(float(wms_id))
2024
- elif id_type == WmsIdType.GLOBAL:
2025
- constraint = f'GlobalJobId == "{wms_id}"'
2026
- schedd_ads = {ad["Name"]: ad for ad in coll.locateAll(htcondor.DaemonTypes.Schedd)}
2027
- schedds = {name: htcondor.Schedd(ad) for name, ad in schedd_ads.items()}
2028
- job_info = condor_q(constraint=constraint, schedds=schedds)
2029
- if job_info:
2030
- schedd_name, job_rec = job_info.popitem()
2031
- job_id, _ = job_rec.popitem()
2032
- schedd_ad = schedd_ads[schedd_name]
2033
- cluster_id = int(float(job_id))
2034
- elif id_type == WmsIdType.PATH:
2035
- try:
2036
- job_info = read_dag_info(wms_id)
2037
- except (FileNotFoundError, PermissionError, OSError):
2038
- pass
2039
- else:
2040
- schedd_name, job_rec = job_info.popitem()
2041
- job_id, _ = job_rec.popitem()
2042
- schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, schedd_name)
2043
- cluster_id = int(float(job_id))
2044
- else:
2045
- pass
2046
- return schedd_ad, cluster_id, id_type
2047
-
2048
-
2049
- def _wms_id_to_dir(wms_id):
2050
- """Convert WMS id to a submit directory candidate.
2051
-
2052
- The function does not check if the directory exists or if it is a valid
2053
- BPS submit directory.
2054
-
2055
- Parameters
2056
- ----------
2057
- wms_id : `int` or `float` or `str`
2058
- HTCondor job id or path.
2059
-
2060
- Returns
2061
- -------
2062
- wms_path : `pathlib.Path` or None
2063
- Submit directory candidate for the run with the given job id. If no
2064
- directory can be associated with the provided WMS id, it will be set
2065
- to None.
2066
- id_type : `lsst.ctrl.bps.wms.htcondor.IdType`
2067
- The type of the provided id.
2068
-
2069
- Raises
2070
- ------
2071
- TypeError
2072
- Raised if provided WMS id has invalid type.
2073
- """
2074
- coll = htcondor.Collector()
2075
- schedd_ads = []
2076
-
2077
- constraint = None
2078
- wms_path = None
2079
- id_type = _wms_id_type(wms_id)
2080
- match id_type:
2081
- case WmsIdType.LOCAL:
2082
- constraint = f"ClusterId == {int(float(wms_id))}"
2083
- schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
2084
- case WmsIdType.GLOBAL:
2085
- constraint = f'GlobalJobId == "{wms_id}"'
2086
- schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
2087
- case WmsIdType.PATH:
2088
- wms_path = Path(wms_id).resolve()
2089
- case WmsIdType.UNKNOWN:
2090
- raise TypeError(f"Invalid job id type: {wms_id}")
2091
- if constraint is not None:
2092
- schedds = {ad["name"]: htcondor.Schedd(ad) for ad in schedd_ads}
2093
- job_info = condor_history(constraint=constraint, schedds=schedds, projection=["Iwd"])
2094
- if job_info:
2095
- _, job_rec = job_info.popitem()
2096
- _, job_ad = job_rec.popitem()
2097
- wms_path = Path(job_ad["Iwd"])
2098
- return wms_path, id_type
2099
-
2100
-
2101
- def _create_periodic_release_expr(
2102
- memory: int, multiplier: float | None, limit: int, additional_expr: str = ""
2103
- ) -> str:
2104
- """Construct an HTCondorAd expression for releasing held jobs.
2105
-
2106
- Parameters
2107
- ----------
2108
- memory : `int`
2109
- Requested memory in MB.
2110
- multiplier : `float` or None
2111
- Memory growth rate between retries.
2112
- limit : `int`
2113
- Memory limit.
2114
- additional_expr : `str`, optional
2115
- Expression to add to periodic_release. Defaults to empty string.
2116
-
2117
- Returns
2118
- -------
2119
- expr : `str`
2120
- A string representing an HTCondor ClassAd expression for releasing job.
2121
- """
2122
- _LOG.debug(
2123
- "periodic_release: memory: %s, multiplier: %s, limit: %s, additional_expr: %s",
2124
- memory,
2125
- multiplier,
2126
- limit,
2127
- additional_expr,
2128
- )
2129
-
2130
- # ctrl_bps sets multiplier to None in the GenericWorkflow if
2131
- # memoryMultiplier <= 1, but checking value just in case.
2132
- if (not multiplier or multiplier <= 1) and not additional_expr:
2133
- return ""
2134
-
2135
- # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
2136
- # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
2137
- # The special comparison operators ensure that all comparisons below will
2138
- # evaluate to FALSE in this case.
2139
- #
2140
- # Note:
2141
- # May not be strictly necessary. Operators '&&' and '||' are not strict so
2142
- # the entire expression should evaluate to FALSE when the job is not HELD.
2143
- # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
2144
- # but better safe than sorry.
2145
- is_held = "JobStatus == 5"
2146
- is_retry_allowed = "NumJobStarts <= JobMaxRetries"
2147
-
2148
- mem_expr = ""
2149
- if memory and multiplier and multiplier > 1 and limit:
2150
- was_mem_exceeded = (
2151
- "(HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
2152
- "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
2153
- )
2154
- was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
2155
- mem_expr = f"{was_mem_exceeded} && {was_below_limit}"
2156
-
2157
- user_expr = ""
2158
- if additional_expr:
2159
- # Never auto release a job held by user.
2160
- user_expr = f"HoldReasonCode =!= 1 && {additional_expr}"
2161
-
2162
- expr = f"{is_held} && {is_retry_allowed}"
2163
- if user_expr and mem_expr:
2164
- expr += f" && ({mem_expr} || {user_expr})"
2165
- elif user_expr:
2166
- expr += f" && {user_expr}"
2167
- elif mem_expr:
2168
- expr += f" && {mem_expr}"
2169
-
2170
- return expr
2171
-
2172
-
2173
- def _create_periodic_remove_expr(memory, multiplier, limit):
2174
- """Construct an HTCondorAd expression for removing jobs from the queue.
2175
-
2176
- Parameters
2177
- ----------
2178
- memory : `int`
2179
- Requested memory in MB.
2180
- multiplier : `float`
2181
- Memory growth rate between retries.
2182
- limit : `int`
2183
- Memory limit.
2184
-
2185
- Returns
2186
- -------
2187
- expr : `str`
2188
- A string representing an HTCondor ClassAd expression for removing jobs.
2189
- """
2190
- # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode'
2191
- # are UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
2192
- # The special comparison operators ensure that all comparisons below
2193
- # will evaluate to FALSE in this case.
2194
- #
2195
- # Note:
2196
- # May not be strictly necessary. Operators '&&' and '||' are not
2197
- # strict so the entire expression should evaluate to FALSE when the
2198
- # job is not HELD. According to ClassAd evaluation semantics
2199
- # FALSE && UNDEFINED is FALSE, but better safe than sorry.
2200
- is_held = "JobStatus == 5"
2201
- is_retry_disallowed = "NumJobStarts > JobMaxRetries"
2202
-
2203
- mem_expr = ""
2204
- if memory and multiplier and multiplier > 1 and limit:
2205
- mem_limit_expr = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
2206
-
2207
- mem_expr = ( # Add || here so only added if adding memory expr
2208
- " || ((HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
2209
- f"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && {mem_limit_expr})"
2210
- )
2211
-
2212
- expr = f"{is_held} && ({is_retry_disallowed}{mem_expr})"
2213
- return expr
2214
-
2215
-
2216
- def _create_request_memory_expr(memory, multiplier, limit):
2217
- """Construct an HTCondor ClassAd expression for safe memory scaling.
2218
-
2219
- Parameters
2220
- ----------
2221
- memory : `int`
2222
- Requested memory in MB.
2223
- multiplier : `float`
2224
- Memory growth rate between retries.
2225
- limit : `int`
2226
- Memory limit.
2227
-
2228
- Returns
2229
- -------
2230
- expr : `str`
2231
- A string representing an HTCondor ClassAd expression enabling safe
2232
- memory scaling between job retries.
2233
- """
2234
- # The check if the job was held due to exceeding memory requirements
2235
- # will be made *after* job was released back to the job queue (is in
2236
- # the IDLE state), hence the need to use `Last*` job ClassAds instead of
2237
- # the ones describing job's current state.
2238
- #
2239
- # Also, 'Last*' job ClassAds attributes are UNDEFINED when a job is
2240
- # initially put in the job queue. The special comparison operators ensure
2241
- # that all comparisons below will evaluate to FALSE in this case.
2242
- was_mem_exceeded = (
2243
- "LastJobStatus =?= 5 "
2244
- "&& (LastHoldReasonCode =?= 34 && LastHoldReasonSubCode =?= 0 "
2245
- "|| LastHoldReasonCode =?= 3 && LastHoldReasonSubCode =?= 34)"
2246
- )
2247
-
2248
- # If job runs the first time or was held for reasons other than exceeding
2249
- # the memory, set the required memory to the requested value or use
2250
- # the memory value measured by HTCondor (MemoryUsage) depending on
2251
- # whichever is greater.
2252
- expr = (
2253
- f"({was_mem_exceeded}) "
2254
- f"? min({{int({memory} * pow({multiplier}, NumJobStarts)), {limit}}}) "
2255
- f": max({{{memory}, MemoryUsage ?: 0}})"
2256
- )
2257
- return expr
2258
-
2259
-
2260
- def _locate_schedds(locate_all=False):
2261
- """Find out Scheduler daemons in an HTCondor pool.
2262
-
2263
- Parameters
2264
- ----------
2265
- locate_all : `bool`, optional
2266
- If True, all available schedulers in the HTCondor pool will be located.
2267
- False by default which means that the search will be limited to looking
2268
- for the Scheduler running on a local host.
2269
-
2270
- Returns
2271
- -------
2272
- schedds : `dict` [`str`, `htcondor.Schedd`]
2273
- A mapping between Scheduler names and Python objects allowing for
2274
- interacting with them.
2275
- """
2276
- coll = htcondor.Collector()
2277
-
2278
- schedd_ads = []
2279
- if locate_all:
2280
- schedd_ads.extend(coll.locateAll(htcondor.DaemonTypes.Schedd))
2281
- else:
2282
- schedd_ads.append(coll.locate(htcondor.DaemonTypes.Schedd))
2283
- return {ad["Name"]: htcondor.Schedd(ad) for ad in schedd_ads}
2284
-
2285
-
2286
- def _gather_site_values(config, compute_site):
2287
- """Gather values specific to given site.
2288
-
2289
- Parameters
2290
- ----------
2291
- config : `lsst.ctrl.bps.BpsConfig`
2292
- BPS configuration that includes necessary submit/runtime
2293
- information.
2294
- compute_site : `str`
2295
- Compute site name.
2296
-
2297
- Returns
2298
- -------
2299
- site_values : `dict` [`str`, `~typing.Any`]
2300
- Values specific to the given site.
2301
- """
2302
- site_values = {"attrs": {}, "profile": {}}
2303
- search_opts = {}
2304
- if compute_site:
2305
- search_opts["curvals"] = {"curr_site": compute_site}
2306
-
2307
- # Determine the hard limit for the memory requirement.
2308
- found, limit = config.search("memoryLimit", opt=search_opts)
2309
- if not found:
2310
- search_opts["default"] = DEFAULT_HTC_EXEC_PATT
2311
- _, patt = config.search("executeMachinesPattern", opt=search_opts)
2312
- del search_opts["default"]
2313
-
2314
- # To reduce the amount of data, ignore dynamic slots (if any) as,
2315
- # by definition, they cannot have more memory than
2316
- # the partitionable slot they are the part of.
2317
- constraint = f'SlotType != "Dynamic" && regexp("{patt}", Machine)'
2318
- pool_info = condor_status(constraint=constraint)
2319
- try:
2320
- limit = max(int(info["TotalSlotMemory"]) for info in pool_info.values())
2321
- except ValueError:
2322
- _LOG.debug("No execute machine in the pool matches %s", patt)
2323
- if limit:
2324
- config[".bps_defined.memory_limit"] = limit
2325
-
2326
- _, site_values["bpsUseShared"] = config.search("bpsUseShared", opt={"default": False})
2327
- site_values["memoryLimit"] = limit
2328
-
2329
- found, value = config.search("accountingGroup", opt=search_opts)
2330
- if found:
2331
- site_values["accountingGroup"] = value
2332
- found, value = config.search("accountingUser", opt=search_opts)
2333
- if found:
2334
- site_values["accountingUser"] = value
2335
-
2336
- key = f".site.{compute_site}.profile.condor"
2337
- if key in config:
2338
- for subkey, val in config[key].items():
2339
- if subkey.startswith("+"):
2340
- site_values["attrs"][subkey[1:]] = val
2341
- else:
2342
- site_values["profile"][subkey] = val
2343
-
2344
- return site_values
2345
-
2346
-
2347
- def _gather_label_values(config: BpsConfig, label: str) -> dict[str, Any]:
2348
- """Gather values specific to given job label.
2349
-
2350
- Parameters
2351
- ----------
2352
- config : `lsst.ctrl.bps.BpsConfig`
2353
- BPS configuration that includes necessary submit/runtime
2354
- information.
2355
- label : `str`
2356
- GenericWorkflowJob label.
2357
-
2358
- Returns
2359
- -------
2360
- values : `dict` [`str`, `~typing.Any`]
2361
- Values specific to the given job label.
2362
- """
2363
- values: dict[str, Any] = {"attrs": {}, "profile": {}}
2364
-
2365
- search_opts = {}
2366
- profile_key = ""
2367
- if label == "finalJob":
2368
- search_opts["searchobj"] = config["finalJob"]
2369
- profile_key = ".finalJob.profile.condor"
2370
- elif label in config["cluster"]:
2371
- search_opts["curvals"] = {"curr_cluster": label}
2372
- profile_key = f".cluster.{label}.profile.condor"
2373
- elif label in config["pipetask"]:
2374
- search_opts["curvals"] = {"curr_pipetask": label}
2375
- profile_key = f".pipetask.{label}.profile.condor"
2376
-
2377
- found, value = config.search("releaseExpr", opt=search_opts)
2378
- if found:
2379
- values["releaseExpr"] = value
2380
-
2381
- found, value = config.search("overwriteJobFiles", opt=search_opts)
2382
- if found:
2383
- values["overwriteJobFiles"] = value
2384
- else:
2385
- values["overwriteJobFiles"] = True
2386
-
2387
- if profile_key and profile_key in config:
2388
- for subkey, val in config[profile_key].items():
2389
- if subkey.startswith("+"):
2390
- values["attrs"][subkey[1:]] = val
2391
- else:
2392
- values["profile"][subkey] = val
2393
-
2394
- return values
2395
-
2396
-
2397
- def is_service_job(job_ad: dict[str, Any]) -> bool:
2398
- """Determine if a job is a service one.
2399
-
2400
- Parameters
2401
- ----------
2402
- job_ad : `dict` [`str`, Any]
2403
- Information about an HTCondor job.
2404
-
2405
- Returns
2406
- -------
2407
- is_service_job : `bool`
2408
- True if the job is a service one, false otherwise.
2409
-
2410
- Notes
2411
- -----
2412
- At the moment, HTCondor does not provide a native way to distinguish
2413
- between payload and service jobs in the workflow. This code depends
2414
- on read_node_status adding wms_node_type.
2415
- """
2416
- return job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) == WmsNodeType.SERVICE
2417
-
2418
-
2419
- def _group_to_subdag(
2420
- config: BpsConfig, generic_workflow_group: GenericWorkflowGroup, out_prefix: str
2421
- ) -> HTCJob:
2422
- """Convert a generic workflow group to an HTCondor dag.
2423
-
2424
- Parameters
2425
- ----------
2426
- config : `lsst.ctrl.bps.BpsConfig`
2427
- Workflow configuration.
2428
- generic_workflow_group : `lsst.ctrl.bps.GenericWorkflowGroup`
2429
- The generic workflow group to convert.
2430
- out_prefix : `str`
2431
- Location prefix to be used when creating jobs.
2432
-
2433
- Returns
2434
- -------
2435
- htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
2436
- Job for running the HTCondor dag.
2437
- """
2438
- jobname = f"wms_{generic_workflow_group.name}"
2439
- htc_job = HTCJob(name=jobname, label=generic_workflow_group.label)
2440
- htc_job.add_dag_cmds({"dir": f"subdags/{jobname}"})
2441
- htc_job.subdag = _generic_workflow_to_htcondor_dag(config, generic_workflow_group, out_prefix)
2442
- if not generic_workflow_group.blocking:
2443
- htc_job.dagcmds["post"] = {
2444
- "defer": "",
2445
- "executable": f"{os.path.dirname(__file__)}/subdag_post.sh",
2446
- "arguments": f"{jobname} $RETURN",
2447
- }
2448
- return htc_job
2449
-
2450
-
2451
- def _create_check_job(group_job_name: str, job_label: str) -> HTCJob:
2452
- """Create a job to check status of a group job.
2453
-
2454
- Parameters
2455
- ----------
2456
- group_job_name : `str`
2457
- Name of the group job.
2458
- job_label : `str`
2459
- Label to use for the check status job.
2460
-
2461
- Returns
2462
- -------
2463
- htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
2464
- Job description for the job to check group job status.
2465
- """
2466
- htc_job = HTCJob(name=f"wms_check_status_{group_job_name}", label=job_label)
2467
- htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/check_group_status.sub"
2468
- htc_job.add_dag_cmds({"dir": f"subdags/{group_job_name}", "vars": {"group_job_name": group_job_name}})
2469
-
2470
- return htc_job
2471
-
2472
-
2473
- def _generic_workflow_to_htcondor_dag(
2474
- config: BpsConfig, generic_workflow: GenericWorkflow, out_prefix: str
2475
- ) -> HTCDag:
2476
- """Convert a GenericWorkflow to a HTCDag.
2477
-
2478
- Parameters
2479
- ----------
2480
- config : `lsst.ctrl.bps.BpsConfig`
2481
- Workflow configuration.
2482
- generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
2483
- The GenericWorkflow to convert.
2484
- out_prefix : `str`
2485
- Location prefix where the HTCondor files will be written.
2486
-
2487
- Returns
2488
- -------
2489
- dag : `lsst.ctrl.bps.htcondor.HTCDag`
2490
- The HTCDag representation of the given GenericWorkflow.
2491
- """
2492
- dag = HTCDag(name=generic_workflow.name)
2493
-
2494
- _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
2495
- dag.add_attribs(generic_workflow.run_attrs)
2496
- dag.add_attribs(
2497
- {
2498
- "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
2499
- "bps_job_summary": create_count_summary(generic_workflow.job_counts),
2500
- }
2501
- )
2502
-
2503
- _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
2504
- if isinstance(tmp_template, str):
2505
- subdir_template = defaultdict(lambda: tmp_template)
2506
- else:
2507
- subdir_template = tmp_template
2508
-
2509
- # Create all DAG jobs
2510
- site_values = {} # Cache compute site specific values to reduce config lookups.
2511
- cached_values = {} # Cache label-specific values to reduce config lookups.
2512
- # Note: Can't use get_job_by_label because those only include payload jobs.
2513
- for job_name in generic_workflow:
2514
- gwjob = generic_workflow.get_job(job_name)
2515
- if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD:
2516
- gwjob = cast(GenericWorkflowJob, gwjob)
2517
- if gwjob.compute_site not in site_values:
2518
- site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
2519
- if gwjob.label not in cached_values:
2520
- cached_values[gwjob.label] = deepcopy(site_values[gwjob.compute_site])
2521
- cached_values[gwjob.label].update(_gather_label_values(config, gwjob.label))
2522
- _LOG.debug("cached: %s= %s", gwjob.label, cached_values[gwjob.label])
2523
- htc_job = _create_job(
2524
- subdir_template[gwjob.label],
2525
- cached_values[gwjob.label],
2526
- generic_workflow,
2527
- gwjob,
2528
- out_prefix,
2529
- )
2530
- elif gwjob.node_type == GenericWorkflowNodeType.NOOP:
2531
- gwjob = cast(GenericWorkflowNoopJob, gwjob)
2532
- htc_job = HTCJob(f"wms_{gwjob.name}", label=gwjob.label)
2533
- htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/noop.sub"
2534
- htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
2535
- htc_job.add_dag_cmds({"noop": True})
2536
- elif gwjob.node_type == GenericWorkflowNodeType.GROUP:
2537
- gwjob = cast(GenericWorkflowGroup, gwjob)
2538
- htc_job = _group_to_subdag(config, gwjob, out_prefix)
2539
- # In case DAGMAN_GENERATE_SUBDAG_SUBMITS is False,
2540
- dag.graph["submit_options"]["do_recurse"] = True
2541
- else:
2542
- raise RuntimeError(f"Unsupported generic workflow node type {gwjob.node_type} ({gwjob.name})")
2543
- _LOG.debug("Calling adding job %s %s", htc_job.name, htc_job.label)
2544
- dag.add_job(htc_job)
2545
-
2546
- # Add job dependencies to the DAG (be careful with wms_ jobs)
2547
- for job_name in generic_workflow:
2548
- gwjob = generic_workflow.get_job(job_name)
2549
- parent_name = (
2550
- gwjob.name if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD else f"wms_{gwjob.name}"
2551
- )
2552
- successor_jobs = [generic_workflow.get_job(j) for j in generic_workflow.successors(job_name)]
2553
- children_names = []
2554
- if gwjob.node_type == GenericWorkflowNodeType.GROUP:
2555
- gwjob = cast(GenericWorkflowGroup, gwjob)
2556
- group_children = [] # Dependencies between same group jobs
2557
- for sjob in successor_jobs:
2558
- if sjob.node_type == GenericWorkflowNodeType.GROUP and sjob.label == gwjob.label:
2559
- group_children.append(f"wms_{sjob.name}")
2560
- elif sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
2561
- children_names.append(sjob.name)
2562
- else:
2563
- children_names.append(f"wms_{sjob.name}")
2564
- if group_children:
2565
- dag.add_job_relationships([parent_name], group_children)
2566
- if not gwjob.blocking:
2567
- # Since subdag will always succeed, need to add a special
2568
- # job that fails if group failed to block payload children.
2569
- check_job = _create_check_job(f"wms_{gwjob.name}", gwjob.label)
2570
- dag.add_job(check_job)
2571
- dag.add_job_relationships([f"wms_{gwjob.name}"], [check_job.name])
2572
- parent_name = check_job.name
2573
- else:
2574
- for sjob in successor_jobs:
2575
- if sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
2576
- children_names.append(sjob.name)
2577
- else:
2578
- children_names.append(f"wms_{sjob.name}")
2579
-
2580
- dag.add_job_relationships([parent_name], children_names)
2581
-
2582
- # If final job exists in generic workflow, create DAG final job
2583
- final = generic_workflow.get_final()
2584
- if final and isinstance(final, GenericWorkflowJob):
2585
- if final.compute_site and final.compute_site not in site_values:
2586
- site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
2587
- if final.label not in cached_values:
2588
- cached_values[final.label] = deepcopy(site_values[final.compute_site])
2589
- cached_values[final.label].update(_gather_label_values(config, final.label))
2590
- final_htjob = _create_job(
2591
- subdir_template[final.label],
2592
- cached_values[final.label],
2593
- generic_workflow,
2594
- final,
2595
- out_prefix,
2596
- )
2597
- if "post" not in final_htjob.dagcmds:
2598
- final_htjob.dagcmds["post"] = {
2599
- "defer": "",
2600
- "executable": f"{os.path.dirname(__file__)}/final_post.sh",
2601
- "arguments": f"{final.name} $DAG_STATUS $RETURN",
2602
- }
2603
- dag.add_final_job(final_htjob)
2604
- elif final and isinstance(final, GenericWorkflow):
2605
- raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
2606
- elif final:
2607
- raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
2608
-
2609
- return dag