lsst-ctrl-bps-htcondor 29.2025.2000__py3-none-any.whl → 29.2025.2200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,6 +34,7 @@ import logging
34
34
  import os
35
35
  import re
36
36
  from collections import defaultdict
37
+ from copy import deepcopy
37
38
  from enum import IntEnum, auto
38
39
  from pathlib import Path
39
40
  from typing import Any, cast
@@ -331,7 +332,7 @@ class HTCondorService(BaseWmsService):
331
332
 
332
333
  Returns
333
334
  -------
334
- job_ids : `list` [`Any`]
335
+ job_ids : `list` [`~typing.Any`]
335
336
  Only job ids to be used by cancel and other functions. Typically
336
337
  this means top-level jobs (i.e., not children jobs).
337
338
  """
@@ -400,6 +401,54 @@ class HTCondorService(BaseWmsService):
400
401
  _LOG.debug("job_ids = %s", job_ids)
401
402
  return job_ids
402
403
 
404
+ def get_status(
405
+ self,
406
+ wms_workflow_id: str,
407
+ hist: float = 1,
408
+ is_global: bool = False,
409
+ ) -> tuple[WmsStates, str]:
410
+ """Return status of run based upon given constraints.
411
+
412
+ Parameters
413
+ ----------
414
+ wms_workflow_id : `str`
415
+ Limit to specific run based on id (queue id or path).
416
+ hist : `float`, optional
417
+ Limit history search to this many days. Defaults to 1.
418
+ is_global : `bool`, optional
419
+ If set, all job queues (and their histories) will be queried for
420
+ job information. Defaults to False which means that only the local
421
+ job queue will be queried.
422
+
423
+ Returns
424
+ -------
425
+ state : `lsst.ctrl.bps.WmsStates`
426
+ Status of single run from given information.
427
+ message : `str`
428
+ Extra message for status command to print. This could be pointers
429
+ to documentation or to WMS specific commands.
430
+ """
431
+ _LOG.debug("get_status: id=%s, hist=%s, is_global=%s", wms_workflow_id, hist, is_global)
432
+
433
+ id_type = _wms_id_type(wms_workflow_id)
434
+ _LOG.debug("id_type = %s", id_type.name)
435
+
436
+ if id_type == WmsIdType.LOCAL:
437
+ schedulers = _locate_schedds(locate_all=is_global)
438
+ _LOG.debug("schedulers = %s", schedulers)
439
+ state, message = _get_status_from_id(wms_workflow_id, hist, schedds=schedulers)
440
+ elif id_type == WmsIdType.GLOBAL:
441
+ schedulers = _locate_schedds(locate_all=True)
442
+ _LOG.debug("schedulers = %s", schedulers)
443
+ state, message = _get_status_from_id(wms_workflow_id, hist, schedds=schedulers)
444
+ elif id_type == WmsIdType.PATH:
445
+ state, message = _get_status_from_path(wms_workflow_id)
446
+ else:
447
+ state, message = WmsStates.UNKNOWN, "Invalid job id"
448
+ _LOG.debug("state: %s, %s", state, message)
449
+
450
+ return state, message
451
+
403
452
  def report(
404
453
  self,
405
454
  wms_workflow_id=None,
@@ -604,15 +653,15 @@ class HTCondorWorkflow(BaseWmsWorkflow):
604
653
  self.dag.write(out_prefix, job_subdir="jobs/{self.label}")
605
654
 
606
655
 
607
- def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix):
656
+ def _create_job(subdir_template, cached_values, generic_workflow, gwjob, out_prefix):
608
657
  """Convert GenericWorkflow job nodes to DAG jobs.
609
658
 
610
659
  Parameters
611
660
  ----------
612
661
  subdir_template : `str`
613
662
  Template for making subdirs.
614
- site_values : `dict`
615
- Site specific values
663
+ cached_values : `dict`
664
+ Site and label specific values.
616
665
  generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
617
666
  Generic workflow that is being converted.
618
667
  gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
@@ -654,7 +703,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
654
703
  "on_exit_hold_subcode": "34",
655
704
  }
656
705
 
657
- htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob))
706
+ htc_job_cmds.update(_translate_job_cmds(cached_values, generic_workflow, gwjob))
658
707
 
659
708
  # job stdout, stderr, htcondor user log.
660
709
  for key in ("output", "error", "log"):
@@ -662,7 +711,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
662
711
  _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
663
712
 
664
713
  htc_job_cmds.update(
665
- _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix)
714
+ _handle_job_inputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
666
715
  )
667
716
 
668
717
  # Add the job cmds dict to the job object.
@@ -673,7 +722,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
673
722
  # Add job attributes to job.
674
723
  _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
675
724
  htc_job.add_job_attrs(gwjob.attrs)
676
- htc_job.add_job_attrs(site_values["attrs"])
725
+ htc_job.add_job_attrs(cached_values["attrs"])
677
726
  htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
678
727
  htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
679
728
 
@@ -685,8 +734,8 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
685
734
 
686
735
  Parameters
687
736
  ----------
688
- cached_vals : `dict` [`str`, `Any`]
689
- Config values common to jobs with same label.
737
+ cached_vals : `dict` [`str`, `~typing.Any`]
738
+ Config values common to jobs with same site or label.
690
739
  generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
691
740
  Generic workflow that contains job to being converted.
692
741
  gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
@@ -694,7 +743,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
694
743
 
695
744
  Returns
696
745
  -------
697
- htc_job_commands : `dict` [`str`, `Any`]
746
+ htc_job_commands : `dict` [`str`, `~typing.Any`]
698
747
  Contains commands which can appear in the HTCondor submit description
699
748
  file.
700
749
  """
@@ -720,9 +769,6 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
720
769
  jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
721
770
 
722
771
  # job commands that need modification
723
- if gwjob.number_of_retries:
724
- jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
725
-
726
772
  if gwjob.retry_unless_exit:
727
773
  if isinstance(gwjob.retry_unless_exit, int):
728
774
  jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
@@ -739,6 +785,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
739
785
  if gwjob.request_memory:
740
786
  jobcmds["request_memory"] = f"{gwjob.request_memory}"
741
787
 
788
+ memory_max = 0
742
789
  if gwjob.memory_multiplier:
743
790
  # Do not use try-except! At the moment, BpsConfig returns an empty
744
791
  # string if it does not contain the key.
@@ -765,13 +812,18 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
765
812
  gwjob.request_memory, gwjob.memory_multiplier, memory_max
766
813
  )
767
814
 
768
- # Periodically release jobs which are being held due to exceeding
769
- # memory. Stop doing that (by removing the job from the HTCondor queue)
770
- # after the maximal number of retries has been reached or the job was
771
- # already run at maximal allowed memory.
772
- jobcmds["periodic_release"] = _create_periodic_release_expr(
773
- gwjob.request_memory, gwjob.memory_multiplier, memory_max
774
- )
815
+ user_release_expr = cached_vals.get("releaseExpr", "")
816
+ if gwjob.number_of_retries is not None and gwjob.number_of_retries >= 0:
817
+ jobcmds["max_retries"] = gwjob.number_of_retries
818
+
819
+ # No point in adding periodic_release if 0 retries
820
+ if gwjob.number_of_retries > 0:
821
+ periodic_release = _create_periodic_release_expr(
822
+ gwjob.request_memory, gwjob.memory_multiplier, memory_max, user_release_expr
823
+ )
824
+ if periodic_release:
825
+ jobcmds["periodic_release"] = periodic_release
826
+
775
827
  jobcmds["periodic_remove"] = _create_periodic_remove_expr(
776
828
  gwjob.request_memory, gwjob.memory_multiplier, memory_max
777
829
  )
@@ -830,7 +882,7 @@ def _translate_dag_cmds(gwjob):
830
882
 
831
883
  Returns
832
884
  -------
833
- dagcmds : `dict` [`str`, `Any`]
885
+ dagcmds : `dict` [`str`, `~typing.Any`]
834
886
  DAGMan commands for the job.
835
887
  """
836
888
  # Values in the dag script that just are name mappings.
@@ -1010,6 +1062,77 @@ def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_sha
1010
1062
  return htc_commands
1011
1063
 
1012
1064
 
1065
+ def _get_status_from_id(
1066
+ wms_workflow_id: str, hist: float, schedds: dict[str, htcondor.Schedd]
1067
+ ) -> tuple[WmsStates, str]:
1068
+ """Gather run information using workflow id.
1069
+
1070
+ Parameters
1071
+ ----------
1072
+ wms_workflow_id : `str`
1073
+ Limit to specific run based on id.
1074
+ hist : `float`
1075
+ Limit history search to this many days.
1076
+ schedds : `dict` [ `str`, `htcondor.Schedd` ]
1077
+ HTCondor schedulers which to query for job information. If empty
1078
+ dictionary, all queries will be run against the local scheduler only.
1079
+
1080
+ Returns
1081
+ -------
1082
+ state : `lsst.ctrl.bps.WmsStates`
1083
+ Status for the corresponding run.
1084
+ message : `str`
1085
+ Message with extra error information.
1086
+ """
1087
+ _LOG.debug("_get_status_from_id: id=%s, hist=%s, schedds=%s", wms_workflow_id, hist, schedds)
1088
+
1089
+ message = ""
1090
+
1091
+ # Collect information about the job by querying HTCondor schedd and
1092
+ # HTCondor history.
1093
+ schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds)
1094
+ if len(schedd_dag_info) == 1:
1095
+ schedd_name = next(iter(schedd_dag_info))
1096
+ dag_id = next(iter(schedd_dag_info[schedd_name]))
1097
+ dag_ad = schedd_dag_info[schedd_name][dag_id]
1098
+ state = _htc_status_to_wms_state(dag_ad)
1099
+ else:
1100
+ state = WmsStates.UNKNOWN
1101
+ message = f"DAGMan job {wms_workflow_id} not found in queue or history. Check id or try path."
1102
+ return state, message
1103
+
1104
+
1105
+ def _get_status_from_path(wms_path: str | os.PathLike) -> tuple[WmsStates, str]:
1106
+ """Gather run status from a given run directory.
1107
+
1108
+ Parameters
1109
+ ----------
1110
+ wms_path : `str` | `os.PathLike`
1111
+ The directory containing the submit side files (e.g., HTCondor files).
1112
+
1113
+ Returns
1114
+ -------
1115
+ state : `lsst.ctrl.bps.WmsStates`
1116
+ Status for the run.
1117
+ message : `str`
1118
+ Message to be printed.
1119
+ """
1120
+ wms_path = Path(wms_path).resolve()
1121
+ message = ""
1122
+ try:
1123
+ wms_workflow_id, dag_ad = read_dag_log(wms_path)
1124
+ except FileNotFoundError:
1125
+ wms_workflow_id = MISSING_ID
1126
+ message = f"DAGMan log not found in {wms_path}. Check path."
1127
+
1128
+ if wms_workflow_id == MISSING_ID:
1129
+ state = WmsStates.UNKNOWN
1130
+ else:
1131
+ state = _htc_status_to_wms_state(dag_ad[wms_workflow_id])
1132
+
1133
+ return state, message
1134
+
1135
+
1013
1136
  def _report_from_path(wms_path):
1014
1137
  """Gather run information from a given run directory.
1015
1138
 
@@ -1135,11 +1258,11 @@ def _get_info_from_schedd(
1135
1258
  ----------
1136
1259
  wms_workflow_id : `str`
1137
1260
  Limit to specific run based on id.
1138
- hist : `int`
1261
+ hist : `float`
1139
1262
  Limit history search to this many days.
1140
- schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
1141
- HTCondor schedulers which to query for job information. If None
1142
- (default), all queries will be run against the local scheduler only.
1263
+ schedds : `dict` [ `str`, `htcondor.Schedd` ]
1264
+ HTCondor schedulers which to query for job information. If empty
1265
+ dictionary, all queries will be run against the local scheduler only.
1143
1266
 
1144
1267
  Returns
1145
1268
  -------
@@ -1148,6 +1271,8 @@ def _get_info_from_schedd(
1148
1271
  Scheduler, local HTCondor job ids are mapped to their respective
1149
1272
  classads.
1150
1273
  """
1274
+ _LOG.debug("_get_info_from_schedd: id=%s, hist=%s, schedds=%s", wms_workflow_id, hist, schedds)
1275
+
1151
1276
  dag_constraint = 'regexp("dagman$", Cmd)'
1152
1277
  try:
1153
1278
  cluster_id = int(float(wms_workflow_id))
@@ -1180,7 +1305,7 @@ def _get_info_from_path(wms_path: str | os.PathLike) -> tuple[str, dict[str, dic
1180
1305
  -------
1181
1306
  wms_workflow_id : `str`
1182
1307
  The run id which is a DAGman job id.
1183
- jobs : `dict` [`str`, `dict` [`str`, `Any`]]
1308
+ jobs : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1184
1309
  Information about jobs read from files in the given directory.
1185
1310
  The key is the HTCondor id and the value is a dictionary of HTCondor
1186
1311
  keys and values.
@@ -1340,7 +1465,7 @@ def _add_service_job_specific_info(job_ad: dict[str, Any], specific_info: WmsSpe
1340
1465
 
1341
1466
  Parameters
1342
1467
  ----------
1343
- job_ad : `dict` [`str`, `Any`]
1468
+ job_ad : `dict` [`str`, `~typing.Any`]
1344
1469
  Provisioning job information.
1345
1470
  specific_info : `lsst.ctrl.bps.WmsSpecificInfo`
1346
1471
  Where to add message.
@@ -1466,7 +1591,7 @@ def _add_run_info(wms_path, job):
1466
1591
  ----------
1467
1592
  wms_path : `str`
1468
1593
  Path to submit files for the run.
1469
- job : `dict` [`str`, `Any`]
1594
+ job : `dict` [`str`, `~typing.Any`]
1470
1595
  HTCondor dag job information.
1471
1596
 
1472
1597
  Raises
@@ -1502,7 +1627,7 @@ def _get_owner(job):
1502
1627
 
1503
1628
  Parameters
1504
1629
  ----------
1505
- job : `dict` [`str`, `Any`]
1630
+ job : `dict` [`str`, `~typing.Any`]
1506
1631
  HTCondor dag job information.
1507
1632
 
1508
1633
  Returns
@@ -1524,7 +1649,7 @@ def _get_run_summary(job):
1524
1649
 
1525
1650
  Parameters
1526
1651
  ----------
1527
- job : `dict` [`str`, `Any`]
1652
+ job : `dict` [`str`, `~typing.Any`]
1528
1653
  HTCondor dag job information.
1529
1654
 
1530
1655
  Returns
@@ -1600,7 +1725,7 @@ def _get_state_counts_from_jobs(
1600
1725
  ----------
1601
1726
  wms_workflow_id : `str`
1602
1727
  HTCondor job id.
1603
- jobs : `dict [`dict` [`str`, `Any`]]
1728
+ jobs : `dict [`dict` [`str`, `~typing.Any`]]
1604
1729
  HTCondor dag job information.
1605
1730
 
1606
1731
  Returns
@@ -1628,7 +1753,7 @@ def _get_state_counts_from_dag_job(job):
1628
1753
 
1629
1754
  Parameters
1630
1755
  ----------
1631
- job : `dict` [`str`, `Any`]
1756
+ job : `dict` [`str`, `~typing.Any`]
1632
1757
  HTCondor dag job information.
1633
1758
 
1634
1759
  Returns
@@ -1684,7 +1809,7 @@ def _htc_status_to_wms_state(job):
1684
1809
 
1685
1810
  Parameters
1686
1811
  ----------
1687
- job : `dict` [`str`, `Any`]
1812
+ job : `dict` [`str`, `~typing.Any`]
1688
1813
  HTCondor job information.
1689
1814
 
1690
1815
  Returns
@@ -1706,7 +1831,7 @@ def _htc_job_status_to_wms_state(job):
1706
1831
 
1707
1832
  Parameters
1708
1833
  ----------
1709
- job : `dict` [`str`, `Any`]
1834
+ job : `dict` [`str`, `~typing.Any`]
1710
1835
  HTCondor job information.
1711
1836
 
1712
1837
  Returns
@@ -1748,7 +1873,7 @@ def _htc_node_status_to_wms_state(job):
1748
1873
 
1749
1874
  Parameters
1750
1875
  ----------
1751
- job : `dict` [`str`, `Any`]
1876
+ job : `dict` [`str`, `~typing.Any`]
1752
1877
  HTCondor job information.
1753
1878
 
1754
1879
  Returns
@@ -1795,9 +1920,9 @@ def _update_jobs(jobs1, jobs2):
1795
1920
 
1796
1921
  Parameters
1797
1922
  ----------
1798
- jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1923
+ jobs1 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1799
1924
  HTCondor job information to be updated.
1800
- jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1925
+ jobs2 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1801
1926
  Additional HTCondor job information.
1802
1927
  """
1803
1928
  for job_id, job_ad in jobs2.items():
@@ -1937,34 +2062,39 @@ def _wms_id_to_dir(wms_id):
1937
2062
  return wms_path, id_type
1938
2063
 
1939
2064
 
1940
- def _create_periodic_release_expr(memory, multiplier, limit):
2065
+ def _create_periodic_release_expr(
2066
+ memory: int, multiplier: float | None, limit: int, additional_expr: str = ""
2067
+ ) -> str:
1941
2068
  """Construct an HTCondorAd expression for releasing held jobs.
1942
2069
 
1943
- The expression instruct HTCondor to release any job which was put on hold
1944
- due to exceeding memory requirements back to the job queue providing it
1945
- satisfies all of the conditions below:
1946
-
1947
- * number of run attempts did not reach allowable number of retries,
1948
- * the memory requirements in the last failed run attempt did not reach
1949
- the specified memory limit.
1950
-
1951
2070
  Parameters
1952
2071
  ----------
1953
2072
  memory : `int`
1954
2073
  Requested memory in MB.
1955
- multiplier : `float`
1956
- Memory growth rate between retires.
2074
+ multiplier : `float` or None
2075
+ Memory growth rate between retries.
1957
2076
  limit : `int`
1958
2077
  Memory limit.
2078
+ additional_expr : `str`, optional
2079
+ Expression to add to periodic_release. Defaults to empty string.
1959
2080
 
1960
2081
  Returns
1961
2082
  -------
1962
2083
  expr : `str`
1963
- A string representing an HTCondor ClassAd expression for releasing jobs
1964
- which have been held due to exceeding the memory requirements.
2084
+ A string representing an HTCondor ClassAd expression for releasing job.
1965
2085
  """
1966
- is_retry_allowed = "NumJobStarts <= JobMaxRetries"
1967
- was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
2086
+ _LOG.debug(
2087
+ "periodic_release: memory: %s, multiplier: %s, limit: %s, additional_expr: %s",
2088
+ memory,
2089
+ multiplier,
2090
+ limit,
2091
+ additional_expr,
2092
+ )
2093
+
2094
+ # ctrl_bps sets multiplier to None in the GenericWorkflow if
2095
+ # memoryMultiplier <= 1, but checking value just in case.
2096
+ if (not multiplier or multiplier <= 1) and not additional_expr:
2097
+ return ""
1968
2098
 
1969
2099
  # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1970
2100
  # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
@@ -1976,63 +2106,74 @@ def _create_periodic_release_expr(memory, multiplier, limit):
1976
2106
  # the entire expression should evaluate to FALSE when the job is not HELD.
1977
2107
  # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1978
2108
  # but better safe than sorry.
1979
- was_mem_exceeded = (
1980
- "JobStatus == 5 "
1981
- "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1982
- "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1983
- )
2109
+ is_held = "JobStatus == 5"
2110
+ is_retry_allowed = "NumJobStarts <= JobMaxRetries"
2111
+
2112
+ mem_expr = ""
2113
+ if memory and multiplier and multiplier > 1 and limit:
2114
+ was_mem_exceeded = (
2115
+ "(HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
2116
+ "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
2117
+ )
2118
+ was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
2119
+ mem_expr = f"{was_mem_exceeded} && {was_below_limit}"
2120
+
2121
+ user_expr = ""
2122
+ if additional_expr:
2123
+ # Never auto release a job held by user.
2124
+ user_expr = f"HoldReasonCode =!= 1 && {additional_expr}"
2125
+
2126
+ expr = f"{is_held} && {is_retry_allowed}"
2127
+ if user_expr and mem_expr:
2128
+ expr += f" && ({mem_expr} || {user_expr})"
2129
+ elif user_expr:
2130
+ expr += f" && {user_expr}"
2131
+ elif mem_expr:
2132
+ expr += f" && {mem_expr}"
1984
2133
 
1985
- expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
1986
2134
  return expr
1987
2135
 
1988
2136
 
1989
2137
  def _create_periodic_remove_expr(memory, multiplier, limit):
1990
2138
  """Construct an HTCondorAd expression for removing jobs from the queue.
1991
2139
 
1992
- The expression instruct HTCondor to remove any job which was put on hold
1993
- due to exceeding memory requirements from the job queue providing it
1994
- satisfies any of the conditions below:
1995
-
1996
- * allowable number of retries was reached,
1997
- * the memory requirements during the last failed run attempt reached
1998
- the specified memory limit.
1999
-
2000
2140
  Parameters
2001
2141
  ----------
2002
2142
  memory : `int`
2003
2143
  Requested memory in MB.
2004
2144
  multiplier : `float`
2005
- Memory growth rate between retires.
2145
+ Memory growth rate between retries.
2006
2146
  limit : `int`
2007
2147
  Memory limit.
2008
2148
 
2009
2149
  Returns
2010
2150
  -------
2011
2151
  expr : `str`
2012
- A string representing an HTCondor ClassAd expression for removing jobs
2013
- which were run at the maximal allowable memory and still exceeded
2014
- the memory requirements.
2152
+ A string representing an HTCondor ClassAd expression for removing jobs.
2015
2153
  """
2016
- is_retry_disallowed = "NumJobStarts > JobMaxRetries"
2017
- was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
2018
-
2019
- # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
2020
- # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
2021
- # The special comparison operators ensure that all comparisons below will
2022
- # evaluate to FALSE in this case.
2154
+ # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode'
2155
+ # are UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
2156
+ # The special comparison operators ensure that all comparisons below
2157
+ # will evaluate to FALSE in this case.
2023
2158
  #
2024
2159
  # Note:
2025
- # May not be strictly necessary. Operators '&&' and '||' are not strict so
2026
- # the entire expression should evaluate to FALSE when the job is not HELD.
2027
- # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
2028
- # but better safe than sorry.
2029
- was_mem_exceeded = (
2030
- "JobStatus == 5 "
2031
- "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
2032
- "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
2033
- )
2160
+ # May not be strictly necessary. Operators '&&' and '||' are not
2161
+ # strict so the entire expression should evaluate to FALSE when the
2162
+ # job is not HELD. According to ClassAd evaluation semantics
2163
+ # FALSE && UNDEFINED is FALSE, but better safe than sorry.
2164
+ is_held = "JobStatus == 5"
2165
+ is_retry_disallowed = "NumJobStarts > JobMaxRetries"
2166
+
2167
+ mem_expr = ""
2168
+ if memory and multiplier and multiplier > 1 and limit:
2169
+ mem_limit_expr = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
2170
+
2171
+ mem_expr = ( # Add || here so only added if adding memory expr
2172
+ " || ((HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
2173
+ f"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && {mem_limit_expr})"
2174
+ )
2034
2175
 
2035
- expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})"
2176
+ expr = f"{is_held} && ({is_retry_disallowed}{mem_expr})"
2036
2177
  return expr
2037
2178
 
2038
2179
 
@@ -2044,7 +2185,7 @@ def _create_request_memory_expr(memory, multiplier, limit):
2044
2185
  memory : `int`
2045
2186
  Requested memory in MB.
2046
2187
  multiplier : `float`
2047
- Memory growth rate between retires.
2188
+ Memory growth rate between retries.
2048
2189
  limit : `int`
2049
2190
  Memory limit.
2050
2191
 
@@ -2119,7 +2260,7 @@ def _gather_site_values(config, compute_site):
2119
2260
 
2120
2261
  Returns
2121
2262
  -------
2122
- site_values : `dict` [`str`, `Any`]
2263
+ site_values : `dict` [`str`, `~typing.Any`]
2123
2264
  Values specific to the given site.
2124
2265
  """
2125
2266
  site_values = {"attrs": {}, "profile": {}}
@@ -2167,6 +2308,50 @@ def _gather_site_values(config, compute_site):
2167
2308
  return site_values
2168
2309
 
2169
2310
 
2311
+ def _gather_label_values(config: BpsConfig, label: str) -> dict[str, Any]:
2312
+ """Gather values specific to given job label.
2313
+
2314
+ Parameters
2315
+ ----------
2316
+ config : `lsst.ctrl.bps.BpsConfig`
2317
+ BPS configuration that includes necessary submit/runtime
2318
+ information.
2319
+ label : `str`
2320
+ GenericWorkflowJob label.
2321
+
2322
+ Returns
2323
+ -------
2324
+ values : `dict` [`str`, `~typing.Any`]
2325
+ Values specific to the given job label.
2326
+ """
2327
+ values: dict[str, Any] = {"attrs": {}, "profile": {}}
2328
+
2329
+ search_opts = {}
2330
+ profile_key = ""
2331
+ if label == "finalJob":
2332
+ search_opts["searchobj"] = config["finalJob"]
2333
+ profile_key = ".finalJob.profile.condor"
2334
+ elif label in config["cluster"]:
2335
+ search_opts["curvals"] = {"curr_cluster": label}
2336
+ profile_key = f".cluster.{label}.profile.condor"
2337
+ elif label in config["pipetask"]:
2338
+ search_opts["curvals"] = {"curr_pipetask": label}
2339
+ profile_key = f".pipetask.{label}.profile.condor"
2340
+
2341
+ found, value = config.search("releaseExpr", opt=search_opts)
2342
+ if found:
2343
+ values["releaseExpr"] = value
2344
+
2345
+ if profile_key and profile_key in config:
2346
+ for subkey, val in config[profile_key].items():
2347
+ if subkey.startswith("+"):
2348
+ values["attrs"][subkey[1:]] = val
2349
+ else:
2350
+ values["profile"][subkey] = val
2351
+
2352
+ return values
2353
+
2354
+
2170
2355
  def is_service_job(job_ad: dict[str, Any]) -> bool:
2171
2356
  """Determine if a job is a service one.
2172
2357
 
@@ -2280,16 +2465,22 @@ def _generic_workflow_to_htcondor_dag(
2280
2465
  subdir_template = tmp_template
2281
2466
 
2282
2467
  # Create all DAG jobs
2283
- site_values = {} # cache compute site specific values to reduce config lookups
2468
+ site_values = {} # Cache compute site specific values to reduce config lookups.
2469
+ cached_values = {} # Cache label-specific values to reduce config lookups.
2470
+ # Note: Can't use get_job_by_label because those only include payload jobs.
2284
2471
  for job_name in generic_workflow:
2285
2472
  gwjob = generic_workflow.get_job(job_name)
2286
2473
  if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD:
2287
2474
  gwjob = cast(GenericWorkflowJob, gwjob)
2288
2475
  if gwjob.compute_site not in site_values:
2289
2476
  site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
2477
+ if gwjob.label not in cached_values:
2478
+ cached_values[gwjob.label] = deepcopy(site_values[gwjob.compute_site])
2479
+ cached_values[gwjob.label].update(_gather_label_values(config, gwjob.label))
2480
+ _LOG.debug("cached: %s= %s", gwjob.label, cached_values[gwjob.label])
2290
2481
  htc_job = _create_job(
2291
2482
  subdir_template[gwjob.label],
2292
- site_values[gwjob.compute_site],
2483
+ cached_values[gwjob.label],
2293
2484
  generic_workflow,
2294
2485
  gwjob,
2295
2486
  out_prefix,
@@ -2351,8 +2542,15 @@ def _generic_workflow_to_htcondor_dag(
2351
2542
  if final and isinstance(final, GenericWorkflowJob):
2352
2543
  if final.compute_site and final.compute_site not in site_values:
2353
2544
  site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
2545
+ if final.label not in cached_values:
2546
+ cached_values[final.label] = deepcopy(site_values[final.compute_site])
2547
+ cached_values[final.label].update(_gather_label_values(config, final.label))
2354
2548
  final_htjob = _create_job(
2355
- subdir_template[final.label], site_values[final.compute_site], generic_workflow, final, out_prefix
2549
+ subdir_template[final.label],
2550
+ cached_values[final.label],
2551
+ generic_workflow,
2552
+ final,
2553
+ out_prefix,
2356
2554
  )
2357
2555
  if "post" not in final_htjob.dagcmds:
2358
2556
  final_htjob.dagcmds["post"] = {
@@ -1376,6 +1376,7 @@ def condor_search(constraint=None, hist=None, schedds=None):
1376
1376
 
1377
1377
  job_info = condor_q(constraint=constraint, schedds=schedds)
1378
1378
  if hist is not None:
1379
+ _LOG.debug("Searching history going back %s days", hist)
1379
1380
  epoch = (datetime.now() - timedelta(days=hist)).timestamp()
1380
1381
  constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
1381
1382
  hist_info = condor_history(constraint, schedds=schedds)
@@ -1,2 +1,2 @@
1
1
  __all__ = ["__version__"]
2
- __version__ = "29.2025.2000"
2
+ __version__ = "29.2025.2200"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lsst-ctrl-bps-htcondor
3
- Version: 29.2025.2000
3
+ Version: 29.2025.2200
4
4
  Summary: HTCondor plugin for lsst-ctrl-bps.
5
5
  Author-email: Rubin Observatory Data Management <dm-admin@lists.lsst.org>
6
6
  License: BSD 3-Clause License
@@ -2,18 +2,18 @@ lsst/ctrl/bps/htcondor/__init__.py,sha256=1gTmOVLJILvBqgqHVECo8uqoX8e4fiTeH_dHBU
2
2
  lsst/ctrl/bps/htcondor/final_post.sh,sha256=chfaQV6Q7rGsK-8Hx58ch52m-PofvBanrl7VwCssHec,248
3
3
  lsst/ctrl/bps/htcondor/handlers.py,sha256=2gM3Ac00in4ob9ckcP331W1LSEjs9UDKIqt4MULA4bg,11196
4
4
  lsst/ctrl/bps/htcondor/htcondor_config.py,sha256=c4lCiYEwEXFdxgbMfEkbDm4LrvkRMF31SqLtQqzqIV4,1523
5
- lsst/ctrl/bps/htcondor/htcondor_service.py,sha256=-qf7FG1yvxawf9vHfUMZtofVrrK06CMbviarLRXGg_0,88308
6
- lsst/ctrl/bps/htcondor/lssthtc.py,sha256=Rsfr7ZehZHiRWmF-8FMDReZQrGMtKii7CO2O8Vu9hYg,80420
5
+ lsst/ctrl/bps/htcondor/htcondor_service.py,sha256=4_jm0lIZw3mYXyays1IWyg3pFwpODR_-g6CLIepXu7w,95330
6
+ lsst/ctrl/bps/htcondor/lssthtc.py,sha256=pYxcA5jicuJs1RnhusSuMFrOU92Xy1fb-tleZ9m784Y,80485
7
7
  lsst/ctrl/bps/htcondor/provisioner.py,sha256=hPN8YJUtwNHQylw68kfskF1S2vCeQvztF8W0d_QKqqM,7851
8
- lsst/ctrl/bps/htcondor/version.py,sha256=bT9RvvieKvOl5VImh838xbe0jWwzezTNwq19FWhF5NU,55
8
+ lsst/ctrl/bps/htcondor/version.py,sha256=RaoK8ADNKLNvY-bXCQnWhCS6HwZ0pWKOeKdmgzS7d6Y,55
9
9
  lsst/ctrl/bps/htcondor/etc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  lsst/ctrl/bps/htcondor/etc/htcondor_defaults.yaml,sha256=xDRts4vHKov2PE_JRh-0nF3jfuNJXtKBXZqveASp_iA,1422
11
- lsst_ctrl_bps_htcondor-29.2025.2000.dist-info/licenses/COPYRIGHT,sha256=Lc6NoAEFQ65v_SmtS9NwfHTOuSUtC2Umbjv5zyowiQM,61
12
- lsst_ctrl_bps_htcondor-29.2025.2000.dist-info/licenses/LICENSE,sha256=pRExkS03v0MQW-neNfIcaSL6aiAnoLxYgtZoFzQ6zkM,232
13
- lsst_ctrl_bps_htcondor-29.2025.2000.dist-info/licenses/bsd_license.txt,sha256=7MIcv8QRX9guUtqPSBDMPz2SnZ5swI-xZMqm_VDSfxY,1606
14
- lsst_ctrl_bps_htcondor-29.2025.2000.dist-info/licenses/gpl-v3.0.txt,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
15
- lsst_ctrl_bps_htcondor-29.2025.2000.dist-info/METADATA,sha256=6brPWGgVcrgiUBsCbbloAFpbz6zzOaPIQ8LdXM3JQCU,2139
16
- lsst_ctrl_bps_htcondor-29.2025.2000.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
17
- lsst_ctrl_bps_htcondor-29.2025.2000.dist-info/top_level.txt,sha256=eUWiOuVVm9wwTrnAgiJT6tp6HQHXxIhj2QSZ7NYZH80,5
18
- lsst_ctrl_bps_htcondor-29.2025.2000.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
19
- lsst_ctrl_bps_htcondor-29.2025.2000.dist-info/RECORD,,
11
+ lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/licenses/COPYRIGHT,sha256=Lc6NoAEFQ65v_SmtS9NwfHTOuSUtC2Umbjv5zyowiQM,61
12
+ lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/licenses/LICENSE,sha256=pRExkS03v0MQW-neNfIcaSL6aiAnoLxYgtZoFzQ6zkM,232
13
+ lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/licenses/bsd_license.txt,sha256=7MIcv8QRX9guUtqPSBDMPz2SnZ5swI-xZMqm_VDSfxY,1606
14
+ lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/licenses/gpl-v3.0.txt,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
15
+ lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/METADATA,sha256=Ls4Ipu4B4iFbZ4rozJhOoBeuOOR6Jg6E3X31dWa2qU0,2139
16
+ lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
17
+ lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/top_level.txt,sha256=eUWiOuVVm9wwTrnAgiJT6tp6HQHXxIhj2QSZ7NYZH80,5
18
+ lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
19
+ lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.7.1)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5