lsst-ctrl-bps-htcondor 29.2025.2000__tar.gz → 29.2025.2100__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {lsst_ctrl_bps_htcondor-29.2025.2000/python/lsst_ctrl_bps_htcondor.egg-info → lsst_ctrl_bps_htcondor-29.2025.2100}/PKG-INFO +1 -1
  2. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/doc/lsst.ctrl.bps.htcondor/userguide.rst +35 -2
  3. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/htcondor_service.py +165 -88
  4. lsst_ctrl_bps_htcondor-29.2025.2100/python/lsst/ctrl/bps/htcondor/version.py +2 -0
  5. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100/python/lsst_ctrl_bps_htcondor.egg-info}/PKG-INFO +1 -1
  6. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/tests/test_htcondor_service.py +144 -1
  7. lsst_ctrl_bps_htcondor-29.2025.2000/python/lsst/ctrl/bps/htcondor/version.py +0 -2
  8. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/COPYRIGHT +0 -0
  9. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/LICENSE +0 -0
  10. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/MANIFEST.in +0 -0
  11. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/README.rst +0 -0
  12. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/bsd_license.txt +0 -0
  13. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/doc/lsst.ctrl.bps.htcondor/CHANGES.rst +0 -0
  14. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/doc/lsst.ctrl.bps.htcondor/index.rst +0 -0
  15. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/gpl-v3.0.txt +0 -0
  16. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/pyproject.toml +0 -0
  17. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/__init__.py +0 -0
  18. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/etc/__init__.py +0 -0
  19. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/etc/htcondor_defaults.yaml +0 -0
  20. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/final_post.sh +0 -0
  21. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/handlers.py +0 -0
  22. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/htcondor_config.py +0 -0
  23. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/lssthtc.py +0 -0
  24. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/provisioner.py +0 -0
  25. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst_ctrl_bps_htcondor.egg-info/SOURCES.txt +0 -0
  26. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst_ctrl_bps_htcondor.egg-info/dependency_links.txt +0 -0
  27. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst_ctrl_bps_htcondor.egg-info/requires.txt +0 -0
  28. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst_ctrl_bps_htcondor.egg-info/top_level.txt +0 -0
  29. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst_ctrl_bps_htcondor.egg-info/zip-safe +0 -0
  30. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/setup.cfg +0 -0
  31. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/tests/test_handlers.py +0 -0
  32. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/tests/test_lssthtc.py +0 -0
  33. {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/tests/test_provisioner.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lsst-ctrl-bps-htcondor
3
- Version: 29.2025.2000
3
+ Version: 29.2025.2100
4
4
  Summary: HTCondor plugin for lsst-ctrl-bps.
5
5
  Author-email: Rubin Observatory Data Management <dm-admin@lists.lsst.org>
6
6
  License: BSD 3-Clause License
@@ -476,12 +476,42 @@ for a specific reason, here, the memory usage exceeded memory limits
476
476
  .. note::
477
477
 
478
478
  By default, BPS will automatically retry jobs that failed due to the out of
479
- memory error (see `Automatic memory scaling`__ section in **ctrl_bps**
479
+ memory error (see `Automatic memory scaling`_ section in **ctrl_bps**
480
480
  documentation for more information regarding this topic) and the issues
481
481
  illustrated by the above examples should only occur if automatic memory
482
482
  scalling was explicitly disabled in the submit YAML file.
483
483
 
484
- .. __: https://pipelines.lsst.io/v/weekly/modules/lsst.ctrl.bps/quickstart.html#automatic-memory-scaling
484
+
485
+ Automatic Releasing of Held Jobs
486
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
487
+
488
+ Many times releasing the jobs to just try again is successful because the
489
+ system issues are transient.
490
+
491
+ ``releaseExpr`` can be set in the submit yaml to add automatic release
492
+ conditions. Like other BPS config values, this can be set globally or
493
+ set for a specific cluster or pipetask. The number of retries is still
494
+ limited by the ``numberOfRetries``. All held jobs count towards this
495
+ limit no matter what the reason. The plugin prohibits the automatic
496
+ release of jobs held by user.
497
+
498
+ Example expressions:
499
+
500
+ * ``releaseExpr: "True"`` - will always release held job unless held by user.
501
+ * ``releaseExpr: "HoldReasonCode =?= 7"`` - release jobs where the standard
502
+ output file for the job could not be opened.
503
+
504
+ For more information about expressions, see HTCondor documentation:
505
+
506
+ * HTCondor `ClassAd expressions`_
507
+ * list of `HoldReasonCodes`_
508
+
509
+ .. warning::
510
+
511
+ System problems should still be tracked and reported. All of the
512
+ hold reasons for a single completed run can be found via ``grep -A
513
+ 2 held <submit dir>/*.nodes.log``.
514
+
485
515
 
486
516
  .. _htc-plugin-troubleshooting:
487
517
 
@@ -535,3 +565,6 @@ complete your run.
535
565
  .. _condor_release: https://htcondor.readthedocs.io/en/latest/man-pages/condor_release.html
536
566
  .. _condor_rm: https://htcondor.readthedocs.io/en/latest/man-pages/condor_rm.html
537
567
  .. _lsst_distrib: https://github.com/lsst/lsst_distrib.git
568
+ .. _Automatic memory scaling: https://pipelines.lsst.io/v/weekly/modules/lsst.ctrl.bps/quickstart.html#automatic-memory-scaling
569
+ .. _HoldReasonCodes: https://htcondor.readthedocs.io/en/latest/classad-attributes/job-classad-attributes.html#HoldReasonCode
570
+ .. _ClassAd expressions: https://htcondor.readthedocs.io/en/latest/classads/classad-mechanism.html#classad-evaluation-semantics
@@ -34,6 +34,7 @@ import logging
34
34
  import os
35
35
  import re
36
36
  from collections import defaultdict
37
+ from copy import deepcopy
37
38
  from enum import IntEnum, auto
38
39
  from pathlib import Path
39
40
  from typing import Any, cast
@@ -331,7 +332,7 @@ class HTCondorService(BaseWmsService):
331
332
 
332
333
  Returns
333
334
  -------
334
- job_ids : `list` [`Any`]
335
+ job_ids : `list` [`~typing.Any`]
335
336
  Only job ids to be used by cancel and other functions. Typically
336
337
  this means top-level jobs (i.e., not children jobs).
337
338
  """
@@ -604,15 +605,15 @@ class HTCondorWorkflow(BaseWmsWorkflow):
604
605
  self.dag.write(out_prefix, job_subdir="jobs/{self.label}")
605
606
 
606
607
 
607
- def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix):
608
+ def _create_job(subdir_template, cached_values, generic_workflow, gwjob, out_prefix):
608
609
  """Convert GenericWorkflow job nodes to DAG jobs.
609
610
 
610
611
  Parameters
611
612
  ----------
612
613
  subdir_template : `str`
613
614
  Template for making subdirs.
614
- site_values : `dict`
615
- Site specific values
615
+ cached_values : `dict`
616
+ Site and label specific values.
616
617
  generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
617
618
  Generic workflow that is being converted.
618
619
  gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
@@ -654,7 +655,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
654
655
  "on_exit_hold_subcode": "34",
655
656
  }
656
657
 
657
- htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob))
658
+ htc_job_cmds.update(_translate_job_cmds(cached_values, generic_workflow, gwjob))
658
659
 
659
660
  # job stdout, stderr, htcondor user log.
660
661
  for key in ("output", "error", "log"):
@@ -662,7 +663,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
662
663
  _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
663
664
 
664
665
  htc_job_cmds.update(
665
- _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix)
666
+ _handle_job_inputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
666
667
  )
667
668
 
668
669
  # Add the job cmds dict to the job object.
@@ -673,7 +674,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
673
674
  # Add job attributes to job.
674
675
  _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
675
676
  htc_job.add_job_attrs(gwjob.attrs)
676
- htc_job.add_job_attrs(site_values["attrs"])
677
+ htc_job.add_job_attrs(cached_values["attrs"])
677
678
  htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
678
679
  htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
679
680
 
@@ -685,8 +686,8 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
685
686
 
686
687
  Parameters
687
688
  ----------
688
- cached_vals : `dict` [`str`, `Any`]
689
- Config values common to jobs with same label.
689
+ cached_vals : `dict` [`str`, `~typing.Any`]
690
+ Config values common to jobs with same site or label.
690
691
  generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
691
692
  Generic workflow that contains job to being converted.
692
693
  gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
@@ -694,7 +695,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
694
695
 
695
696
  Returns
696
697
  -------
697
- htc_job_commands : `dict` [`str`, `Any`]
698
+ htc_job_commands : `dict` [`str`, `~typing.Any`]
698
699
  Contains commands which can appear in the HTCondor submit description
699
700
  file.
700
701
  """
@@ -720,9 +721,6 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
720
721
  jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
721
722
 
722
723
  # job commands that need modification
723
- if gwjob.number_of_retries:
724
- jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
725
-
726
724
  if gwjob.retry_unless_exit:
727
725
  if isinstance(gwjob.retry_unless_exit, int):
728
726
  jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
@@ -739,6 +737,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
739
737
  if gwjob.request_memory:
740
738
  jobcmds["request_memory"] = f"{gwjob.request_memory}"
741
739
 
740
+ memory_max = 0
742
741
  if gwjob.memory_multiplier:
743
742
  # Do not use try-except! At the moment, BpsConfig returns an empty
744
743
  # string if it does not contain the key.
@@ -765,13 +764,18 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
765
764
  gwjob.request_memory, gwjob.memory_multiplier, memory_max
766
765
  )
767
766
 
768
- # Periodically release jobs which are being held due to exceeding
769
- # memory. Stop doing that (by removing the job from the HTCondor queue)
770
- # after the maximal number of retries has been reached or the job was
771
- # already run at maximal allowed memory.
772
- jobcmds["periodic_release"] = _create_periodic_release_expr(
773
- gwjob.request_memory, gwjob.memory_multiplier, memory_max
774
- )
767
+ user_release_expr = cached_vals.get("releaseExpr", "")
768
+ if gwjob.number_of_retries is not None and gwjob.number_of_retries >= 0:
769
+ jobcmds["max_retries"] = gwjob.number_of_retries
770
+
771
+ # No point in adding periodic_release if 0 retries
772
+ if gwjob.number_of_retries > 0:
773
+ periodic_release = _create_periodic_release_expr(
774
+ gwjob.request_memory, gwjob.memory_multiplier, memory_max, user_release_expr
775
+ )
776
+ if periodic_release:
777
+ jobcmds["periodic_release"] = periodic_release
778
+
775
779
  jobcmds["periodic_remove"] = _create_periodic_remove_expr(
776
780
  gwjob.request_memory, gwjob.memory_multiplier, memory_max
777
781
  )
@@ -830,7 +834,7 @@ def _translate_dag_cmds(gwjob):
830
834
 
831
835
  Returns
832
836
  -------
833
- dagcmds : `dict` [`str`, `Any`]
837
+ dagcmds : `dict` [`str`, `~typing.Any`]
834
838
  DAGMan commands for the job.
835
839
  """
836
840
  # Values in the dag script that just are name mappings.
@@ -1180,7 +1184,7 @@ def _get_info_from_path(wms_path: str | os.PathLike) -> tuple[str, dict[str, dic
1180
1184
  -------
1181
1185
  wms_workflow_id : `str`
1182
1186
  The run id which is a DAGman job id.
1183
- jobs : `dict` [`str`, `dict` [`str`, `Any`]]
1187
+ jobs : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1184
1188
  Information about jobs read from files in the given directory.
1185
1189
  The key is the HTCondor id and the value is a dictionary of HTCondor
1186
1190
  keys and values.
@@ -1340,7 +1344,7 @@ def _add_service_job_specific_info(job_ad: dict[str, Any], specific_info: WmsSpe
1340
1344
 
1341
1345
  Parameters
1342
1346
  ----------
1343
- job_ad : `dict` [`str`, `Any`]
1347
+ job_ad : `dict` [`str`, `~typing.Any`]
1344
1348
  Provisioning job information.
1345
1349
  specific_info : `lsst.ctrl.bps.WmsSpecificInfo`
1346
1350
  Where to add message.
@@ -1466,7 +1470,7 @@ def _add_run_info(wms_path, job):
1466
1470
  ----------
1467
1471
  wms_path : `str`
1468
1472
  Path to submit files for the run.
1469
- job : `dict` [`str`, `Any`]
1473
+ job : `dict` [`str`, `~typing.Any`]
1470
1474
  HTCondor dag job information.
1471
1475
 
1472
1476
  Raises
@@ -1502,7 +1506,7 @@ def _get_owner(job):
1502
1506
 
1503
1507
  Parameters
1504
1508
  ----------
1505
- job : `dict` [`str`, `Any`]
1509
+ job : `dict` [`str`, `~typing.Any`]
1506
1510
  HTCondor dag job information.
1507
1511
 
1508
1512
  Returns
@@ -1524,7 +1528,7 @@ def _get_run_summary(job):
1524
1528
 
1525
1529
  Parameters
1526
1530
  ----------
1527
- job : `dict` [`str`, `Any`]
1531
+ job : `dict` [`str`, `~typing.Any`]
1528
1532
  HTCondor dag job information.
1529
1533
 
1530
1534
  Returns
@@ -1600,7 +1604,7 @@ def _get_state_counts_from_jobs(
1600
1604
  ----------
1601
1605
  wms_workflow_id : `str`
1602
1606
  HTCondor job id.
1603
- jobs : `dict [`dict` [`str`, `Any`]]
1607
+ jobs : `dict [`dict` [`str`, `~typing.Any`]]
1604
1608
  HTCondor dag job information.
1605
1609
 
1606
1610
  Returns
@@ -1628,7 +1632,7 @@ def _get_state_counts_from_dag_job(job):
1628
1632
 
1629
1633
  Parameters
1630
1634
  ----------
1631
- job : `dict` [`str`, `Any`]
1635
+ job : `dict` [`str`, `~typing.Any`]
1632
1636
  HTCondor dag job information.
1633
1637
 
1634
1638
  Returns
@@ -1684,7 +1688,7 @@ def _htc_status_to_wms_state(job):
1684
1688
 
1685
1689
  Parameters
1686
1690
  ----------
1687
- job : `dict` [`str`, `Any`]
1691
+ job : `dict` [`str`, `~typing.Any`]
1688
1692
  HTCondor job information.
1689
1693
 
1690
1694
  Returns
@@ -1706,7 +1710,7 @@ def _htc_job_status_to_wms_state(job):
1706
1710
 
1707
1711
  Parameters
1708
1712
  ----------
1709
- job : `dict` [`str`, `Any`]
1713
+ job : `dict` [`str`, `~typing.Any`]
1710
1714
  HTCondor job information.
1711
1715
 
1712
1716
  Returns
@@ -1748,7 +1752,7 @@ def _htc_node_status_to_wms_state(job):
1748
1752
 
1749
1753
  Parameters
1750
1754
  ----------
1751
- job : `dict` [`str`, `Any`]
1755
+ job : `dict` [`str`, `~typing.Any`]
1752
1756
  HTCondor job information.
1753
1757
 
1754
1758
  Returns
@@ -1795,9 +1799,9 @@ def _update_jobs(jobs1, jobs2):
1795
1799
 
1796
1800
  Parameters
1797
1801
  ----------
1798
- jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1802
+ jobs1 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1799
1803
  HTCondor job information to be updated.
1800
- jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1804
+ jobs2 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1801
1805
  Additional HTCondor job information.
1802
1806
  """
1803
1807
  for job_id, job_ad in jobs2.items():
@@ -1937,34 +1941,39 @@ def _wms_id_to_dir(wms_id):
1937
1941
  return wms_path, id_type
1938
1942
 
1939
1943
 
1940
- def _create_periodic_release_expr(memory, multiplier, limit):
1944
+ def _create_periodic_release_expr(
1945
+ memory: int, multiplier: float | None, limit: int, additional_expr: str = ""
1946
+ ) -> str:
1941
1947
  """Construct an HTCondorAd expression for releasing held jobs.
1942
1948
 
1943
- The expression instruct HTCondor to release any job which was put on hold
1944
- due to exceeding memory requirements back to the job queue providing it
1945
- satisfies all of the conditions below:
1946
-
1947
- * number of run attempts did not reach allowable number of retries,
1948
- * the memory requirements in the last failed run attempt did not reach
1949
- the specified memory limit.
1950
-
1951
1949
  Parameters
1952
1950
  ----------
1953
1951
  memory : `int`
1954
1952
  Requested memory in MB.
1955
- multiplier : `float`
1956
- Memory growth rate between retires.
1953
+ multiplier : `float` or None
1954
+ Memory growth rate between retries.
1957
1955
  limit : `int`
1958
1956
  Memory limit.
1957
+ additional_expr : `str`, optional
1958
+ Expression to add to periodic_release. Defaults to empty string.
1959
1959
 
1960
1960
  Returns
1961
1961
  -------
1962
1962
  expr : `str`
1963
- A string representing an HTCondor ClassAd expression for releasing jobs
1964
- which have been held due to exceeding the memory requirements.
1963
+ A string representing an HTCondor ClassAd expression for releasing job.
1965
1964
  """
1966
- is_retry_allowed = "NumJobStarts <= JobMaxRetries"
1967
- was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
1965
+ _LOG.debug(
1966
+ "periodic_release: memory: %s, multiplier: %s, limit: %s, additional_expr: %s",
1967
+ memory,
1968
+ multiplier,
1969
+ limit,
1970
+ additional_expr,
1971
+ )
1972
+
1973
+ # ctrl_bps sets multiplier to None in the GenericWorkflow if
1974
+ # memoryMultiplier <= 1, but checking value just in case.
1975
+ if (not multiplier or multiplier <= 1) and not additional_expr:
1976
+ return ""
1968
1977
 
1969
1978
  # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1970
1979
  # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
@@ -1976,63 +1985,74 @@ def _create_periodic_release_expr(memory, multiplier, limit):
1976
1985
  # the entire expression should evaluate to FALSE when the job is not HELD.
1977
1986
  # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
1978
1987
  # but better safe than sorry.
1979
- was_mem_exceeded = (
1980
- "JobStatus == 5 "
1981
- "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1982
- "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1983
- )
1988
+ is_held = "JobStatus == 5"
1989
+ is_retry_allowed = "NumJobStarts <= JobMaxRetries"
1990
+
1991
+ mem_expr = ""
1992
+ if memory and multiplier and multiplier > 1 and limit:
1993
+ was_mem_exceeded = (
1994
+ "(HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1995
+ "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1996
+ )
1997
+ was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
1998
+ mem_expr = f"{was_mem_exceeded} && {was_below_limit}"
1999
+
2000
+ user_expr = ""
2001
+ if additional_expr:
2002
+ # Never auto release a job held by user.
2003
+ user_expr = f"HoldReasonCode =!= 1 && {additional_expr}"
2004
+
2005
+ expr = f"{is_held} && {is_retry_allowed}"
2006
+ if user_expr and mem_expr:
2007
+ expr += f" && ({mem_expr} || {user_expr})"
2008
+ elif user_expr:
2009
+ expr += f" && {user_expr}"
2010
+ elif mem_expr:
2011
+ expr += f" && {mem_expr}"
1984
2012
 
1985
- expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
1986
2013
  return expr
1987
2014
 
1988
2015
 
1989
2016
  def _create_periodic_remove_expr(memory, multiplier, limit):
1990
2017
  """Construct an HTCondorAd expression for removing jobs from the queue.
1991
2018
 
1992
- The expression instruct HTCondor to remove any job which was put on hold
1993
- due to exceeding memory requirements from the job queue providing it
1994
- satisfies any of the conditions below:
1995
-
1996
- * allowable number of retries was reached,
1997
- * the memory requirements during the last failed run attempt reached
1998
- the specified memory limit.
1999
-
2000
2019
  Parameters
2001
2020
  ----------
2002
2021
  memory : `int`
2003
2022
  Requested memory in MB.
2004
2023
  multiplier : `float`
2005
- Memory growth rate between retires.
2024
+ Memory growth rate between retries.
2006
2025
  limit : `int`
2007
2026
  Memory limit.
2008
2027
 
2009
2028
  Returns
2010
2029
  -------
2011
2030
  expr : `str`
2012
- A string representing an HTCondor ClassAd expression for removing jobs
2013
- which were run at the maximal allowable memory and still exceeded
2014
- the memory requirements.
2031
+ A string representing an HTCondor ClassAd expression for removing jobs.
2015
2032
  """
2016
- is_retry_disallowed = "NumJobStarts > JobMaxRetries"
2017
- was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
2018
-
2019
- # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
2020
- # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
2021
- # The special comparison operators ensure that all comparisons below will
2022
- # evaluate to FALSE in this case.
2033
+ # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode'
2034
+ # are UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
2035
+ # The special comparison operators ensure that all comparisons below
2036
+ # will evaluate to FALSE in this case.
2023
2037
  #
2024
2038
  # Note:
2025
- # May not be strictly necessary. Operators '&&' and '||' are not strict so
2026
- # the entire expression should evaluate to FALSE when the job is not HELD.
2027
- # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
2028
- # but better safe than sorry.
2029
- was_mem_exceeded = (
2030
- "JobStatus == 5 "
2031
- "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
2032
- "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
2033
- )
2039
+ # May not be strictly necessary. Operators '&&' and '||' are not
2040
+ # strict so the entire expression should evaluate to FALSE when the
2041
+ # job is not HELD. According to ClassAd evaluation semantics
2042
+ # FALSE && UNDEFINED is FALSE, but better safe than sorry.
2043
+ is_held = "JobStatus == 5"
2044
+ is_retry_disallowed = "NumJobStarts > JobMaxRetries"
2045
+
2046
+ mem_expr = ""
2047
+ if memory and multiplier and multiplier > 1 and limit:
2048
+ mem_limit_expr = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
2049
+
2050
+ mem_expr = ( # Add || here so only added if adding memory expr
2051
+ " || ((HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
2052
+ f"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && {mem_limit_expr})"
2053
+ )
2034
2054
 
2035
- expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})"
2055
+ expr = f"{is_held} && ({is_retry_disallowed}{mem_expr})"
2036
2056
  return expr
2037
2057
 
2038
2058
 
@@ -2044,7 +2064,7 @@ def _create_request_memory_expr(memory, multiplier, limit):
2044
2064
  memory : `int`
2045
2065
  Requested memory in MB.
2046
2066
  multiplier : `float`
2047
- Memory growth rate between retires.
2067
+ Memory growth rate between retries.
2048
2068
  limit : `int`
2049
2069
  Memory limit.
2050
2070
 
@@ -2119,7 +2139,7 @@ def _gather_site_values(config, compute_site):
2119
2139
 
2120
2140
  Returns
2121
2141
  -------
2122
- site_values : `dict` [`str`, `Any`]
2142
+ site_values : `dict` [`str`, `~typing.Any`]
2123
2143
  Values specific to the given site.
2124
2144
  """
2125
2145
  site_values = {"attrs": {}, "profile": {}}
@@ -2167,6 +2187,50 @@ def _gather_site_values(config, compute_site):
2167
2187
  return site_values
2168
2188
 
2169
2189
 
2190
+ def _gather_label_values(config: BpsConfig, label: str) -> dict[str, Any]:
2191
+ """Gather values specific to given job label.
2192
+
2193
+ Parameters
2194
+ ----------
2195
+ config : `lsst.ctrl.bps.BpsConfig`
2196
+ BPS configuration that includes necessary submit/runtime
2197
+ information.
2198
+ label : `str`
2199
+ GenericWorkflowJob label.
2200
+
2201
+ Returns
2202
+ -------
2203
+ values : `dict` [`str`, `~typing.Any`]
2204
+ Values specific to the given job label.
2205
+ """
2206
+ values: dict[str, Any] = {"attrs": {}, "profile": {}}
2207
+
2208
+ search_opts = {}
2209
+ profile_key = ""
2210
+ if label == "finalJob":
2211
+ search_opts["searchobj"] = config["finalJob"]
2212
+ profile_key = ".finalJob.profile.condor"
2213
+ elif label in config["cluster"]:
2214
+ search_opts["curvals"] = {"curr_cluster": label}
2215
+ profile_key = f".cluster.{label}.profile.condor"
2216
+ elif label in config["pipetask"]:
2217
+ search_opts["curvals"] = {"curr_pipetask": label}
2218
+ profile_key = f".pipetask.{label}.profile.condor"
2219
+
2220
+ found, value = config.search("releaseExpr", opt=search_opts)
2221
+ if found:
2222
+ values["releaseExpr"] = value
2223
+
2224
+ if profile_key and profile_key in config:
2225
+ for subkey, val in config[profile_key].items():
2226
+ if subkey.startswith("+"):
2227
+ values["attrs"][subkey[1:]] = val
2228
+ else:
2229
+ values["profile"][subkey] = val
2230
+
2231
+ return values
2232
+
2233
+
2170
2234
  def is_service_job(job_ad: dict[str, Any]) -> bool:
2171
2235
  """Determine if a job is a service one.
2172
2236
 
@@ -2280,16 +2344,22 @@ def _generic_workflow_to_htcondor_dag(
2280
2344
  subdir_template = tmp_template
2281
2345
 
2282
2346
  # Create all DAG jobs
2283
- site_values = {} # cache compute site specific values to reduce config lookups
2347
+ site_values = {} # Cache compute site specific values to reduce config lookups.
2348
+ cached_values = {} # Cache label-specific values to reduce config lookups.
2349
+ # Note: Can't use get_job_by_label because those only include payload jobs.
2284
2350
  for job_name in generic_workflow:
2285
2351
  gwjob = generic_workflow.get_job(job_name)
2286
2352
  if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD:
2287
2353
  gwjob = cast(GenericWorkflowJob, gwjob)
2288
2354
  if gwjob.compute_site not in site_values:
2289
2355
  site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
2356
+ if gwjob.label not in cached_values:
2357
+ cached_values[gwjob.label] = deepcopy(site_values[gwjob.compute_site])
2358
+ cached_values[gwjob.label].update(_gather_label_values(config, gwjob.label))
2359
+ _LOG.debug("cached: %s= %s", gwjob.label, cached_values[gwjob.label])
2290
2360
  htc_job = _create_job(
2291
2361
  subdir_template[gwjob.label],
2292
- site_values[gwjob.compute_site],
2362
+ cached_values[gwjob.label],
2293
2363
  generic_workflow,
2294
2364
  gwjob,
2295
2365
  out_prefix,
@@ -2351,8 +2421,15 @@ def _generic_workflow_to_htcondor_dag(
2351
2421
  if final and isinstance(final, GenericWorkflowJob):
2352
2422
  if final.compute_site and final.compute_site not in site_values:
2353
2423
  site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
2424
+ if final.label not in cached_values:
2425
+ cached_values[final.label] = deepcopy(site_values[final.compute_site])
2426
+ cached_values[final.label].update(_gather_label_values(config, final.label))
2354
2427
  final_htjob = _create_job(
2355
- subdir_template[final.label], site_values[final.compute_site], generic_workflow, final, out_prefix
2428
+ subdir_template[final.label],
2429
+ cached_values[final.label],
2430
+ generic_workflow,
2431
+ final,
2432
+ out_prefix,
2356
2433
  )
2357
2434
  if "post" not in final_htjob.dagcmds:
2358
2435
  final_htjob.dagcmds["post"] = {
@@ -0,0 +1,2 @@
1
+ __all__ = ["__version__"]
2
+ __version__ = "29.2025.2100"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lsst-ctrl-bps-htcondor
3
- Version: 29.2025.2000
3
+ Version: 29.2025.2100
4
4
  Summary: HTCondor plugin for lsst-ctrl-bps.
5
5
  Author-email: Rubin Observatory Data Management <dm-admin@lists.lsst.org>
6
6
  License: BSD 3-Clause License
@@ -352,7 +352,7 @@ class TranslateJobCmdsTestCase(unittest.TestCase):
352
352
 
353
353
  def setUp(self):
354
354
  self.gw_exec = GenericWorkflowExec("test_exec", "/dummy/dir/pipetask")
355
- self.cached_vals = {"profile": {}, "bpsUseShared": True}
355
+ self.cached_vals = {"profile": {}, "bpsUseShared": True, "memoryLimit": 32768}
356
356
 
357
357
  def testRetryUnlessNone(self):
358
358
  gwjob = GenericWorkflowJob("retryUnless", "label1", executable=self.gw_exec)
@@ -409,6 +409,30 @@ class TranslateJobCmdsTestCase(unittest.TestCase):
409
409
  htc_commands = htcondor_service._translate_job_cmds(self.cached_vals, None, gwjob)
410
410
  self.assertEqual(htc_commands["environment"], "TEST_ENV_VAR='$ENV(CTRL_BPS_DIR)/tests'")
411
411
 
412
+ def testPeriodicRelease(self):
413
+ gwjob = GenericWorkflowJob("periodicRelease", "label1", executable=self.gw_exec)
414
+ gwjob.request_memory = 2048
415
+ gwjob.memory_multiplier = 2
416
+ gwjob.number_of_retries = 3
417
+ htc_commands = htcondor_service._translate_job_cmds(self.cached_vals, None, gwjob)
418
+ release = (
419
+ "JobStatus == 5 && NumJobStarts <= JobMaxRetries && "
420
+ "(HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 || "
421
+ "HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && "
422
+ "min({int(2048 * pow(2, NumJobStarts - 1)), 32768}) < 32768"
423
+ )
424
+ self.assertEqual(htc_commands["periodic_release"], release)
425
+
426
+ def testPeriodicRemoveNoRetries(self):
427
+ gwjob = GenericWorkflowJob("periodicRelease", "label1", executable=self.gw_exec)
428
+ gwjob.request_memory = 2048
429
+ gwjob.memory_multiplier = 1
430
+ gwjob.number_of_retries = 0
431
+ htc_commands = htcondor_service._translate_job_cmds(self.cached_vals, None, gwjob)
432
+ remove = "JobStatus == 5 && (NumJobStarts > JobMaxRetries)"
433
+ self.assertEqual(htc_commands["periodic_remove"], remove)
434
+ self.assertEqual(htc_commands["max_retries"], 0)
435
+
412
436
 
413
437
  class GetStateCountsFromDagJobTestCase(unittest.TestCase):
414
438
  """Test counting number of jobs per WMS state."""
@@ -1135,6 +1159,53 @@ class GatherSiteValuesTestCase(unittest.TestCase):
1135
1159
  self.assertEqual(results["memoryLimit"], BPS_DEFAULTS["memoryLimit"])
1136
1160
 
1137
1161
 
1162
+ class GatherLabelValuesTestCase(unittest.TestCase):
1163
+ """Test _gather_labels_values function."""
1164
+
1165
+ def testClusterLabel(self):
1166
+ # Test cluster value overrides pipetask.
1167
+ label = "label1"
1168
+ config = BpsConfig(
1169
+ {
1170
+ "cluster": {
1171
+ "label1": {"releaseExpr": "cluster_val", "profile": {"condor": {"prof_val1": 3}}}
1172
+ },
1173
+ "pipetask": {"label1": {"releaseExpr": "pipetask_val"}},
1174
+ },
1175
+ search_order=BPS_SEARCH_ORDER,
1176
+ defaults=BPS_DEFAULTS,
1177
+ wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
1178
+ )
1179
+ results = htcondor_service._gather_label_values(config, label)
1180
+ self.assertEqual(results, {"attrs": {}, "profile": {"prof_val1": 3}, "releaseExpr": "cluster_val"})
1181
+
1182
+ def testPipetaskLabel(self):
1183
+ label = "label1"
1184
+ config = BpsConfig(
1185
+ {
1186
+ "pipetask": {
1187
+ "label1": {"releaseExpr": "pipetask_val", "profile": {"condor": {"prof_val1": 3}}}
1188
+ }
1189
+ },
1190
+ search_order=BPS_SEARCH_ORDER,
1191
+ defaults=BPS_DEFAULTS,
1192
+ wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
1193
+ )
1194
+ results = htcondor_service._gather_label_values(config, label)
1195
+ self.assertEqual(results, {"attrs": {}, "profile": {"prof_val1": 3}, "releaseExpr": "pipetask_val"})
1196
+
1197
+ def testNoSection(self):
1198
+ label = "notThere"
1199
+ config = BpsConfig(
1200
+ {},
1201
+ search_order=BPS_SEARCH_ORDER,
1202
+ defaults=BPS_DEFAULTS,
1203
+ wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
1204
+ )
1205
+ results = htcondor_service._gather_label_values(config, label)
1206
+ self.assertEqual(results, {"attrs": {}, "profile": {}})
1207
+
1208
+
1138
1209
  class CreateCheckJobTestCase(unittest.TestCase):
1139
1210
  """Test _create_check_job function."""
1140
1211
 
@@ -1147,5 +1218,77 @@ class CreateCheckJobTestCase(unittest.TestCase):
1147
1218
  self.assertIn("check_group_status.sub", job.subfile)
1148
1219
 
1149
1220
 
1221
+ class CreatePeriodicReleaseExprTestCase(unittest.TestCase):
1222
+ """Test _create_periodic_release_expr function."""
1223
+
1224
+ def testNoReleaseExpr(self):
1225
+ results = htcondor_service._create_periodic_release_expr(2048, 1, 32768, "")
1226
+ self.assertEqual(results, "")
1227
+
1228
+ def testMultiplierNone(self):
1229
+ results = htcondor_service._create_periodic_release_expr(2048, None, 32768, "")
1230
+ self.assertEqual(results, "")
1231
+
1232
+ def testJustMemoryReleaseExpr(self):
1233
+ self.maxDiff = None # so test error shows entire strings
1234
+ results = htcondor_service._create_periodic_release_expr(2048, 2, 32768, "")
1235
+ truth = (
1236
+ "JobStatus == 5 && NumJobStarts <= JobMaxRetries && "
1237
+ "(HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 || "
1238
+ "HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && "
1239
+ "min({int(2048 * pow(2, NumJobStarts - 1)), 32768}) < 32768"
1240
+ )
1241
+ self.assertEqual(results, truth)
1242
+
1243
+ def testJustUserReleaseExpr(self):
1244
+ results = htcondor_service._create_periodic_release_expr(2048, 1, 32768, "True")
1245
+ truth = "JobStatus == 5 && NumJobStarts <= JobMaxRetries && HoldReasonCode =!= 1 && True"
1246
+ self.assertEqual(results, truth)
1247
+
1248
+ def testJustUserReleaseExprMultiplierNone(self):
1249
+ results = htcondor_service._create_periodic_release_expr(2048, None, 32768, "True")
1250
+ truth = "JobStatus == 5 && NumJobStarts <= JobMaxRetries && HoldReasonCode =!= 1 && True"
1251
+ self.assertEqual(results, truth)
1252
+
1253
+ def testMemoryAndUserReleaseExpr(self):
1254
+ self.maxDiff = None # so test error shows entire strings
1255
+ results = htcondor_service._create_periodic_release_expr(2048, 2, 32768, "True")
1256
+ truth = (
1257
+ "JobStatus == 5 && NumJobStarts <= JobMaxRetries && "
1258
+ "((HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 || "
1259
+ "HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && "
1260
+ "min({int(2048 * pow(2, NumJobStarts - 1)), 32768}) < 32768 || "
1261
+ "HoldReasonCode =!= 1 && True)"
1262
+ )
1263
+ self.assertEqual(results, truth)
1264
+
1265
+
1266
+ class CreatePeriodicRemoveExprTestCase(unittest.TestCase):
1267
+ """Test _create_periodic_release_expr function."""
1268
+
1269
+ def testBasicRemoveExpr(self):
1270
+ """Function assumes only called if max_retries >= 0."""
1271
+ results = htcondor_service._create_periodic_remove_expr(2048, 1, 32768)
1272
+ truth = "JobStatus == 5 && (NumJobStarts > JobMaxRetries)"
1273
+ self.assertEqual(results, truth)
1274
+
1275
+ def testBasicRemoveExprMultiplierNone(self):
1276
+ """Function assumes only called if max_retries >= 0."""
1277
+ results = htcondor_service._create_periodic_remove_expr(2048, None, 32768)
1278
+ truth = "JobStatus == 5 && (NumJobStarts > JobMaxRetries)"
1279
+ self.assertEqual(results, truth)
1280
+
1281
+ def testMemoryRemoveExpr(self):
1282
+ self.maxDiff = None # so test error shows entire strings
1283
+ results = htcondor_service._create_periodic_remove_expr(2048, 2, 32768)
1284
+ truth = (
1285
+ "JobStatus == 5 && (NumJobStarts > JobMaxRetries || "
1286
+ "((HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 || "
1287
+ "HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && "
1288
+ "min({int(2048 * pow(2, NumJobStarts - 1)), 32768}) == 32768))"
1289
+ )
1290
+ self.assertEqual(results, truth)
1291
+
1292
+
1150
1293
  if __name__ == "__main__":
1151
1294
  unittest.main()
@@ -1,2 +0,0 @@
1
- __all__ = ["__version__"]
2
- __version__ = "29.2025.2000"