lsst-ctrl-bps-htcondor 29.2025.2000__tar.gz → 29.2025.2100__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lsst_ctrl_bps_htcondor-29.2025.2000/python/lsst_ctrl_bps_htcondor.egg-info → lsst_ctrl_bps_htcondor-29.2025.2100}/PKG-INFO +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/doc/lsst.ctrl.bps.htcondor/userguide.rst +35 -2
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/htcondor_service.py +165 -88
- lsst_ctrl_bps_htcondor-29.2025.2100/python/lsst/ctrl/bps/htcondor/version.py +2 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100/python/lsst_ctrl_bps_htcondor.egg-info}/PKG-INFO +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/tests/test_htcondor_service.py +144 -1
- lsst_ctrl_bps_htcondor-29.2025.2000/python/lsst/ctrl/bps/htcondor/version.py +0 -2
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/COPYRIGHT +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/LICENSE +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/MANIFEST.in +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/README.rst +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/bsd_license.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/doc/lsst.ctrl.bps.htcondor/CHANGES.rst +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/doc/lsst.ctrl.bps.htcondor/index.rst +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/gpl-v3.0.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/pyproject.toml +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/__init__.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/etc/__init__.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/etc/htcondor_defaults.yaml +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/final_post.sh +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/handlers.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/htcondor_config.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/lssthtc.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst/ctrl/bps/htcondor/provisioner.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst_ctrl_bps_htcondor.egg-info/SOURCES.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst_ctrl_bps_htcondor.egg-info/dependency_links.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst_ctrl_bps_htcondor.egg-info/requires.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst_ctrl_bps_htcondor.egg-info/top_level.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/python/lsst_ctrl_bps_htcondor.egg-info/zip-safe +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/setup.cfg +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/tests/test_handlers.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/tests/test_lssthtc.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/tests/test_provisioner.py +0 -0
|
@@ -476,12 +476,42 @@ for a specific reason, here, the memory usage exceeded memory limits
|
|
|
476
476
|
.. note::
|
|
477
477
|
|
|
478
478
|
By default, BPS will automatically retry jobs that failed due to the out of
|
|
479
|
-
memory error (see `Automatic memory scaling`
|
|
479
|
+
memory error (see `Automatic memory scaling`_ section in **ctrl_bps**
|
|
480
480
|
documentation for more information regarding this topic) and the issues
|
|
481
481
|
illustrated by the above examples should only occur if automatic memory
|
|
482
482
|
scalling was explicitly disabled in the submit YAML file.
|
|
483
483
|
|
|
484
|
-
|
|
484
|
+
|
|
485
|
+
Automatic Releasing of Held Jobs
|
|
486
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
487
|
+
|
|
488
|
+
Many times releasing the jobs to just try again is successful because the
|
|
489
|
+
system issues are transient.
|
|
490
|
+
|
|
491
|
+
``releaseExpr`` can be set in the submit yaml to add automatic release
|
|
492
|
+
conditions. Like other BPS config values, this can be set globally or
|
|
493
|
+
set for a specific cluster or pipetask. The number of retries is still
|
|
494
|
+
limited by the ``numberOfRetries``. All held jobs count towards this
|
|
495
|
+
limit no matter what the reason. The plugin prohibits the automatic
|
|
496
|
+
release of jobs held by user.
|
|
497
|
+
|
|
498
|
+
Example expressions:
|
|
499
|
+
|
|
500
|
+
* ``releaseExpr: "True"`` - will always release held job unless held by user.
|
|
501
|
+
* ``releaseExpr: "HoldReasonCode =?= 7"`` - release jobs where the standard
|
|
502
|
+
output file for the job could not be opened.
|
|
503
|
+
|
|
504
|
+
For more information about expressions, see HTCondor documentation:
|
|
505
|
+
|
|
506
|
+
* HTCondor `ClassAd expressions`_
|
|
507
|
+
* list of `HoldReasonCodes`_
|
|
508
|
+
|
|
509
|
+
.. warning::
|
|
510
|
+
|
|
511
|
+
System problems should still be tracked and reported. All of the
|
|
512
|
+
hold reasons for a single completed run can be found via ``grep -A
|
|
513
|
+
2 held <submit dir>/*.nodes.log``.
|
|
514
|
+
|
|
485
515
|
|
|
486
516
|
.. _htc-plugin-troubleshooting:
|
|
487
517
|
|
|
@@ -535,3 +565,6 @@ complete your run.
|
|
|
535
565
|
.. _condor_release: https://htcondor.readthedocs.io/en/latest/man-pages/condor_release.html
|
|
536
566
|
.. _condor_rm: https://htcondor.readthedocs.io/en/latest/man-pages/condor_rm.html
|
|
537
567
|
.. _lsst_distrib: https://github.com/lsst/lsst_distrib.git
|
|
568
|
+
.. _Automatic memory scaling: https://pipelines.lsst.io/v/weekly/modules/lsst.ctrl.bps/quickstart.html#automatic-memory-scaling
|
|
569
|
+
.. _HoldReasonCodes: https://htcondor.readthedocs.io/en/latest/classad-attributes/job-classad-attributes.html#HoldReasonCode
|
|
570
|
+
.. _ClassAd expressions: https://htcondor.readthedocs.io/en/latest/classads/classad-mechanism.html#classad-evaluation-semantics
|
|
@@ -34,6 +34,7 @@ import logging
|
|
|
34
34
|
import os
|
|
35
35
|
import re
|
|
36
36
|
from collections import defaultdict
|
|
37
|
+
from copy import deepcopy
|
|
37
38
|
from enum import IntEnum, auto
|
|
38
39
|
from pathlib import Path
|
|
39
40
|
from typing import Any, cast
|
|
@@ -331,7 +332,7 @@ class HTCondorService(BaseWmsService):
|
|
|
331
332
|
|
|
332
333
|
Returns
|
|
333
334
|
-------
|
|
334
|
-
job_ids : `list` [
|
|
335
|
+
job_ids : `list` [`~typing.Any`]
|
|
335
336
|
Only job ids to be used by cancel and other functions. Typically
|
|
336
337
|
this means top-level jobs (i.e., not children jobs).
|
|
337
338
|
"""
|
|
@@ -604,15 +605,15 @@ class HTCondorWorkflow(BaseWmsWorkflow):
|
|
|
604
605
|
self.dag.write(out_prefix, job_subdir="jobs/{self.label}")
|
|
605
606
|
|
|
606
607
|
|
|
607
|
-
def _create_job(subdir_template,
|
|
608
|
+
def _create_job(subdir_template, cached_values, generic_workflow, gwjob, out_prefix):
|
|
608
609
|
"""Convert GenericWorkflow job nodes to DAG jobs.
|
|
609
610
|
|
|
610
611
|
Parameters
|
|
611
612
|
----------
|
|
612
613
|
subdir_template : `str`
|
|
613
614
|
Template for making subdirs.
|
|
614
|
-
|
|
615
|
-
Site specific values
|
|
615
|
+
cached_values : `dict`
|
|
616
|
+
Site and label specific values.
|
|
616
617
|
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
617
618
|
Generic workflow that is being converted.
|
|
618
619
|
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
@@ -654,7 +655,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
|
|
|
654
655
|
"on_exit_hold_subcode": "34",
|
|
655
656
|
}
|
|
656
657
|
|
|
657
|
-
htc_job_cmds.update(_translate_job_cmds(
|
|
658
|
+
htc_job_cmds.update(_translate_job_cmds(cached_values, generic_workflow, gwjob))
|
|
658
659
|
|
|
659
660
|
# job stdout, stderr, htcondor user log.
|
|
660
661
|
for key in ("output", "error", "log"):
|
|
@@ -662,7 +663,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
|
|
|
662
663
|
_LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
|
|
663
664
|
|
|
664
665
|
htc_job_cmds.update(
|
|
665
|
-
_handle_job_inputs(generic_workflow, gwjob.name,
|
|
666
|
+
_handle_job_inputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
|
|
666
667
|
)
|
|
667
668
|
|
|
668
669
|
# Add the job cmds dict to the job object.
|
|
@@ -673,7 +674,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
|
|
|
673
674
|
# Add job attributes to job.
|
|
674
675
|
_LOG.debug("gwjob.attrs = %s", gwjob.attrs)
|
|
675
676
|
htc_job.add_job_attrs(gwjob.attrs)
|
|
676
|
-
htc_job.add_job_attrs(
|
|
677
|
+
htc_job.add_job_attrs(cached_values["attrs"])
|
|
677
678
|
htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
|
|
678
679
|
htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
|
|
679
680
|
|
|
@@ -685,8 +686,8 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
685
686
|
|
|
686
687
|
Parameters
|
|
687
688
|
----------
|
|
688
|
-
cached_vals : `dict` [`str`,
|
|
689
|
-
Config values common to jobs with same label.
|
|
689
|
+
cached_vals : `dict` [`str`, `~typing.Any`]
|
|
690
|
+
Config values common to jobs with same site or label.
|
|
690
691
|
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
691
692
|
Generic workflow that contains job to being converted.
|
|
692
693
|
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
@@ -694,7 +695,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
694
695
|
|
|
695
696
|
Returns
|
|
696
697
|
-------
|
|
697
|
-
htc_job_commands : `dict` [`str`,
|
|
698
|
+
htc_job_commands : `dict` [`str`, `~typing.Any`]
|
|
698
699
|
Contains commands which can appear in the HTCondor submit description
|
|
699
700
|
file.
|
|
700
701
|
"""
|
|
@@ -720,9 +721,6 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
720
721
|
jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
|
|
721
722
|
|
|
722
723
|
# job commands that need modification
|
|
723
|
-
if gwjob.number_of_retries:
|
|
724
|
-
jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
|
|
725
|
-
|
|
726
724
|
if gwjob.retry_unless_exit:
|
|
727
725
|
if isinstance(gwjob.retry_unless_exit, int):
|
|
728
726
|
jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
|
|
@@ -739,6 +737,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
739
737
|
if gwjob.request_memory:
|
|
740
738
|
jobcmds["request_memory"] = f"{gwjob.request_memory}"
|
|
741
739
|
|
|
740
|
+
memory_max = 0
|
|
742
741
|
if gwjob.memory_multiplier:
|
|
743
742
|
# Do not use try-except! At the moment, BpsConfig returns an empty
|
|
744
743
|
# string if it does not contain the key.
|
|
@@ -765,13 +764,18 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
765
764
|
gwjob.request_memory, gwjob.memory_multiplier, memory_max
|
|
766
765
|
)
|
|
767
766
|
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
767
|
+
user_release_expr = cached_vals.get("releaseExpr", "")
|
|
768
|
+
if gwjob.number_of_retries is not None and gwjob.number_of_retries >= 0:
|
|
769
|
+
jobcmds["max_retries"] = gwjob.number_of_retries
|
|
770
|
+
|
|
771
|
+
# No point in adding periodic_release if 0 retries
|
|
772
|
+
if gwjob.number_of_retries > 0:
|
|
773
|
+
periodic_release = _create_periodic_release_expr(
|
|
774
|
+
gwjob.request_memory, gwjob.memory_multiplier, memory_max, user_release_expr
|
|
775
|
+
)
|
|
776
|
+
if periodic_release:
|
|
777
|
+
jobcmds["periodic_release"] = periodic_release
|
|
778
|
+
|
|
775
779
|
jobcmds["periodic_remove"] = _create_periodic_remove_expr(
|
|
776
780
|
gwjob.request_memory, gwjob.memory_multiplier, memory_max
|
|
777
781
|
)
|
|
@@ -830,7 +834,7 @@ def _translate_dag_cmds(gwjob):
|
|
|
830
834
|
|
|
831
835
|
Returns
|
|
832
836
|
-------
|
|
833
|
-
dagcmds : `dict` [`str`,
|
|
837
|
+
dagcmds : `dict` [`str`, `~typing.Any`]
|
|
834
838
|
DAGMan commands for the job.
|
|
835
839
|
"""
|
|
836
840
|
# Values in the dag script that just are name mappings.
|
|
@@ -1180,7 +1184,7 @@ def _get_info_from_path(wms_path: str | os.PathLike) -> tuple[str, dict[str, dic
|
|
|
1180
1184
|
-------
|
|
1181
1185
|
wms_workflow_id : `str`
|
|
1182
1186
|
The run id which is a DAGman job id.
|
|
1183
|
-
jobs : `dict` [`str`, `dict` [`str`,
|
|
1187
|
+
jobs : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1184
1188
|
Information about jobs read from files in the given directory.
|
|
1185
1189
|
The key is the HTCondor id and the value is a dictionary of HTCondor
|
|
1186
1190
|
keys and values.
|
|
@@ -1340,7 +1344,7 @@ def _add_service_job_specific_info(job_ad: dict[str, Any], specific_info: WmsSpe
|
|
|
1340
1344
|
|
|
1341
1345
|
Parameters
|
|
1342
1346
|
----------
|
|
1343
|
-
job_ad : `dict` [`str`,
|
|
1347
|
+
job_ad : `dict` [`str`, `~typing.Any`]
|
|
1344
1348
|
Provisioning job information.
|
|
1345
1349
|
specific_info : `lsst.ctrl.bps.WmsSpecificInfo`
|
|
1346
1350
|
Where to add message.
|
|
@@ -1466,7 +1470,7 @@ def _add_run_info(wms_path, job):
|
|
|
1466
1470
|
----------
|
|
1467
1471
|
wms_path : `str`
|
|
1468
1472
|
Path to submit files for the run.
|
|
1469
|
-
job : `dict` [`str`,
|
|
1473
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1470
1474
|
HTCondor dag job information.
|
|
1471
1475
|
|
|
1472
1476
|
Raises
|
|
@@ -1502,7 +1506,7 @@ def _get_owner(job):
|
|
|
1502
1506
|
|
|
1503
1507
|
Parameters
|
|
1504
1508
|
----------
|
|
1505
|
-
job : `dict` [`str`,
|
|
1509
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1506
1510
|
HTCondor dag job information.
|
|
1507
1511
|
|
|
1508
1512
|
Returns
|
|
@@ -1524,7 +1528,7 @@ def _get_run_summary(job):
|
|
|
1524
1528
|
|
|
1525
1529
|
Parameters
|
|
1526
1530
|
----------
|
|
1527
|
-
job : `dict` [`str`,
|
|
1531
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1528
1532
|
HTCondor dag job information.
|
|
1529
1533
|
|
|
1530
1534
|
Returns
|
|
@@ -1600,7 +1604,7 @@ def _get_state_counts_from_jobs(
|
|
|
1600
1604
|
----------
|
|
1601
1605
|
wms_workflow_id : `str`
|
|
1602
1606
|
HTCondor job id.
|
|
1603
|
-
jobs : `dict [`dict` [`str`,
|
|
1607
|
+
jobs : `dict [`dict` [`str`, `~typing.Any`]]
|
|
1604
1608
|
HTCondor dag job information.
|
|
1605
1609
|
|
|
1606
1610
|
Returns
|
|
@@ -1628,7 +1632,7 @@ def _get_state_counts_from_dag_job(job):
|
|
|
1628
1632
|
|
|
1629
1633
|
Parameters
|
|
1630
1634
|
----------
|
|
1631
|
-
job : `dict` [`str`,
|
|
1635
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1632
1636
|
HTCondor dag job information.
|
|
1633
1637
|
|
|
1634
1638
|
Returns
|
|
@@ -1684,7 +1688,7 @@ def _htc_status_to_wms_state(job):
|
|
|
1684
1688
|
|
|
1685
1689
|
Parameters
|
|
1686
1690
|
----------
|
|
1687
|
-
job : `dict` [`str`,
|
|
1691
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1688
1692
|
HTCondor job information.
|
|
1689
1693
|
|
|
1690
1694
|
Returns
|
|
@@ -1706,7 +1710,7 @@ def _htc_job_status_to_wms_state(job):
|
|
|
1706
1710
|
|
|
1707
1711
|
Parameters
|
|
1708
1712
|
----------
|
|
1709
|
-
job : `dict` [`str`,
|
|
1713
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1710
1714
|
HTCondor job information.
|
|
1711
1715
|
|
|
1712
1716
|
Returns
|
|
@@ -1748,7 +1752,7 @@ def _htc_node_status_to_wms_state(job):
|
|
|
1748
1752
|
|
|
1749
1753
|
Parameters
|
|
1750
1754
|
----------
|
|
1751
|
-
job : `dict` [`str`,
|
|
1755
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1752
1756
|
HTCondor job information.
|
|
1753
1757
|
|
|
1754
1758
|
Returns
|
|
@@ -1795,9 +1799,9 @@ def _update_jobs(jobs1, jobs2):
|
|
|
1795
1799
|
|
|
1796
1800
|
Parameters
|
|
1797
1801
|
----------
|
|
1798
|
-
jobs1 : `dict` [`str`, `dict` [`str`,
|
|
1802
|
+
jobs1 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1799
1803
|
HTCondor job information to be updated.
|
|
1800
|
-
jobs2 : `dict` [`str`, `dict` [`str`,
|
|
1804
|
+
jobs2 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1801
1805
|
Additional HTCondor job information.
|
|
1802
1806
|
"""
|
|
1803
1807
|
for job_id, job_ad in jobs2.items():
|
|
@@ -1937,34 +1941,39 @@ def _wms_id_to_dir(wms_id):
|
|
|
1937
1941
|
return wms_path, id_type
|
|
1938
1942
|
|
|
1939
1943
|
|
|
1940
|
-
def _create_periodic_release_expr(
|
|
1944
|
+
def _create_periodic_release_expr(
|
|
1945
|
+
memory: int, multiplier: float | None, limit: int, additional_expr: str = ""
|
|
1946
|
+
) -> str:
|
|
1941
1947
|
"""Construct an HTCondorAd expression for releasing held jobs.
|
|
1942
1948
|
|
|
1943
|
-
The expression instruct HTCondor to release any job which was put on hold
|
|
1944
|
-
due to exceeding memory requirements back to the job queue providing it
|
|
1945
|
-
satisfies all of the conditions below:
|
|
1946
|
-
|
|
1947
|
-
* number of run attempts did not reach allowable number of retries,
|
|
1948
|
-
* the memory requirements in the last failed run attempt did not reach
|
|
1949
|
-
the specified memory limit.
|
|
1950
|
-
|
|
1951
1949
|
Parameters
|
|
1952
1950
|
----------
|
|
1953
1951
|
memory : `int`
|
|
1954
1952
|
Requested memory in MB.
|
|
1955
|
-
multiplier : `float`
|
|
1956
|
-
Memory growth rate between
|
|
1953
|
+
multiplier : `float` or None
|
|
1954
|
+
Memory growth rate between retries.
|
|
1957
1955
|
limit : `int`
|
|
1958
1956
|
Memory limit.
|
|
1957
|
+
additional_expr : `str`, optional
|
|
1958
|
+
Expression to add to periodic_release. Defaults to empty string.
|
|
1959
1959
|
|
|
1960
1960
|
Returns
|
|
1961
1961
|
-------
|
|
1962
1962
|
expr : `str`
|
|
1963
|
-
A string representing an HTCondor ClassAd expression for releasing
|
|
1964
|
-
which have been held due to exceeding the memory requirements.
|
|
1963
|
+
A string representing an HTCondor ClassAd expression for releasing job.
|
|
1965
1964
|
"""
|
|
1966
|
-
|
|
1967
|
-
|
|
1965
|
+
_LOG.debug(
|
|
1966
|
+
"periodic_release: memory: %s, multiplier: %s, limit: %s, additional_expr: %s",
|
|
1967
|
+
memory,
|
|
1968
|
+
multiplier,
|
|
1969
|
+
limit,
|
|
1970
|
+
additional_expr,
|
|
1971
|
+
)
|
|
1972
|
+
|
|
1973
|
+
# ctrl_bps sets multiplier to None in the GenericWorkflow if
|
|
1974
|
+
# memoryMultiplier <= 1, but checking value just in case.
|
|
1975
|
+
if (not multiplier or multiplier <= 1) and not additional_expr:
|
|
1976
|
+
return ""
|
|
1968
1977
|
|
|
1969
1978
|
# Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
|
|
1970
1979
|
# UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
|
|
@@ -1976,63 +1985,74 @@ def _create_periodic_release_expr(memory, multiplier, limit):
|
|
|
1976
1985
|
# the entire expression should evaluate to FALSE when the job is not HELD.
|
|
1977
1986
|
# According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
|
|
1978
1987
|
# but better safe than sorry.
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
1988
|
+
is_held = "JobStatus == 5"
|
|
1989
|
+
is_retry_allowed = "NumJobStarts <= JobMaxRetries"
|
|
1990
|
+
|
|
1991
|
+
mem_expr = ""
|
|
1992
|
+
if memory and multiplier and multiplier > 1 and limit:
|
|
1993
|
+
was_mem_exceeded = (
|
|
1994
|
+
"(HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
|
|
1995
|
+
"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
|
|
1996
|
+
)
|
|
1997
|
+
was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
|
|
1998
|
+
mem_expr = f"{was_mem_exceeded} && {was_below_limit}"
|
|
1999
|
+
|
|
2000
|
+
user_expr = ""
|
|
2001
|
+
if additional_expr:
|
|
2002
|
+
# Never auto release a job held by user.
|
|
2003
|
+
user_expr = f"HoldReasonCode =!= 1 && {additional_expr}"
|
|
2004
|
+
|
|
2005
|
+
expr = f"{is_held} && {is_retry_allowed}"
|
|
2006
|
+
if user_expr and mem_expr:
|
|
2007
|
+
expr += f" && ({mem_expr} || {user_expr})"
|
|
2008
|
+
elif user_expr:
|
|
2009
|
+
expr += f" && {user_expr}"
|
|
2010
|
+
elif mem_expr:
|
|
2011
|
+
expr += f" && {mem_expr}"
|
|
1984
2012
|
|
|
1985
|
-
expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
|
|
1986
2013
|
return expr
|
|
1987
2014
|
|
|
1988
2015
|
|
|
1989
2016
|
def _create_periodic_remove_expr(memory, multiplier, limit):
|
|
1990
2017
|
"""Construct an HTCondorAd expression for removing jobs from the queue.
|
|
1991
2018
|
|
|
1992
|
-
The expression instruct HTCondor to remove any job which was put on hold
|
|
1993
|
-
due to exceeding memory requirements from the job queue providing it
|
|
1994
|
-
satisfies any of the conditions below:
|
|
1995
|
-
|
|
1996
|
-
* allowable number of retries was reached,
|
|
1997
|
-
* the memory requirements during the last failed run attempt reached
|
|
1998
|
-
the specified memory limit.
|
|
1999
|
-
|
|
2000
2019
|
Parameters
|
|
2001
2020
|
----------
|
|
2002
2021
|
memory : `int`
|
|
2003
2022
|
Requested memory in MB.
|
|
2004
2023
|
multiplier : `float`
|
|
2005
|
-
Memory growth rate between
|
|
2024
|
+
Memory growth rate between retries.
|
|
2006
2025
|
limit : `int`
|
|
2007
2026
|
Memory limit.
|
|
2008
2027
|
|
|
2009
2028
|
Returns
|
|
2010
2029
|
-------
|
|
2011
2030
|
expr : `str`
|
|
2012
|
-
A string representing an HTCondor ClassAd expression for removing jobs
|
|
2013
|
-
which were run at the maximal allowable memory and still exceeded
|
|
2014
|
-
the memory requirements.
|
|
2031
|
+
A string representing an HTCondor ClassAd expression for removing jobs.
|
|
2015
2032
|
"""
|
|
2016
|
-
|
|
2017
|
-
|
|
2018
|
-
|
|
2019
|
-
#
|
|
2020
|
-
# UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
|
|
2021
|
-
# The special comparison operators ensure that all comparisons below will
|
|
2022
|
-
# evaluate to FALSE in this case.
|
|
2033
|
+
# Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode'
|
|
2034
|
+
# are UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
|
|
2035
|
+
# The special comparison operators ensure that all comparisons below
|
|
2036
|
+
# will evaluate to FALSE in this case.
|
|
2023
2037
|
#
|
|
2024
2038
|
# Note:
|
|
2025
|
-
# May not be strictly necessary. Operators '&&' and '||' are not
|
|
2026
|
-
# the entire expression should evaluate to FALSE when the
|
|
2027
|
-
# According to ClassAd evaluation semantics
|
|
2028
|
-
# but better safe than sorry.
|
|
2029
|
-
|
|
2030
|
-
|
|
2031
|
-
|
|
2032
|
-
|
|
2033
|
-
|
|
2039
|
+
# May not be strictly necessary. Operators '&&' and '||' are not
|
|
2040
|
+
# strict so the entire expression should evaluate to FALSE when the
|
|
2041
|
+
# job is not HELD. According to ClassAd evaluation semantics
|
|
2042
|
+
# FALSE && UNDEFINED is FALSE, but better safe than sorry.
|
|
2043
|
+
is_held = "JobStatus == 5"
|
|
2044
|
+
is_retry_disallowed = "NumJobStarts > JobMaxRetries"
|
|
2045
|
+
|
|
2046
|
+
mem_expr = ""
|
|
2047
|
+
if memory and multiplier and multiplier > 1 and limit:
|
|
2048
|
+
mem_limit_expr = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
|
|
2049
|
+
|
|
2050
|
+
mem_expr = ( # Add || here so only added if adding memory expr
|
|
2051
|
+
" || ((HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
|
|
2052
|
+
f"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && {mem_limit_expr})"
|
|
2053
|
+
)
|
|
2034
2054
|
|
|
2035
|
-
expr = f"{
|
|
2055
|
+
expr = f"{is_held} && ({is_retry_disallowed}{mem_expr})"
|
|
2036
2056
|
return expr
|
|
2037
2057
|
|
|
2038
2058
|
|
|
@@ -2044,7 +2064,7 @@ def _create_request_memory_expr(memory, multiplier, limit):
|
|
|
2044
2064
|
memory : `int`
|
|
2045
2065
|
Requested memory in MB.
|
|
2046
2066
|
multiplier : `float`
|
|
2047
|
-
Memory growth rate between
|
|
2067
|
+
Memory growth rate between retries.
|
|
2048
2068
|
limit : `int`
|
|
2049
2069
|
Memory limit.
|
|
2050
2070
|
|
|
@@ -2119,7 +2139,7 @@ def _gather_site_values(config, compute_site):
|
|
|
2119
2139
|
|
|
2120
2140
|
Returns
|
|
2121
2141
|
-------
|
|
2122
|
-
site_values : `dict` [`str`,
|
|
2142
|
+
site_values : `dict` [`str`, `~typing.Any`]
|
|
2123
2143
|
Values specific to the given site.
|
|
2124
2144
|
"""
|
|
2125
2145
|
site_values = {"attrs": {}, "profile": {}}
|
|
@@ -2167,6 +2187,50 @@ def _gather_site_values(config, compute_site):
|
|
|
2167
2187
|
return site_values
|
|
2168
2188
|
|
|
2169
2189
|
|
|
2190
|
+
def _gather_label_values(config: BpsConfig, label: str) -> dict[str, Any]:
|
|
2191
|
+
"""Gather values specific to given job label.
|
|
2192
|
+
|
|
2193
|
+
Parameters
|
|
2194
|
+
----------
|
|
2195
|
+
config : `lsst.ctrl.bps.BpsConfig`
|
|
2196
|
+
BPS configuration that includes necessary submit/runtime
|
|
2197
|
+
information.
|
|
2198
|
+
label : `str`
|
|
2199
|
+
GenericWorkflowJob label.
|
|
2200
|
+
|
|
2201
|
+
Returns
|
|
2202
|
+
-------
|
|
2203
|
+
values : `dict` [`str`, `~typing.Any`]
|
|
2204
|
+
Values specific to the given job label.
|
|
2205
|
+
"""
|
|
2206
|
+
values: dict[str, Any] = {"attrs": {}, "profile": {}}
|
|
2207
|
+
|
|
2208
|
+
search_opts = {}
|
|
2209
|
+
profile_key = ""
|
|
2210
|
+
if label == "finalJob":
|
|
2211
|
+
search_opts["searchobj"] = config["finalJob"]
|
|
2212
|
+
profile_key = ".finalJob.profile.condor"
|
|
2213
|
+
elif label in config["cluster"]:
|
|
2214
|
+
search_opts["curvals"] = {"curr_cluster": label}
|
|
2215
|
+
profile_key = f".cluster.{label}.profile.condor"
|
|
2216
|
+
elif label in config["pipetask"]:
|
|
2217
|
+
search_opts["curvals"] = {"curr_pipetask": label}
|
|
2218
|
+
profile_key = f".pipetask.{label}.profile.condor"
|
|
2219
|
+
|
|
2220
|
+
found, value = config.search("releaseExpr", opt=search_opts)
|
|
2221
|
+
if found:
|
|
2222
|
+
values["releaseExpr"] = value
|
|
2223
|
+
|
|
2224
|
+
if profile_key and profile_key in config:
|
|
2225
|
+
for subkey, val in config[profile_key].items():
|
|
2226
|
+
if subkey.startswith("+"):
|
|
2227
|
+
values["attrs"][subkey[1:]] = val
|
|
2228
|
+
else:
|
|
2229
|
+
values["profile"][subkey] = val
|
|
2230
|
+
|
|
2231
|
+
return values
|
|
2232
|
+
|
|
2233
|
+
|
|
2170
2234
|
def is_service_job(job_ad: dict[str, Any]) -> bool:
|
|
2171
2235
|
"""Determine if a job is a service one.
|
|
2172
2236
|
|
|
@@ -2280,16 +2344,22 @@ def _generic_workflow_to_htcondor_dag(
|
|
|
2280
2344
|
subdir_template = tmp_template
|
|
2281
2345
|
|
|
2282
2346
|
# Create all DAG jobs
|
|
2283
|
-
site_values = {} #
|
|
2347
|
+
site_values = {} # Cache compute site specific values to reduce config lookups.
|
|
2348
|
+
cached_values = {} # Cache label-specific values to reduce config lookups.
|
|
2349
|
+
# Note: Can't use get_job_by_label because those only include payload jobs.
|
|
2284
2350
|
for job_name in generic_workflow:
|
|
2285
2351
|
gwjob = generic_workflow.get_job(job_name)
|
|
2286
2352
|
if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
2287
2353
|
gwjob = cast(GenericWorkflowJob, gwjob)
|
|
2288
2354
|
if gwjob.compute_site not in site_values:
|
|
2289
2355
|
site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
|
|
2356
|
+
if gwjob.label not in cached_values:
|
|
2357
|
+
cached_values[gwjob.label] = deepcopy(site_values[gwjob.compute_site])
|
|
2358
|
+
cached_values[gwjob.label].update(_gather_label_values(config, gwjob.label))
|
|
2359
|
+
_LOG.debug("cached: %s= %s", gwjob.label, cached_values[gwjob.label])
|
|
2290
2360
|
htc_job = _create_job(
|
|
2291
2361
|
subdir_template[gwjob.label],
|
|
2292
|
-
|
|
2362
|
+
cached_values[gwjob.label],
|
|
2293
2363
|
generic_workflow,
|
|
2294
2364
|
gwjob,
|
|
2295
2365
|
out_prefix,
|
|
@@ -2351,8 +2421,15 @@ def _generic_workflow_to_htcondor_dag(
|
|
|
2351
2421
|
if final and isinstance(final, GenericWorkflowJob):
|
|
2352
2422
|
if final.compute_site and final.compute_site not in site_values:
|
|
2353
2423
|
site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
|
|
2424
|
+
if final.label not in cached_values:
|
|
2425
|
+
cached_values[final.label] = deepcopy(site_values[final.compute_site])
|
|
2426
|
+
cached_values[final.label].update(_gather_label_values(config, final.label))
|
|
2354
2427
|
final_htjob = _create_job(
|
|
2355
|
-
subdir_template[final.label],
|
|
2428
|
+
subdir_template[final.label],
|
|
2429
|
+
cached_values[final.label],
|
|
2430
|
+
generic_workflow,
|
|
2431
|
+
final,
|
|
2432
|
+
out_prefix,
|
|
2356
2433
|
)
|
|
2357
2434
|
if "post" not in final_htjob.dagcmds:
|
|
2358
2435
|
final_htjob.dagcmds["post"] = {
|
|
@@ -352,7 +352,7 @@ class TranslateJobCmdsTestCase(unittest.TestCase):
|
|
|
352
352
|
|
|
353
353
|
def setUp(self):
|
|
354
354
|
self.gw_exec = GenericWorkflowExec("test_exec", "/dummy/dir/pipetask")
|
|
355
|
-
self.cached_vals = {"profile": {}, "bpsUseShared": True}
|
|
355
|
+
self.cached_vals = {"profile": {}, "bpsUseShared": True, "memoryLimit": 32768}
|
|
356
356
|
|
|
357
357
|
def testRetryUnlessNone(self):
|
|
358
358
|
gwjob = GenericWorkflowJob("retryUnless", "label1", executable=self.gw_exec)
|
|
@@ -409,6 +409,30 @@ class TranslateJobCmdsTestCase(unittest.TestCase):
|
|
|
409
409
|
htc_commands = htcondor_service._translate_job_cmds(self.cached_vals, None, gwjob)
|
|
410
410
|
self.assertEqual(htc_commands["environment"], "TEST_ENV_VAR='$ENV(CTRL_BPS_DIR)/tests'")
|
|
411
411
|
|
|
412
|
+
def testPeriodicRelease(self):
|
|
413
|
+
gwjob = GenericWorkflowJob("periodicRelease", "label1", executable=self.gw_exec)
|
|
414
|
+
gwjob.request_memory = 2048
|
|
415
|
+
gwjob.memory_multiplier = 2
|
|
416
|
+
gwjob.number_of_retries = 3
|
|
417
|
+
htc_commands = htcondor_service._translate_job_cmds(self.cached_vals, None, gwjob)
|
|
418
|
+
release = (
|
|
419
|
+
"JobStatus == 5 && NumJobStarts <= JobMaxRetries && "
|
|
420
|
+
"(HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 || "
|
|
421
|
+
"HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && "
|
|
422
|
+
"min({int(2048 * pow(2, NumJobStarts - 1)), 32768}) < 32768"
|
|
423
|
+
)
|
|
424
|
+
self.assertEqual(htc_commands["periodic_release"], release)
|
|
425
|
+
|
|
426
|
+
def testPeriodicRemoveNoRetries(self):
|
|
427
|
+
gwjob = GenericWorkflowJob("periodicRelease", "label1", executable=self.gw_exec)
|
|
428
|
+
gwjob.request_memory = 2048
|
|
429
|
+
gwjob.memory_multiplier = 1
|
|
430
|
+
gwjob.number_of_retries = 0
|
|
431
|
+
htc_commands = htcondor_service._translate_job_cmds(self.cached_vals, None, gwjob)
|
|
432
|
+
remove = "JobStatus == 5 && (NumJobStarts > JobMaxRetries)"
|
|
433
|
+
self.assertEqual(htc_commands["periodic_remove"], remove)
|
|
434
|
+
self.assertEqual(htc_commands["max_retries"], 0)
|
|
435
|
+
|
|
412
436
|
|
|
413
437
|
class GetStateCountsFromDagJobTestCase(unittest.TestCase):
|
|
414
438
|
"""Test counting number of jobs per WMS state."""
|
|
@@ -1135,6 +1159,53 @@ class GatherSiteValuesTestCase(unittest.TestCase):
|
|
|
1135
1159
|
self.assertEqual(results["memoryLimit"], BPS_DEFAULTS["memoryLimit"])
|
|
1136
1160
|
|
|
1137
1161
|
|
|
1162
|
+
class GatherLabelValuesTestCase(unittest.TestCase):
|
|
1163
|
+
"""Test _gather_labels_values function."""
|
|
1164
|
+
|
|
1165
|
+
def testClusterLabel(self):
|
|
1166
|
+
# Test cluster value overrides pipetask.
|
|
1167
|
+
label = "label1"
|
|
1168
|
+
config = BpsConfig(
|
|
1169
|
+
{
|
|
1170
|
+
"cluster": {
|
|
1171
|
+
"label1": {"releaseExpr": "cluster_val", "profile": {"condor": {"prof_val1": 3}}}
|
|
1172
|
+
},
|
|
1173
|
+
"pipetask": {"label1": {"releaseExpr": "pipetask_val"}},
|
|
1174
|
+
},
|
|
1175
|
+
search_order=BPS_SEARCH_ORDER,
|
|
1176
|
+
defaults=BPS_DEFAULTS,
|
|
1177
|
+
wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
|
|
1178
|
+
)
|
|
1179
|
+
results = htcondor_service._gather_label_values(config, label)
|
|
1180
|
+
self.assertEqual(results, {"attrs": {}, "profile": {"prof_val1": 3}, "releaseExpr": "cluster_val"})
|
|
1181
|
+
|
|
1182
|
+
def testPipetaskLabel(self):
|
|
1183
|
+
label = "label1"
|
|
1184
|
+
config = BpsConfig(
|
|
1185
|
+
{
|
|
1186
|
+
"pipetask": {
|
|
1187
|
+
"label1": {"releaseExpr": "pipetask_val", "profile": {"condor": {"prof_val1": 3}}}
|
|
1188
|
+
}
|
|
1189
|
+
},
|
|
1190
|
+
search_order=BPS_SEARCH_ORDER,
|
|
1191
|
+
defaults=BPS_DEFAULTS,
|
|
1192
|
+
wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
|
|
1193
|
+
)
|
|
1194
|
+
results = htcondor_service._gather_label_values(config, label)
|
|
1195
|
+
self.assertEqual(results, {"attrs": {}, "profile": {"prof_val1": 3}, "releaseExpr": "pipetask_val"})
|
|
1196
|
+
|
|
1197
|
+
def testNoSection(self):
|
|
1198
|
+
label = "notThere"
|
|
1199
|
+
config = BpsConfig(
|
|
1200
|
+
{},
|
|
1201
|
+
search_order=BPS_SEARCH_ORDER,
|
|
1202
|
+
defaults=BPS_DEFAULTS,
|
|
1203
|
+
wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
|
|
1204
|
+
)
|
|
1205
|
+
results = htcondor_service._gather_label_values(config, label)
|
|
1206
|
+
self.assertEqual(results, {"attrs": {}, "profile": {}})
|
|
1207
|
+
|
|
1208
|
+
|
|
1138
1209
|
class CreateCheckJobTestCase(unittest.TestCase):
|
|
1139
1210
|
"""Test _create_check_job function."""
|
|
1140
1211
|
|
|
@@ -1147,5 +1218,77 @@ class CreateCheckJobTestCase(unittest.TestCase):
|
|
|
1147
1218
|
self.assertIn("check_group_status.sub", job.subfile)
|
|
1148
1219
|
|
|
1149
1220
|
|
|
1221
|
+
class CreatePeriodicReleaseExprTestCase(unittest.TestCase):
|
|
1222
|
+
"""Test _create_periodic_release_expr function."""
|
|
1223
|
+
|
|
1224
|
+
def testNoReleaseExpr(self):
|
|
1225
|
+
results = htcondor_service._create_periodic_release_expr(2048, 1, 32768, "")
|
|
1226
|
+
self.assertEqual(results, "")
|
|
1227
|
+
|
|
1228
|
+
def testMultiplierNone(self):
|
|
1229
|
+
results = htcondor_service._create_periodic_release_expr(2048, None, 32768, "")
|
|
1230
|
+
self.assertEqual(results, "")
|
|
1231
|
+
|
|
1232
|
+
def testJustMemoryReleaseExpr(self):
|
|
1233
|
+
self.maxDiff = None # so test error shows entire strings
|
|
1234
|
+
results = htcondor_service._create_periodic_release_expr(2048, 2, 32768, "")
|
|
1235
|
+
truth = (
|
|
1236
|
+
"JobStatus == 5 && NumJobStarts <= JobMaxRetries && "
|
|
1237
|
+
"(HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 || "
|
|
1238
|
+
"HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && "
|
|
1239
|
+
"min({int(2048 * pow(2, NumJobStarts - 1)), 32768}) < 32768"
|
|
1240
|
+
)
|
|
1241
|
+
self.assertEqual(results, truth)
|
|
1242
|
+
|
|
1243
|
+
def testJustUserReleaseExpr(self):
|
|
1244
|
+
results = htcondor_service._create_periodic_release_expr(2048, 1, 32768, "True")
|
|
1245
|
+
truth = "JobStatus == 5 && NumJobStarts <= JobMaxRetries && HoldReasonCode =!= 1 && True"
|
|
1246
|
+
self.assertEqual(results, truth)
|
|
1247
|
+
|
|
1248
|
+
def testJustUserReleaseExprMultiplierNone(self):
|
|
1249
|
+
results = htcondor_service._create_periodic_release_expr(2048, None, 32768, "True")
|
|
1250
|
+
truth = "JobStatus == 5 && NumJobStarts <= JobMaxRetries && HoldReasonCode =!= 1 && True"
|
|
1251
|
+
self.assertEqual(results, truth)
|
|
1252
|
+
|
|
1253
|
+
def testMemoryAndUserReleaseExpr(self):
|
|
1254
|
+
self.maxDiff = None # so test error shows entire strings
|
|
1255
|
+
results = htcondor_service._create_periodic_release_expr(2048, 2, 32768, "True")
|
|
1256
|
+
truth = (
|
|
1257
|
+
"JobStatus == 5 && NumJobStarts <= JobMaxRetries && "
|
|
1258
|
+
"((HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 || "
|
|
1259
|
+
"HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && "
|
|
1260
|
+
"min({int(2048 * pow(2, NumJobStarts - 1)), 32768}) < 32768 || "
|
|
1261
|
+
"HoldReasonCode =!= 1 && True)"
|
|
1262
|
+
)
|
|
1263
|
+
self.assertEqual(results, truth)
|
|
1264
|
+
|
|
1265
|
+
|
|
1266
|
+
class CreatePeriodicRemoveExprTestCase(unittest.TestCase):
|
|
1267
|
+
"""Test _create_periodic_release_expr function."""
|
|
1268
|
+
|
|
1269
|
+
def testBasicRemoveExpr(self):
|
|
1270
|
+
"""Function assumes only called if max_retries >= 0."""
|
|
1271
|
+
results = htcondor_service._create_periodic_remove_expr(2048, 1, 32768)
|
|
1272
|
+
truth = "JobStatus == 5 && (NumJobStarts > JobMaxRetries)"
|
|
1273
|
+
self.assertEqual(results, truth)
|
|
1274
|
+
|
|
1275
|
+
def testBasicRemoveExprMultiplierNone(self):
|
|
1276
|
+
"""Function assumes only called if max_retries >= 0."""
|
|
1277
|
+
results = htcondor_service._create_periodic_remove_expr(2048, None, 32768)
|
|
1278
|
+
truth = "JobStatus == 5 && (NumJobStarts > JobMaxRetries)"
|
|
1279
|
+
self.assertEqual(results, truth)
|
|
1280
|
+
|
|
1281
|
+
def testMemoryRemoveExpr(self):
|
|
1282
|
+
self.maxDiff = None # so test error shows entire strings
|
|
1283
|
+
results = htcondor_service._create_periodic_remove_expr(2048, 2, 32768)
|
|
1284
|
+
truth = (
|
|
1285
|
+
"JobStatus == 5 && (NumJobStarts > JobMaxRetries || "
|
|
1286
|
+
"((HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 || "
|
|
1287
|
+
"HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && "
|
|
1288
|
+
"min({int(2048 * pow(2, NumJobStarts - 1)), 32768}) == 32768))"
|
|
1289
|
+
)
|
|
1290
|
+
self.assertEqual(results, truth)
|
|
1291
|
+
|
|
1292
|
+
|
|
1150
1293
|
if __name__ == "__main__":
|
|
1151
1294
|
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/tests/test_handlers.py
RENAMED
|
File without changes
|
{lsst_ctrl_bps_htcondor-29.2025.2000 → lsst_ctrl_bps_htcondor-29.2025.2100}/tests/test_lssthtc.py
RENAMED
|
File without changes
|
|
File without changes
|