lsst-ctrl-bps-htcondor 29.2025.2000__py3-none-any.whl → 29.2025.2200__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/ctrl/bps/htcondor/htcondor_service.py +290 -92
- lsst/ctrl/bps/htcondor/lssthtc.py +1 -0
- lsst/ctrl/bps/htcondor/version.py +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.2000.dist-info → lsst_ctrl_bps_htcondor-29.2025.2200.dist-info}/METADATA +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.2000.dist-info → lsst_ctrl_bps_htcondor-29.2025.2200.dist-info}/RECORD +12 -12
- {lsst_ctrl_bps_htcondor-29.2025.2000.dist-info → lsst_ctrl_bps_htcondor-29.2025.2200.dist-info}/WHEEL +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.2000.dist-info → lsst_ctrl_bps_htcondor-29.2025.2200.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000.dist-info → lsst_ctrl_bps_htcondor-29.2025.2200.dist-info}/licenses/LICENSE +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000.dist-info → lsst_ctrl_bps_htcondor-29.2025.2200.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000.dist-info → lsst_ctrl_bps_htcondor-29.2025.2200.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000.dist-info → lsst_ctrl_bps_htcondor-29.2025.2200.dist-info}/top_level.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2000.dist-info → lsst_ctrl_bps_htcondor-29.2025.2200.dist-info}/zip-safe +0 -0
|
@@ -34,6 +34,7 @@ import logging
|
|
|
34
34
|
import os
|
|
35
35
|
import re
|
|
36
36
|
from collections import defaultdict
|
|
37
|
+
from copy import deepcopy
|
|
37
38
|
from enum import IntEnum, auto
|
|
38
39
|
from pathlib import Path
|
|
39
40
|
from typing import Any, cast
|
|
@@ -331,7 +332,7 @@ class HTCondorService(BaseWmsService):
|
|
|
331
332
|
|
|
332
333
|
Returns
|
|
333
334
|
-------
|
|
334
|
-
job_ids : `list` [
|
|
335
|
+
job_ids : `list` [`~typing.Any`]
|
|
335
336
|
Only job ids to be used by cancel and other functions. Typically
|
|
336
337
|
this means top-level jobs (i.e., not children jobs).
|
|
337
338
|
"""
|
|
@@ -400,6 +401,54 @@ class HTCondorService(BaseWmsService):
|
|
|
400
401
|
_LOG.debug("job_ids = %s", job_ids)
|
|
401
402
|
return job_ids
|
|
402
403
|
|
|
404
|
+
def get_status(
|
|
405
|
+
self,
|
|
406
|
+
wms_workflow_id: str,
|
|
407
|
+
hist: float = 1,
|
|
408
|
+
is_global: bool = False,
|
|
409
|
+
) -> tuple[WmsStates, str]:
|
|
410
|
+
"""Return status of run based upon given constraints.
|
|
411
|
+
|
|
412
|
+
Parameters
|
|
413
|
+
----------
|
|
414
|
+
wms_workflow_id : `str`
|
|
415
|
+
Limit to specific run based on id (queue id or path).
|
|
416
|
+
hist : `float`, optional
|
|
417
|
+
Limit history search to this many days. Defaults to 1.
|
|
418
|
+
is_global : `bool`, optional
|
|
419
|
+
If set, all job queues (and their histories) will be queried for
|
|
420
|
+
job information. Defaults to False which means that only the local
|
|
421
|
+
job queue will be queried.
|
|
422
|
+
|
|
423
|
+
Returns
|
|
424
|
+
-------
|
|
425
|
+
state : `lsst.ctrl.bps.WmsStates`
|
|
426
|
+
Status of single run from given information.
|
|
427
|
+
message : `str`
|
|
428
|
+
Extra message for status command to print. This could be pointers
|
|
429
|
+
to documentation or to WMS specific commands.
|
|
430
|
+
"""
|
|
431
|
+
_LOG.debug("get_status: id=%s, hist=%s, is_global=%s", wms_workflow_id, hist, is_global)
|
|
432
|
+
|
|
433
|
+
id_type = _wms_id_type(wms_workflow_id)
|
|
434
|
+
_LOG.debug("id_type = %s", id_type.name)
|
|
435
|
+
|
|
436
|
+
if id_type == WmsIdType.LOCAL:
|
|
437
|
+
schedulers = _locate_schedds(locate_all=is_global)
|
|
438
|
+
_LOG.debug("schedulers = %s", schedulers)
|
|
439
|
+
state, message = _get_status_from_id(wms_workflow_id, hist, schedds=schedulers)
|
|
440
|
+
elif id_type == WmsIdType.GLOBAL:
|
|
441
|
+
schedulers = _locate_schedds(locate_all=True)
|
|
442
|
+
_LOG.debug("schedulers = %s", schedulers)
|
|
443
|
+
state, message = _get_status_from_id(wms_workflow_id, hist, schedds=schedulers)
|
|
444
|
+
elif id_type == WmsIdType.PATH:
|
|
445
|
+
state, message = _get_status_from_path(wms_workflow_id)
|
|
446
|
+
else:
|
|
447
|
+
state, message = WmsStates.UNKNOWN, "Invalid job id"
|
|
448
|
+
_LOG.debug("state: %s, %s", state, message)
|
|
449
|
+
|
|
450
|
+
return state, message
|
|
451
|
+
|
|
403
452
|
def report(
|
|
404
453
|
self,
|
|
405
454
|
wms_workflow_id=None,
|
|
@@ -604,15 +653,15 @@ class HTCondorWorkflow(BaseWmsWorkflow):
|
|
|
604
653
|
self.dag.write(out_prefix, job_subdir="jobs/{self.label}")
|
|
605
654
|
|
|
606
655
|
|
|
607
|
-
def _create_job(subdir_template,
|
|
656
|
+
def _create_job(subdir_template, cached_values, generic_workflow, gwjob, out_prefix):
|
|
608
657
|
"""Convert GenericWorkflow job nodes to DAG jobs.
|
|
609
658
|
|
|
610
659
|
Parameters
|
|
611
660
|
----------
|
|
612
661
|
subdir_template : `str`
|
|
613
662
|
Template for making subdirs.
|
|
614
|
-
|
|
615
|
-
Site specific values
|
|
663
|
+
cached_values : `dict`
|
|
664
|
+
Site and label specific values.
|
|
616
665
|
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
617
666
|
Generic workflow that is being converted.
|
|
618
667
|
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
@@ -654,7 +703,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
|
|
|
654
703
|
"on_exit_hold_subcode": "34",
|
|
655
704
|
}
|
|
656
705
|
|
|
657
|
-
htc_job_cmds.update(_translate_job_cmds(
|
|
706
|
+
htc_job_cmds.update(_translate_job_cmds(cached_values, generic_workflow, gwjob))
|
|
658
707
|
|
|
659
708
|
# job stdout, stderr, htcondor user log.
|
|
660
709
|
for key in ("output", "error", "log"):
|
|
@@ -662,7 +711,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
|
|
|
662
711
|
_LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
|
|
663
712
|
|
|
664
713
|
htc_job_cmds.update(
|
|
665
|
-
_handle_job_inputs(generic_workflow, gwjob.name,
|
|
714
|
+
_handle_job_inputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
|
|
666
715
|
)
|
|
667
716
|
|
|
668
717
|
# Add the job cmds dict to the job object.
|
|
@@ -673,7 +722,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
|
|
|
673
722
|
# Add job attributes to job.
|
|
674
723
|
_LOG.debug("gwjob.attrs = %s", gwjob.attrs)
|
|
675
724
|
htc_job.add_job_attrs(gwjob.attrs)
|
|
676
|
-
htc_job.add_job_attrs(
|
|
725
|
+
htc_job.add_job_attrs(cached_values["attrs"])
|
|
677
726
|
htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
|
|
678
727
|
htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
|
|
679
728
|
|
|
@@ -685,8 +734,8 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
685
734
|
|
|
686
735
|
Parameters
|
|
687
736
|
----------
|
|
688
|
-
cached_vals : `dict` [`str`,
|
|
689
|
-
Config values common to jobs with same label.
|
|
737
|
+
cached_vals : `dict` [`str`, `~typing.Any`]
|
|
738
|
+
Config values common to jobs with same site or label.
|
|
690
739
|
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
691
740
|
Generic workflow that contains job to being converted.
|
|
692
741
|
gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
|
|
@@ -694,7 +743,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
694
743
|
|
|
695
744
|
Returns
|
|
696
745
|
-------
|
|
697
|
-
htc_job_commands : `dict` [`str`,
|
|
746
|
+
htc_job_commands : `dict` [`str`, `~typing.Any`]
|
|
698
747
|
Contains commands which can appear in the HTCondor submit description
|
|
699
748
|
file.
|
|
700
749
|
"""
|
|
@@ -720,9 +769,6 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
720
769
|
jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
|
|
721
770
|
|
|
722
771
|
# job commands that need modification
|
|
723
|
-
if gwjob.number_of_retries:
|
|
724
|
-
jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
|
|
725
|
-
|
|
726
772
|
if gwjob.retry_unless_exit:
|
|
727
773
|
if isinstance(gwjob.retry_unless_exit, int):
|
|
728
774
|
jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
|
|
@@ -739,6 +785,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
739
785
|
if gwjob.request_memory:
|
|
740
786
|
jobcmds["request_memory"] = f"{gwjob.request_memory}"
|
|
741
787
|
|
|
788
|
+
memory_max = 0
|
|
742
789
|
if gwjob.memory_multiplier:
|
|
743
790
|
# Do not use try-except! At the moment, BpsConfig returns an empty
|
|
744
791
|
# string if it does not contain the key.
|
|
@@ -765,13 +812,18 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
|
|
|
765
812
|
gwjob.request_memory, gwjob.memory_multiplier, memory_max
|
|
766
813
|
)
|
|
767
814
|
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
815
|
+
user_release_expr = cached_vals.get("releaseExpr", "")
|
|
816
|
+
if gwjob.number_of_retries is not None and gwjob.number_of_retries >= 0:
|
|
817
|
+
jobcmds["max_retries"] = gwjob.number_of_retries
|
|
818
|
+
|
|
819
|
+
# No point in adding periodic_release if 0 retries
|
|
820
|
+
if gwjob.number_of_retries > 0:
|
|
821
|
+
periodic_release = _create_periodic_release_expr(
|
|
822
|
+
gwjob.request_memory, gwjob.memory_multiplier, memory_max, user_release_expr
|
|
823
|
+
)
|
|
824
|
+
if periodic_release:
|
|
825
|
+
jobcmds["periodic_release"] = periodic_release
|
|
826
|
+
|
|
775
827
|
jobcmds["periodic_remove"] = _create_periodic_remove_expr(
|
|
776
828
|
gwjob.request_memory, gwjob.memory_multiplier, memory_max
|
|
777
829
|
)
|
|
@@ -830,7 +882,7 @@ def _translate_dag_cmds(gwjob):
|
|
|
830
882
|
|
|
831
883
|
Returns
|
|
832
884
|
-------
|
|
833
|
-
dagcmds : `dict` [`str`,
|
|
885
|
+
dagcmds : `dict` [`str`, `~typing.Any`]
|
|
834
886
|
DAGMan commands for the job.
|
|
835
887
|
"""
|
|
836
888
|
# Values in the dag script that just are name mappings.
|
|
@@ -1010,6 +1062,77 @@ def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_sha
|
|
|
1010
1062
|
return htc_commands
|
|
1011
1063
|
|
|
1012
1064
|
|
|
1065
|
+
def _get_status_from_id(
|
|
1066
|
+
wms_workflow_id: str, hist: float, schedds: dict[str, htcondor.Schedd]
|
|
1067
|
+
) -> tuple[WmsStates, str]:
|
|
1068
|
+
"""Gather run information using workflow id.
|
|
1069
|
+
|
|
1070
|
+
Parameters
|
|
1071
|
+
----------
|
|
1072
|
+
wms_workflow_id : `str`
|
|
1073
|
+
Limit to specific run based on id.
|
|
1074
|
+
hist : `float`
|
|
1075
|
+
Limit history search to this many days.
|
|
1076
|
+
schedds : `dict` [ `str`, `htcondor.Schedd` ]
|
|
1077
|
+
HTCondor schedulers which to query for job information. If empty
|
|
1078
|
+
dictionary, all queries will be run against the local scheduler only.
|
|
1079
|
+
|
|
1080
|
+
Returns
|
|
1081
|
+
-------
|
|
1082
|
+
state : `lsst.ctrl.bps.WmsStates`
|
|
1083
|
+
Status for the corresponding run.
|
|
1084
|
+
message : `str`
|
|
1085
|
+
Message with extra error information.
|
|
1086
|
+
"""
|
|
1087
|
+
_LOG.debug("_get_status_from_id: id=%s, hist=%s, schedds=%s", wms_workflow_id, hist, schedds)
|
|
1088
|
+
|
|
1089
|
+
message = ""
|
|
1090
|
+
|
|
1091
|
+
# Collect information about the job by querying HTCondor schedd and
|
|
1092
|
+
# HTCondor history.
|
|
1093
|
+
schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds)
|
|
1094
|
+
if len(schedd_dag_info) == 1:
|
|
1095
|
+
schedd_name = next(iter(schedd_dag_info))
|
|
1096
|
+
dag_id = next(iter(schedd_dag_info[schedd_name]))
|
|
1097
|
+
dag_ad = schedd_dag_info[schedd_name][dag_id]
|
|
1098
|
+
state = _htc_status_to_wms_state(dag_ad)
|
|
1099
|
+
else:
|
|
1100
|
+
state = WmsStates.UNKNOWN
|
|
1101
|
+
message = f"DAGMan job {wms_workflow_id} not found in queue or history. Check id or try path."
|
|
1102
|
+
return state, message
|
|
1103
|
+
|
|
1104
|
+
|
|
1105
|
+
def _get_status_from_path(wms_path: str | os.PathLike) -> tuple[WmsStates, str]:
|
|
1106
|
+
"""Gather run status from a given run directory.
|
|
1107
|
+
|
|
1108
|
+
Parameters
|
|
1109
|
+
----------
|
|
1110
|
+
wms_path : `str` | `os.PathLike`
|
|
1111
|
+
The directory containing the submit side files (e.g., HTCondor files).
|
|
1112
|
+
|
|
1113
|
+
Returns
|
|
1114
|
+
-------
|
|
1115
|
+
state : `lsst.ctrl.bps.WmsStates`
|
|
1116
|
+
Status for the run.
|
|
1117
|
+
message : `str`
|
|
1118
|
+
Message to be printed.
|
|
1119
|
+
"""
|
|
1120
|
+
wms_path = Path(wms_path).resolve()
|
|
1121
|
+
message = ""
|
|
1122
|
+
try:
|
|
1123
|
+
wms_workflow_id, dag_ad = read_dag_log(wms_path)
|
|
1124
|
+
except FileNotFoundError:
|
|
1125
|
+
wms_workflow_id = MISSING_ID
|
|
1126
|
+
message = f"DAGMan log not found in {wms_path}. Check path."
|
|
1127
|
+
|
|
1128
|
+
if wms_workflow_id == MISSING_ID:
|
|
1129
|
+
state = WmsStates.UNKNOWN
|
|
1130
|
+
else:
|
|
1131
|
+
state = _htc_status_to_wms_state(dag_ad[wms_workflow_id])
|
|
1132
|
+
|
|
1133
|
+
return state, message
|
|
1134
|
+
|
|
1135
|
+
|
|
1013
1136
|
def _report_from_path(wms_path):
|
|
1014
1137
|
"""Gather run information from a given run directory.
|
|
1015
1138
|
|
|
@@ -1135,11 +1258,11 @@ def _get_info_from_schedd(
|
|
|
1135
1258
|
----------
|
|
1136
1259
|
wms_workflow_id : `str`
|
|
1137
1260
|
Limit to specific run based on id.
|
|
1138
|
-
hist : `
|
|
1261
|
+
hist : `float`
|
|
1139
1262
|
Limit history search to this many days.
|
|
1140
|
-
schedds : `dict` [ `str`, `htcondor.Schedd` ]
|
|
1141
|
-
HTCondor schedulers which to query for job information. If
|
|
1142
|
-
|
|
1263
|
+
schedds : `dict` [ `str`, `htcondor.Schedd` ]
|
|
1264
|
+
HTCondor schedulers which to query for job information. If empty
|
|
1265
|
+
dictionary, all queries will be run against the local scheduler only.
|
|
1143
1266
|
|
|
1144
1267
|
Returns
|
|
1145
1268
|
-------
|
|
@@ -1148,6 +1271,8 @@ def _get_info_from_schedd(
|
|
|
1148
1271
|
Scheduler, local HTCondor job ids are mapped to their respective
|
|
1149
1272
|
classads.
|
|
1150
1273
|
"""
|
|
1274
|
+
_LOG.debug("_get_info_from_schedd: id=%s, hist=%s, schedds=%s", wms_workflow_id, hist, schedds)
|
|
1275
|
+
|
|
1151
1276
|
dag_constraint = 'regexp("dagman$", Cmd)'
|
|
1152
1277
|
try:
|
|
1153
1278
|
cluster_id = int(float(wms_workflow_id))
|
|
@@ -1180,7 +1305,7 @@ def _get_info_from_path(wms_path: str | os.PathLike) -> tuple[str, dict[str, dic
|
|
|
1180
1305
|
-------
|
|
1181
1306
|
wms_workflow_id : `str`
|
|
1182
1307
|
The run id which is a DAGman job id.
|
|
1183
|
-
jobs : `dict` [`str`, `dict` [`str`,
|
|
1308
|
+
jobs : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1184
1309
|
Information about jobs read from files in the given directory.
|
|
1185
1310
|
The key is the HTCondor id and the value is a dictionary of HTCondor
|
|
1186
1311
|
keys and values.
|
|
@@ -1340,7 +1465,7 @@ def _add_service_job_specific_info(job_ad: dict[str, Any], specific_info: WmsSpe
|
|
|
1340
1465
|
|
|
1341
1466
|
Parameters
|
|
1342
1467
|
----------
|
|
1343
|
-
job_ad : `dict` [`str`,
|
|
1468
|
+
job_ad : `dict` [`str`, `~typing.Any`]
|
|
1344
1469
|
Provisioning job information.
|
|
1345
1470
|
specific_info : `lsst.ctrl.bps.WmsSpecificInfo`
|
|
1346
1471
|
Where to add message.
|
|
@@ -1466,7 +1591,7 @@ def _add_run_info(wms_path, job):
|
|
|
1466
1591
|
----------
|
|
1467
1592
|
wms_path : `str`
|
|
1468
1593
|
Path to submit files for the run.
|
|
1469
|
-
job : `dict` [`str`,
|
|
1594
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1470
1595
|
HTCondor dag job information.
|
|
1471
1596
|
|
|
1472
1597
|
Raises
|
|
@@ -1502,7 +1627,7 @@ def _get_owner(job):
|
|
|
1502
1627
|
|
|
1503
1628
|
Parameters
|
|
1504
1629
|
----------
|
|
1505
|
-
job : `dict` [`str`,
|
|
1630
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1506
1631
|
HTCondor dag job information.
|
|
1507
1632
|
|
|
1508
1633
|
Returns
|
|
@@ -1524,7 +1649,7 @@ def _get_run_summary(job):
|
|
|
1524
1649
|
|
|
1525
1650
|
Parameters
|
|
1526
1651
|
----------
|
|
1527
|
-
job : `dict` [`str`,
|
|
1652
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1528
1653
|
HTCondor dag job information.
|
|
1529
1654
|
|
|
1530
1655
|
Returns
|
|
@@ -1600,7 +1725,7 @@ def _get_state_counts_from_jobs(
|
|
|
1600
1725
|
----------
|
|
1601
1726
|
wms_workflow_id : `str`
|
|
1602
1727
|
HTCondor job id.
|
|
1603
|
-
jobs : `dict [`dict` [`str`,
|
|
1728
|
+
jobs : `dict [`dict` [`str`, `~typing.Any`]]
|
|
1604
1729
|
HTCondor dag job information.
|
|
1605
1730
|
|
|
1606
1731
|
Returns
|
|
@@ -1628,7 +1753,7 @@ def _get_state_counts_from_dag_job(job):
|
|
|
1628
1753
|
|
|
1629
1754
|
Parameters
|
|
1630
1755
|
----------
|
|
1631
|
-
job : `dict` [`str`,
|
|
1756
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1632
1757
|
HTCondor dag job information.
|
|
1633
1758
|
|
|
1634
1759
|
Returns
|
|
@@ -1684,7 +1809,7 @@ def _htc_status_to_wms_state(job):
|
|
|
1684
1809
|
|
|
1685
1810
|
Parameters
|
|
1686
1811
|
----------
|
|
1687
|
-
job : `dict` [`str`,
|
|
1812
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1688
1813
|
HTCondor job information.
|
|
1689
1814
|
|
|
1690
1815
|
Returns
|
|
@@ -1706,7 +1831,7 @@ def _htc_job_status_to_wms_state(job):
|
|
|
1706
1831
|
|
|
1707
1832
|
Parameters
|
|
1708
1833
|
----------
|
|
1709
|
-
job : `dict` [`str`,
|
|
1834
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1710
1835
|
HTCondor job information.
|
|
1711
1836
|
|
|
1712
1837
|
Returns
|
|
@@ -1748,7 +1873,7 @@ def _htc_node_status_to_wms_state(job):
|
|
|
1748
1873
|
|
|
1749
1874
|
Parameters
|
|
1750
1875
|
----------
|
|
1751
|
-
job : `dict` [`str`,
|
|
1876
|
+
job : `dict` [`str`, `~typing.Any`]
|
|
1752
1877
|
HTCondor job information.
|
|
1753
1878
|
|
|
1754
1879
|
Returns
|
|
@@ -1795,9 +1920,9 @@ def _update_jobs(jobs1, jobs2):
|
|
|
1795
1920
|
|
|
1796
1921
|
Parameters
|
|
1797
1922
|
----------
|
|
1798
|
-
jobs1 : `dict` [`str`, `dict` [`str`,
|
|
1923
|
+
jobs1 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1799
1924
|
HTCondor job information to be updated.
|
|
1800
|
-
jobs2 : `dict` [`str`, `dict` [`str`,
|
|
1925
|
+
jobs2 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1801
1926
|
Additional HTCondor job information.
|
|
1802
1927
|
"""
|
|
1803
1928
|
for job_id, job_ad in jobs2.items():
|
|
@@ -1937,34 +2062,39 @@ def _wms_id_to_dir(wms_id):
|
|
|
1937
2062
|
return wms_path, id_type
|
|
1938
2063
|
|
|
1939
2064
|
|
|
1940
|
-
def _create_periodic_release_expr(
|
|
2065
|
+
def _create_periodic_release_expr(
|
|
2066
|
+
memory: int, multiplier: float | None, limit: int, additional_expr: str = ""
|
|
2067
|
+
) -> str:
|
|
1941
2068
|
"""Construct an HTCondorAd expression for releasing held jobs.
|
|
1942
2069
|
|
|
1943
|
-
The expression instruct HTCondor to release any job which was put on hold
|
|
1944
|
-
due to exceeding memory requirements back to the job queue providing it
|
|
1945
|
-
satisfies all of the conditions below:
|
|
1946
|
-
|
|
1947
|
-
* number of run attempts did not reach allowable number of retries,
|
|
1948
|
-
* the memory requirements in the last failed run attempt did not reach
|
|
1949
|
-
the specified memory limit.
|
|
1950
|
-
|
|
1951
2070
|
Parameters
|
|
1952
2071
|
----------
|
|
1953
2072
|
memory : `int`
|
|
1954
2073
|
Requested memory in MB.
|
|
1955
|
-
multiplier : `float`
|
|
1956
|
-
Memory growth rate between
|
|
2074
|
+
multiplier : `float` or None
|
|
2075
|
+
Memory growth rate between retries.
|
|
1957
2076
|
limit : `int`
|
|
1958
2077
|
Memory limit.
|
|
2078
|
+
additional_expr : `str`, optional
|
|
2079
|
+
Expression to add to periodic_release. Defaults to empty string.
|
|
1959
2080
|
|
|
1960
2081
|
Returns
|
|
1961
2082
|
-------
|
|
1962
2083
|
expr : `str`
|
|
1963
|
-
A string representing an HTCondor ClassAd expression for releasing
|
|
1964
|
-
which have been held due to exceeding the memory requirements.
|
|
2084
|
+
A string representing an HTCondor ClassAd expression for releasing job.
|
|
1965
2085
|
"""
|
|
1966
|
-
|
|
1967
|
-
|
|
2086
|
+
_LOG.debug(
|
|
2087
|
+
"periodic_release: memory: %s, multiplier: %s, limit: %s, additional_expr: %s",
|
|
2088
|
+
memory,
|
|
2089
|
+
multiplier,
|
|
2090
|
+
limit,
|
|
2091
|
+
additional_expr,
|
|
2092
|
+
)
|
|
2093
|
+
|
|
2094
|
+
# ctrl_bps sets multiplier to None in the GenericWorkflow if
|
|
2095
|
+
# memoryMultiplier <= 1, but checking value just in case.
|
|
2096
|
+
if (not multiplier or multiplier <= 1) and not additional_expr:
|
|
2097
|
+
return ""
|
|
1968
2098
|
|
|
1969
2099
|
# Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
|
|
1970
2100
|
# UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
|
|
@@ -1976,63 +2106,74 @@ def _create_periodic_release_expr(memory, multiplier, limit):
|
|
|
1976
2106
|
# the entire expression should evaluate to FALSE when the job is not HELD.
|
|
1977
2107
|
# According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
|
|
1978
2108
|
# but better safe than sorry.
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
2109
|
+
is_held = "JobStatus == 5"
|
|
2110
|
+
is_retry_allowed = "NumJobStarts <= JobMaxRetries"
|
|
2111
|
+
|
|
2112
|
+
mem_expr = ""
|
|
2113
|
+
if memory and multiplier and multiplier > 1 and limit:
|
|
2114
|
+
was_mem_exceeded = (
|
|
2115
|
+
"(HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
|
|
2116
|
+
"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
|
|
2117
|
+
)
|
|
2118
|
+
was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
|
|
2119
|
+
mem_expr = f"{was_mem_exceeded} && {was_below_limit}"
|
|
2120
|
+
|
|
2121
|
+
user_expr = ""
|
|
2122
|
+
if additional_expr:
|
|
2123
|
+
# Never auto release a job held by user.
|
|
2124
|
+
user_expr = f"HoldReasonCode =!= 1 && {additional_expr}"
|
|
2125
|
+
|
|
2126
|
+
expr = f"{is_held} && {is_retry_allowed}"
|
|
2127
|
+
if user_expr and mem_expr:
|
|
2128
|
+
expr += f" && ({mem_expr} || {user_expr})"
|
|
2129
|
+
elif user_expr:
|
|
2130
|
+
expr += f" && {user_expr}"
|
|
2131
|
+
elif mem_expr:
|
|
2132
|
+
expr += f" && {mem_expr}"
|
|
1984
2133
|
|
|
1985
|
-
expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
|
|
1986
2134
|
return expr
|
|
1987
2135
|
|
|
1988
2136
|
|
|
1989
2137
|
def _create_periodic_remove_expr(memory, multiplier, limit):
|
|
1990
2138
|
"""Construct an HTCondorAd expression for removing jobs from the queue.
|
|
1991
2139
|
|
|
1992
|
-
The expression instruct HTCondor to remove any job which was put on hold
|
|
1993
|
-
due to exceeding memory requirements from the job queue providing it
|
|
1994
|
-
satisfies any of the conditions below:
|
|
1995
|
-
|
|
1996
|
-
* allowable number of retries was reached,
|
|
1997
|
-
* the memory requirements during the last failed run attempt reached
|
|
1998
|
-
the specified memory limit.
|
|
1999
|
-
|
|
2000
2140
|
Parameters
|
|
2001
2141
|
----------
|
|
2002
2142
|
memory : `int`
|
|
2003
2143
|
Requested memory in MB.
|
|
2004
2144
|
multiplier : `float`
|
|
2005
|
-
Memory growth rate between
|
|
2145
|
+
Memory growth rate between retries.
|
|
2006
2146
|
limit : `int`
|
|
2007
2147
|
Memory limit.
|
|
2008
2148
|
|
|
2009
2149
|
Returns
|
|
2010
2150
|
-------
|
|
2011
2151
|
expr : `str`
|
|
2012
|
-
A string representing an HTCondor ClassAd expression for removing jobs
|
|
2013
|
-
which were run at the maximal allowable memory and still exceeded
|
|
2014
|
-
the memory requirements.
|
|
2152
|
+
A string representing an HTCondor ClassAd expression for removing jobs.
|
|
2015
2153
|
"""
|
|
2016
|
-
|
|
2017
|
-
|
|
2018
|
-
|
|
2019
|
-
#
|
|
2020
|
-
# UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
|
|
2021
|
-
# The special comparison operators ensure that all comparisons below will
|
|
2022
|
-
# evaluate to FALSE in this case.
|
|
2154
|
+
# Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode'
|
|
2155
|
+
# are UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
|
|
2156
|
+
# The special comparison operators ensure that all comparisons below
|
|
2157
|
+
# will evaluate to FALSE in this case.
|
|
2023
2158
|
#
|
|
2024
2159
|
# Note:
|
|
2025
|
-
# May not be strictly necessary. Operators '&&' and '||' are not
|
|
2026
|
-
# the entire expression should evaluate to FALSE when the
|
|
2027
|
-
# According to ClassAd evaluation semantics
|
|
2028
|
-
# but better safe than sorry.
|
|
2029
|
-
|
|
2030
|
-
|
|
2031
|
-
|
|
2032
|
-
|
|
2033
|
-
|
|
2160
|
+
# May not be strictly necessary. Operators '&&' and '||' are not
|
|
2161
|
+
# strict so the entire expression should evaluate to FALSE when the
|
|
2162
|
+
# job is not HELD. According to ClassAd evaluation semantics
|
|
2163
|
+
# FALSE && UNDEFINED is FALSE, but better safe than sorry.
|
|
2164
|
+
is_held = "JobStatus == 5"
|
|
2165
|
+
is_retry_disallowed = "NumJobStarts > JobMaxRetries"
|
|
2166
|
+
|
|
2167
|
+
mem_expr = ""
|
|
2168
|
+
if memory and multiplier and multiplier > 1 and limit:
|
|
2169
|
+
mem_limit_expr = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
|
|
2170
|
+
|
|
2171
|
+
mem_expr = ( # Add || here so only added if adding memory expr
|
|
2172
|
+
" || ((HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
|
|
2173
|
+
f"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && {mem_limit_expr})"
|
|
2174
|
+
)
|
|
2034
2175
|
|
|
2035
|
-
expr = f"{
|
|
2176
|
+
expr = f"{is_held} && ({is_retry_disallowed}{mem_expr})"
|
|
2036
2177
|
return expr
|
|
2037
2178
|
|
|
2038
2179
|
|
|
@@ -2044,7 +2185,7 @@ def _create_request_memory_expr(memory, multiplier, limit):
|
|
|
2044
2185
|
memory : `int`
|
|
2045
2186
|
Requested memory in MB.
|
|
2046
2187
|
multiplier : `float`
|
|
2047
|
-
Memory growth rate between
|
|
2188
|
+
Memory growth rate between retries.
|
|
2048
2189
|
limit : `int`
|
|
2049
2190
|
Memory limit.
|
|
2050
2191
|
|
|
@@ -2119,7 +2260,7 @@ def _gather_site_values(config, compute_site):
|
|
|
2119
2260
|
|
|
2120
2261
|
Returns
|
|
2121
2262
|
-------
|
|
2122
|
-
site_values : `dict` [`str`,
|
|
2263
|
+
site_values : `dict` [`str`, `~typing.Any`]
|
|
2123
2264
|
Values specific to the given site.
|
|
2124
2265
|
"""
|
|
2125
2266
|
site_values = {"attrs": {}, "profile": {}}
|
|
@@ -2167,6 +2308,50 @@ def _gather_site_values(config, compute_site):
|
|
|
2167
2308
|
return site_values
|
|
2168
2309
|
|
|
2169
2310
|
|
|
2311
|
+
def _gather_label_values(config: BpsConfig, label: str) -> dict[str, Any]:
|
|
2312
|
+
"""Gather values specific to given job label.
|
|
2313
|
+
|
|
2314
|
+
Parameters
|
|
2315
|
+
----------
|
|
2316
|
+
config : `lsst.ctrl.bps.BpsConfig`
|
|
2317
|
+
BPS configuration that includes necessary submit/runtime
|
|
2318
|
+
information.
|
|
2319
|
+
label : `str`
|
|
2320
|
+
GenericWorkflowJob label.
|
|
2321
|
+
|
|
2322
|
+
Returns
|
|
2323
|
+
-------
|
|
2324
|
+
values : `dict` [`str`, `~typing.Any`]
|
|
2325
|
+
Values specific to the given job label.
|
|
2326
|
+
"""
|
|
2327
|
+
values: dict[str, Any] = {"attrs": {}, "profile": {}}
|
|
2328
|
+
|
|
2329
|
+
search_opts = {}
|
|
2330
|
+
profile_key = ""
|
|
2331
|
+
if label == "finalJob":
|
|
2332
|
+
search_opts["searchobj"] = config["finalJob"]
|
|
2333
|
+
profile_key = ".finalJob.profile.condor"
|
|
2334
|
+
elif label in config["cluster"]:
|
|
2335
|
+
search_opts["curvals"] = {"curr_cluster": label}
|
|
2336
|
+
profile_key = f".cluster.{label}.profile.condor"
|
|
2337
|
+
elif label in config["pipetask"]:
|
|
2338
|
+
search_opts["curvals"] = {"curr_pipetask": label}
|
|
2339
|
+
profile_key = f".pipetask.{label}.profile.condor"
|
|
2340
|
+
|
|
2341
|
+
found, value = config.search("releaseExpr", opt=search_opts)
|
|
2342
|
+
if found:
|
|
2343
|
+
values["releaseExpr"] = value
|
|
2344
|
+
|
|
2345
|
+
if profile_key and profile_key in config:
|
|
2346
|
+
for subkey, val in config[profile_key].items():
|
|
2347
|
+
if subkey.startswith("+"):
|
|
2348
|
+
values["attrs"][subkey[1:]] = val
|
|
2349
|
+
else:
|
|
2350
|
+
values["profile"][subkey] = val
|
|
2351
|
+
|
|
2352
|
+
return values
|
|
2353
|
+
|
|
2354
|
+
|
|
2170
2355
|
def is_service_job(job_ad: dict[str, Any]) -> bool:
|
|
2171
2356
|
"""Determine if a job is a service one.
|
|
2172
2357
|
|
|
@@ -2280,16 +2465,22 @@ def _generic_workflow_to_htcondor_dag(
|
|
|
2280
2465
|
subdir_template = tmp_template
|
|
2281
2466
|
|
|
2282
2467
|
# Create all DAG jobs
|
|
2283
|
-
site_values = {} #
|
|
2468
|
+
site_values = {} # Cache compute site specific values to reduce config lookups.
|
|
2469
|
+
cached_values = {} # Cache label-specific values to reduce config lookups.
|
|
2470
|
+
# Note: Can't use get_job_by_label because those only include payload jobs.
|
|
2284
2471
|
for job_name in generic_workflow:
|
|
2285
2472
|
gwjob = generic_workflow.get_job(job_name)
|
|
2286
2473
|
if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD:
|
|
2287
2474
|
gwjob = cast(GenericWorkflowJob, gwjob)
|
|
2288
2475
|
if gwjob.compute_site not in site_values:
|
|
2289
2476
|
site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
|
|
2477
|
+
if gwjob.label not in cached_values:
|
|
2478
|
+
cached_values[gwjob.label] = deepcopy(site_values[gwjob.compute_site])
|
|
2479
|
+
cached_values[gwjob.label].update(_gather_label_values(config, gwjob.label))
|
|
2480
|
+
_LOG.debug("cached: %s= %s", gwjob.label, cached_values[gwjob.label])
|
|
2290
2481
|
htc_job = _create_job(
|
|
2291
2482
|
subdir_template[gwjob.label],
|
|
2292
|
-
|
|
2483
|
+
cached_values[gwjob.label],
|
|
2293
2484
|
generic_workflow,
|
|
2294
2485
|
gwjob,
|
|
2295
2486
|
out_prefix,
|
|
@@ -2351,8 +2542,15 @@ def _generic_workflow_to_htcondor_dag(
|
|
|
2351
2542
|
if final and isinstance(final, GenericWorkflowJob):
|
|
2352
2543
|
if final.compute_site and final.compute_site not in site_values:
|
|
2353
2544
|
site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
|
|
2545
|
+
if final.label not in cached_values:
|
|
2546
|
+
cached_values[final.label] = deepcopy(site_values[final.compute_site])
|
|
2547
|
+
cached_values[final.label].update(_gather_label_values(config, final.label))
|
|
2354
2548
|
final_htjob = _create_job(
|
|
2355
|
-
subdir_template[final.label],
|
|
2549
|
+
subdir_template[final.label],
|
|
2550
|
+
cached_values[final.label],
|
|
2551
|
+
generic_workflow,
|
|
2552
|
+
final,
|
|
2553
|
+
out_prefix,
|
|
2356
2554
|
)
|
|
2357
2555
|
if "post" not in final_htjob.dagcmds:
|
|
2358
2556
|
final_htjob.dagcmds["post"] = {
|
|
@@ -1376,6 +1376,7 @@ def condor_search(constraint=None, hist=None, schedds=None):
|
|
|
1376
1376
|
|
|
1377
1377
|
job_info = condor_q(constraint=constraint, schedds=schedds)
|
|
1378
1378
|
if hist is not None:
|
|
1379
|
+
_LOG.debug("Searching history going back %s days", hist)
|
|
1379
1380
|
epoch = (datetime.now() - timedelta(days=hist)).timestamp()
|
|
1380
1381
|
constraint += f" && (CompletionDate >= {epoch} || JobFinishedHookDone >= {epoch})"
|
|
1381
1382
|
hist_info = condor_history(constraint, schedds=schedds)
|
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
__all__ = ["__version__"]
|
|
2
|
-
__version__ = "29.2025.
|
|
2
|
+
__version__ = "29.2025.2200"
|
|
@@ -2,18 +2,18 @@ lsst/ctrl/bps/htcondor/__init__.py,sha256=1gTmOVLJILvBqgqHVECo8uqoX8e4fiTeH_dHBU
|
|
|
2
2
|
lsst/ctrl/bps/htcondor/final_post.sh,sha256=chfaQV6Q7rGsK-8Hx58ch52m-PofvBanrl7VwCssHec,248
|
|
3
3
|
lsst/ctrl/bps/htcondor/handlers.py,sha256=2gM3Ac00in4ob9ckcP331W1LSEjs9UDKIqt4MULA4bg,11196
|
|
4
4
|
lsst/ctrl/bps/htcondor/htcondor_config.py,sha256=c4lCiYEwEXFdxgbMfEkbDm4LrvkRMF31SqLtQqzqIV4,1523
|
|
5
|
-
lsst/ctrl/bps/htcondor/htcondor_service.py,sha256
|
|
6
|
-
lsst/ctrl/bps/htcondor/lssthtc.py,sha256=
|
|
5
|
+
lsst/ctrl/bps/htcondor/htcondor_service.py,sha256=4_jm0lIZw3mYXyays1IWyg3pFwpODR_-g6CLIepXu7w,95330
|
|
6
|
+
lsst/ctrl/bps/htcondor/lssthtc.py,sha256=pYxcA5jicuJs1RnhusSuMFrOU92Xy1fb-tleZ9m784Y,80485
|
|
7
7
|
lsst/ctrl/bps/htcondor/provisioner.py,sha256=hPN8YJUtwNHQylw68kfskF1S2vCeQvztF8W0d_QKqqM,7851
|
|
8
|
-
lsst/ctrl/bps/htcondor/version.py,sha256=
|
|
8
|
+
lsst/ctrl/bps/htcondor/version.py,sha256=RaoK8ADNKLNvY-bXCQnWhCS6HwZ0pWKOeKdmgzS7d6Y,55
|
|
9
9
|
lsst/ctrl/bps/htcondor/etc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
lsst/ctrl/bps/htcondor/etc/htcondor_defaults.yaml,sha256=xDRts4vHKov2PE_JRh-0nF3jfuNJXtKBXZqveASp_iA,1422
|
|
11
|
-
lsst_ctrl_bps_htcondor-29.2025.
|
|
12
|
-
lsst_ctrl_bps_htcondor-29.2025.
|
|
13
|
-
lsst_ctrl_bps_htcondor-29.2025.
|
|
14
|
-
lsst_ctrl_bps_htcondor-29.2025.
|
|
15
|
-
lsst_ctrl_bps_htcondor-29.2025.
|
|
16
|
-
lsst_ctrl_bps_htcondor-29.2025.
|
|
17
|
-
lsst_ctrl_bps_htcondor-29.2025.
|
|
18
|
-
lsst_ctrl_bps_htcondor-29.2025.
|
|
19
|
-
lsst_ctrl_bps_htcondor-29.2025.
|
|
11
|
+
lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/licenses/COPYRIGHT,sha256=Lc6NoAEFQ65v_SmtS9NwfHTOuSUtC2Umbjv5zyowiQM,61
|
|
12
|
+
lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/licenses/LICENSE,sha256=pRExkS03v0MQW-neNfIcaSL6aiAnoLxYgtZoFzQ6zkM,232
|
|
13
|
+
lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/licenses/bsd_license.txt,sha256=7MIcv8QRX9guUtqPSBDMPz2SnZ5swI-xZMqm_VDSfxY,1606
|
|
14
|
+
lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/licenses/gpl-v3.0.txt,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
15
|
+
lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/METADATA,sha256=Ls4Ipu4B4iFbZ4rozJhOoBeuOOR6Jg6E3X31dWa2qU0,2139
|
|
16
|
+
lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
+
lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/top_level.txt,sha256=eUWiOuVVm9wwTrnAgiJT6tp6HQHXxIhj2QSZ7NYZH80,5
|
|
18
|
+
lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
19
|
+
lsst_ctrl_bps_htcondor-29.2025.2200.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|