lsst-ctrl-bps-htcondor 29.2025.2300__tar.gz → 29.2025.3000__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lsst_ctrl_bps_htcondor-29.2025.2300/python/lsst_ctrl_bps_htcondor.egg-info → lsst_ctrl_bps_htcondor-29.2025.3000}/PKG-INFO +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/doc/lsst.ctrl.bps.htcondor/CHANGES.rst +17 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/doc/lsst.ctrl.bps.htcondor/userguide.rst +70 -10
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/etc/htcondor_defaults.yaml +5 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/htcondor_service.py +72 -30
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/lssthtc.py +1 -0
- lsst_ctrl_bps_htcondor-29.2025.3000/python/lsst/ctrl/bps/htcondor/version.py +2 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000/python/lsst_ctrl_bps_htcondor.egg-info}/PKG-INFO +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/tests/test_htcondor_service.py +167 -6
- lsst_ctrl_bps_htcondor-29.2025.2300/python/lsst/ctrl/bps/htcondor/version.py +0 -2
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/COPYRIGHT +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/LICENSE +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/MANIFEST.in +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/README.rst +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/bsd_license.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/doc/lsst.ctrl.bps.htcondor/index.rst +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/gpl-v3.0.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/pyproject.toml +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/__init__.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/etc/__init__.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/final_post.sh +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/handlers.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/htcondor_config.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/provisioner.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst_ctrl_bps_htcondor.egg-info/SOURCES.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst_ctrl_bps_htcondor.egg-info/dependency_links.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst_ctrl_bps_htcondor.egg-info/requires.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst_ctrl_bps_htcondor.egg-info/top_level.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst_ctrl_bps_htcondor.egg-info/zip-safe +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/setup.cfg +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/tests/test_handlers.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/tests/test_lssthtc.py +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/tests/test_provisioner.py +0 -0
|
@@ -1,3 +1,20 @@
|
|
|
1
|
+
lsst-ctrl-bps-htcondor v29.1.0 (2025-06-13)
|
|
2
|
+
===========================================
|
|
3
|
+
|
|
4
|
+
New Features
|
|
5
|
+
------------
|
|
6
|
+
|
|
7
|
+
- Added capability for ``NOOP`` and ``EXTERNAL SUBDAG`` DAG nodes. (`DM-46294 <https://rubinobs.atlassian.net/browse/DM-46294>`_)
|
|
8
|
+
- Added ability to add job release expression. (`DM-50614 <https://rubinobs.atlassian.net/browse/DM-50614>`_)
|
|
9
|
+
- Added get_status method to ``HTCondorService`` class for quick checking of run status. (`DM-50619 <https://rubinobs.atlassian.net/browse/DM-50619>`_)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
Other Changes and Additions
|
|
13
|
+
---------------------------
|
|
14
|
+
|
|
15
|
+
- Explicitly define ``MaxIdle`` to workaround bug where HTCondor overrides config and environment variables when it is responsible for making DAGMan submit file (affects at least certain 24.0.x versions). (`DM-50212 <https://rubinobs.atlassian.net/browse/DM-50212>`_)
|
|
16
|
+
|
|
17
|
+
|
|
1
18
|
lsst-ctrl-bps-htcondor v29.0.0 (2025-03-25)
|
|
2
19
|
===========================================
|
|
3
20
|
|
|
@@ -109,6 +109,24 @@ environment variables can be used. Some examples:
|
|
|
109
109
|
per cluster, per pipeline task)
|
|
110
110
|
|
|
111
111
|
|
|
112
|
+
Overwriting Job Output/Error Files
|
|
113
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
114
|
+
|
|
115
|
+
When jobs are automatically retried, HTCondor keeps the same job id.
|
|
116
|
+
Any existing job's output and error files are overwritten with the new
|
|
117
|
+
ones. This is not always ideal, for example, when successful parts of
|
|
118
|
+
the failed job is skipped in the retry. The ``overwriteJobFiles`` value
|
|
119
|
+
(True or False) in the submit yaml controls whether to overwrite job files
|
|
120
|
+
on retry. When not overwriting job files an extra counter corresponding
|
|
121
|
+
to the retry number appears in the output and error filenames even for
|
|
122
|
+
successful jobs (e.g., cluster1_96908.163.0.err, cluster1_96908.163.1.err,
|
|
123
|
+
cluster1_96908.163.2.err). ``overwriteJobFiles`` defaults to True for
|
|
124
|
+
payload jobs, but defaults to False for ``finalJob`` because the retries
|
|
125
|
+
for it are always partial. ``overwriteJobFiles`` can be specified in
|
|
126
|
+
``pipetask`` and ``cluster`` sections as well as the ``finalJob`` section
|
|
127
|
+
or yaml root.
|
|
128
|
+
|
|
129
|
+
|
|
112
130
|
Glideins
|
|
113
131
|
^^^^^^^^
|
|
114
132
|
|
|
@@ -181,7 +199,7 @@ DAG, this status can lag behind by a few minutes. Also, DAGMan tracks
|
|
|
181
199
|
deletion of individual jobs as failures (no separate counts for
|
|
182
200
|
deleted jobs). So the summary report flag column will show ``F`` when
|
|
183
201
|
there are either failed or deleted jobs. If getting a detailed report
|
|
184
|
-
(``bps report --id <
|
|
202
|
+
(``bps report --id <ID>``), the plugin reads detailed job information
|
|
185
203
|
from files. So, the detailed report can distinguish between failed and
|
|
186
204
|
deleted jobs, and thus will show ``D`` in the flag column for a running
|
|
187
205
|
workflow if there is a deleted job.
|
|
@@ -202,7 +220,7 @@ jobs are being held, use
|
|
|
202
220
|
|
|
203
221
|
.. code-block:: bash
|
|
204
222
|
|
|
205
|
-
condor_q -hold <
|
|
223
|
+
condor_q -hold <ID> # to see a specific job being held
|
|
206
224
|
condor-q -hold <user> # to see all held jobs owned by the user
|
|
207
225
|
|
|
208
226
|
.. _htc-plugin-cancel:
|
|
@@ -231,18 +249,18 @@ See `bps restart`_.
|
|
|
231
249
|
.. Describe any plugin specific aspects of restarting failed jobs below
|
|
232
250
|
if any.
|
|
233
251
|
|
|
234
|
-
A valid run
|
|
252
|
+
A valid run ID is one of the following:
|
|
235
253
|
|
|
236
|
-
* job
|
|
254
|
+
* job ID, e.g., ``1234.0`` (using just the cluster ID, ``1234``, will also
|
|
237
255
|
work),
|
|
238
|
-
* global job
|
|
256
|
+
* global job ID (e.g.,
|
|
239
257
|
``sdfrome002.sdf.slac.stanford.edu#165725.0#1699393748``),
|
|
240
258
|
* run's submit directory (e.g.,
|
|
241
259
|
``/sdf/home/m/mxk/lsst/bps/submit/u/mxk/pipelines_check/20230713T135346Z``).
|
|
242
260
|
|
|
243
261
|
.. note::
|
|
244
262
|
|
|
245
|
-
If you don't remember any of the run's
|
|
263
|
+
If you don't remember any of the run's ID you may try running
|
|
246
264
|
|
|
247
265
|
.. code::
|
|
248
266
|
|
|
@@ -299,7 +317,7 @@ alongside the other payload jobs in the workflow that should automatically
|
|
|
299
317
|
create and maintain glideins required for the payload jobs to run.
|
|
300
318
|
|
|
301
319
|
If you enable automatic provisioning of resources, you will see the status of
|
|
302
|
-
the provisioning job in the output of the ``bps report --id <
|
|
320
|
+
the provisioning job in the output of the ``bps report --id <ID>`` command.
|
|
303
321
|
Look for the line starting with "Provisioning job status". For example
|
|
304
322
|
|
|
305
323
|
.. code-block:: bash
|
|
@@ -446,7 +464,7 @@ If any of your jobs are being held, it will display something similar to::
|
|
|
446
464
|
|
|
447
465
|
The job that is in the hold state can be released from it with
|
|
448
466
|
`condor_release`_ providing the issue that made HTCondor put it in this state
|
|
449
|
-
has been resolved. For example, if your job with
|
|
467
|
+
has been resolved. For example, if your job with ID 1234.0 was placed in the
|
|
450
468
|
hold state because during the execution it exceeded 2048 MiB you requested for
|
|
451
469
|
it during the submission, you can double the amount of memory it should request with
|
|
452
470
|
|
|
@@ -538,7 +556,49 @@ Troubleshooting
|
|
|
538
556
|
Where is stdout/stderr from pipeline tasks?
|
|
539
557
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
540
558
|
|
|
541
|
-
For now, stdout/stderr can be found in files in the run submit directory
|
|
559
|
+
For now, stdout/stderr can be found in files in the run submit directory
|
|
560
|
+
after the job is done. Python logging goes to stderr so the majority
|
|
561
|
+
of the pipetask output will be in the \*.err file. One exception is
|
|
562
|
+
``finalJob`` which does print some information to stdout (\*.out file)
|
|
563
|
+
|
|
564
|
+
While the job is running, the owner of the job can use ``condor_tail``
|
|
565
|
+
command to peek at the stdout/stderr of a job. ``bps`` uses the ID for
|
|
566
|
+
the entire workflow. But for the HTCondor command ``condor_tail``
|
|
567
|
+
you will need the ID for the individual job. Run the following command
|
|
568
|
+
and look for the ID for the job (undefined's are normal and normally
|
|
569
|
+
correspond to the DAGMan jobs).
|
|
570
|
+
|
|
571
|
+
.. code-block::
|
|
572
|
+
|
|
573
|
+
condor_q -run -nobatch -af:hj bps_job_name bps_run
|
|
574
|
+
|
|
575
|
+
Once you have the HTCondor ID for the particular job you want to peek
|
|
576
|
+
at the output, run this command:
|
|
577
|
+
|
|
578
|
+
.. code-block::
|
|
579
|
+
|
|
580
|
+
condor_tail -stderr -f <ID>
|
|
581
|
+
|
|
582
|
+
If you want to instead see the stdout, leave off the ``-stderr``.
|
|
583
|
+
If you need to see more of the contents specify ``-maxbytes <numbytes>``
|
|
584
|
+
(defaults to 1024 bytes).
|
|
585
|
+
|
|
586
|
+
I need to look around on the compute node where my job is running.
|
|
587
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
588
|
+
|
|
589
|
+
If using glideins, you might be able to just ``ssh`` to the compute
|
|
590
|
+
node from the submit node. First, need to find out on which node the
|
|
591
|
+
job is running.
|
|
592
|
+
|
|
593
|
+
.. code-block::
|
|
594
|
+
|
|
595
|
+
condor_q -run -nobatch -af:hj RemoteHost bps_job_name bps_run
|
|
596
|
+
|
|
597
|
+
Alternatively, HTCondor has the command ``condor_ssh_to_job`` where you
|
|
598
|
+
just need the job ID. This is not the workflow ID (the ID that ``bps``
|
|
599
|
+
commands use), but an individual job ID. The command above also prints
|
|
600
|
+
the job IDs.
|
|
601
|
+
|
|
542
602
|
|
|
543
603
|
Why did my submission fail?
|
|
544
604
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
@@ -556,7 +616,7 @@ will continue normally until the existing gliedins expire. As a result,
|
|
|
556
616
|
payload jobs may get stuck in the job queue if the glideins were not created
|
|
557
617
|
or expired before the execution of the workflow could be completed.
|
|
558
618
|
|
|
559
|
-
Firstly, use ``bps report --id <run
|
|
619
|
+
Firstly, use ``bps report --id <run ID>`` to display the run report and look
|
|
560
620
|
for the line
|
|
561
621
|
|
|
562
622
|
.. code-block::
|
|
@@ -706,14 +706,25 @@ def _create_job(subdir_template, cached_values, generic_workflow, gwjob, out_pre
|
|
|
706
706
|
htc_job_cmds.update(_translate_job_cmds(cached_values, generic_workflow, gwjob))
|
|
707
707
|
|
|
708
708
|
# job stdout, stderr, htcondor user log.
|
|
709
|
-
for key in ("output", "error"
|
|
710
|
-
|
|
709
|
+
for key in ("output", "error"):
|
|
710
|
+
if cached_values["overwriteJobFiles"]:
|
|
711
|
+
htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).{key[:3]}"
|
|
712
|
+
else:
|
|
713
|
+
htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).$$([NumJobStarts ?: 0]).{key[:3]}"
|
|
711
714
|
_LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
|
|
712
715
|
|
|
716
|
+
key = "log"
|
|
717
|
+
htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).{key[:3]}"
|
|
718
|
+
_LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
|
|
719
|
+
|
|
713
720
|
htc_job_cmds.update(
|
|
714
721
|
_handle_job_inputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
|
|
715
722
|
)
|
|
716
723
|
|
|
724
|
+
htc_job_cmds.update(
|
|
725
|
+
_handle_job_outputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
|
|
726
|
+
)
|
|
727
|
+
|
|
717
728
|
# Add the job cmds dict to the job object.
|
|
718
729
|
htc_job.add_job_cmds(htc_job_cmds)
|
|
719
730
|
|
|
@@ -946,13 +957,7 @@ def _replace_file_vars(use_shared, arguments, workflow, gwjob):
|
|
|
946
957
|
# Have shared filesystems and jobs can share file.
|
|
947
958
|
uri = gwfile.src_uri
|
|
948
959
|
else:
|
|
949
|
-
|
|
950
|
-
# Temporary fix until have job wrapper that pulls files
|
|
951
|
-
# within job.
|
|
952
|
-
if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml":
|
|
953
|
-
uri = "butler.yaml"
|
|
954
|
-
else:
|
|
955
|
-
uri = os.path.basename(gwfile.src_uri)
|
|
960
|
+
uri = os.path.basename(gwfile.src_uri)
|
|
956
961
|
else: # Using push transfer
|
|
957
962
|
uri = os.path.basename(gwfile.src_uri)
|
|
958
963
|
arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
|
|
@@ -1001,7 +1006,9 @@ def _replace_cmd_vars(arguments, gwjob):
|
|
|
1001
1006
|
return arguments
|
|
1002
1007
|
|
|
1003
1008
|
|
|
1004
|
-
def _handle_job_inputs(
|
|
1009
|
+
def _handle_job_inputs(
|
|
1010
|
+
generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str
|
|
1011
|
+
) -> dict[str, str]:
|
|
1005
1012
|
"""Add job input files from generic workflow to job.
|
|
1006
1013
|
|
|
1007
1014
|
Parameters
|
|
@@ -1020,7 +1027,6 @@ def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_sha
|
|
|
1020
1027
|
htc_commands : `dict` [`str`, `str`]
|
|
1021
1028
|
HTCondor commands for the job submission script.
|
|
1022
1029
|
"""
|
|
1023
|
-
htc_commands = {}
|
|
1024
1030
|
inputs = []
|
|
1025
1031
|
for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
|
|
1026
1032
|
_LOG.debug("src_uri=%s", gwf_file.src_uri)
|
|
@@ -1030,38 +1036,68 @@ def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_sha
|
|
|
1030
1036
|
# Note if use_shared and job_shared, don't need to transfer file.
|
|
1031
1037
|
|
|
1032
1038
|
if not use_shared: # Copy file using push to job
|
|
1033
|
-
inputs.append(str(uri
|
|
1039
|
+
inputs.append(str(uri))
|
|
1034
1040
|
elif not gwf_file.job_shared: # Jobs require own copy
|
|
1035
1041
|
# if using shared filesystem, but still need copy in job. Use
|
|
1036
1042
|
# HTCondor's curl plugin for a local copy.
|
|
1037
|
-
|
|
1038
|
-
# Execution butler is represented as a directory which the
|
|
1039
|
-
# curl plugin does not handle. Taking advantage of inside
|
|
1040
|
-
# knowledge for temporary fix until have job wrapper that pulls
|
|
1041
|
-
# files within job.
|
|
1042
|
-
if gwf_file.name == "butlerConfig":
|
|
1043
|
-
# The execution butler directory doesn't normally exist until
|
|
1044
|
-
# the submit phase so checking for suffix instead of using
|
|
1045
|
-
# is_dir(). If other non-yaml file exists they would have a
|
|
1046
|
-
# different gwf_file.name.
|
|
1047
|
-
if uri.suffix == ".yaml": # Single file, so just copy.
|
|
1048
|
-
inputs.append(f"file://{uri}")
|
|
1049
|
-
else:
|
|
1050
|
-
inputs.append(f"file://{uri / 'butler.yaml'}")
|
|
1051
|
-
inputs.append(f"file://{uri / 'gen3.sqlite3'}")
|
|
1052
|
-
elif uri.is_dir():
|
|
1043
|
+
if uri.is_dir():
|
|
1053
1044
|
raise RuntimeError(
|
|
1054
1045
|
f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}"
|
|
1055
1046
|
)
|
|
1056
|
-
|
|
1057
|
-
inputs.append(f"file://{uri}")
|
|
1047
|
+
inputs.append(f"file://{uri}")
|
|
1058
1048
|
|
|
1049
|
+
htc_commands = {}
|
|
1059
1050
|
if inputs:
|
|
1060
1051
|
htc_commands["transfer_input_files"] = ",".join(inputs)
|
|
1061
1052
|
_LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
|
|
1062
1053
|
return htc_commands
|
|
1063
1054
|
|
|
1064
1055
|
|
|
1056
|
+
def _handle_job_outputs(
|
|
1057
|
+
generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str
|
|
1058
|
+
) -> dict[str, str]:
|
|
1059
|
+
"""Add job output files from generic workflow to the job if any.
|
|
1060
|
+
|
|
1061
|
+
Parameters
|
|
1062
|
+
----------
|
|
1063
|
+
generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
|
|
1064
|
+
The generic workflow (e.g., has executable name and arguments).
|
|
1065
|
+
job_name : `str`
|
|
1066
|
+
Unique name for the job.
|
|
1067
|
+
use_shared : `bool`
|
|
1068
|
+
Whether job has access to files via shared filesystem.
|
|
1069
|
+
out_prefix : `str`
|
|
1070
|
+
The root directory into which all WMS-specific files are written.
|
|
1071
|
+
|
|
1072
|
+
Returns
|
|
1073
|
+
-------
|
|
1074
|
+
htc_commands : `dict` [`str`, `str`]
|
|
1075
|
+
HTCondor commands for the job submission script.
|
|
1076
|
+
"""
|
|
1077
|
+
outputs = []
|
|
1078
|
+
output_remaps = []
|
|
1079
|
+
for gwf_file in generic_workflow.get_job_outputs(job_name, data=True, transfer_only=True):
|
|
1080
|
+
_LOG.debug("src_uri=%s", gwf_file.src_uri)
|
|
1081
|
+
|
|
1082
|
+
uri = Path(gwf_file.src_uri)
|
|
1083
|
+
if not use_shared:
|
|
1084
|
+
outputs.append(uri.name)
|
|
1085
|
+
output_remaps.append(f"{uri.name}={str(uri)}")
|
|
1086
|
+
|
|
1087
|
+
# Set to an empty string to disable and only update if there are output
|
|
1088
|
+
# files to transfer. Otherwise, HTCondor will transfer back all files in
|
|
1089
|
+
# the job’s temporary working directory that have been modified or created
|
|
1090
|
+
# by the job.
|
|
1091
|
+
htc_commands = {"transfer_output_files": '""'}
|
|
1092
|
+
if outputs:
|
|
1093
|
+
htc_commands["transfer_output_files"] = ",".join(outputs)
|
|
1094
|
+
_LOG.debug("transfer_output_files=%s", htc_commands["transfer_output_files"])
|
|
1095
|
+
|
|
1096
|
+
htc_commands["transfer_output_remaps"] = f'"{";".join(output_remaps)}"'
|
|
1097
|
+
_LOG.debug("transfer_output_remaps=%s", htc_commands["transfer_output_remaps"])
|
|
1098
|
+
return htc_commands
|
|
1099
|
+
|
|
1100
|
+
|
|
1065
1101
|
def _get_status_from_id(
|
|
1066
1102
|
wms_workflow_id: str, hist: float, schedds: dict[str, htcondor.Schedd]
|
|
1067
1103
|
) -> tuple[WmsStates, str]:
|
|
@@ -2342,6 +2378,12 @@ def _gather_label_values(config: BpsConfig, label: str) -> dict[str, Any]:
|
|
|
2342
2378
|
if found:
|
|
2343
2379
|
values["releaseExpr"] = value
|
|
2344
2380
|
|
|
2381
|
+
found, value = config.search("overwriteJobFiles", opt=search_opts)
|
|
2382
|
+
if found:
|
|
2383
|
+
values["overwriteJobFiles"] = value
|
|
2384
|
+
else:
|
|
2385
|
+
values["overwriteJobFiles"] = True
|
|
2386
|
+
|
|
2345
2387
|
if profile_key and profile_key in config:
|
|
2346
2388
|
for subkey, val in config[profile_key].items():
|
|
2347
2389
|
if subkey.startswith("+"):
|
|
@@ -40,13 +40,14 @@ from lsst.ctrl.bps import (
|
|
|
40
40
|
BPS_SEARCH_ORDER,
|
|
41
41
|
BpsConfig,
|
|
42
42
|
GenericWorkflowExec,
|
|
43
|
+
GenericWorkflowFile,
|
|
43
44
|
GenericWorkflowJob,
|
|
44
45
|
WmsSpecificInfo,
|
|
45
46
|
WmsStates,
|
|
46
47
|
)
|
|
47
48
|
from lsst.ctrl.bps.htcondor import htcondor_service, lssthtc
|
|
48
49
|
from lsst.ctrl.bps.htcondor.htcondor_config import HTC_DEFAULTS_URI
|
|
49
|
-
from lsst.ctrl.bps.tests.gw_test_utils import make_3_label_workflow_groups_sort
|
|
50
|
+
from lsst.ctrl.bps.tests.gw_test_utils import make_3_label_workflow, make_3_label_workflow_groups_sort
|
|
50
51
|
from lsst.utils.tests import temporaryDirectory
|
|
51
52
|
|
|
52
53
|
logger = logging.getLogger("lsst.ctrl.bps.htcondor")
|
|
@@ -1233,7 +1234,11 @@ class GatherLabelValuesTestCase(unittest.TestCase):
|
|
|
1233
1234
|
config = BpsConfig(
|
|
1234
1235
|
{
|
|
1235
1236
|
"cluster": {
|
|
1236
|
-
"label1": {
|
|
1237
|
+
"label1": {
|
|
1238
|
+
"releaseExpr": "cluster_val",
|
|
1239
|
+
"overwriteJobFiles": False,
|
|
1240
|
+
"profile": {"condor": {"prof_val1": 3}},
|
|
1241
|
+
}
|
|
1237
1242
|
},
|
|
1238
1243
|
"pipetask": {"label1": {"releaseExpr": "pipetask_val"}},
|
|
1239
1244
|
},
|
|
@@ -1242,14 +1247,26 @@ class GatherLabelValuesTestCase(unittest.TestCase):
|
|
|
1242
1247
|
wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
|
|
1243
1248
|
)
|
|
1244
1249
|
results = htcondor_service._gather_label_values(config, label)
|
|
1245
|
-
self.assertEqual(
|
|
1250
|
+
self.assertEqual(
|
|
1251
|
+
results,
|
|
1252
|
+
{
|
|
1253
|
+
"attrs": {},
|
|
1254
|
+
"profile": {"prof_val1": 3},
|
|
1255
|
+
"releaseExpr": "cluster_val",
|
|
1256
|
+
"overwriteJobFiles": False,
|
|
1257
|
+
},
|
|
1258
|
+
)
|
|
1246
1259
|
|
|
1247
1260
|
def testPipetaskLabel(self):
|
|
1248
1261
|
label = "label1"
|
|
1249
1262
|
config = BpsConfig(
|
|
1250
1263
|
{
|
|
1251
1264
|
"pipetask": {
|
|
1252
|
-
"label1": {
|
|
1265
|
+
"label1": {
|
|
1266
|
+
"releaseExpr": "pipetask_val",
|
|
1267
|
+
"overwriteJobFiles": False,
|
|
1268
|
+
"profile": {"condor": {"prof_val1": 3}},
|
|
1269
|
+
}
|
|
1253
1270
|
}
|
|
1254
1271
|
},
|
|
1255
1272
|
search_order=BPS_SEARCH_ORDER,
|
|
@@ -1257,7 +1274,15 @@ class GatherLabelValuesTestCase(unittest.TestCase):
|
|
|
1257
1274
|
wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
|
|
1258
1275
|
)
|
|
1259
1276
|
results = htcondor_service._gather_label_values(config, label)
|
|
1260
|
-
self.assertEqual(
|
|
1277
|
+
self.assertEqual(
|
|
1278
|
+
results,
|
|
1279
|
+
{
|
|
1280
|
+
"attrs": {},
|
|
1281
|
+
"profile": {"prof_val1": 3},
|
|
1282
|
+
"releaseExpr": "pipetask_val",
|
|
1283
|
+
"overwriteJobFiles": False,
|
|
1284
|
+
},
|
|
1285
|
+
)
|
|
1261
1286
|
|
|
1262
1287
|
def testNoSection(self):
|
|
1263
1288
|
label = "notThere"
|
|
@@ -1268,7 +1293,31 @@ class GatherLabelValuesTestCase(unittest.TestCase):
|
|
|
1268
1293
|
wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
|
|
1269
1294
|
)
|
|
1270
1295
|
results = htcondor_service._gather_label_values(config, label)
|
|
1271
|
-
self.assertEqual(results, {"attrs": {}, "profile": {}})
|
|
1296
|
+
self.assertEqual(results, {"attrs": {}, "profile": {}, "overwriteJobFiles": True})
|
|
1297
|
+
|
|
1298
|
+
def testNoOverwriteSpecified(self):
|
|
1299
|
+
label = "notthere"
|
|
1300
|
+
config = BpsConfig(
|
|
1301
|
+
{},
|
|
1302
|
+
search_order=BPS_SEARCH_ORDER,
|
|
1303
|
+
defaults={},
|
|
1304
|
+
wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
|
|
1305
|
+
)
|
|
1306
|
+
results = htcondor_service._gather_label_values(config, label)
|
|
1307
|
+
self.assertEqual(results, {"attrs": {}, "profile": {}, "overwriteJobFiles": True})
|
|
1308
|
+
|
|
1309
|
+
def testFinalJob(self):
|
|
1310
|
+
label = "finalJob"
|
|
1311
|
+
config = BpsConfig(
|
|
1312
|
+
{"finalJob": {"profile": {"condor": {"prof_val2": 6, "+attr_val1": 5}}}},
|
|
1313
|
+
search_order=BPS_SEARCH_ORDER,
|
|
1314
|
+
defaults=BPS_DEFAULTS,
|
|
1315
|
+
wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
|
|
1316
|
+
)
|
|
1317
|
+
results = htcondor_service._gather_label_values(config, label)
|
|
1318
|
+
self.assertEqual(
|
|
1319
|
+
results, {"attrs": {"attr_val1": 5}, "profile": {"prof_val2": 6}, "overwriteJobFiles": False}
|
|
1320
|
+
)
|
|
1272
1321
|
|
|
1273
1322
|
|
|
1274
1323
|
class CreateCheckJobTestCase(unittest.TestCase):
|
|
@@ -1418,5 +1467,117 @@ class GetStatusFromPathTestCase(unittest.TestCase):
|
|
|
1418
1467
|
self.assertEqual(message, "")
|
|
1419
1468
|
|
|
1420
1469
|
|
|
1470
|
+
class HandleJobOutputsTestCase(unittest.TestCase):
|
|
1471
|
+
"""Test _handle_job_outputs function."""
|
|
1472
|
+
|
|
1473
|
+
def setUp(self):
|
|
1474
|
+
self.job_name = "test_job"
|
|
1475
|
+
self.out_prefix = "/test/prefix"
|
|
1476
|
+
|
|
1477
|
+
def tearDown(self):
|
|
1478
|
+
pass
|
|
1479
|
+
|
|
1480
|
+
def testNoOutputsSharedFilesystem(self):
|
|
1481
|
+
"""Test with shared filesystem and no outputs."""
|
|
1482
|
+
mock_workflow = unittest.mock.Mock()
|
|
1483
|
+
mock_workflow.get_job_outputs.return_value = []
|
|
1484
|
+
|
|
1485
|
+
result = htcondor_service._handle_job_outputs(mock_workflow, self.job_name, True, self.out_prefix)
|
|
1486
|
+
|
|
1487
|
+
self.assertEqual(result, {"transfer_output_files": '""'})
|
|
1488
|
+
|
|
1489
|
+
def testWithOutputsSharedFilesystem(self):
|
|
1490
|
+
"""Test with shared filesystem and outputs present (still empty)."""
|
|
1491
|
+
mock_workflow = unittest.mock.Mock()
|
|
1492
|
+
mock_workflow.get_job_outputs.return_value = [
|
|
1493
|
+
GenericWorkflowFile(name="output.txt", src_uri="/path/to/output.txt")
|
|
1494
|
+
]
|
|
1495
|
+
|
|
1496
|
+
result = htcondor_service._handle_job_outputs(mock_workflow, self.job_name, True, self.out_prefix)
|
|
1497
|
+
|
|
1498
|
+
self.assertEqual(result, {"transfer_output_files": '""'})
|
|
1499
|
+
|
|
1500
|
+
def testNoOutputsNoSharedFilesystem(self):
|
|
1501
|
+
"""Test without shared filesystem and no outputs."""
|
|
1502
|
+
mock_workflow = unittest.mock.Mock()
|
|
1503
|
+
mock_workflow.get_job_outputs.return_value = []
|
|
1504
|
+
|
|
1505
|
+
result = htcondor_service._handle_job_outputs(mock_workflow, self.job_name, False, self.out_prefix)
|
|
1506
|
+
|
|
1507
|
+
self.assertEqual(result, {"transfer_output_files": '""'})
|
|
1508
|
+
|
|
1509
|
+
def testWithAnOutputNoSharedFilesystem(self):
|
|
1510
|
+
"""Test without shared filesystem and single output file."""
|
|
1511
|
+
mock_workflow = unittest.mock.Mock()
|
|
1512
|
+
mock_workflow.get_job_outputs.return_value = [
|
|
1513
|
+
GenericWorkflowFile(name="output.txt", src_uri="/path/to/output.txt")
|
|
1514
|
+
]
|
|
1515
|
+
|
|
1516
|
+
result = htcondor_service._handle_job_outputs(mock_workflow, self.job_name, False, self.out_prefix)
|
|
1517
|
+
|
|
1518
|
+
expected = {
|
|
1519
|
+
"transfer_output_files": "output.txt",
|
|
1520
|
+
"transfer_output_remaps": '"output.txt=/path/to/output.txt"',
|
|
1521
|
+
}
|
|
1522
|
+
self.assertEqual(result, expected)
|
|
1523
|
+
|
|
1524
|
+
def testWithOutputsNoSharedFilesystem(self):
|
|
1525
|
+
"""Test without shared filesystem and multiple output files."""
|
|
1526
|
+
mock_workflow = unittest.mock.Mock()
|
|
1527
|
+
mock_workflow.get_job_outputs.return_value = [
|
|
1528
|
+
GenericWorkflowFile(name="output1.txt", src_uri="/path/output1.txt"),
|
|
1529
|
+
GenericWorkflowFile(name="output2.txt", src_uri="/another/path/output2.txt"),
|
|
1530
|
+
]
|
|
1531
|
+
|
|
1532
|
+
result = htcondor_service._handle_job_outputs(mock_workflow, self.job_name, False, self.out_prefix)
|
|
1533
|
+
|
|
1534
|
+
expected = {
|
|
1535
|
+
"transfer_output_files": "output1.txt,output2.txt",
|
|
1536
|
+
"transfer_output_remaps": '"output1.txt=/path/output1.txt;output2.txt=/another/path/output2.txt"',
|
|
1537
|
+
}
|
|
1538
|
+
self.assertEqual(result, expected)
|
|
1539
|
+
|
|
1540
|
+
@unittest.mock.patch("lsst.ctrl.bps.htcondor.htcondor_service._LOG")
|
|
1541
|
+
def testLogging(self, mock_log):
|
|
1542
|
+
mock_workflow = unittest.mock.Mock()
|
|
1543
|
+
mock_workflow.get_job_outputs.return_value = [
|
|
1544
|
+
GenericWorkflowFile(name="output.txt", src_uri="/path/to/output.txt")
|
|
1545
|
+
]
|
|
1546
|
+
|
|
1547
|
+
htcondor_service._handle_job_outputs(mock_workflow, self.job_name, False, self.out_prefix)
|
|
1548
|
+
|
|
1549
|
+
self.assertTrue(mock_log.debug.called)
|
|
1550
|
+
debug_calls = mock_log.debug.call_args_list
|
|
1551
|
+
self.assertTrue(any("src_uri=" in str(call) for call in debug_calls))
|
|
1552
|
+
self.assertTrue(any("transfer_output_files=" in str(call) for call in debug_calls))
|
|
1553
|
+
self.assertTrue(any("transfer_output_remaps=" in str(call) for call in debug_calls))
|
|
1554
|
+
|
|
1555
|
+
|
|
1556
|
+
class CreateJobTestCase(unittest.TestCase):
|
|
1557
|
+
"""Test _create_job function."""
|
|
1558
|
+
|
|
1559
|
+
def setUp(self):
|
|
1560
|
+
self.generic_workflow = make_3_label_workflow("test1", True)
|
|
1561
|
+
|
|
1562
|
+
def testNoOverwrite(self):
|
|
1563
|
+
template = "{label}/{tract}/{patch}/{band}/{subfilter}/{physical_filter}/{visit}/{exposure}"
|
|
1564
|
+
cached_values = {
|
|
1565
|
+
"bpsUseShared": True,
|
|
1566
|
+
"overwriteJobFiles": False,
|
|
1567
|
+
"memoryLimit": 491520,
|
|
1568
|
+
"profile": {},
|
|
1569
|
+
"attrs": {},
|
|
1570
|
+
}
|
|
1571
|
+
gwjob = self.generic_workflow.get_final()
|
|
1572
|
+
out_prefix = "submit"
|
|
1573
|
+
htc_job = htcondor_service._create_job(
|
|
1574
|
+
template, cached_values, self.generic_workflow, gwjob, out_prefix
|
|
1575
|
+
)
|
|
1576
|
+
self.assertEqual(htc_job.name, gwjob.name)
|
|
1577
|
+
self.assertEqual(htc_job.label, gwjob.label)
|
|
1578
|
+
self.assertIn("NumJobStarts", htc_job.cmds["output"])
|
|
1579
|
+
self.assertIn("NumJobStarts", htc_job.cmds["error"])
|
|
1580
|
+
|
|
1581
|
+
|
|
1421
1582
|
if __name__ == "__main__":
|
|
1422
1583
|
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/tests/test_handlers.py
RENAMED
|
File without changes
|
{lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/tests/test_lssthtc.py
RENAMED
|
File without changes
|
|
File without changes
|