lsst-ctrl-bps-htcondor 29.2025.2300__tar.gz → 29.2025.3000__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {lsst_ctrl_bps_htcondor-29.2025.2300/python/lsst_ctrl_bps_htcondor.egg-info → lsst_ctrl_bps_htcondor-29.2025.3000}/PKG-INFO +1 -1
  2. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/doc/lsst.ctrl.bps.htcondor/CHANGES.rst +17 -0
  3. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/doc/lsst.ctrl.bps.htcondor/userguide.rst +70 -10
  4. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/etc/htcondor_defaults.yaml +5 -0
  5. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/htcondor_service.py +72 -30
  6. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/lssthtc.py +1 -0
  7. lsst_ctrl_bps_htcondor-29.2025.3000/python/lsst/ctrl/bps/htcondor/version.py +2 -0
  8. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000/python/lsst_ctrl_bps_htcondor.egg-info}/PKG-INFO +1 -1
  9. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/tests/test_htcondor_service.py +167 -6
  10. lsst_ctrl_bps_htcondor-29.2025.2300/python/lsst/ctrl/bps/htcondor/version.py +0 -2
  11. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/COPYRIGHT +0 -0
  12. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/LICENSE +0 -0
  13. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/MANIFEST.in +0 -0
  14. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/README.rst +0 -0
  15. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/bsd_license.txt +0 -0
  16. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/doc/lsst.ctrl.bps.htcondor/index.rst +0 -0
  17. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/gpl-v3.0.txt +0 -0
  18. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/pyproject.toml +0 -0
  19. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/__init__.py +0 -0
  20. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/etc/__init__.py +0 -0
  21. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/final_post.sh +0 -0
  22. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/handlers.py +0 -0
  23. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/htcondor_config.py +0 -0
  24. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst/ctrl/bps/htcondor/provisioner.py +0 -0
  25. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst_ctrl_bps_htcondor.egg-info/SOURCES.txt +0 -0
  26. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst_ctrl_bps_htcondor.egg-info/dependency_links.txt +0 -0
  27. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst_ctrl_bps_htcondor.egg-info/requires.txt +0 -0
  28. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst_ctrl_bps_htcondor.egg-info/top_level.txt +0 -0
  29. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/python/lsst_ctrl_bps_htcondor.egg-info/zip-safe +0 -0
  30. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/setup.cfg +0 -0
  31. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/tests/test_handlers.py +0 -0
  32. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/tests/test_lssthtc.py +0 -0
  33. {lsst_ctrl_bps_htcondor-29.2025.2300 → lsst_ctrl_bps_htcondor-29.2025.3000}/tests/test_provisioner.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lsst-ctrl-bps-htcondor
3
- Version: 29.2025.2300
3
+ Version: 29.2025.3000
4
4
  Summary: HTCondor plugin for lsst-ctrl-bps.
5
5
  Author-email: Rubin Observatory Data Management <dm-admin@lists.lsst.org>
6
6
  License: BSD 3-Clause License
@@ -1,3 +1,20 @@
1
+ lsst-ctrl-bps-htcondor v29.1.0 (2025-06-13)
2
+ ===========================================
3
+
4
+ New Features
5
+ ------------
6
+
7
+ - Added capability for ``NOOP`` and ``EXTERNAL SUBDAG`` DAG nodes. (`DM-46294 <https://rubinobs.atlassian.net/browse/DM-46294>`_)
8
+ - Added ability to add job release expression. (`DM-50614 <https://rubinobs.atlassian.net/browse/DM-50614>`_)
9
+ - Added get_status method to ``HTCondorService`` class for quick checking of run status. (`DM-50619 <https://rubinobs.atlassian.net/browse/DM-50619>`_)
10
+
11
+
12
+ Other Changes and Additions
13
+ ---------------------------
14
+
15
+ - Explicitly define ``MaxIdle`` to workaround bug where HTCondor overrides config and environment variables when it is responsible for making DAGMan submit file (affects at least certain 24.0.x versions). (`DM-50212 <https://rubinobs.atlassian.net/browse/DM-50212>`_)
16
+
17
+
1
18
  lsst-ctrl-bps-htcondor v29.0.0 (2025-03-25)
2
19
  ===========================================
3
20
 
@@ -109,6 +109,24 @@ environment variables can be used. Some examples:
109
109
  per cluster, per pipeline task)
110
110
 
111
111
 
112
+ Overwriting Job Output/Error Files
113
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
114
+
115
+ When jobs are automatically retried, HTCondor keeps the same job id.
116
+ Any existing job's output and error files are overwritten with the new
117
+ ones. This is not always ideal, for example, when successful parts of
118
+ the failed job is skipped in the retry. The ``overwriteJobFiles`` value
119
+ (True or False) in the submit yaml controls whether to overwrite job files
120
+ on retry. When not overwriting job files an extra counter corresponding
121
+ to the retry number appears in the output and error filenames even for
122
+ successful jobs (e.g., cluster1_96908.163.0.err, cluster1_96908.163.1.err,
123
+ cluster1_96908.163.2.err). ``overwriteJobFiles`` defaults to True for
124
+ payload jobs, but defaults to False for ``finalJob`` because the retries
125
+ for it are always partial. ``overwriteJobFiles`` can be specified in
126
+ ``pipetask`` and ``cluster`` sections as well as the ``finalJob`` section
127
+ or yaml root.
128
+
129
+
112
130
  Glideins
113
131
  ^^^^^^^^
114
132
 
@@ -181,7 +199,7 @@ DAG, this status can lag behind by a few minutes. Also, DAGMan tracks
181
199
  deletion of individual jobs as failures (no separate counts for
182
200
  deleted jobs). So the summary report flag column will show ``F`` when
183
201
  there are either failed or deleted jobs. If getting a detailed report
184
- (``bps report --id <id>``), the plugin reads detailed job information
202
+ (``bps report --id <ID>``), the plugin reads detailed job information
185
203
  from files. So, the detailed report can distinguish between failed and
186
204
  deleted jobs, and thus will show ``D`` in the flag column for a running
187
205
  workflow if there is a deleted job.
@@ -202,7 +220,7 @@ jobs are being held, use
202
220
 
203
221
  .. code-block:: bash
204
222
 
205
- condor_q -hold <id> # to see a specific job being held
223
+ condor_q -hold <ID> # to see a specific job being held
206
224
  condor-q -hold <user> # to see all held jobs owned by the user
207
225
 
208
226
  .. _htc-plugin-cancel:
@@ -231,18 +249,18 @@ See `bps restart`_.
231
249
  .. Describe any plugin specific aspects of restarting failed jobs below
232
250
  if any.
233
251
 
234
- A valid run id is one of the following:
252
+ A valid run ID is one of the following:
235
253
 
236
- * job id, e.g., ``1234.0`` (using just the cluster id, ``1234``, will also
254
+ * job ID, e.g., ``1234.0`` (using just the cluster ID, ``1234``, will also
237
255
  work),
238
- * global job id (e.g.,
256
+ * global job ID (e.g.,
239
257
  ``sdfrome002.sdf.slac.stanford.edu#165725.0#1699393748``),
240
258
  * run's submit directory (e.g.,
241
259
  ``/sdf/home/m/mxk/lsst/bps/submit/u/mxk/pipelines_check/20230713T135346Z``).
242
260
 
243
261
  .. note::
244
262
 
245
- If you don't remember any of the run's id you may try running
263
+ If you don't remember any of the run's ID you may try running
246
264
 
247
265
  .. code::
248
266
 
@@ -299,7 +317,7 @@ alongside the other payload jobs in the workflow that should automatically
299
317
  create and maintain glideins required for the payload jobs to run.
300
318
 
301
319
  If you enable automatic provisioning of resources, you will see the status of
302
- the provisioning job in the output of the ``bps report --id <id>`` command.
320
+ the provisioning job in the output of the ``bps report --id <ID>`` command.
303
321
  Look for the line starting with "Provisioning job status". For example
304
322
 
305
323
  .. code-block:: bash
@@ -446,7 +464,7 @@ If any of your jobs are being held, it will display something similar to::
446
464
 
447
465
  The job that is in the hold state can be released from it with
448
466
  `condor_release`_ providing the issue that made HTCondor put it in this state
449
- has been resolved. For example, if your job with id 1234.0 was placed in the
467
+ has been resolved. For example, if your job with ID 1234.0 was placed in the
450
468
  hold state because during the execution it exceeded 2048 MiB you requested for
451
469
  it during the submission, you can double the amount of memory it should request with
452
470
 
@@ -538,7 +556,49 @@ Troubleshooting
538
556
  Where is stdout/stderr from pipeline tasks?
539
557
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
540
558
 
541
- For now, stdout/stderr can be found in files in the run submit directory.
559
+ For now, stdout/stderr can be found in files in the run submit directory
560
+ after the job is done. Python logging goes to stderr so the majority
561
+ of the pipetask output will be in the \*.err file. One exception is
562
+ ``finalJob`` which does print some information to stdout (\*.out file)
563
+
564
+ While the job is running, the owner of the job can use ``condor_tail``
565
+ command to peek at the stdout/stderr of a job. ``bps`` uses the ID for
566
+ the entire workflow. But for the HTCondor command ``condor_tail``
567
+ you will need the ID for the individual job. Run the following command
568
+ and look for the ID for the job (undefined's are normal and normally
569
+ correspond to the DAGMan jobs).
570
+
571
+ .. code-block::
572
+
573
+ condor_q -run -nobatch -af:hj bps_job_name bps_run
574
+
575
+ Once you have the HTCondor ID for the particular job you want to peek
576
+ at the output, run this command:
577
+
578
+ .. code-block::
579
+
580
+ condor_tail -stderr -f <ID>
581
+
582
+ If you want to instead see the stdout, leave off the ``-stderr``.
583
+ If you need to see more of the contents specify ``-maxbytes <numbytes>``
584
+ (defaults to 1024 bytes).
585
+
586
+ I need to look around on the compute node where my job is running.
587
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
588
+
589
+ If using glideins, you might be able to just ``ssh`` to the compute
590
+ node from the submit node. First, need to find out on which node the
591
+ job is running.
592
+
593
+ .. code-block::
594
+
595
+ condor_q -run -nobatch -af:hj RemoteHost bps_job_name bps_run
596
+
597
+ Alternatively, HTCondor has the command ``condor_ssh_to_job`` where you
598
+ just need the job ID. This is not the workflow ID (the ID that ``bps``
599
+ commands use), but an individual job ID. The command above also prints
600
+ the job IDs.
601
+
542
602
 
543
603
  Why did my submission fail?
544
604
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -556,7 +616,7 @@ will continue normally until the existing gliedins expire. As a result,
556
616
  payload jobs may get stuck in the job queue if the glideins were not created
557
617
  or expired before the execution of the workflow could be completed.
558
618
 
559
- Firstly, use ``bps report --id <run id>`` to display the run report and look
619
+ Firstly, use ``bps report --id <run ID>`` to display the run report and look
560
620
  for the line
561
621
 
562
622
  .. code-block::
@@ -38,3 +38,8 @@ provisioning:
38
38
 
39
39
  # By default, disable automatic provisioning of resources.
40
40
  provisionResources: false
41
+
42
+ # Whether automatic job retries overwrite stdout/stderr of previous attempt.
43
+ overwriteJobFiles: true
44
+ finalJob:
45
+ overwriteJobFiles: false
@@ -706,14 +706,25 @@ def _create_job(subdir_template, cached_values, generic_workflow, gwjob, out_pre
706
706
  htc_job_cmds.update(_translate_job_cmds(cached_values, generic_workflow, gwjob))
707
707
 
708
708
  # job stdout, stderr, htcondor user log.
709
- for key in ("output", "error", "log"):
710
- htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).{key[:3]}"
709
+ for key in ("output", "error"):
710
+ if cached_values["overwriteJobFiles"]:
711
+ htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).{key[:3]}"
712
+ else:
713
+ htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).$$([NumJobStarts ?: 0]).{key[:3]}"
711
714
  _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
712
715
 
716
+ key = "log"
717
+ htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).{key[:3]}"
718
+ _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
719
+
713
720
  htc_job_cmds.update(
714
721
  _handle_job_inputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
715
722
  )
716
723
 
724
+ htc_job_cmds.update(
725
+ _handle_job_outputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
726
+ )
727
+
717
728
  # Add the job cmds dict to the job object.
718
729
  htc_job.add_job_cmds(htc_job_cmds)
719
730
 
@@ -946,13 +957,7 @@ def _replace_file_vars(use_shared, arguments, workflow, gwjob):
946
957
  # Have shared filesystems and jobs can share file.
947
958
  uri = gwfile.src_uri
948
959
  else:
949
- # Taking advantage of inside knowledge. Not future-proof.
950
- # Temporary fix until have job wrapper that pulls files
951
- # within job.
952
- if gwfile.name == "butlerConfig" and Path(gwfile.src_uri).suffix != ".yaml":
953
- uri = "butler.yaml"
954
- else:
955
- uri = os.path.basename(gwfile.src_uri)
960
+ uri = os.path.basename(gwfile.src_uri)
956
961
  else: # Using push transfer
957
962
  uri = os.path.basename(gwfile.src_uri)
958
963
  arguments = arguments.replace(f"<FILE:{gwfile.name}>", uri)
@@ -1001,7 +1006,9 @@ def _replace_cmd_vars(arguments, gwjob):
1001
1006
  return arguments
1002
1007
 
1003
1008
 
1004
- def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str):
1009
+ def _handle_job_inputs(
1010
+ generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str
1011
+ ) -> dict[str, str]:
1005
1012
  """Add job input files from generic workflow to job.
1006
1013
 
1007
1014
  Parameters
@@ -1020,7 +1027,6 @@ def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_sha
1020
1027
  htc_commands : `dict` [`str`, `str`]
1021
1028
  HTCondor commands for the job submission script.
1022
1029
  """
1023
- htc_commands = {}
1024
1030
  inputs = []
1025
1031
  for gwf_file in generic_workflow.get_job_inputs(job_name, data=True, transfer_only=True):
1026
1032
  _LOG.debug("src_uri=%s", gwf_file.src_uri)
@@ -1030,38 +1036,68 @@ def _handle_job_inputs(generic_workflow: GenericWorkflow, job_name: str, use_sha
1030
1036
  # Note if use_shared and job_shared, don't need to transfer file.
1031
1037
 
1032
1038
  if not use_shared: # Copy file using push to job
1033
- inputs.append(str(uri.relative_to(out_prefix)))
1039
+ inputs.append(str(uri))
1034
1040
  elif not gwf_file.job_shared: # Jobs require own copy
1035
1041
  # if using shared filesystem, but still need copy in job. Use
1036
1042
  # HTCondor's curl plugin for a local copy.
1037
-
1038
- # Execution butler is represented as a directory which the
1039
- # curl plugin does not handle. Taking advantage of inside
1040
- # knowledge for temporary fix until have job wrapper that pulls
1041
- # files within job.
1042
- if gwf_file.name == "butlerConfig":
1043
- # The execution butler directory doesn't normally exist until
1044
- # the submit phase so checking for suffix instead of using
1045
- # is_dir(). If other non-yaml file exists they would have a
1046
- # different gwf_file.name.
1047
- if uri.suffix == ".yaml": # Single file, so just copy.
1048
- inputs.append(f"file://{uri}")
1049
- else:
1050
- inputs.append(f"file://{uri / 'butler.yaml'}")
1051
- inputs.append(f"file://{uri / 'gen3.sqlite3'}")
1052
- elif uri.is_dir():
1043
+ if uri.is_dir():
1053
1044
  raise RuntimeError(
1054
1045
  f"HTCondor plugin cannot transfer directories locally within job {gwf_file.src_uri}"
1055
1046
  )
1056
- else:
1057
- inputs.append(f"file://{uri}")
1047
+ inputs.append(f"file://{uri}")
1058
1048
 
1049
+ htc_commands = {}
1059
1050
  if inputs:
1060
1051
  htc_commands["transfer_input_files"] = ",".join(inputs)
1061
1052
  _LOG.debug("transfer_input_files=%s", htc_commands["transfer_input_files"])
1062
1053
  return htc_commands
1063
1054
 
1064
1055
 
1056
+ def _handle_job_outputs(
1057
+ generic_workflow: GenericWorkflow, job_name: str, use_shared: bool, out_prefix: str
1058
+ ) -> dict[str, str]:
1059
+ """Add job output files from generic workflow to the job if any.
1060
+
1061
+ Parameters
1062
+ ----------
1063
+ generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
1064
+ The generic workflow (e.g., has executable name and arguments).
1065
+ job_name : `str`
1066
+ Unique name for the job.
1067
+ use_shared : `bool`
1068
+ Whether job has access to files via shared filesystem.
1069
+ out_prefix : `str`
1070
+ The root directory into which all WMS-specific files are written.
1071
+
1072
+ Returns
1073
+ -------
1074
+ htc_commands : `dict` [`str`, `str`]
1075
+ HTCondor commands for the job submission script.
1076
+ """
1077
+ outputs = []
1078
+ output_remaps = []
1079
+ for gwf_file in generic_workflow.get_job_outputs(job_name, data=True, transfer_only=True):
1080
+ _LOG.debug("src_uri=%s", gwf_file.src_uri)
1081
+
1082
+ uri = Path(gwf_file.src_uri)
1083
+ if not use_shared:
1084
+ outputs.append(uri.name)
1085
+ output_remaps.append(f"{uri.name}={str(uri)}")
1086
+
1087
+ # Set to an empty string to disable and only update if there are output
1088
+ # files to transfer. Otherwise, HTCondor will transfer back all files in
1089
+ # the job’s temporary working directory that have been modified or created
1090
+ # by the job.
1091
+ htc_commands = {"transfer_output_files": '""'}
1092
+ if outputs:
1093
+ htc_commands["transfer_output_files"] = ",".join(outputs)
1094
+ _LOG.debug("transfer_output_files=%s", htc_commands["transfer_output_files"])
1095
+
1096
+ htc_commands["transfer_output_remaps"] = f'"{";".join(output_remaps)}"'
1097
+ _LOG.debug("transfer_output_remaps=%s", htc_commands["transfer_output_remaps"])
1098
+ return htc_commands
1099
+
1100
+
1065
1101
  def _get_status_from_id(
1066
1102
  wms_workflow_id: str, hist: float, schedds: dict[str, htcondor.Schedd]
1067
1103
  ) -> tuple[WmsStates, str]:
@@ -2342,6 +2378,12 @@ def _gather_label_values(config: BpsConfig, label: str) -> dict[str, Any]:
2342
2378
  if found:
2343
2379
  values["releaseExpr"] = value
2344
2380
 
2381
+ found, value = config.search("overwriteJobFiles", opt=search_opts)
2382
+ if found:
2383
+ values["overwriteJobFiles"] = value
2384
+ else:
2385
+ values["overwriteJobFiles"] = True
2386
+
2345
2387
  if profile_key and profile_key in config:
2346
2388
  for subkey, val in config[profile_key].items():
2347
2389
  if subkey.startswith("+"):
@@ -205,6 +205,7 @@ HTC_VALID_JOB_KEYS = {
205
205
  "transfer_executable",
206
206
  "transfer_input_files",
207
207
  "transfer_output_files",
208
+ "transfer_output_remaps",
208
209
  "request_cpus",
209
210
  "request_memory",
210
211
  "request_disk",
@@ -0,0 +1,2 @@
1
+ __all__ = ["__version__"]
2
+ __version__ = "29.2025.3000"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lsst-ctrl-bps-htcondor
3
- Version: 29.2025.2300
3
+ Version: 29.2025.3000
4
4
  Summary: HTCondor plugin for lsst-ctrl-bps.
5
5
  Author-email: Rubin Observatory Data Management <dm-admin@lists.lsst.org>
6
6
  License: BSD 3-Clause License
@@ -40,13 +40,14 @@ from lsst.ctrl.bps import (
40
40
  BPS_SEARCH_ORDER,
41
41
  BpsConfig,
42
42
  GenericWorkflowExec,
43
+ GenericWorkflowFile,
43
44
  GenericWorkflowJob,
44
45
  WmsSpecificInfo,
45
46
  WmsStates,
46
47
  )
47
48
  from lsst.ctrl.bps.htcondor import htcondor_service, lssthtc
48
49
  from lsst.ctrl.bps.htcondor.htcondor_config import HTC_DEFAULTS_URI
49
- from lsst.ctrl.bps.tests.gw_test_utils import make_3_label_workflow_groups_sort
50
+ from lsst.ctrl.bps.tests.gw_test_utils import make_3_label_workflow, make_3_label_workflow_groups_sort
50
51
  from lsst.utils.tests import temporaryDirectory
51
52
 
52
53
  logger = logging.getLogger("lsst.ctrl.bps.htcondor")
@@ -1233,7 +1234,11 @@ class GatherLabelValuesTestCase(unittest.TestCase):
1233
1234
  config = BpsConfig(
1234
1235
  {
1235
1236
  "cluster": {
1236
- "label1": {"releaseExpr": "cluster_val", "profile": {"condor": {"prof_val1": 3}}}
1237
+ "label1": {
1238
+ "releaseExpr": "cluster_val",
1239
+ "overwriteJobFiles": False,
1240
+ "profile": {"condor": {"prof_val1": 3}},
1241
+ }
1237
1242
  },
1238
1243
  "pipetask": {"label1": {"releaseExpr": "pipetask_val"}},
1239
1244
  },
@@ -1242,14 +1247,26 @@ class GatherLabelValuesTestCase(unittest.TestCase):
1242
1247
  wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
1243
1248
  )
1244
1249
  results = htcondor_service._gather_label_values(config, label)
1245
- self.assertEqual(results, {"attrs": {}, "profile": {"prof_val1": 3}, "releaseExpr": "cluster_val"})
1250
+ self.assertEqual(
1251
+ results,
1252
+ {
1253
+ "attrs": {},
1254
+ "profile": {"prof_val1": 3},
1255
+ "releaseExpr": "cluster_val",
1256
+ "overwriteJobFiles": False,
1257
+ },
1258
+ )
1246
1259
 
1247
1260
  def testPipetaskLabel(self):
1248
1261
  label = "label1"
1249
1262
  config = BpsConfig(
1250
1263
  {
1251
1264
  "pipetask": {
1252
- "label1": {"releaseExpr": "pipetask_val", "profile": {"condor": {"prof_val1": 3}}}
1265
+ "label1": {
1266
+ "releaseExpr": "pipetask_val",
1267
+ "overwriteJobFiles": False,
1268
+ "profile": {"condor": {"prof_val1": 3}},
1269
+ }
1253
1270
  }
1254
1271
  },
1255
1272
  search_order=BPS_SEARCH_ORDER,
@@ -1257,7 +1274,15 @@ class GatherLabelValuesTestCase(unittest.TestCase):
1257
1274
  wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
1258
1275
  )
1259
1276
  results = htcondor_service._gather_label_values(config, label)
1260
- self.assertEqual(results, {"attrs": {}, "profile": {"prof_val1": 3}, "releaseExpr": "pipetask_val"})
1277
+ self.assertEqual(
1278
+ results,
1279
+ {
1280
+ "attrs": {},
1281
+ "profile": {"prof_val1": 3},
1282
+ "releaseExpr": "pipetask_val",
1283
+ "overwriteJobFiles": False,
1284
+ },
1285
+ )
1261
1286
 
1262
1287
  def testNoSection(self):
1263
1288
  label = "notThere"
@@ -1268,7 +1293,31 @@ class GatherLabelValuesTestCase(unittest.TestCase):
1268
1293
  wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
1269
1294
  )
1270
1295
  results = htcondor_service._gather_label_values(config, label)
1271
- self.assertEqual(results, {"attrs": {}, "profile": {}})
1296
+ self.assertEqual(results, {"attrs": {}, "profile": {}, "overwriteJobFiles": True})
1297
+
1298
+ def testNoOverwriteSpecified(self):
1299
+ label = "notthere"
1300
+ config = BpsConfig(
1301
+ {},
1302
+ search_order=BPS_SEARCH_ORDER,
1303
+ defaults={},
1304
+ wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
1305
+ )
1306
+ results = htcondor_service._gather_label_values(config, label)
1307
+ self.assertEqual(results, {"attrs": {}, "profile": {}, "overwriteJobFiles": True})
1308
+
1309
+ def testFinalJob(self):
1310
+ label = "finalJob"
1311
+ config = BpsConfig(
1312
+ {"finalJob": {"profile": {"condor": {"prof_val2": 6, "+attr_val1": 5}}}},
1313
+ search_order=BPS_SEARCH_ORDER,
1314
+ defaults=BPS_DEFAULTS,
1315
+ wms_service_class_fqn="lsst.ctrl.bps.htcondor.HTCondorService",
1316
+ )
1317
+ results = htcondor_service._gather_label_values(config, label)
1318
+ self.assertEqual(
1319
+ results, {"attrs": {"attr_val1": 5}, "profile": {"prof_val2": 6}, "overwriteJobFiles": False}
1320
+ )
1272
1321
 
1273
1322
 
1274
1323
  class CreateCheckJobTestCase(unittest.TestCase):
@@ -1418,5 +1467,117 @@ class GetStatusFromPathTestCase(unittest.TestCase):
1418
1467
  self.assertEqual(message, "")
1419
1468
 
1420
1469
 
1470
+ class HandleJobOutputsTestCase(unittest.TestCase):
1471
+ """Test _handle_job_outputs function."""
1472
+
1473
+ def setUp(self):
1474
+ self.job_name = "test_job"
1475
+ self.out_prefix = "/test/prefix"
1476
+
1477
+ def tearDown(self):
1478
+ pass
1479
+
1480
+ def testNoOutputsSharedFilesystem(self):
1481
+ """Test with shared filesystem and no outputs."""
1482
+ mock_workflow = unittest.mock.Mock()
1483
+ mock_workflow.get_job_outputs.return_value = []
1484
+
1485
+ result = htcondor_service._handle_job_outputs(mock_workflow, self.job_name, True, self.out_prefix)
1486
+
1487
+ self.assertEqual(result, {"transfer_output_files": '""'})
1488
+
1489
+ def testWithOutputsSharedFilesystem(self):
1490
+ """Test with shared filesystem and outputs present (still empty)."""
1491
+ mock_workflow = unittest.mock.Mock()
1492
+ mock_workflow.get_job_outputs.return_value = [
1493
+ GenericWorkflowFile(name="output.txt", src_uri="/path/to/output.txt")
1494
+ ]
1495
+
1496
+ result = htcondor_service._handle_job_outputs(mock_workflow, self.job_name, True, self.out_prefix)
1497
+
1498
+ self.assertEqual(result, {"transfer_output_files": '""'})
1499
+
1500
+ def testNoOutputsNoSharedFilesystem(self):
1501
+ """Test without shared filesystem and no outputs."""
1502
+ mock_workflow = unittest.mock.Mock()
1503
+ mock_workflow.get_job_outputs.return_value = []
1504
+
1505
+ result = htcondor_service._handle_job_outputs(mock_workflow, self.job_name, False, self.out_prefix)
1506
+
1507
+ self.assertEqual(result, {"transfer_output_files": '""'})
1508
+
1509
+ def testWithAnOutputNoSharedFilesystem(self):
1510
+ """Test without shared filesystem and single output file."""
1511
+ mock_workflow = unittest.mock.Mock()
1512
+ mock_workflow.get_job_outputs.return_value = [
1513
+ GenericWorkflowFile(name="output.txt", src_uri="/path/to/output.txt")
1514
+ ]
1515
+
1516
+ result = htcondor_service._handle_job_outputs(mock_workflow, self.job_name, False, self.out_prefix)
1517
+
1518
+ expected = {
1519
+ "transfer_output_files": "output.txt",
1520
+ "transfer_output_remaps": '"output.txt=/path/to/output.txt"',
1521
+ }
1522
+ self.assertEqual(result, expected)
1523
+
1524
+ def testWithOutputsNoSharedFilesystem(self):
1525
+ """Test without shared filesystem and multiple output files."""
1526
+ mock_workflow = unittest.mock.Mock()
1527
+ mock_workflow.get_job_outputs.return_value = [
1528
+ GenericWorkflowFile(name="output1.txt", src_uri="/path/output1.txt"),
1529
+ GenericWorkflowFile(name="output2.txt", src_uri="/another/path/output2.txt"),
1530
+ ]
1531
+
1532
+ result = htcondor_service._handle_job_outputs(mock_workflow, self.job_name, False, self.out_prefix)
1533
+
1534
+ expected = {
1535
+ "transfer_output_files": "output1.txt,output2.txt",
1536
+ "transfer_output_remaps": '"output1.txt=/path/output1.txt;output2.txt=/another/path/output2.txt"',
1537
+ }
1538
+ self.assertEqual(result, expected)
1539
+
1540
+ @unittest.mock.patch("lsst.ctrl.bps.htcondor.htcondor_service._LOG")
1541
+ def testLogging(self, mock_log):
1542
+ mock_workflow = unittest.mock.Mock()
1543
+ mock_workflow.get_job_outputs.return_value = [
1544
+ GenericWorkflowFile(name="output.txt", src_uri="/path/to/output.txt")
1545
+ ]
1546
+
1547
+ htcondor_service._handle_job_outputs(mock_workflow, self.job_name, False, self.out_prefix)
1548
+
1549
+ self.assertTrue(mock_log.debug.called)
1550
+ debug_calls = mock_log.debug.call_args_list
1551
+ self.assertTrue(any("src_uri=" in str(call) for call in debug_calls))
1552
+ self.assertTrue(any("transfer_output_files=" in str(call) for call in debug_calls))
1553
+ self.assertTrue(any("transfer_output_remaps=" in str(call) for call in debug_calls))
1554
+
1555
+
1556
+ class CreateJobTestCase(unittest.TestCase):
1557
+ """Test _create_job function."""
1558
+
1559
+ def setUp(self):
1560
+ self.generic_workflow = make_3_label_workflow("test1", True)
1561
+
1562
+ def testNoOverwrite(self):
1563
+ template = "{label}/{tract}/{patch}/{band}/{subfilter}/{physical_filter}/{visit}/{exposure}"
1564
+ cached_values = {
1565
+ "bpsUseShared": True,
1566
+ "overwriteJobFiles": False,
1567
+ "memoryLimit": 491520,
1568
+ "profile": {},
1569
+ "attrs": {},
1570
+ }
1571
+ gwjob = self.generic_workflow.get_final()
1572
+ out_prefix = "submit"
1573
+ htc_job = htcondor_service._create_job(
1574
+ template, cached_values, self.generic_workflow, gwjob, out_prefix
1575
+ )
1576
+ self.assertEqual(htc_job.name, gwjob.name)
1577
+ self.assertEqual(htc_job.label, gwjob.label)
1578
+ self.assertIn("NumJobStarts", htc_job.cmds["output"])
1579
+ self.assertIn("NumJobStarts", htc_job.cmds["error"])
1580
+
1581
+
1421
1582
  if __name__ == "__main__":
1422
1583
  unittest.main()
@@ -1,2 +0,0 @@
1
- __all__ = ["__version__"]
2
- __version__ = "29.2025.2300"