lsst-ctrl-bps-htcondor 29.0.1__py3-none-any.whl → 29.1.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,9 +34,10 @@ import logging
34
34
  import os
35
35
  import re
36
36
  from collections import defaultdict
37
+ from copy import deepcopy
37
38
  from enum import IntEnum, auto
38
39
  from pathlib import Path
39
- from typing import Any
40
+ from typing import Any, cast
40
41
 
41
42
  import htcondor
42
43
  from packaging import version
@@ -44,8 +45,12 @@ from packaging import version
44
45
  from lsst.ctrl.bps import (
45
46
  BaseWmsService,
46
47
  BaseWmsWorkflow,
48
+ BpsConfig,
47
49
  GenericWorkflow,
50
+ GenericWorkflowGroup,
48
51
  GenericWorkflowJob,
52
+ GenericWorkflowNodeType,
53
+ GenericWorkflowNoopJob,
49
54
  WmsJobReport,
50
55
  WmsRunReport,
51
56
  WmsSpecificInfo,
@@ -60,8 +65,9 @@ from .lssthtc import (
60
65
  MISSING_ID,
61
66
  HTCDag,
62
67
  HTCJob,
63
- JobStatus,
64
68
  NodeStatus,
69
+ WmsNodeType,
70
+ _update_rescue_file,
65
71
  condor_history,
66
72
  condor_q,
67
73
  condor_search,
@@ -175,17 +181,23 @@ class HTCondorService(BaseWmsService):
175
181
  Keyword arguments for the options.
176
182
  """
177
183
  dag = workflow.dag
178
-
179
184
  ver = version.parse(htc_version())
180
- if ver >= version.parse("8.9.3"):
181
- sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {})
182
- else:
183
- sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {})
184
185
 
185
186
  # For workflow portability, internal paths are all relative. Hence
186
187
  # the DAG needs to be submitted to HTCondor from inside the submit
187
188
  # directory.
188
189
  with chdir(workflow.submit_path):
190
+ try:
191
+ if ver >= version.parse("8.9.3"):
192
+ sub = htc_create_submit_from_dag(dag.graph["dag_filename"], dag.graph["submit_options"])
193
+ else:
194
+ sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], dag.graph["submit_options"])
195
+ except Exception:
196
+ _LOG.error(
197
+ "Problems creating HTCondor submit object from filename: %s", dag.graph["dag_filename"]
198
+ )
199
+ raise
200
+
189
201
  _LOG.info("Submitting from directory: %s", os.getcwd())
190
202
  schedd_dag_info = htc_submit_dag(sub)
191
203
  if schedd_dag_info:
@@ -226,7 +238,7 @@ class HTCondorService(BaseWmsService):
226
238
  None,
227
239
  (
228
240
  f"workflow with run id '{wms_workflow_id}' not found. "
229
- f"Hint: use run's submit directory as the id instead"
241
+ "Hint: use run's submit directory as the id instead"
230
242
  ),
231
243
  )
232
244
 
@@ -266,7 +278,9 @@ class HTCondorService(BaseWmsService):
266
278
  )
267
279
 
268
280
  _LOG.info("Backing up select HTCondor files from previous run attempt")
269
- htc_backup_files(wms_path, subdir="backups")
281
+ rescue_file = htc_backup_files(wms_path, subdir="backups")
282
+ if (wms_path / "subdags").exists():
283
+ _update_rescue_file(rescue_file)
270
284
 
271
285
  # For workflow portability, internal paths are all relative. Hence
272
286
  # the DAG needs to be resubmitted to HTCondor from inside the submit
@@ -318,7 +332,7 @@ class HTCondorService(BaseWmsService):
318
332
 
319
333
  Returns
320
334
  -------
321
- job_ids : `list` [`Any`]
335
+ job_ids : `list` [`~typing.Any`]
322
336
  Only job ids to be used by cancel and other functions. Typically
323
337
  this means top-level jobs (i.e., not children jobs).
324
338
  """
@@ -563,66 +577,17 @@ class HTCondorWorkflow(BaseWmsWorkflow):
563
577
  def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
564
578
  # Docstring inherited
565
579
  htc_workflow = cls(generic_workflow.name, config)
566
- htc_workflow.dag = HTCDag(name=generic_workflow.name)
580
+ htc_workflow.dag = _generic_workflow_to_htcondor_dag(config, generic_workflow, out_prefix)
567
581
 
568
582
  _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
569
- htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
583
+ # Add extra attributes to top most DAG.
570
584
  htc_workflow.dag.add_attribs(
571
585
  {
572
586
  "bps_wms_service": service_class,
573
587
  "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
574
- "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
575
- "bps_job_summary": create_count_summary(generic_workflow.job_counts),
576
588
  }
577
589
  )
578
590
 
579
- _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
580
- if isinstance(tmp_template, str):
581
- subdir_template = defaultdict(lambda: tmp_template)
582
- else:
583
- subdir_template = tmp_template
584
-
585
- # Create all DAG jobs
586
- site_values = {} # cache compute site specific values to reduce config lookups
587
- for job_name in generic_workflow:
588
- gwjob = generic_workflow.get_job(job_name)
589
- if gwjob.compute_site not in site_values:
590
- site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
591
- htc_job = _create_job(
592
- subdir_template[gwjob.label],
593
- site_values[gwjob.compute_site],
594
- generic_workflow,
595
- gwjob,
596
- out_prefix,
597
- )
598
- htc_workflow.dag.add_job(htc_job)
599
-
600
- # Add job dependencies to the DAG
601
- for job_name in generic_workflow:
602
- htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
603
-
604
- # If final job exists in generic workflow, create DAG final job
605
- final = generic_workflow.get_final()
606
- if final and isinstance(final, GenericWorkflowJob):
607
- if final.compute_site and final.compute_site not in site_values:
608
- site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
609
- final_htjob = _create_job(
610
- subdir_template[final.label],
611
- site_values[final.compute_site],
612
- generic_workflow,
613
- final,
614
- out_prefix,
615
- )
616
- if "post" not in final_htjob.dagcmds:
617
- final_htjob.dagcmds["post"] = (
618
- f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN"
619
- )
620
- htc_workflow.dag.add_final_job(final_htjob)
621
- elif final and isinstance(final, GenericWorkflow):
622
- raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
623
- elif final:
624
- return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
625
-
626
591
  return htc_workflow
627
592
 
628
593
  def write(self, out_prefix):
@@ -637,18 +602,18 @@ class HTCondorWorkflow(BaseWmsWorkflow):
637
602
  os.makedirs(out_prefix, exist_ok=True)
638
603
 
639
604
  # Write down the workflow in HTCondor format.
640
- self.dag.write(out_prefix, "jobs/{self.label}")
605
+ self.dag.write(out_prefix, job_subdir="jobs/{self.label}")
641
606
 
642
607
 
643
- def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix):
608
+ def _create_job(subdir_template, cached_values, generic_workflow, gwjob, out_prefix):
644
609
  """Convert GenericWorkflow job nodes to DAG jobs.
645
610
 
646
611
  Parameters
647
612
  ----------
648
613
  subdir_template : `str`
649
614
  Template for making subdirs.
650
- site_values : `dict`
651
- Site specific values
615
+ cached_values : `dict`
616
+ Site and label specific values.
652
617
  generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
653
618
  Generic workflow that is being converted.
654
619
  gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
@@ -668,8 +633,10 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
668
633
  if gwjob.tags:
669
634
  curvals.update(gwjob.tags)
670
635
 
671
- subdir = subdir_template.format_map(curvals)
672
- htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
636
+ subdir = Path("jobs") / subdir_template.format_map(curvals)
637
+ htc_job.subdir = subdir
638
+ htc_job.subfile = f"{gwjob.name}.sub"
639
+ htc_job.add_dag_cmds({"dir": subdir})
673
640
 
674
641
  htc_job_cmds = {
675
642
  "universe": "vanilla",
@@ -681,20 +648,22 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
681
648
  # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell
682
649
  # htcondor to put on hold any jobs which exited by a signal.
683
650
  "on_exit_hold": "ExitBySignal == true",
684
- "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", '
685
- '"Handling signal as if job has gone over memory limit.")',
651
+ "on_exit_hold_reason": (
652
+ 'strcat("Job raised a signal ", string(ExitSignal), ". ", '
653
+ '"Handling signal as if job has gone over memory limit.")'
654
+ ),
686
655
  "on_exit_hold_subcode": "34",
687
656
  }
688
657
 
689
- htc_job_cmds.update(_translate_job_cmds(site_values, generic_workflow, gwjob))
658
+ htc_job_cmds.update(_translate_job_cmds(cached_values, generic_workflow, gwjob))
690
659
 
691
660
  # job stdout, stderr, htcondor user log.
692
661
  for key in ("output", "error", "log"):
693
- htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
662
+ htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).{key[:3]}"
694
663
  _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
695
664
 
696
665
  htc_job_cmds.update(
697
- _handle_job_inputs(generic_workflow, gwjob.name, site_values["bpsUseShared"], out_prefix)
666
+ _handle_job_inputs(generic_workflow, gwjob.name, cached_values["bpsUseShared"], out_prefix)
698
667
  )
699
668
 
700
669
  # Add the job cmds dict to the job object.
@@ -705,7 +674,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
705
674
  # Add job attributes to job.
706
675
  _LOG.debug("gwjob.attrs = %s", gwjob.attrs)
707
676
  htc_job.add_job_attrs(gwjob.attrs)
708
- htc_job.add_job_attrs(site_values["attrs"])
677
+ htc_job.add_job_attrs(cached_values["attrs"])
709
678
  htc_job.add_job_attrs({"bps_job_quanta": create_count_summary(gwjob.quanta_counts)})
710
679
  htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
711
680
 
@@ -717,8 +686,8 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
717
686
 
718
687
  Parameters
719
688
  ----------
720
- cached_vals : `dict` [`str`, `Any`]
721
- Config values common to jobs with same label.
689
+ cached_vals : `dict` [`str`, `~typing.Any`]
690
+ Config values common to jobs with same site or label.
722
691
  generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
723
692
  Generic workflow that contains job to being converted.
724
693
  gwjob : `lsst.ctrl.bps.GenericWorkflowJob`
@@ -726,7 +695,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
726
695
 
727
696
  Returns
728
697
  -------
729
- htc_job_commands : `dict` [`str`, `Any`]
698
+ htc_job_commands : `dict` [`str`, `~typing.Any`]
730
699
  Contains commands which can appear in the HTCondor submit description
731
700
  file.
732
701
  """
@@ -752,9 +721,6 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
752
721
  jobcmds["accounting_group_user"] = cached_vals.get("accountingUser")
753
722
 
754
723
  # job commands that need modification
755
- if gwjob.number_of_retries:
756
- jobcmds["max_retries"] = f"{gwjob.number_of_retries}"
757
-
758
724
  if gwjob.retry_unless_exit:
759
725
  if isinstance(gwjob.retry_unless_exit, int):
760
726
  jobcmds["retry_until"] = f"{gwjob.retry_unless_exit}"
@@ -771,6 +737,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
771
737
  if gwjob.request_memory:
772
738
  jobcmds["request_memory"] = f"{gwjob.request_memory}"
773
739
 
740
+ memory_max = 0
774
741
  if gwjob.memory_multiplier:
775
742
  # Do not use try-except! At the moment, BpsConfig returns an empty
776
743
  # string if it does not contain the key.
@@ -797,13 +764,18 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
797
764
  gwjob.request_memory, gwjob.memory_multiplier, memory_max
798
765
  )
799
766
 
800
- # Periodically release jobs which are being held due to exceeding
801
- # memory. Stop doing that (by removing the job from the HTCondor queue)
802
- # after the maximal number of retries has been reached or the job was
803
- # already run at maximal allowed memory.
804
- jobcmds["periodic_release"] = _create_periodic_release_expr(
805
- gwjob.request_memory, gwjob.memory_multiplier, memory_max
806
- )
767
+ user_release_expr = cached_vals.get("releaseExpr", "")
768
+ if gwjob.number_of_retries is not None and gwjob.number_of_retries >= 0:
769
+ jobcmds["max_retries"] = gwjob.number_of_retries
770
+
771
+ # No point in adding periodic_release if 0 retries
772
+ if gwjob.number_of_retries > 0:
773
+ periodic_release = _create_periodic_release_expr(
774
+ gwjob.request_memory, gwjob.memory_multiplier, memory_max, user_release_expr
775
+ )
776
+ if periodic_release:
777
+ jobcmds["periodic_release"] = periodic_release
778
+
807
779
  jobcmds["periodic_remove"] = _create_periodic_remove_expr(
808
780
  gwjob.request_memory, gwjob.memory_multiplier, memory_max
809
781
  )
@@ -817,7 +789,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
817
789
  # Handle command line
818
790
  if gwjob.executable.transfer_executable:
819
791
  jobcmds["transfer_executable"] = "True"
820
- jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
792
+ jobcmds["executable"] = gwjob.executable.src_uri
821
793
  else:
822
794
  jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
823
795
 
@@ -862,7 +834,7 @@ def _translate_dag_cmds(gwjob):
862
834
 
863
835
  Returns
864
836
  -------
865
- dagcmds : `dict` [`str`, `Any`]
837
+ dagcmds : `dict` [`str`, `~typing.Any`]
866
838
  DAGMan commands for the job.
867
839
  """
868
840
  # Values in the dag script that just are name mappings.
@@ -974,7 +946,7 @@ def _replace_cmd_vars(arguments, gwjob):
974
946
  replacements = gwjob.cmdvals if gwjob.cmdvals is not None else {}
975
947
  try:
976
948
  arguments = arguments.format(**replacements)
977
- except KeyError as exc:
949
+ except (KeyError, TypeError) as exc: # TypeError in case None instead of {}
978
950
  _LOG.error("Could not replace command variables: replacement for %s not provided", str(exc))
979
951
  _LOG.debug("arguments: %s\ncmdvals: %s", arguments, replacements)
980
952
  raise
@@ -1200,19 +1172,19 @@ def _get_info_from_schedd(
1200
1172
  return schedd_dag_info
1201
1173
 
1202
1174
 
1203
- def _get_info_from_path(wms_path: str) -> tuple[str, dict[str, dict[str, Any]], str]:
1175
+ def _get_info_from_path(wms_path: str | os.PathLike) -> tuple[str, dict[str, dict[str, Any]], str]:
1204
1176
  """Gather run information from a given run directory.
1205
1177
 
1206
1178
  Parameters
1207
1179
  ----------
1208
- wms_path : `str`
1180
+ wms_path : `str` or `os.PathLike`
1209
1181
  Directory containing HTCondor files.
1210
1182
 
1211
1183
  Returns
1212
1184
  -------
1213
1185
  wms_workflow_id : `str`
1214
1186
  The run id which is a DAGman job id.
1215
- jobs : `dict` [`str`, `dict` [`str`, `Any`]]
1187
+ jobs : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1216
1188
  Information about jobs read from files in the given directory.
1217
1189
  The key is the HTCondor id and the value is a dictionary of HTCondor
1218
1190
  keys and values.
@@ -1263,9 +1235,9 @@ def _get_info_from_path(wms_path: str) -> tuple[str, dict[str, dict[str, Any]],
1263
1235
  schedd_name = next(iter(job_info))
1264
1236
  job_ad = next(iter(job_info[schedd_name].values()))
1265
1237
  job.update(job_ad)
1266
- except FileNotFoundError:
1267
- message = f"Could not find HTCondor files in '{wms_path}'"
1268
- _LOG.warning(message)
1238
+ except FileNotFoundError as err:
1239
+ message = f"Could not find HTCondor files in '{wms_path}' ({err})"
1240
+ _LOG.debug(message)
1269
1241
  messages.append(message)
1270
1242
  message = htc_check_dagman_output(wms_path)
1271
1243
  if message:
@@ -1298,8 +1270,9 @@ def _create_detailed_report_from_jobs(
1298
1270
  id and the value is a collection of report information for that run.
1299
1271
  """
1300
1272
  _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
1301
- dag_ad = jobs.pop(wms_workflow_id)
1302
- total_jobs, state_counts = _get_state_counts_from_dag_job(dag_ad)
1273
+
1274
+ dag_ad = jobs[wms_workflow_id]
1275
+
1303
1276
  report = WmsRunReport(
1304
1277
  wms_id=f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}",
1305
1278
  global_wms_id=dag_ad.get("GlobalJobId", "MISS"),
@@ -1312,28 +1285,34 @@ def _create_detailed_report_from_jobs(
1312
1285
  operator=_get_owner(dag_ad),
1313
1286
  run_summary=_get_run_summary(dag_ad),
1314
1287
  state=_htc_status_to_wms_state(dag_ad),
1288
+ total_number_jobs=0,
1315
1289
  jobs=[],
1316
- total_number_jobs=dag_ad.get("total_jobs", total_jobs),
1317
- job_state_counts=dag_ad.get("state_counts", state_counts),
1318
- exit_code_summary=_get_exit_code_summary(jobs),
1290
+ job_state_counts=dict.fromkeys(WmsStates, 0),
1291
+ exit_code_summary={},
1319
1292
  )
1293
+
1294
+ payload_jobs = {} # keep track for later processing
1320
1295
  specific_info = WmsSpecificInfo()
1321
1296
  for job_id, job_ad in jobs.items():
1322
- if not is_service_job(job_ad):
1297
+ if job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [WmsNodeType.PAYLOAD, WmsNodeType.FINAL]:
1323
1298
  try:
1299
+ name = job_ad.get("DAGNodeName", job_id)
1300
+ wms_state = _htc_status_to_wms_state(job_ad)
1324
1301
  job_report = WmsJobReport(
1325
1302
  wms_id=job_id,
1326
- name=job_ad.get("DAGNodeName", job_id),
1327
- label=job_ad.get("bps_job_label", pegasus_name_to_label(job_ad["DAGNodeName"])),
1328
- state=_htc_status_to_wms_state(job_ad),
1303
+ name=name,
1304
+ label=job_ad.get("bps_job_label", pegasus_name_to_label(name)),
1305
+ state=wms_state,
1329
1306
  )
1330
1307
  if job_report.label == "init":
1331
1308
  job_report.label = "pipetaskInit"
1309
+ report.job_state_counts[wms_state] += 1
1332
1310
  report.jobs.append(job_report)
1311
+ payload_jobs[job_id] = job_ad
1333
1312
  except KeyError as ex:
1334
1313
  _LOG.error("Job missing key '%s': %s", str(ex), job_ad)
1335
1314
  raise
1336
- else:
1315
+ elif is_service_job(job_ad):
1337
1316
  _LOG.debug(
1338
1317
  "Found service job: id='%s', name='%s', label='%s', NodeStatus='%s', JobStatus='%s'",
1339
1318
  job_id,
@@ -1344,13 +1323,11 @@ def _create_detailed_report_from_jobs(
1344
1323
  )
1345
1324
  _add_service_job_specific_info(job_ad, specific_info)
1346
1325
 
1326
+ report.total_number_jobs = len(payload_jobs)
1327
+ report.exit_code_summary = _get_exit_code_summary(payload_jobs)
1347
1328
  if specific_info:
1348
1329
  report.specific_info = specific_info
1349
1330
 
1350
- # Add the removed entry to restore the original content of the dictionary.
1351
- # The ordering of keys will be change permanently though.
1352
- jobs.update({wms_workflow_id: dag_ad})
1353
-
1354
1331
  # Workflow will exit with non-zero DAG_STATUS if problem with
1355
1332
  # any of the wms jobs. So change FAILED to SUCCEEDED if all
1356
1333
  # payload jobs SUCCEEDED.
@@ -1367,7 +1344,7 @@ def _add_service_job_specific_info(job_ad: dict[str, Any], specific_info: WmsSpe
1367
1344
 
1368
1345
  Parameters
1369
1346
  ----------
1370
- job_ad : `dict` [`str`, `Any`]
1347
+ job_ad : `dict` [`str`, `~typing.Any`]
1371
1348
  Provisioning job information.
1372
1349
  specific_info : `lsst.ctrl.bps.WmsSpecificInfo`
1373
1350
  Where to add message.
@@ -1450,6 +1427,7 @@ def _summary_report(user, hist, pass_thru, schedds=None):
1450
1427
 
1451
1428
  # Have list of DAGMan jobs, need to get run_report info.
1452
1429
  run_reports = {}
1430
+ msg = ""
1453
1431
  for jobs in job_info.values():
1454
1432
  for job_id, job in jobs.items():
1455
1433
  total_jobs, state_counts = _get_state_counts_from_dag_job(job)
@@ -1482,7 +1460,7 @@ def _summary_report(user, hist, pass_thru, schedds=None):
1482
1460
  )
1483
1461
  run_reports[report.global_wms_id] = report
1484
1462
 
1485
- return run_reports, ""
1463
+ return run_reports, msg
1486
1464
 
1487
1465
 
1488
1466
  def _add_run_info(wms_path, job):
@@ -1492,7 +1470,7 @@ def _add_run_info(wms_path, job):
1492
1470
  ----------
1493
1471
  wms_path : `str`
1494
1472
  Path to submit files for the run.
1495
- job : `dict` [`str`, `Any`]
1473
+ job : `dict` [`str`, `~typing.Any`]
1496
1474
  HTCondor dag job information.
1497
1475
 
1498
1476
  Raises
@@ -1528,7 +1506,7 @@ def _get_owner(job):
1528
1506
 
1529
1507
  Parameters
1530
1508
  ----------
1531
- job : `dict` [`str`, `Any`]
1509
+ job : `dict` [`str`, `~typing.Any`]
1532
1510
  HTCondor dag job information.
1533
1511
 
1534
1512
  Returns
@@ -1550,7 +1528,7 @@ def _get_run_summary(job):
1550
1528
 
1551
1529
  Parameters
1552
1530
  ----------
1553
- job : `dict` [`str`, `Any`]
1531
+ job : `dict` [`str`, `~typing.Any`]
1554
1532
  HTCondor dag job information.
1555
1533
 
1556
1534
  Returns
@@ -1596,14 +1574,14 @@ def _get_exit_code_summary(jobs):
1596
1574
  exit_code = 0
1597
1575
  job_status = job_ad["JobStatus"]
1598
1576
  match job_status:
1599
- case JobStatus.COMPLETED | JobStatus.HELD:
1577
+ case htcondor.JobStatus.COMPLETED | htcondor.JobStatus.HELD:
1600
1578
  exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["ExitCode"]
1601
1579
  case (
1602
- JobStatus.IDLE
1603
- | JobStatus.RUNNING
1604
- | JobStatus.REMOVED
1605
- | JobStatus.TRANSFERRING_OUTPUT
1606
- | JobStatus.SUSPENDED
1580
+ htcondor.JobStatus.IDLE
1581
+ | htcondor.JobStatus.RUNNING
1582
+ | htcondor.JobStatus.REMOVED
1583
+ | htcondor.JobStatus.TRANSFERRING_OUTPUT
1584
+ | htcondor.JobStatus.SUSPENDED
1607
1585
  ):
1608
1586
  pass
1609
1587
  case _:
@@ -1626,7 +1604,7 @@ def _get_state_counts_from_jobs(
1626
1604
  ----------
1627
1605
  wms_workflow_id : `str`
1628
1606
  HTCondor job id.
1629
- jobs : `dict [`dict` [`str`, `Any`]]
1607
+ jobs : `dict [`dict` [`str`, `~typing.Any`]]
1630
1608
  HTCondor dag job information.
1631
1609
 
1632
1610
  Returns
@@ -1639,16 +1617,13 @@ def _get_state_counts_from_jobs(
1639
1617
  """
1640
1618
  state_counts = dict.fromkeys(WmsStates, 0)
1641
1619
  for job_id, job_ad in jobs.items():
1642
- if job_id != wms_workflow_id and not is_service_job(job_ad):
1620
+ if job_id != wms_workflow_id and job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [
1621
+ WmsNodeType.PAYLOAD,
1622
+ WmsNodeType.FINAL,
1623
+ ]:
1643
1624
  state_counts[_htc_status_to_wms_state(job_ad)] += 1
1644
- total_counted = sum(state_counts.values())
1645
-
1646
- if "NodesTotal" in jobs[wms_workflow_id]:
1647
- total_count = jobs[wms_workflow_id]["NodesTotal"]
1648
- else:
1649
- total_count = total_counted
1625
+ total_count = sum(state_counts.values())
1650
1626
 
1651
- state_counts[WmsStates.UNREADY] += total_count - total_counted
1652
1627
  return total_count, state_counts
1653
1628
 
1654
1629
 
@@ -1657,7 +1632,7 @@ def _get_state_counts_from_dag_job(job):
1657
1632
 
1658
1633
  Parameters
1659
1634
  ----------
1660
- job : `dict` [`str`, `Any`]
1635
+ job : `dict` [`str`, `~typing.Any`]
1661
1636
  HTCondor dag job information.
1662
1637
 
1663
1638
  Returns
@@ -1713,7 +1688,7 @@ def _htc_status_to_wms_state(job):
1713
1688
 
1714
1689
  Parameters
1715
1690
  ----------
1716
- job : `dict` [`str`, `Any`]
1691
+ job : `dict` [`str`, `~typing.Any`]
1717
1692
  HTCondor job information.
1718
1693
 
1719
1694
  Returns
@@ -1735,7 +1710,7 @@ def _htc_job_status_to_wms_state(job):
1735
1710
 
1736
1711
  Parameters
1737
1712
  ----------
1738
- job : `dict` [`str`, `Any`]
1713
+ job : `dict` [`str`, `~typing.Any`]
1739
1714
  HTCondor job information.
1740
1715
 
1741
1716
  Returns
@@ -1746,27 +1721,28 @@ def _htc_job_status_to_wms_state(job):
1746
1721
  _LOG.debug(
1747
1722
  "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"])
1748
1723
  )
1749
- job_status = int(job["JobStatus"])
1750
1724
  wms_state = WmsStates.MISFIT
1751
-
1752
- _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1753
- if job_status == JobStatus.IDLE:
1754
- wms_state = WmsStates.PENDING
1755
- elif job_status == JobStatus.RUNNING:
1756
- wms_state = WmsStates.RUNNING
1757
- elif job_status == JobStatus.REMOVED:
1758
- wms_state = WmsStates.DELETED
1759
- elif job_status == JobStatus.COMPLETED:
1760
- if (
1761
- (job.get("ExitBySignal", False) and job.get("ExitSignal", 0))
1762
- or job.get("ExitCode", 0)
1763
- or job.get("DAG_Status", 0)
1764
- ):
1765
- wms_state = WmsStates.FAILED
1766
- else:
1767
- wms_state = WmsStates.SUCCEEDED
1768
- elif job_status == JobStatus.HELD:
1769
- wms_state = WmsStates.HELD
1725
+ if "JobStatus" in job and job["JobStatus"]:
1726
+ job_status = int(job["JobStatus"])
1727
+
1728
+ _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1729
+ if job_status == htcondor.JobStatus.IDLE:
1730
+ wms_state = WmsStates.PENDING
1731
+ elif job_status == htcondor.JobStatus.RUNNING:
1732
+ wms_state = WmsStates.RUNNING
1733
+ elif job_status == htcondor.JobStatus.REMOVED:
1734
+ wms_state = WmsStates.DELETED
1735
+ elif job_status == htcondor.JobStatus.COMPLETED:
1736
+ if (
1737
+ (job.get("ExitBySignal", False) and job.get("ExitSignal", 0))
1738
+ or job.get("ExitCode", 0)
1739
+ or job.get("DAG_Status", 0)
1740
+ ):
1741
+ wms_state = WmsStates.FAILED
1742
+ else:
1743
+ wms_state = WmsStates.SUCCEEDED
1744
+ elif job_status == htcondor.JobStatus.HELD:
1745
+ wms_state = WmsStates.HELD
1770
1746
 
1771
1747
  return wms_state
1772
1748
 
@@ -1776,7 +1752,7 @@ def _htc_node_status_to_wms_state(job):
1776
1752
 
1777
1753
  Parameters
1778
1754
  ----------
1779
- job : `dict` [`str`, `Any`]
1755
+ job : `dict` [`str`, `~typing.Any`]
1780
1756
  HTCondor job information.
1781
1757
 
1782
1758
  Returns
@@ -1823,9 +1799,9 @@ def _update_jobs(jobs1, jobs2):
1823
1799
 
1824
1800
  Parameters
1825
1801
  ----------
1826
- jobs1 : `dict` [`str`, `dict` [`str`, `Any`]]
1802
+ jobs1 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1827
1803
  HTCondor job information to be updated.
1828
- jobs2 : `dict` [`str`, `dict` [`str`, `Any`]]
1804
+ jobs2 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1829
1805
  Additional HTCondor job information.
1830
1806
  """
1831
1807
  for job_id, job_ad in jobs2.items():
@@ -1965,34 +1941,39 @@ def _wms_id_to_dir(wms_id):
1965
1941
  return wms_path, id_type
1966
1942
 
1967
1943
 
1968
- def _create_periodic_release_expr(memory, multiplier, limit):
1944
+ def _create_periodic_release_expr(
1945
+ memory: int, multiplier: float | None, limit: int, additional_expr: str = ""
1946
+ ) -> str:
1969
1947
  """Construct an HTCondorAd expression for releasing held jobs.
1970
1948
 
1971
- The expression instruct HTCondor to release any job which was put on hold
1972
- due to exceeding memory requirements back to the job queue providing it
1973
- satisfies all of the conditions below:
1974
-
1975
- * number of run attempts did not reach allowable number of retries,
1976
- * the memory requirements in the last failed run attempt did not reach
1977
- the specified memory limit.
1978
-
1979
1949
  Parameters
1980
1950
  ----------
1981
1951
  memory : `int`
1982
1952
  Requested memory in MB.
1983
- multiplier : `float`
1984
- Memory growth rate between retires.
1953
+ multiplier : `float` or None
1954
+ Memory growth rate between retries.
1985
1955
  limit : `int`
1986
1956
  Memory limit.
1957
+ additional_expr : `str`, optional
1958
+ Expression to add to periodic_release. Defaults to empty string.
1987
1959
 
1988
1960
  Returns
1989
1961
  -------
1990
1962
  expr : `str`
1991
- A string representing an HTCondor ClassAd expression for releasing jobs
1992
- which have been held due to exceeding the memory requirements.
1963
+ A string representing an HTCondor ClassAd expression for releasing job.
1993
1964
  """
1994
- is_retry_allowed = "NumJobStarts <= JobMaxRetries"
1995
- was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
1965
+ _LOG.debug(
1966
+ "periodic_release: memory: %s, multiplier: %s, limit: %s, additional_expr: %s",
1967
+ memory,
1968
+ multiplier,
1969
+ limit,
1970
+ additional_expr,
1971
+ )
1972
+
1973
+ # ctrl_bps sets multiplier to None in the GenericWorkflow if
1974
+ # memoryMultiplier <= 1, but checking value just in case.
1975
+ if (not multiplier or multiplier <= 1) and not additional_expr:
1976
+ return ""
1996
1977
 
1997
1978
  # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
1998
1979
  # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
@@ -2004,63 +1985,74 @@ def _create_periodic_release_expr(memory, multiplier, limit):
2004
1985
  # the entire expression should evaluate to FALSE when the job is not HELD.
2005
1986
  # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
2006
1987
  # but better safe than sorry.
2007
- was_mem_exceeded = (
2008
- "JobStatus == 5 "
2009
- "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
2010
- "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
2011
- )
1988
+ is_held = "JobStatus == 5"
1989
+ is_retry_allowed = "NumJobStarts <= JobMaxRetries"
1990
+
1991
+ mem_expr = ""
1992
+ if memory and multiplier and multiplier > 1 and limit:
1993
+ was_mem_exceeded = (
1994
+ "(HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
1995
+ "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
1996
+ )
1997
+ was_below_limit = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) < {limit}"
1998
+ mem_expr = f"{was_mem_exceeded} && {was_below_limit}"
1999
+
2000
+ user_expr = ""
2001
+ if additional_expr:
2002
+ # Never auto release a job held by user.
2003
+ user_expr = f"HoldReasonCode =!= 1 && {additional_expr}"
2004
+
2005
+ expr = f"{is_held} && {is_retry_allowed}"
2006
+ if user_expr and mem_expr:
2007
+ expr += f" && ({mem_expr} || {user_expr})"
2008
+ elif user_expr:
2009
+ expr += f" && {user_expr}"
2010
+ elif mem_expr:
2011
+ expr += f" && {mem_expr}"
2012
2012
 
2013
- expr = f"{was_mem_exceeded} && {is_retry_allowed} && {was_below_limit}"
2014
2013
  return expr
2015
2014
 
2016
2015
 
2017
2016
  def _create_periodic_remove_expr(memory, multiplier, limit):
2018
2017
  """Construct an HTCondorAd expression for removing jobs from the queue.
2019
2018
 
2020
- The expression instruct HTCondor to remove any job which was put on hold
2021
- due to exceeding memory requirements from the job queue providing it
2022
- satisfies any of the conditions below:
2023
-
2024
- * allowable number of retries was reached,
2025
- * the memory requirements during the last failed run attempt reached
2026
- the specified memory limit.
2027
-
2028
2019
  Parameters
2029
2020
  ----------
2030
2021
  memory : `int`
2031
2022
  Requested memory in MB.
2032
2023
  multiplier : `float`
2033
- Memory growth rate between retires.
2024
+ Memory growth rate between retries.
2034
2025
  limit : `int`
2035
2026
  Memory limit.
2036
2027
 
2037
2028
  Returns
2038
2029
  -------
2039
2030
  expr : `str`
2040
- A string representing an HTCondor ClassAd expression for removing jobs
2041
- which were run at the maximal allowable memory and still exceeded
2042
- the memory requirements.
2031
+ A string representing an HTCondor ClassAd expression for removing jobs.
2043
2032
  """
2044
- is_retry_disallowed = "NumJobStarts > JobMaxRetries"
2045
- was_limit_reached = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
2046
-
2047
- # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode' are
2048
- # UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
2049
- # The special comparison operators ensure that all comparisons below will
2050
- # evaluate to FALSE in this case.
2033
+ # Job ClassAds attributes 'HoldReasonCode' and 'HoldReasonSubCode'
2034
+ # are UNDEFINED if job is not HELD (i.e. when 'JobStatus' is not 5).
2035
+ # The special comparison operators ensure that all comparisons below
2036
+ # will evaluate to FALSE in this case.
2051
2037
  #
2052
2038
  # Note:
2053
- # May not be strictly necessary. Operators '&&' and '||' are not strict so
2054
- # the entire expression should evaluate to FALSE when the job is not HELD.
2055
- # According to ClassAd evaluation semantics FALSE && UNDEFINED is FALSE,
2056
- # but better safe than sorry.
2057
- was_mem_exceeded = (
2058
- "JobStatus == 5 "
2059
- "&& (HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
2060
- "|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34)"
2061
- )
2039
+ # May not be strictly necessary. Operators '&&' and '||' are not
2040
+ # strict so the entire expression should evaluate to FALSE when the
2041
+ # job is not HELD. According to ClassAd evaluation semantics
2042
+ # FALSE && UNDEFINED is FALSE, but better safe than sorry.
2043
+ is_held = "JobStatus == 5"
2044
+ is_retry_disallowed = "NumJobStarts > JobMaxRetries"
2062
2045
 
2063
- expr = f"{was_mem_exceeded} && ({is_retry_disallowed} || {was_limit_reached})"
2046
+ mem_expr = ""
2047
+ if memory and multiplier and multiplier > 1 and limit:
2048
+ mem_limit_expr = f"min({{int({memory} * pow({multiplier}, NumJobStarts - 1)), {limit}}}) == {limit}"
2049
+
2050
+ mem_expr = ( # Add || here so only added if adding memory expr
2051
+ " || ((HoldReasonCode =?= 34 && HoldReasonSubCode =?= 0 "
2052
+ f"|| HoldReasonCode =?= 3 && HoldReasonSubCode =?= 34) && {mem_limit_expr})"
2053
+ )
2054
+
2055
+ expr = f"{is_held} && ({is_retry_disallowed}{mem_expr})"
2064
2056
  return expr
2065
2057
 
2066
2058
 
@@ -2072,7 +2064,7 @@ def _create_request_memory_expr(memory, multiplier, limit):
2072
2064
  memory : `int`
2073
2065
  Requested memory in MB.
2074
2066
  multiplier : `float`
2075
- Memory growth rate between retires.
2067
+ Memory growth rate between retries.
2076
2068
  limit : `int`
2077
2069
  Memory limit.
2078
2070
 
@@ -2147,7 +2139,7 @@ def _gather_site_values(config, compute_site):
2147
2139
 
2148
2140
  Returns
2149
2141
  -------
2150
- site_values : `dict` [`str`, `Any`]
2142
+ site_values : `dict` [`str`, `~typing.Any`]
2151
2143
  Values specific to the given site.
2152
2144
  """
2153
2145
  site_values = {"attrs": {}, "profile": {}}
@@ -2195,6 +2187,50 @@ def _gather_site_values(config, compute_site):
2195
2187
  return site_values
2196
2188
 
2197
2189
 
2190
+ def _gather_label_values(config: BpsConfig, label: str) -> dict[str, Any]:
2191
+ """Gather values specific to given job label.
2192
+
2193
+ Parameters
2194
+ ----------
2195
+ config : `lsst.ctrl.bps.BpsConfig`
2196
+ BPS configuration that includes necessary submit/runtime
2197
+ information.
2198
+ label : `str`
2199
+ GenericWorkflowJob label.
2200
+
2201
+ Returns
2202
+ -------
2203
+ values : `dict` [`str`, `~typing.Any`]
2204
+ Values specific to the given job label.
2205
+ """
2206
+ values: dict[str, Any] = {"attrs": {}, "profile": {}}
2207
+
2208
+ search_opts = {}
2209
+ profile_key = ""
2210
+ if label == "finalJob":
2211
+ search_opts["searchobj"] = config["finalJob"]
2212
+ profile_key = ".finalJob.profile.condor"
2213
+ elif label in config["cluster"]:
2214
+ search_opts["curvals"] = {"curr_cluster": label}
2215
+ profile_key = f".cluster.{label}.profile.condor"
2216
+ elif label in config["pipetask"]:
2217
+ search_opts["curvals"] = {"curr_pipetask": label}
2218
+ profile_key = f".pipetask.{label}.profile.condor"
2219
+
2220
+ found, value = config.search("releaseExpr", opt=search_opts)
2221
+ if found:
2222
+ values["releaseExpr"] = value
2223
+
2224
+ if profile_key and profile_key in config:
2225
+ for subkey, val in config[profile_key].items():
2226
+ if subkey.startswith("+"):
2227
+ values["attrs"][subkey[1:]] = val
2228
+ else:
2229
+ values["profile"][subkey] = val
2230
+
2231
+ return values
2232
+
2233
+
2198
2234
  def is_service_job(job_ad: dict[str, Any]) -> bool:
2199
2235
  """Determine if a job is a service one.
2200
2236
 
@@ -2212,6 +2248,199 @@ def is_service_job(job_ad: dict[str, Any]) -> bool:
2212
2248
  -----
2213
2249
  At the moment, HTCondor does not provide a native way to distinguish
2214
2250
  between payload and service jobs in the workflow. This code depends
2215
- on read_node_status adding bps_job_type.
2251
+ on read_node_status adding wms_node_type.
2252
+ """
2253
+ return job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) == WmsNodeType.SERVICE
2254
+
2255
+
2256
+ def _group_to_subdag(
2257
+ config: BpsConfig, generic_workflow_group: GenericWorkflowGroup, out_prefix: str
2258
+ ) -> HTCJob:
2259
+ """Convert a generic workflow group to an HTCondor dag.
2260
+
2261
+ Parameters
2262
+ ----------
2263
+ config : `lsst.ctrl.bps.BpsConfig`
2264
+ Workflow configuration.
2265
+ generic_workflow_group : `lsst.ctrl.bps.GenericWorkflowGroup`
2266
+ The generic workflow group to convert.
2267
+ out_prefix : `str`
2268
+ Location prefix to be used when creating jobs.
2269
+
2270
+ Returns
2271
+ -------
2272
+ htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
2273
+ Job for running the HTCondor dag.
2274
+ """
2275
+ jobname = f"wms_{generic_workflow_group.name}"
2276
+ htc_job = HTCJob(name=jobname, label=generic_workflow_group.label)
2277
+ htc_job.add_dag_cmds({"dir": f"subdags/{jobname}"})
2278
+ htc_job.subdag = _generic_workflow_to_htcondor_dag(config, generic_workflow_group, out_prefix)
2279
+ if not generic_workflow_group.blocking:
2280
+ htc_job.dagcmds["post"] = {
2281
+ "defer": "",
2282
+ "executable": f"{os.path.dirname(__file__)}/subdag_post.sh",
2283
+ "arguments": f"{jobname} $RETURN",
2284
+ }
2285
+ return htc_job
2286
+
2287
+
2288
+ def _create_check_job(group_job_name: str, job_label: str) -> HTCJob:
2289
+ """Create a job to check status of a group job.
2290
+
2291
+ Parameters
2292
+ ----------
2293
+ group_job_name : `str`
2294
+ Name of the group job.
2295
+ job_label : `str`
2296
+ Label to use for the check status job.
2297
+
2298
+ Returns
2299
+ -------
2300
+ htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
2301
+ Job description for the job to check group job status.
2216
2302
  """
2217
- return job_ad.get("bps_job_type", "MISSING") == "service"
2303
+ htc_job = HTCJob(name=f"wms_check_status_{group_job_name}", label=job_label)
2304
+ htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/check_group_status.sub"
2305
+ htc_job.add_dag_cmds({"dir": f"subdags/{group_job_name}", "vars": {"group_job_name": group_job_name}})
2306
+
2307
+ return htc_job
2308
+
2309
+
2310
+ def _generic_workflow_to_htcondor_dag(
2311
+ config: BpsConfig, generic_workflow: GenericWorkflow, out_prefix: str
2312
+ ) -> HTCDag:
2313
+ """Convert a GenericWorkflow to a HTCDag.
2314
+
2315
+ Parameters
2316
+ ----------
2317
+ config : `lsst.ctrl.bps.BpsConfig`
2318
+ Workflow configuration.
2319
+ generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
2320
+ The GenericWorkflow to convert.
2321
+ out_prefix : `str`
2322
+ Location prefix where the HTCondor files will be written.
2323
+
2324
+ Returns
2325
+ -------
2326
+ dag : `lsst.ctrl.bps.htcondor.HTCDag`
2327
+ The HTCDag representation of the given GenericWorkflow.
2328
+ """
2329
+ dag = HTCDag(name=generic_workflow.name)
2330
+
2331
+ _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
2332
+ dag.add_attribs(generic_workflow.run_attrs)
2333
+ dag.add_attribs(
2334
+ {
2335
+ "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
2336
+ "bps_job_summary": create_count_summary(generic_workflow.job_counts),
2337
+ }
2338
+ )
2339
+
2340
+ _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
2341
+ if isinstance(tmp_template, str):
2342
+ subdir_template = defaultdict(lambda: tmp_template)
2343
+ else:
2344
+ subdir_template = tmp_template
2345
+
2346
+ # Create all DAG jobs
2347
+ site_values = {} # Cache compute site specific values to reduce config lookups.
2348
+ cached_values = {} # Cache label-specific values to reduce config lookups.
2349
+ # Note: Can't use get_job_by_label because those only include payload jobs.
2350
+ for job_name in generic_workflow:
2351
+ gwjob = generic_workflow.get_job(job_name)
2352
+ if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD:
2353
+ gwjob = cast(GenericWorkflowJob, gwjob)
2354
+ if gwjob.compute_site not in site_values:
2355
+ site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
2356
+ if gwjob.label not in cached_values:
2357
+ cached_values[gwjob.label] = deepcopy(site_values[gwjob.compute_site])
2358
+ cached_values[gwjob.label].update(_gather_label_values(config, gwjob.label))
2359
+ _LOG.debug("cached: %s= %s", gwjob.label, cached_values[gwjob.label])
2360
+ htc_job = _create_job(
2361
+ subdir_template[gwjob.label],
2362
+ cached_values[gwjob.label],
2363
+ generic_workflow,
2364
+ gwjob,
2365
+ out_prefix,
2366
+ )
2367
+ elif gwjob.node_type == GenericWorkflowNodeType.NOOP:
2368
+ gwjob = cast(GenericWorkflowNoopJob, gwjob)
2369
+ htc_job = HTCJob(f"wms_{gwjob.name}", label=gwjob.label)
2370
+ htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/noop.sub"
2371
+ htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
2372
+ htc_job.add_dag_cmds({"noop": True})
2373
+ elif gwjob.node_type == GenericWorkflowNodeType.GROUP:
2374
+ gwjob = cast(GenericWorkflowGroup, gwjob)
2375
+ htc_job = _group_to_subdag(config, gwjob, out_prefix)
2376
+ # In case DAGMAN_GENERATE_SUBDAG_SUBMITS is False,
2377
+ dag.graph["submit_options"]["do_recurse"] = True
2378
+ else:
2379
+ raise RuntimeError(f"Unsupported generic workflow node type {gwjob.node_type} ({gwjob.name})")
2380
+ _LOG.debug("Calling adding job %s %s", htc_job.name, htc_job.label)
2381
+ dag.add_job(htc_job)
2382
+
2383
+ # Add job dependencies to the DAG (be careful with wms_ jobs)
2384
+ for job_name in generic_workflow:
2385
+ gwjob = generic_workflow.get_job(job_name)
2386
+ parent_name = (
2387
+ gwjob.name if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD else f"wms_{gwjob.name}"
2388
+ )
2389
+ successor_jobs = [generic_workflow.get_job(j) for j in generic_workflow.successors(job_name)]
2390
+ children_names = []
2391
+ if gwjob.node_type == GenericWorkflowNodeType.GROUP:
2392
+ gwjob = cast(GenericWorkflowGroup, gwjob)
2393
+ group_children = [] # Dependencies between same group jobs
2394
+ for sjob in successor_jobs:
2395
+ if sjob.node_type == GenericWorkflowNodeType.GROUP and sjob.label == gwjob.label:
2396
+ group_children.append(f"wms_{sjob.name}")
2397
+ elif sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
2398
+ children_names.append(sjob.name)
2399
+ else:
2400
+ children_names.append(f"wms_{sjob.name}")
2401
+ if group_children:
2402
+ dag.add_job_relationships([parent_name], group_children)
2403
+ if not gwjob.blocking:
2404
+ # Since subdag will always succeed, need to add a special
2405
+ # job that fails if group failed to block payload children.
2406
+ check_job = _create_check_job(f"wms_{gwjob.name}", gwjob.label)
2407
+ dag.add_job(check_job)
2408
+ dag.add_job_relationships([f"wms_{gwjob.name}"], [check_job.name])
2409
+ parent_name = check_job.name
2410
+ else:
2411
+ for sjob in successor_jobs:
2412
+ if sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
2413
+ children_names.append(sjob.name)
2414
+ else:
2415
+ children_names.append(f"wms_{sjob.name}")
2416
+
2417
+ dag.add_job_relationships([parent_name], children_names)
2418
+
2419
+ # If final job exists in generic workflow, create DAG final job
2420
+ final = generic_workflow.get_final()
2421
+ if final and isinstance(final, GenericWorkflowJob):
2422
+ if final.compute_site and final.compute_site not in site_values:
2423
+ site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
2424
+ if final.label not in cached_values:
2425
+ cached_values[final.label] = deepcopy(site_values[final.compute_site])
2426
+ cached_values[final.label].update(_gather_label_values(config, final.label))
2427
+ final_htjob = _create_job(
2428
+ subdir_template[final.label],
2429
+ cached_values[final.label],
2430
+ generic_workflow,
2431
+ final,
2432
+ out_prefix,
2433
+ )
2434
+ if "post" not in final_htjob.dagcmds:
2435
+ final_htjob.dagcmds["post"] = {
2436
+ "defer": "",
2437
+ "executable": f"{os.path.dirname(__file__)}/final_post.sh",
2438
+ "arguments": f"{final.name} $DAG_STATUS $RETURN",
2439
+ }
2440
+ dag.add_final_job(final_htjob)
2441
+ elif final and isinstance(final, GenericWorkflow):
2442
+ raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
2443
+ elif final:
2444
+ raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
2445
+
2446
+ return dag