lsst-ctrl-bps-htcondor 29.2025.1300__py3-none-any.whl → 29.2025.1500__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,7 +36,7 @@ import re
36
36
  from collections import defaultdict
37
37
  from enum import IntEnum, auto
38
38
  from pathlib import Path
39
- from typing import Any
39
+ from typing import Any, cast
40
40
 
41
41
  import htcondor
42
42
  from packaging import version
@@ -44,8 +44,12 @@ from packaging import version
44
44
  from lsst.ctrl.bps import (
45
45
  BaseWmsService,
46
46
  BaseWmsWorkflow,
47
+ BpsConfig,
47
48
  GenericWorkflow,
49
+ GenericWorkflowGroup,
48
50
  GenericWorkflowJob,
51
+ GenericWorkflowNodeType,
52
+ GenericWorkflowNoopJob,
49
53
  WmsJobReport,
50
54
  WmsRunReport,
51
55
  WmsSpecificInfo,
@@ -60,8 +64,9 @@ from .lssthtc import (
60
64
  MISSING_ID,
61
65
  HTCDag,
62
66
  HTCJob,
63
- JobStatus,
64
67
  NodeStatus,
68
+ WmsNodeType,
69
+ _update_rescue_file,
65
70
  condor_history,
66
71
  condor_q,
67
72
  condor_search,
@@ -175,17 +180,23 @@ class HTCondorService(BaseWmsService):
175
180
  Keyword arguments for the options.
176
181
  """
177
182
  dag = workflow.dag
178
-
179
183
  ver = version.parse(htc_version())
180
- if ver >= version.parse("8.9.3"):
181
- sub = htc_create_submit_from_dag(dag.graph["dag_filename"], {})
182
- else:
183
- sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], {})
184
184
 
185
185
  # For workflow portability, internal paths are all relative. Hence
186
186
  # the DAG needs to be submitted to HTCondor from inside the submit
187
187
  # directory.
188
188
  with chdir(workflow.submit_path):
189
+ try:
190
+ if ver >= version.parse("8.9.3"):
191
+ sub = htc_create_submit_from_dag(dag.graph["dag_filename"], dag.graph["submit_options"])
192
+ else:
193
+ sub = htc_create_submit_from_cmd(dag.graph["dag_filename"], dag.graph["submit_options"])
194
+ except Exception:
195
+ _LOG.error(
196
+ "Problems creating HTCondor submit object from filename: %s", dag.graph["dag_filename"]
197
+ )
198
+ raise
199
+
189
200
  _LOG.info("Submitting from directory: %s", os.getcwd())
190
201
  schedd_dag_info = htc_submit_dag(sub)
191
202
  if schedd_dag_info:
@@ -226,7 +237,7 @@ class HTCondorService(BaseWmsService):
226
237
  None,
227
238
  (
228
239
  f"workflow with run id '{wms_workflow_id}' not found. "
229
- f"Hint: use run's submit directory as the id instead"
240
+ "Hint: use run's submit directory as the id instead"
230
241
  ),
231
242
  )
232
243
 
@@ -266,7 +277,9 @@ class HTCondorService(BaseWmsService):
266
277
  )
267
278
 
268
279
  _LOG.info("Backing up select HTCondor files from previous run attempt")
269
- htc_backup_files(wms_path, subdir="backups")
280
+ rescue_file = htc_backup_files(wms_path, subdir="backups")
281
+ if (wms_path / "subdags").exists():
282
+ _update_rescue_file(rescue_file)
270
283
 
271
284
  # For workflow portability, internal paths are all relative. Hence
272
285
  # the DAG needs to be resubmitted to HTCondor from inside the submit
@@ -563,66 +576,17 @@ class HTCondorWorkflow(BaseWmsWorkflow):
563
576
  def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
564
577
  # Docstring inherited
565
578
  htc_workflow = cls(generic_workflow.name, config)
566
- htc_workflow.dag = HTCDag(name=generic_workflow.name)
579
+ htc_workflow.dag = _generic_workflow_to_htcondor_dag(config, generic_workflow, out_prefix)
567
580
 
568
581
  _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
569
- htc_workflow.dag.add_attribs(generic_workflow.run_attrs)
582
+ # Add extra attributes to top most DAG.
570
583
  htc_workflow.dag.add_attribs(
571
584
  {
572
585
  "bps_wms_service": service_class,
573
586
  "bps_wms_workflow": f"{cls.__module__}.{cls.__name__}",
574
- "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
575
- "bps_job_summary": create_count_summary(generic_workflow.job_counts),
576
587
  }
577
588
  )
578
589
 
579
- _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
580
- if isinstance(tmp_template, str):
581
- subdir_template = defaultdict(lambda: tmp_template)
582
- else:
583
- subdir_template = tmp_template
584
-
585
- # Create all DAG jobs
586
- site_values = {} # cache compute site specific values to reduce config lookups
587
- for job_name in generic_workflow:
588
- gwjob = generic_workflow.get_job(job_name)
589
- if gwjob.compute_site not in site_values:
590
- site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
591
- htc_job = _create_job(
592
- subdir_template[gwjob.label],
593
- site_values[gwjob.compute_site],
594
- generic_workflow,
595
- gwjob,
596
- out_prefix,
597
- )
598
- htc_workflow.dag.add_job(htc_job)
599
-
600
- # Add job dependencies to the DAG
601
- for job_name in generic_workflow:
602
- htc_workflow.dag.add_job_relationships([job_name], generic_workflow.successors(job_name))
603
-
604
- # If final job exists in generic workflow, create DAG final job
605
- final = generic_workflow.get_final()
606
- if final and isinstance(final, GenericWorkflowJob):
607
- if final.compute_site and final.compute_site not in site_values:
608
- site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
609
- final_htjob = _create_job(
610
- subdir_template[final.label],
611
- site_values[final.compute_site],
612
- generic_workflow,
613
- final,
614
- out_prefix,
615
- )
616
- if "post" not in final_htjob.dagcmds:
617
- final_htjob.dagcmds["post"] = (
618
- f"{os.path.dirname(__file__)}/final_post.sh {final.name} $DAG_STATUS $RETURN"
619
- )
620
- htc_workflow.dag.add_final_job(final_htjob)
621
- elif final and isinstance(final, GenericWorkflow):
622
- raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
623
- elif final:
624
- return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
625
-
626
590
  return htc_workflow
627
591
 
628
592
  def write(self, out_prefix):
@@ -637,7 +601,7 @@ class HTCondorWorkflow(BaseWmsWorkflow):
637
601
  os.makedirs(out_prefix, exist_ok=True)
638
602
 
639
603
  # Write down the workflow in HTCondor format.
640
- self.dag.write(out_prefix, "jobs/{self.label}")
604
+ self.dag.write(out_prefix, job_subdir="jobs/{self.label}")
641
605
 
642
606
 
643
607
  def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefix):
@@ -668,8 +632,10 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
668
632
  if gwjob.tags:
669
633
  curvals.update(gwjob.tags)
670
634
 
671
- subdir = subdir_template.format_map(curvals)
672
- htc_job.subfile = Path("jobs") / subdir / f"{gwjob.name}.sub"
635
+ subdir = Path("jobs") / subdir_template.format_map(curvals)
636
+ htc_job.subdir = subdir
637
+ htc_job.subfile = f"{gwjob.name}.sub"
638
+ htc_job.add_dag_cmds({"dir": subdir})
673
639
 
674
640
  htc_job_cmds = {
675
641
  "universe": "vanilla",
@@ -681,8 +647,10 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
681
647
  # Exceeding memory sometimes triggering SIGBUS or SIGSEGV error. Tell
682
648
  # htcondor to put on hold any jobs which exited by a signal.
683
649
  "on_exit_hold": "ExitBySignal == true",
684
- "on_exit_hold_reason": 'strcat("Job raised a signal ", string(ExitSignal), ". ", '
685
- '"Handling signal as if job has gone over memory limit.")',
650
+ "on_exit_hold_reason": (
651
+ 'strcat("Job raised a signal ", string(ExitSignal), ". ", '
652
+ '"Handling signal as if job has gone over memory limit.")'
653
+ ),
686
654
  "on_exit_hold_subcode": "34",
687
655
  }
688
656
 
@@ -690,7 +658,7 @@ def _create_job(subdir_template, site_values, generic_workflow, gwjob, out_prefi
690
658
 
691
659
  # job stdout, stderr, htcondor user log.
692
660
  for key in ("output", "error", "log"):
693
- htc_job_cmds[key] = htc_job.subfile.with_suffix(f".$(Cluster).{key[:3]}")
661
+ htc_job_cmds[key] = f"{gwjob.name}.$(Cluster).{key[:3]}"
694
662
  _LOG.debug("HTCondor %s = %s", key, htc_job_cmds[key])
695
663
 
696
664
  htc_job_cmds.update(
@@ -817,7 +785,7 @@ def _translate_job_cmds(cached_vals, generic_workflow, gwjob):
817
785
  # Handle command line
818
786
  if gwjob.executable.transfer_executable:
819
787
  jobcmds["transfer_executable"] = "True"
820
- jobcmds["executable"] = os.path.basename(gwjob.executable.src_uri)
788
+ jobcmds["executable"] = gwjob.executable.src_uri
821
789
  else:
822
790
  jobcmds["executable"] = _fix_env_var_syntax(gwjob.executable.src_uri)
823
791
 
@@ -974,7 +942,7 @@ def _replace_cmd_vars(arguments, gwjob):
974
942
  replacements = gwjob.cmdvals if gwjob.cmdvals is not None else {}
975
943
  try:
976
944
  arguments = arguments.format(**replacements)
977
- except KeyError as exc:
945
+ except (KeyError, TypeError) as exc: # TypeError in case None instead of {}
978
946
  _LOG.error("Could not replace command variables: replacement for %s not provided", str(exc))
979
947
  _LOG.debug("arguments: %s\ncmdvals: %s", arguments, replacements)
980
948
  raise
@@ -1200,12 +1168,12 @@ def _get_info_from_schedd(
1200
1168
  return schedd_dag_info
1201
1169
 
1202
1170
 
1203
- def _get_info_from_path(wms_path: str) -> tuple[str, dict[str, dict[str, Any]], str]:
1171
+ def _get_info_from_path(wms_path: str | os.PathLike) -> tuple[str, dict[str, dict[str, Any]], str]:
1204
1172
  """Gather run information from a given run directory.
1205
1173
 
1206
1174
  Parameters
1207
1175
  ----------
1208
- wms_path : `str`
1176
+ wms_path : `str` or `os.PathLike`
1209
1177
  Directory containing HTCondor files.
1210
1178
 
1211
1179
  Returns
@@ -1263,9 +1231,9 @@ def _get_info_from_path(wms_path: str) -> tuple[str, dict[str, dict[str, Any]],
1263
1231
  schedd_name = next(iter(job_info))
1264
1232
  job_ad = next(iter(job_info[schedd_name].values()))
1265
1233
  job.update(job_ad)
1266
- except FileNotFoundError:
1267
- message = f"Could not find HTCondor files in '{wms_path}'"
1268
- _LOG.warning(message)
1234
+ except FileNotFoundError as err:
1235
+ message = f"Could not find HTCondor files in '{wms_path}' ({err})"
1236
+ _LOG.debug(message)
1269
1237
  messages.append(message)
1270
1238
  message = htc_check_dagman_output(wms_path)
1271
1239
  if message:
@@ -1298,8 +1266,9 @@ def _create_detailed_report_from_jobs(
1298
1266
  id and the value is a collection of report information for that run.
1299
1267
  """
1300
1268
  _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
1301
- dag_ad = jobs.pop(wms_workflow_id)
1302
- total_jobs, state_counts = _get_state_counts_from_dag_job(dag_ad)
1269
+
1270
+ dag_ad = jobs[wms_workflow_id]
1271
+
1303
1272
  report = WmsRunReport(
1304
1273
  wms_id=f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}",
1305
1274
  global_wms_id=dag_ad.get("GlobalJobId", "MISS"),
@@ -1312,28 +1281,34 @@ def _create_detailed_report_from_jobs(
1312
1281
  operator=_get_owner(dag_ad),
1313
1282
  run_summary=_get_run_summary(dag_ad),
1314
1283
  state=_htc_status_to_wms_state(dag_ad),
1284
+ total_number_jobs=0,
1315
1285
  jobs=[],
1316
- total_number_jobs=dag_ad.get("total_jobs", total_jobs),
1317
- job_state_counts=dag_ad.get("state_counts", state_counts),
1318
- exit_code_summary=_get_exit_code_summary(jobs),
1286
+ job_state_counts=dict.fromkeys(WmsStates, 0),
1287
+ exit_code_summary={},
1319
1288
  )
1289
+
1290
+ payload_jobs = {} # keep track for later processing
1320
1291
  specific_info = WmsSpecificInfo()
1321
1292
  for job_id, job_ad in jobs.items():
1322
- if not is_service_job(job_ad):
1293
+ if job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [WmsNodeType.PAYLOAD, WmsNodeType.FINAL]:
1323
1294
  try:
1295
+ name = job_ad.get("DAGNodeName", job_id)
1296
+ wms_state = _htc_status_to_wms_state(job_ad)
1324
1297
  job_report = WmsJobReport(
1325
1298
  wms_id=job_id,
1326
- name=job_ad.get("DAGNodeName", job_id),
1327
- label=job_ad.get("bps_job_label", pegasus_name_to_label(job_ad["DAGNodeName"])),
1328
- state=_htc_status_to_wms_state(job_ad),
1299
+ name=name,
1300
+ label=job_ad.get("bps_job_label", pegasus_name_to_label(name)),
1301
+ state=wms_state,
1329
1302
  )
1330
1303
  if job_report.label == "init":
1331
1304
  job_report.label = "pipetaskInit"
1305
+ report.job_state_counts[wms_state] += 1
1332
1306
  report.jobs.append(job_report)
1307
+ payload_jobs[job_id] = job_ad
1333
1308
  except KeyError as ex:
1334
1309
  _LOG.error("Job missing key '%s': %s", str(ex), job_ad)
1335
1310
  raise
1336
- else:
1311
+ elif is_service_job(job_ad):
1337
1312
  _LOG.debug(
1338
1313
  "Found service job: id='%s', name='%s', label='%s', NodeStatus='%s', JobStatus='%s'",
1339
1314
  job_id,
@@ -1344,13 +1319,11 @@ def _create_detailed_report_from_jobs(
1344
1319
  )
1345
1320
  _add_service_job_specific_info(job_ad, specific_info)
1346
1321
 
1322
+ report.total_number_jobs = len(payload_jobs)
1323
+ report.exit_code_summary = _get_exit_code_summary(payload_jobs)
1347
1324
  if specific_info:
1348
1325
  report.specific_info = specific_info
1349
1326
 
1350
- # Add the removed entry to restore the original content of the dictionary.
1351
- # The ordering of keys will be change permanently though.
1352
- jobs.update({wms_workflow_id: dag_ad})
1353
-
1354
1327
  # Workflow will exit with non-zero DAG_STATUS if problem with
1355
1328
  # any of the wms jobs. So change FAILED to SUCCEEDED if all
1356
1329
  # payload jobs SUCCEEDED.
@@ -1450,6 +1423,7 @@ def _summary_report(user, hist, pass_thru, schedds=None):
1450
1423
 
1451
1424
  # Have list of DAGMan jobs, need to get run_report info.
1452
1425
  run_reports = {}
1426
+ msg = ""
1453
1427
  for jobs in job_info.values():
1454
1428
  for job_id, job in jobs.items():
1455
1429
  total_jobs, state_counts = _get_state_counts_from_dag_job(job)
@@ -1482,7 +1456,7 @@ def _summary_report(user, hist, pass_thru, schedds=None):
1482
1456
  )
1483
1457
  run_reports[report.global_wms_id] = report
1484
1458
 
1485
- return run_reports, ""
1459
+ return run_reports, msg
1486
1460
 
1487
1461
 
1488
1462
  def _add_run_info(wms_path, job):
@@ -1596,14 +1570,14 @@ def _get_exit_code_summary(jobs):
1596
1570
  exit_code = 0
1597
1571
  job_status = job_ad["JobStatus"]
1598
1572
  match job_status:
1599
- case JobStatus.COMPLETED | JobStatus.HELD:
1573
+ case htcondor.JobStatus.COMPLETED | htcondor.JobStatus.HELD:
1600
1574
  exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["ExitCode"]
1601
1575
  case (
1602
- JobStatus.IDLE
1603
- | JobStatus.RUNNING
1604
- | JobStatus.REMOVED
1605
- | JobStatus.TRANSFERRING_OUTPUT
1606
- | JobStatus.SUSPENDED
1576
+ htcondor.JobStatus.IDLE
1577
+ | htcondor.JobStatus.RUNNING
1578
+ | htcondor.JobStatus.REMOVED
1579
+ | htcondor.JobStatus.TRANSFERRING_OUTPUT
1580
+ | htcondor.JobStatus.SUSPENDED
1607
1581
  ):
1608
1582
  pass
1609
1583
  case _:
@@ -1639,16 +1613,13 @@ def _get_state_counts_from_jobs(
1639
1613
  """
1640
1614
  state_counts = dict.fromkeys(WmsStates, 0)
1641
1615
  for job_id, job_ad in jobs.items():
1642
- if job_id != wms_workflow_id and not is_service_job(job_ad):
1616
+ if job_id != wms_workflow_id and job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [
1617
+ WmsNodeType.PAYLOAD,
1618
+ WmsNodeType.FINAL,
1619
+ ]:
1643
1620
  state_counts[_htc_status_to_wms_state(job_ad)] += 1
1644
- total_counted = sum(state_counts.values())
1645
-
1646
- if "NodesTotal" in jobs[wms_workflow_id]:
1647
- total_count = jobs[wms_workflow_id]["NodesTotal"]
1648
- else:
1649
- total_count = total_counted
1621
+ total_count = sum(state_counts.values())
1650
1622
 
1651
- state_counts[WmsStates.UNREADY] += total_count - total_counted
1652
1623
  return total_count, state_counts
1653
1624
 
1654
1625
 
@@ -1746,27 +1717,28 @@ def _htc_job_status_to_wms_state(job):
1746
1717
  _LOG.debug(
1747
1718
  "htc_job_status_to_wms_state: %s=%s, %s", job["ClusterId"], job["JobStatus"], type(job["JobStatus"])
1748
1719
  )
1749
- job_status = int(job["JobStatus"])
1750
1720
  wms_state = WmsStates.MISFIT
1751
-
1752
- _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1753
- if job_status == JobStatus.IDLE:
1754
- wms_state = WmsStates.PENDING
1755
- elif job_status == JobStatus.RUNNING:
1756
- wms_state = WmsStates.RUNNING
1757
- elif job_status == JobStatus.REMOVED:
1758
- wms_state = WmsStates.DELETED
1759
- elif job_status == JobStatus.COMPLETED:
1760
- if (
1761
- (job.get("ExitBySignal", False) and job.get("ExitSignal", 0))
1762
- or job.get("ExitCode", 0)
1763
- or job.get("DAG_Status", 0)
1764
- ):
1765
- wms_state = WmsStates.FAILED
1766
- else:
1767
- wms_state = WmsStates.SUCCEEDED
1768
- elif job_status == JobStatus.HELD:
1769
- wms_state = WmsStates.HELD
1721
+ if "JobStatus" in job and job["JobStatus"]:
1722
+ job_status = int(job["JobStatus"])
1723
+
1724
+ _LOG.debug("htc_job_status_to_wms_state: job_status = %s", job_status)
1725
+ if job_status == htcondor.JobStatus.IDLE:
1726
+ wms_state = WmsStates.PENDING
1727
+ elif job_status == htcondor.JobStatus.RUNNING:
1728
+ wms_state = WmsStates.RUNNING
1729
+ elif job_status == htcondor.JobStatus.REMOVED:
1730
+ wms_state = WmsStates.DELETED
1731
+ elif job_status == htcondor.JobStatus.COMPLETED:
1732
+ if (
1733
+ (job.get("ExitBySignal", False) and job.get("ExitSignal", 0))
1734
+ or job.get("ExitCode", 0)
1735
+ or job.get("DAG_Status", 0)
1736
+ ):
1737
+ wms_state = WmsStates.FAILED
1738
+ else:
1739
+ wms_state = WmsStates.SUCCEEDED
1740
+ elif job_status == htcondor.JobStatus.HELD:
1741
+ wms_state = WmsStates.HELD
1770
1742
 
1771
1743
  return wms_state
1772
1744
 
@@ -2212,6 +2184,186 @@ def is_service_job(job_ad: dict[str, Any]) -> bool:
2212
2184
  -----
2213
2185
  At the moment, HTCondor does not provide a native way to distinguish
2214
2186
  between payload and service jobs in the workflow. This code depends
2215
- on read_node_status adding bps_job_type.
2187
+ on read_node_status adding wms_node_type.
2188
+ """
2189
+ return job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) == WmsNodeType.SERVICE
2190
+
2191
+
2192
+ def _group_to_subdag(
2193
+ config: BpsConfig, generic_workflow_group: GenericWorkflowGroup, out_prefix: str
2194
+ ) -> HTCJob:
2195
+ """Convert a generic workflow group to an HTCondor dag.
2196
+
2197
+ Parameters
2198
+ ----------
2199
+ config : `lsst.ctrl.bps.BpsConfig`
2200
+ Workflow configuration.
2201
+ generic_workflow_group : `lsst.ctrl.bps.GenericWorkflowGroup`
2202
+ The generic workflow group to convert.
2203
+ out_prefix : `str`
2204
+ Location prefix to be used when creating jobs.
2205
+
2206
+ Returns
2207
+ -------
2208
+ htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
2209
+ Job for running the HTCondor dag.
2216
2210
  """
2217
- return job_ad.get("bps_job_type", "MISSING") == "service"
2211
+ jobname = f"wms_{generic_workflow_group.name}"
2212
+ htc_job = HTCJob(name=jobname, label=generic_workflow_group.label)
2213
+ htc_job.add_dag_cmds({"dir": f"subdags/{jobname}"})
2214
+ htc_job.subdag = _generic_workflow_to_htcondor_dag(config, generic_workflow_group, out_prefix)
2215
+ if not generic_workflow_group.blocking:
2216
+ htc_job.dagcmds["post"] = {
2217
+ "defer": "",
2218
+ "executable": f"{os.path.dirname(__file__)}/subdag_post.sh",
2219
+ "arguments": f"{jobname} $RETURN",
2220
+ }
2221
+ return htc_job
2222
+
2223
+
2224
+ def _create_check_job(group_job_name: str, job_label: str) -> HTCJob:
2225
+ """Create a job to check status of a group job.
2226
+
2227
+ Parameters
2228
+ ----------
2229
+ group_job_name : `str`
2230
+ Name of the group job.
2231
+ job_label : `str`
2232
+ Label to use for the check status job.
2233
+
2234
+ Returns
2235
+ -------
2236
+ htc_job : `lsst.ctrl.bps.htcondor.HTCJob`
2237
+ Job description for the job to check group job status.
2238
+ """
2239
+ htc_job = HTCJob(name=f"wms_check_status_{group_job_name}", label=job_label)
2240
+ htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/check_group_status.sub"
2241
+ htc_job.add_dag_cmds({"dir": f"subdags/{group_job_name}", "vars": {"group_job_name": group_job_name}})
2242
+
2243
+ return htc_job
2244
+
2245
+
2246
+ def _generic_workflow_to_htcondor_dag(
2247
+ config: BpsConfig, generic_workflow: GenericWorkflow, out_prefix: str
2248
+ ) -> HTCDag:
2249
+ """Convert a GenericWorkflow to a HTCDag.
2250
+
2251
+ Parameters
2252
+ ----------
2253
+ config : `lsst.ctrl.bps.BpsConfig`
2254
+ Workflow configuration.
2255
+ generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
2256
+ The GenericWorkflow to convert.
2257
+ out_prefix : `str`
2258
+ Location prefix where the HTCondor files will be written.
2259
+
2260
+ Returns
2261
+ -------
2262
+ dag : `lsst.ctrl.bps.htcondor.HTCDag`
2263
+ The HTCDag representation of the given GenericWorkflow.
2264
+ """
2265
+ dag = HTCDag(name=generic_workflow.name)
2266
+
2267
+ _LOG.debug("htcondor dag attribs %s", generic_workflow.run_attrs)
2268
+ dag.add_attribs(generic_workflow.run_attrs)
2269
+ dag.add_attribs(
2270
+ {
2271
+ "bps_run_quanta": create_count_summary(generic_workflow.quanta_counts),
2272
+ "bps_job_summary": create_count_summary(generic_workflow.job_counts),
2273
+ }
2274
+ )
2275
+
2276
+ _, tmp_template = config.search("subDirTemplate", opt={"replaceVars": False, "default": ""})
2277
+ if isinstance(tmp_template, str):
2278
+ subdir_template = defaultdict(lambda: tmp_template)
2279
+ else:
2280
+ subdir_template = tmp_template
2281
+
2282
+ # Create all DAG jobs
2283
+ site_values = {} # cache compute site specific values to reduce config lookups
2284
+ for job_name in generic_workflow:
2285
+ gwjob = generic_workflow.get_job(job_name)
2286
+ if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD:
2287
+ gwjob = cast(GenericWorkflowJob, gwjob)
2288
+ if gwjob.compute_site not in site_values:
2289
+ site_values[gwjob.compute_site] = _gather_site_values(config, gwjob.compute_site)
2290
+ htc_job = _create_job(
2291
+ subdir_template[gwjob.label],
2292
+ site_values[gwjob.compute_site],
2293
+ generic_workflow,
2294
+ gwjob,
2295
+ out_prefix,
2296
+ )
2297
+ elif gwjob.node_type == GenericWorkflowNodeType.NOOP:
2298
+ gwjob = cast(GenericWorkflowNoopJob, gwjob)
2299
+ htc_job = HTCJob(f"wms_{gwjob.name}", label=gwjob.label)
2300
+ htc_job.subfile = "${CTRL_BPS_HTCONDOR_DIR}/python/lsst/ctrl/bps/htcondor/noop.sub"
2301
+ htc_job.add_job_attrs({"bps_job_name": gwjob.name, "bps_job_label": gwjob.label})
2302
+ htc_job.add_dag_cmds({"noop": True})
2303
+ elif gwjob.node_type == GenericWorkflowNodeType.GROUP:
2304
+ gwjob = cast(GenericWorkflowGroup, gwjob)
2305
+ htc_job = _group_to_subdag(config, gwjob, out_prefix)
2306
+ # In case DAGMAN_GENERATE_SUBDAG_SUBMITS is False,
2307
+ dag.graph["submit_options"]["do_recurse"] = True
2308
+ else:
2309
+ raise RuntimeError(f"Unsupported generic workflow node type {gwjob.node_type} ({gwjob.name})")
2310
+ _LOG.debug("Calling adding job %s %s", htc_job.name, htc_job.label)
2311
+ dag.add_job(htc_job)
2312
+
2313
+ # Add job dependencies to the DAG (be careful with wms_ jobs)
2314
+ for job_name in generic_workflow:
2315
+ gwjob = generic_workflow.get_job(job_name)
2316
+ parent_name = (
2317
+ gwjob.name if gwjob.node_type == GenericWorkflowNodeType.PAYLOAD else f"wms_{gwjob.name}"
2318
+ )
2319
+ successor_jobs = [generic_workflow.get_job(j) for j in generic_workflow.successors(job_name)]
2320
+ children_names = []
2321
+ if gwjob.node_type == GenericWorkflowNodeType.GROUP:
2322
+ gwjob = cast(GenericWorkflowGroup, gwjob)
2323
+ group_children = [] # Dependencies between same group jobs
2324
+ for sjob in successor_jobs:
2325
+ if sjob.node_type == GenericWorkflowNodeType.GROUP and sjob.label == gwjob.label:
2326
+ group_children.append(f"wms_{sjob.name}")
2327
+ elif sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
2328
+ children_names.append(sjob.name)
2329
+ else:
2330
+ children_names.append(f"wms_{sjob.name}")
2331
+ if group_children:
2332
+ dag.add_job_relationships([parent_name], group_children)
2333
+ if not gwjob.blocking:
2334
+ # Since subdag will always succeed, need to add a special
2335
+ # job that fails if group failed to block payload children.
2336
+ check_job = _create_check_job(f"wms_{gwjob.name}", gwjob.label)
2337
+ dag.add_job(check_job)
2338
+ dag.add_job_relationships([f"wms_{gwjob.name}"], [check_job.name])
2339
+ parent_name = check_job.name
2340
+ else:
2341
+ for sjob in successor_jobs:
2342
+ if sjob.node_type == GenericWorkflowNodeType.PAYLOAD:
2343
+ children_names.append(sjob.name)
2344
+ else:
2345
+ children_names.append(f"wms_{sjob.name}")
2346
+
2347
+ dag.add_job_relationships([parent_name], children_names)
2348
+
2349
+ # If final job exists in generic workflow, create DAG final job
2350
+ final = generic_workflow.get_final()
2351
+ if final and isinstance(final, GenericWorkflowJob):
2352
+ if final.compute_site and final.compute_site not in site_values:
2353
+ site_values[final.compute_site] = _gather_site_values(config, final.compute_site)
2354
+ final_htjob = _create_job(
2355
+ subdir_template[final.label], site_values[final.compute_site], generic_workflow, final, out_prefix
2356
+ )
2357
+ if "post" not in final_htjob.dagcmds:
2358
+ final_htjob.dagcmds["post"] = {
2359
+ "defer": "",
2360
+ "executable": f"{os.path.dirname(__file__)}/final_post.sh",
2361
+ "arguments": f"{final.name} $DAG_STATUS $RETURN",
2362
+ }
2363
+ dag.add_final_job(final_htjob)
2364
+ elif final and isinstance(final, GenericWorkflow):
2365
+ raise NotImplementedError("HTCondor plugin does not support a workflow as the final job")
2366
+ elif final:
2367
+ raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})")
2368
+
2369
+ return dag