lsst-ctrl-bps-htcondor 28.2025.800__tar.gz → 29.0.0rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {lsst_ctrl_bps_htcondor-28.2025.800/python/lsst_ctrl_bps_htcondor.egg-info → lsst_ctrl_bps_htcondor-29.0.0rc1}/PKG-INFO +1 -1
  2. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/doc/lsst.ctrl.bps.htcondor/userguide.rst +26 -0
  3. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/pyproject.toml +9 -0
  4. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst/ctrl/bps/htcondor/htcondor_service.py +81 -32
  5. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst/ctrl/bps/htcondor/lssthtc.py +54 -45
  6. lsst_ctrl_bps_htcondor-29.0.0rc1/python/lsst/ctrl/bps/htcondor/version.py +2 -0
  7. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1/python/lsst_ctrl_bps_htcondor.egg-info}/PKG-INFO +1 -1
  8. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/tests/test_htcondor_service.py +494 -2
  9. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/tests/test_lssthtc.py +99 -6
  10. lsst_ctrl_bps_htcondor-28.2025.800/python/lsst/ctrl/bps/htcondor/version.py +0 -2
  11. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/COPYRIGHT +0 -0
  12. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/LICENSE +0 -0
  13. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/MANIFEST.in +0 -0
  14. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/README.rst +0 -0
  15. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/bsd_license.txt +0 -0
  16. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/doc/lsst.ctrl.bps.htcondor/CHANGES.rst +0 -0
  17. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/doc/lsst.ctrl.bps.htcondor/index.rst +0 -0
  18. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/gpl-v3.0.txt +0 -0
  19. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst/ctrl/bps/htcondor/__init__.py +0 -0
  20. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst/ctrl/bps/htcondor/etc/__init__.py +0 -0
  21. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst/ctrl/bps/htcondor/etc/htcondor_defaults.yaml +0 -0
  22. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst/ctrl/bps/htcondor/final_post.sh +0 -0
  23. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst/ctrl/bps/htcondor/handlers.py +0 -0
  24. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst/ctrl/bps/htcondor/htcondor_config.py +0 -0
  25. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst/ctrl/bps/htcondor/provisioner.py +0 -0
  26. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst_ctrl_bps_htcondor.egg-info/SOURCES.txt +0 -0
  27. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst_ctrl_bps_htcondor.egg-info/dependency_links.txt +0 -0
  28. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst_ctrl_bps_htcondor.egg-info/requires.txt +0 -0
  29. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst_ctrl_bps_htcondor.egg-info/top_level.txt +0 -0
  30. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/python/lsst_ctrl_bps_htcondor.egg-info/zip-safe +0 -0
  31. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/setup.cfg +0 -0
  32. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/tests/test_handlers.py +0 -0
  33. {lsst_ctrl_bps_htcondor-28.2025.800 → lsst_ctrl_bps_htcondor-29.0.0rc1}/tests/test_provisioner.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: lsst-ctrl-bps-htcondor
3
- Version: 28.2025.800
3
+ Version: 29.0.0rc1
4
4
  Summary: HTCondor plugin for lsst-ctrl-bps.
5
5
  Author-email: Rubin Observatory Data Management <dm-admin@lists.lsst.org>
6
6
  License: BSD 3-Clause License
@@ -148,6 +148,17 @@ from files. So, the detailed report can distinguish between failed and
148
148
  deleted jobs, and thus will show ``D`` in the flag column for a running
149
149
  workflow if there is a deleted job.
150
150
 
151
+ Rarely, a detailed report may warn about job submission issues. For example:
152
+
153
+ .. code-block:: bash
154
+
155
+ Warn: Job submission issues (last: 01/30/25 10:36:57)
156
+
157
+ A job submission issue could be intermittent or not. It may cause
158
+ problems with the status or counts in the reports. To get more information
159
+ about the submission issue, look in the ``*.dag.dagman.out`` file for
160
+ errors, in particular lines containing ``submit attempt failed``.
161
+
151
162
  Occasionally, some jobs are put on hold by HTCondor. To see the reason why
152
163
  jobs are being held, use
153
164
 
@@ -276,12 +287,27 @@ Look for the line starting with "Provisioning job status". For example
276
287
  calibrate 0 0 1 0 0 0 0 0 0 0 0 1
277
288
  finalJob 0 0 1 0 0 0 0 0 0 0 0 1
278
289
 
290
+ If the provisioning job status is UNREADY, check the end of the report to see
291
+ if there is a warning about submission issues. There may be a temporary problem.
292
+ Check the ``*.dag.dagman.out`` in run submit directory for errors, in
293
+ particular for ``ERROR: submit attempt failed``.
294
+
295
+ If the provisioning job status is HELD, the hold reason will appear in parentheses.
296
+
279
297
  The service job managing the glideins will be automatically canceled once the
280
298
  workflow is completed. However, the existing glideins will be left for
281
299
  HTCondor to shut them down once they remain inactive for the period specified
282
300
  by ``provisioningMaxIdleTime`` (default value: 15 min., see below) or maximum
283
301
  wall time is reached.
284
302
 
303
+ The provisioning job is expected to run as long as the workflow. If the job
304
+ dies, the job status will be `FAILED`. If the job just completed successfully,
305
+ the job status will be `SUCCEEDED` with a message saying it ended early (which
306
+ may or may not cause a problem since existing glideins could remain running).
307
+ To get more information about either of these cases, check the job output
308
+ and error files in the `jobs/provisioningJob` subdirectory.
309
+
310
+
285
311
  If the automatic provisioning of the resources is enabled, the script that the
286
312
  service job is supposed to run in order to provide the required resources *must
287
313
  be* defined by the ``provisioningScript`` setting in the ``provisioning``
@@ -113,6 +113,15 @@ convention = "numpy"
113
113
  # not fit on one line.
114
114
  add-ignore = ["D107", "D105", "D102", "D100", "D200", "D205", "D400", "D104"]
115
115
 
116
+ [tool.coverage.report]
117
+ exclude_lines = [
118
+ "pragma: no cover",
119
+ "raise AssertionError",
120
+ "raise NotImplementedError",
121
+ "if __name__ == .__main__.:",
122
+ "if TYPE_CHECKING:",
123
+ ]
124
+
116
125
  [tool.ruff]
117
126
  target-version = "py311"
118
127
  line-length = 110
@@ -79,7 +79,7 @@ from .lssthtc import (
79
79
  read_dag_log,
80
80
  read_dag_status,
81
81
  read_node_status,
82
- summary_from_dag,
82
+ summarize_dag,
83
83
  write_dag_info,
84
84
  )
85
85
  from .provisioner import Provisioner
@@ -154,7 +154,7 @@ class HTCondorService(BaseWmsService):
154
154
  if enable_provisioning:
155
155
  provisioner = Provisioner(config)
156
156
  provisioner.configure()
157
- provisioner.prepare("provisioning_job.bash", prefix=out_prefix)
157
+ provisioner.prepare("provisioningJob.bash", prefix=out_prefix)
158
158
  provisioner.provision(workflow.dag)
159
159
 
160
160
  with time_this(
@@ -1317,9 +1317,9 @@ def _create_detailed_report_from_jobs(
1317
1317
  job_state_counts=dag_ad.get("state_counts", state_counts),
1318
1318
  exit_code_summary=_get_exit_code_summary(jobs),
1319
1319
  )
1320
-
1320
+ specific_info = WmsSpecificInfo()
1321
1321
  for job_id, job_ad in jobs.items():
1322
- if not is_service_job(job_id):
1322
+ if not is_service_job(job_ad):
1323
1323
  try:
1324
1324
  job_report = WmsJobReport(
1325
1325
  wms_id=job_id,
@@ -1334,33 +1334,85 @@ def _create_detailed_report_from_jobs(
1334
1334
  _LOG.error("Job missing key '%s': %s", str(ex), job_ad)
1335
1335
  raise
1336
1336
  else:
1337
- job_label = job_ad.get("bps_job_label")
1338
- if job_label is None:
1339
- _LOG.warning("Service job with id '%s': missing label, no action taken", job_id)
1340
- elif job_label == dag_ad.get("bps_provisioning_job", "MISS"):
1341
- report.specific_info = WmsSpecificInfo()
1342
- job_status = _htc_status_to_wms_state(job_ad)
1343
- if job_status == WmsStates.DELETED:
1344
- if "Reason" in job_ad and "Removed by DAGMan" in job_ad["Reason"]:
1345
- job_status = WmsStates.SUCCEEDED
1346
- report.specific_info.add_message(
1347
- template="Provisioning job status: {status}",
1348
- context={"status": job_status.name},
1349
- )
1350
- else:
1351
- _LOG.warning(
1352
- "Service job with id '%s' (label '%s'): no handler, no action taken", job_id, job_label
1353
- )
1337
+ _LOG.debug(
1338
+ "Found service job: id='%s', name='%s', label='%s', NodeStatus='%s', JobStatus='%s'",
1339
+ job_id,
1340
+ job_ad["DAGNodeName"],
1341
+ job_ad.get("bps_job_label", "MISS"),
1342
+ job_ad.get("NodeStatus", "MISS"),
1343
+ job_ad.get("JobStatus", "MISS"),
1344
+ )
1345
+ _add_service_job_specific_info(job_ad, specific_info)
1346
+
1347
+ if specific_info:
1348
+ report.specific_info = specific_info
1354
1349
 
1355
1350
  # Add the removed entry to restore the original content of the dictionary.
1356
1351
  # The ordering of keys will be change permanently though.
1357
1352
  jobs.update({wms_workflow_id: dag_ad})
1358
1353
 
1354
+ # Workflow will exit with non-zero DAG_STATUS if problem with
1355
+ # any of the wms jobs. So change FAILED to SUCCEEDED if all
1356
+ # payload jobs SUCCEEDED.
1357
+ if report.total_number_jobs == report.job_state_counts[WmsStates.SUCCEEDED]:
1358
+ report.state = WmsStates.SUCCEEDED
1359
+
1359
1360
  run_reports = {report.wms_id: report}
1360
1361
  _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
1361
1362
  return run_reports
1362
1363
 
1363
1364
 
1365
+ def _add_service_job_specific_info(job_ad: dict[str, Any], specific_info: WmsSpecificInfo) -> None:
1366
+ """Generate report information for service job.
1367
+
1368
+ Parameters
1369
+ ----------
1370
+ job_ad : `dict` [`str`, `Any`]
1371
+ Provisioning job information.
1372
+ specific_info : `lsst.ctrl.bps.WmsSpecificInfo`
1373
+ Where to add message.
1374
+ """
1375
+ status_details = ""
1376
+ job_status = _htc_status_to_wms_state(job_ad)
1377
+
1378
+ # Service jobs in queue are deleted when DAG is done.
1379
+ # To get accurate status, need to check other info.
1380
+ if (
1381
+ job_status == WmsStates.DELETED
1382
+ and "Reason" in job_ad
1383
+ and (
1384
+ "Removed by DAGMan" in job_ad["Reason"]
1385
+ or "removed because <OtherJobRemoveRequirements = DAGManJobId =?=" in job_ad["Reason"]
1386
+ or "DAG is exiting and writing rescue file." in job_ad["Reason"]
1387
+ )
1388
+ ):
1389
+ if "HoldReason" in job_ad:
1390
+ # HoldReason exists even if released, so check.
1391
+ if "job_released_time" in job_ad and job_ad["job_held_time"] < job_ad["job_released_time"]:
1392
+ # If released, assume running until deleted.
1393
+ job_status = WmsStates.SUCCEEDED
1394
+ status_details = ""
1395
+ else:
1396
+ # If job held when deleted by DAGMan, still want to
1397
+ # report hold reason
1398
+ status_details = f"(Job was held for the following reason: {job_ad['HoldReason']})"
1399
+
1400
+ else:
1401
+ job_status = WmsStates.SUCCEEDED
1402
+ elif job_status == WmsStates.SUCCEEDED:
1403
+ status_details = "(Note: Finished before workflow.)"
1404
+ elif job_status == WmsStates.HELD:
1405
+ status_details = f"({job_ad['HoldReason']})"
1406
+
1407
+ template = "Status of {job_name}: {status} {status_details}"
1408
+ context = {
1409
+ "job_name": job_ad["DAGNodeName"],
1410
+ "status": job_status.name,
1411
+ "status_details": status_details,
1412
+ }
1413
+ specific_info.add_message(template=template, context=context)
1414
+
1415
+
1364
1416
  def _summary_report(user, hist, pass_thru, schedds=None):
1365
1417
  """Gather run information to be used in generating summary reports.
1366
1418
 
@@ -1509,7 +1561,7 @@ def _get_run_summary(job):
1509
1561
  """
1510
1562
  summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
1511
1563
  if not summary:
1512
- summary, _ = summary_from_dag(job["Iwd"])
1564
+ summary, _, _ = summarize_dag(job["Iwd"])
1513
1565
  if not summary:
1514
1566
  _LOG.warning("Could not get run summary for htcondor job: %s", job)
1515
1567
  _LOG.debug("_get_run_summary: summary=%s", summary)
@@ -1587,7 +1639,7 @@ def _get_state_counts_from_jobs(
1587
1639
  """
1588
1640
  state_counts = dict.fromkeys(WmsStates, 0)
1589
1641
  for job_id, job_ad in jobs.items():
1590
- if job_id != wms_workflow_id and not is_service_job(job_id):
1642
+ if job_id != wms_workflow_id and not is_service_job(job_ad):
1591
1643
  state_counts[_htc_status_to_wms_state(job_ad)] += 1
1592
1644
  total_counted = sum(state_counts.values())
1593
1645
 
@@ -2143,13 +2195,13 @@ def _gather_site_values(config, compute_site):
2143
2195
  return site_values
2144
2196
 
2145
2197
 
2146
- def is_service_job(job_id: str) -> bool:
2198
+ def is_service_job(job_ad: dict[str, Any]) -> bool:
2147
2199
  """Determine if a job is a service one.
2148
2200
 
2149
2201
  Parameters
2150
2202
  ----------
2151
- job_id : str
2152
- HTCondor job id.
2203
+ job_ad : `dict` [`str`, Any]
2204
+ Information about an HTCondor job.
2153
2205
 
2154
2206
  Returns
2155
2207
  -------
@@ -2159,10 +2211,7 @@ def is_service_job(job_id: str) -> bool:
2159
2211
  Notes
2160
2212
  -----
2161
2213
  At the moment, HTCondor does not provide a native way to distinguish
2162
- between payload and service jobs in the workflow. As a result, the current
2163
- implementation depends entirely on the logic that is used in
2164
- :py:func:`read_node_status()` (service jobs are given ids with ClusterId=0
2165
- and ProcId=some integer). If it changes, this function needs to be
2166
- updated too.
2214
+ between payload and service jobs in the workflow. This code depends
2215
+ on read_node_status adding bps_job_type.
2167
2216
  """
2168
- return int(float(job_id)) == 0
2217
+ return job_ad.get("bps_job_type", "MISSING") == "service"
@@ -63,7 +63,8 @@ __all__ = [
63
63
  "read_dag_nodes_log",
64
64
  "read_dag_status",
65
65
  "read_node_status",
66
- "summary_from_dag",
66
+ "summarize_dag",
67
+ "update_job_info",
67
68
  "update_job_info",
68
69
  "write_dag_info",
69
70
  ]
@@ -1245,7 +1246,7 @@ def update_job_info(job_info, other_info):
1245
1246
  return job_info
1246
1247
 
1247
1248
 
1248
- def summary_from_dag(dir_name):
1249
+ def summarize_dag(dir_name: str) -> tuple[str, dict[str, str], dict[str, str]]:
1249
1250
  """Build bps_run_summary string from dag file.
1250
1251
 
1251
1252
  Parameters
@@ -1256,51 +1257,64 @@ def summary_from_dag(dir_name):
1256
1257
  Returns
1257
1258
  -------
1258
1259
  summary : `str`
1259
- Semi-colon separated list of job labels and counts.
1260
+ Semi-colon separated list of job labels and counts
1260
1261
  (Same format as saved in dag classad).
1261
1262
  job_name_to_label : `dict` [`str`, `str`]
1262
1263
  Mapping of job names to job labels.
1264
+ job_name_to_type : `dict` [`str`, `str`]
1265
+ Mapping of job names to job types
1266
+ (e.g., payload, final, service).
1263
1267
  """
1264
1268
  # Later code depends upon insertion order
1265
- counts = defaultdict(int)
1269
+ counts: defaultdict[str, int] = defaultdict(int) # counts of payload jobs per label
1266
1270
  job_name_to_label = {}
1271
+ job_name_to_type = {}
1267
1272
  try:
1268
1273
  dag = next(Path(dir_name).glob("*.dag"))
1269
1274
  with open(dag) as fh:
1270
1275
  for line in fh:
1276
+ job_name = ""
1271
1277
  if line.startswith("JOB"):
1272
- m = re.match(r'JOB (\S+) "jobs/([^/]+)/', line)
1278
+ m = re.match(r'JOB (\S+) "?jobs/([^/]+)/', line)
1273
1279
  if m:
1280
+ job_name = m.group(1)
1274
1281
  label = m.group(2)
1275
1282
  if label == "init":
1276
1283
  label = "pipetaskInit"
1277
- job_name_to_label[m.group(1)] = label
1278
1284
  counts[label] += 1
1279
1285
  else: # Check if Pegasus submission
1280
1286
  m = re.match(r"JOB (\S+) (\S+)", line)
1281
1287
  if m:
1288
+ job_name = m.group(1)
1282
1289
  label = pegasus_name_to_label(m.group(1))
1283
- job_name_to_label[m.group(1)] = label
1284
1290
  counts[label] += 1
1285
1291
  else:
1286
1292
  _LOG.warning("Parse DAG: unmatched job line: %s", line)
1293
+ job_type = "payload"
1287
1294
  elif line.startswith("FINAL"):
1288
1295
  m = re.match(r"FINAL (\S+) jobs/([^/]+)/", line)
1289
1296
  if m:
1297
+ job_name = m.group(1)
1290
1298
  label = m.group(2)
1291
- job_name_to_label[m.group(1)] = label
1292
- counts[label] += 1
1299
+ counts[label] += 1 # final counts a payload job.
1300
+ job_type = "final"
1293
1301
  elif line.startswith("SERVICE"):
1294
1302
  m = re.match(r"SERVICE (\S+) jobs/([^/]+)/", line)
1295
1303
  if m:
1304
+ job_name = m.group(1)
1296
1305
  label = m.group(2)
1297
- job_name_to_label[m.group(1)] = label
1306
+ job_type = "service"
1307
+
1308
+ if job_name:
1309
+ job_name_to_label[job_name] = label
1310
+ job_name_to_type[job_name] = job_type
1311
+
1298
1312
  except (OSError, PermissionError, StopIteration):
1299
1313
  pass
1300
1314
 
1301
1315
  summary = ";".join([f"{name}:{counts[name]}" for name in counts])
1302
- _LOG.debug("summary_from_dag: %s %s", summary, job_name_to_label)
1303
- return summary, job_name_to_label
1316
+ _LOG.debug("summarize_dag: %s %s %s", summary, job_name_to_label, job_name_to_type)
1317
+ return summary, job_name_to_label, job_name_to_type
1304
1318
 
1305
1319
 
1306
1320
  def pegasus_name_to_label(name):
@@ -1400,7 +1414,7 @@ def read_node_status(wms_path):
1400
1414
  file.
1401
1415
  """
1402
1416
  # Get jobid info from other places to fill in gaps in info from node_status
1403
- _, job_name_to_label = summary_from_dag(wms_path)
1417
+ _, job_name_to_label, job_name_to_type = summarize_dag(wms_path)
1404
1418
  wms_workflow_id, loginfo = read_dag_log(wms_path)
1405
1419
  loginfo = read_dag_nodes_log(wms_path)
1406
1420
  _LOG.debug("loginfo = %s", loginfo)
@@ -1409,17 +1423,17 @@ def read_node_status(wms_path):
1409
1423
  if "LogNotes" in job_info:
1410
1424
  m = re.match(r"DAG Node: (\S+)", job_info["LogNotes"])
1411
1425
  if m:
1412
- job_name_to_id[m.group(1)] = job_id
1413
- job_info["DAGNodeName"] = m.group(1)
1426
+ job_name = m.group(1)
1427
+ job_name_to_id[job_name] = job_id
1428
+ job_info["DAGNodeName"] = job_name
1429
+ job_info["bps_job_type"] = job_name_to_type[job_name]
1430
+ job_info["bps_job_label"] = job_name_to_label[job_name]
1414
1431
 
1432
+ jobs = loginfo
1433
+ fake_id = -1.0 # For nodes that do not yet have a job id, give fake one
1415
1434
  try:
1416
1435
  node_status = next(Path(wms_path).glob("*.node_status"))
1417
- except StopIteration:
1418
- return loginfo
1419
1436
 
1420
- jobs = {}
1421
- fake_id = -1.0 # For nodes that do not yet have a job id, give fake one
1422
- try:
1423
1437
  with open(node_status) as fh:
1424
1438
  for ad in classad.parseAds(fh):
1425
1439
  match ad["Type"]:
@@ -1438,22 +1452,19 @@ def read_node_status(wms_path):
1438
1452
  # Make job info as if came from condor_q.
1439
1453
  if job_name in job_name_to_id:
1440
1454
  job_id = str(job_name_to_id[job_name])
1455
+ job = jobs[job_id]
1441
1456
  else:
1442
1457
  job_id = str(fake_id)
1458
+ job_name_to_id[job_name] = job_id
1459
+ job = dict(ad)
1460
+ jobs[job_id] = job
1443
1461
  fake_id -= 1
1444
- job = dict(ad)
1445
1462
  job["ClusterId"] = int(float(job_id))
1446
1463
  job["DAGManJobID"] = wms_workflow_id
1447
1464
  job["DAGNodeName"] = job_name
1448
1465
  job["bps_job_label"] = job_label
1466
+ job["bps_job_type"] = job_name_to_type[job_name]
1449
1467
 
1450
- # Include information retrieved from the event log
1451
- # if available.
1452
- jobs[job_id] = job
1453
- try:
1454
- jobs[job_id] |= loginfo[job_id]
1455
- except KeyError:
1456
- pass
1457
1468
  case "StatusEnd":
1458
1469
  # Skip node status file "epilog".
1459
1470
  pass
@@ -1463,24 +1474,22 @@ def read_node_status(wms_path):
1463
1474
  ad["Type"],
1464
1475
  wms_path,
1465
1476
  )
1466
- except (OSError, PermissionError):
1477
+ except (StopIteration, OSError, PermissionError):
1467
1478
  pass
1468
- else:
1469
- # Assume that the jobs found in the event log, but *not* in the node
1470
- # status file are the service jobs as HTCondor does not include
1471
- # information about these jobs in the node status file at the moment.
1472
- #
1473
- # Note: To be able to easily identify the service jobs downstream,
1474
- # we reverse the ClusterId and ProcId in their HTCondor ids in internal
1475
- # use. For example, if HTCondor id of a service job is '1.0', we will
1476
- # use '0.1' instead.
1477
- service_jobs = {job_id: loginfo[job_id] for job_id in set(loginfo) - set(jobs)}
1478
- job_id_to_name = {
1479
- job_id: job_name for job_name, job_id in job_name_to_id.items() if job_id in service_jobs
1480
- }
1481
- for job_id, job_info in service_jobs.items():
1482
- job_info["bps_job_label"] = job_name_to_label[job_id_to_name[job_id]]
1483
- jobs[f"{job_info['ProcId']}.{job_info['ClusterId']}"] = job_info
1479
+
1480
+ # Check for missing jobs (e.g., submission failure or not submitted yet)
1481
+ # Use dag info to create job placeholders
1482
+ for name in set(job_name_to_label) - set(job_name_to_id):
1483
+ job = {}
1484
+ job["ClusterId"] = int(float(fake_id))
1485
+ job["ProcId"] = 0
1486
+ job["DAGManJobID"] = wms_workflow_id
1487
+ job["DAGNodeName"] = name
1488
+ job["bps_job_label"] = job_name_to_label[name]
1489
+ job["bps_job_type"] = job_name_to_type[name]
1490
+ job["NodeStatus"] = NodeStatus.NOT_READY
1491
+ jobs[f"{job['ClusterId']}.{job['ProcId']}"] = job
1492
+ fake_id -= 1
1484
1493
 
1485
1494
  return jobs
1486
1495
 
@@ -0,0 +1,2 @@
1
+ __all__ = ["__version__"]
2
+ __version__ = "29.0.0rc1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: lsst-ctrl-bps-htcondor
3
- Version: 28.2025.800
3
+ Version: 29.0.0rc1
4
4
  Summary: HTCondor plugin for lsst-ctrl-bps.
5
5
  Author-email: Rubin Observatory Data Management <dm-admin@lists.lsst.org>
6
6
  License: BSD 3-Clause License
@@ -31,24 +31,28 @@ import logging
31
31
  import os
32
32
  import unittest
33
33
  from pathlib import Path
34
- from shutil import copy2
34
+ from shutil import copy2, copytree
35
35
 
36
36
  import htcondor
37
37
 
38
- from lsst.ctrl.bps import BpsConfig, GenericWorkflowExec, GenericWorkflowJob, WmsStates
38
+ from lsst.ctrl.bps import BpsConfig, GenericWorkflowExec, GenericWorkflowJob, WmsSpecificInfo, WmsStates
39
39
  from lsst.ctrl.bps.htcondor.htcondor_config import HTC_DEFAULTS_URI
40
40
  from lsst.ctrl.bps.htcondor.htcondor_service import (
41
41
  HTCondorService,
42
42
  JobStatus,
43
43
  NodeStatus,
44
44
  WmsIdType,
45
+ _add_service_job_specific_info,
46
+ _create_detailed_report_from_jobs,
45
47
  _get_exit_code_summary,
46
48
  _get_info_from_path,
49
+ _get_run_summary,
47
50
  _get_state_counts_from_dag_job,
48
51
  _htc_node_status_to_wms_state,
49
52
  _htc_status_to_wms_state,
50
53
  _translate_job_cmds,
51
54
  _wms_id_to_dir,
55
+ is_service_job,
52
56
  )
53
57
  from lsst.ctrl.bps.htcondor.lssthtc import MISSING_ID
54
58
  from lsst.utils.tests import temporaryDirectory
@@ -532,3 +536,491 @@ class WmsIdToDirTestCase(unittest.TestCase):
532
536
  self.assertEqual(id_type, WmsIdType.PATH)
533
537
  self.assertEqual(abs_path.resolve(), wms_path)
534
538
  os.chdir(orig_dir)
539
+
540
+
541
+ class AddServiceJobSpecificInfoTestCase(unittest.TestCase):
542
+ """Test _add_service_job_specific_info function.
543
+
544
+ Note: The job_ad's are hardcoded in these tests. The
545
+ values in the dictionaries come from plugin code as
546
+ well as HTCondor. Changes in either of those codes
547
+ that produce data for the job_ad can break this
548
+ function without breaking these unit tests.
549
+
550
+ Also, since hold status/messages stick around, testing
551
+ various cases with and without job being held just to
552
+ ensure get right status in both cases.
553
+ """
554
+
555
+ def testNotSubmitted(self):
556
+ # Service job not submitted yet or can't be submitted.
557
+ # (Typically an plugin bug.)
558
+ # At this function level, can't tell if not submitted
559
+ # yet or problem so it never will.
560
+ job_ad = {
561
+ "ClusterId": -64,
562
+ "DAGManJobID": "8997.0",
563
+ "DAGNodeName": "provisioningJob",
564
+ "NodeStatus": NodeStatus.NOT_READY,
565
+ "ProcId": 0,
566
+ "bps_job_label": "service_provisioningJob",
567
+ }
568
+ results = WmsSpecificInfo()
569
+ _add_service_job_specific_info(job_ad, results)
570
+ self.assertEqual(
571
+ results.context, {"job_name": "provisioningJob", "status": "UNREADY", "status_details": ""}
572
+ )
573
+
574
+ def testRunning(self):
575
+ # DAG hasn't completed (Running or held),
576
+ # Service job is running.
577
+ job_ad = {
578
+ "ClusterId": 8523,
579
+ "ProcId": 0,
580
+ "DAGNodeName": "provisioningJob",
581
+ "JobStatus": JobStatus.RUNNING,
582
+ }
583
+
584
+ results = WmsSpecificInfo()
585
+ _add_service_job_specific_info(job_ad, results)
586
+ self.assertEqual(
587
+ results.context, {"job_name": "provisioningJob", "status": "RUNNING", "status_details": ""}
588
+ )
589
+
590
+ def testDied(self):
591
+ # DAG hasn't completed (Running or held),
592
+ # Service job failed (completed non-zero exit code)
593
+ job_ad = {
594
+ "ClusterId": 8761,
595
+ "ProcId": 0,
596
+ "DAGNodeName": "provisioningJob",
597
+ "JobStatus": JobStatus.COMPLETED,
598
+ "ExitCode": 4,
599
+ }
600
+ results = WmsSpecificInfo()
601
+ _add_service_job_specific_info(job_ad, results)
602
+ self.assertEqual(
603
+ results.context, {"job_name": "provisioningJob", "status": "FAILED", "status_details": ""}
604
+ )
605
+
606
+ def testDeleted(self):
607
+ # Deleted by user (never held)
608
+ job_ad = {
609
+ "ClusterId": 9086,
610
+ "DAGNodeName": "provisioningJob",
611
+ "JobStatus": JobStatus.REMOVED,
612
+ "ProcId": 0,
613
+ "Reason": "via condor_rm (by user mgower)",
614
+ "job_evicted_time": "2025-02-11T11:35:04",
615
+ }
616
+ results = WmsSpecificInfo()
617
+ _add_service_job_specific_info(job_ad, results)
618
+ self.assertEqual(
619
+ results.context, {"job_name": "provisioningJob", "status": "DELETED", "status_details": ""}
620
+ )
621
+
622
+ def testSucceedEarly(self):
623
+ # DAG hasn't completed (Running or held),
624
+ # Service job completed with exit code 0
625
+ job_ad = {
626
+ "ClusterId": 8761,
627
+ "ProcId": 0,
628
+ "DAGNodeName": "provisioningJob",
629
+ "JobStatus": JobStatus.COMPLETED,
630
+ "ExitCode": 0,
631
+ }
632
+ results = WmsSpecificInfo()
633
+ _add_service_job_specific_info(job_ad, results)
634
+ self.assertEqual(
635
+ results.context,
636
+ {
637
+ "job_name": "provisioningJob",
638
+ "status": "SUCCEEDED",
639
+ "status_details": "(Note: Finished before workflow.)",
640
+ },
641
+ )
642
+
643
+ def testSucceedOldRemoveMessage(self):
644
+ # DAG completed, job was in running state when removed.
645
+ job_ad = {
646
+ "ClusterId": 8761,
647
+ "ProcId": 0,
648
+ "DAGNodeName": "provisioningJob",
649
+ "JobStatus": JobStatus.REMOVED,
650
+ "Reason": "Removed by DAGMan (by user mgower)",
651
+ }
652
+ results = WmsSpecificInfo()
653
+ _add_service_job_specific_info(job_ad, results)
654
+ self.assertEqual(
655
+ results.context, {"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""}
656
+ )
657
+
658
+ def testSucceed(self):
659
+ # DAG completed, job was in running state when removed.
660
+ job_ad = {
661
+ "ClusterId": 8761,
662
+ "ProcId": 0,
663
+ "DAGNodeName": "provisioningJob",
664
+ "JobStatus": JobStatus.REMOVED,
665
+ "Reason": (
666
+ "removed because <OtherJobRemoveRequirements = DAGManJobId =?= 8556>"
667
+ " fired when job (8556.0) was removed"
668
+ ),
669
+ }
670
+ results = WmsSpecificInfo()
671
+ _add_service_job_specific_info(job_ad, results)
672
+ self.assertEqual(
673
+ results.context, {"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""}
674
+ )
675
+
676
+ def testUserHeldWhileRunning(self):
677
+ # DAG hasn't completed (Running or held),
678
+ # user put at least service job on hold
679
+ job_ad = {
680
+ "ClusterId": 8523,
681
+ "ProcId": 0,
682
+ "DAGNodeName": "provisioningJob",
683
+ "JobStatus": JobStatus.HELD,
684
+ "HoldReason": "via condor_hold (by user mgower)",
685
+ "HoldReasonCode": 1,
686
+ "HoldReasonSubCode": 0,
687
+ }
688
+
689
+ results = WmsSpecificInfo()
690
+ _add_service_job_specific_info(job_ad, results)
691
+ self.assertEqual(
692
+ results.context,
693
+ {
694
+ "job_name": "provisioningJob",
695
+ "status": "HELD",
696
+ "status_details": "(via condor_hold (by user mgower))",
697
+ },
698
+ )
699
+
700
+ def testHeldByHTC(self):
701
+ # Job put on hold by HTCondor, removed when DAG ends
702
+ job_ad = {
703
+ "ClusterId": 8693,
704
+ "DAGNodeName": "provisioningJob",
705
+ "HoldReason": "Failed to execute",
706
+ "HoldReasonCode": 6,
707
+ "HoldReasonSubCode": 2,
708
+ "JobStatus": JobStatus.REMOVED,
709
+ "ProcId": 0,
710
+ "Reason": "Removed by DAGMan (by user mgower)",
711
+ "job_held_time": "2025-02-07T12:50:07",
712
+ }
713
+ results = WmsSpecificInfo()
714
+ _add_service_job_specific_info(job_ad, results)
715
+ self.assertEqual(
716
+ results.context,
717
+ {
718
+ "job_name": "provisioningJob",
719
+ "status": "DELETED",
720
+ "status_details": "(Job was held for the following reason: Failed to execute)",
721
+ },
722
+ )
723
+
724
+ def testHeldReleasedRunning(self):
725
+ # DAG hasn't completed (Running or held),
726
+ # Since held info will be in job_ad, make sure knows released.
727
+ job_ad = {
728
+ "ClusterId": 8625,
729
+ "DAGNodeName": "provisioningJob",
730
+ "HoldReason": "via condor_hold (by user mgower)",
731
+ "HoldReasonCode": 1,
732
+ "HoldReasonSubCode": 0,
733
+ "JobStatus": JobStatus.RUNNING,
734
+ "LogNotes": "DAG Node: provisioningJob",
735
+ "ProcId": 0,
736
+ "job_held_time": "2025-02-07T12:33:34",
737
+ "job_released_time": "2025-02-07T12:33:47",
738
+ }
739
+ results = WmsSpecificInfo()
740
+ _add_service_job_specific_info(job_ad, results)
741
+ self.assertEqual(
742
+ results.context, {"job_name": "provisioningJob", "status": "RUNNING", "status_details": ""}
743
+ )
744
+
745
+ def testHeldReleasedDied(self):
746
+ # Since held info will be in job_ad,
747
+ # make sure knows status after released.
748
+ job_ad = {
749
+ "ClusterId": 9120,
750
+ "DAGNodeName": "provisioningJob",
751
+ "ExitBySignal": False,
752
+ "ExitCode": 4,
753
+ "HoldReason": "via condor_hold (by user mgower)",
754
+ "HoldReasonCode": 1,
755
+ "HoldReasonSubCode": 0,
756
+ "JobStatus": JobStatus.COMPLETED,
757
+ "ProcId": 0,
758
+ "Reason": "via condor_release (by user mgower)",
759
+ "ReturnValue": 4,
760
+ "TerminatedNormally": True,
761
+ "job_held_time": "2025-02-11T11:46:40",
762
+ "job_released_time": "2025-02-11T11:46:47",
763
+ }
764
+ results = WmsSpecificInfo()
765
+ _add_service_job_specific_info(job_ad, results)
766
+ self.assertEqual(
767
+ results.context, {"job_name": "provisioningJob", "status": "FAILED", "status_details": ""}
768
+ )
769
+
770
+ def testHeldReleasedSuccessEarly(self):
771
+ # Since held info will be in job_ad,
772
+ # make sure knows status after released.
773
+ job_ad = {
774
+ "ClusterId": 9154,
775
+ "DAGNodeName": "provisioningJob",
776
+ "ExitBySignal": False,
777
+ "ExitCode": 0,
778
+ "HoldReason": "via condor_hold (by user mgower)",
779
+ "HoldReasonCode": 1,
780
+ "HoldReasonSubCode": 0,
781
+ "JobStatus": JobStatus.COMPLETED,
782
+ "ProcId": 0,
783
+ "Reason": "via condor_release (by user mgower)",
784
+ "TerminatedNormally": True,
785
+ "job_held_time": "2025-02-11T11:55:20",
786
+ "job_released_time": "2025-02-11T11:55:25",
787
+ }
788
+ results = WmsSpecificInfo()
789
+ _add_service_job_specific_info(job_ad, results)
790
+ self.assertEqual(
791
+ results.context,
792
+ {
793
+ "job_name": "provisioningJob",
794
+ "status": "SUCCEEDED",
795
+ "status_details": "(Note: Finished before workflow.)",
796
+ },
797
+ )
798
+
799
+ def testHeldReleasedSuccess(self):
800
+ # DAG has completed.
801
+ # Since held info will be in job_ad,
802
+ # make sure knows status after released.
803
+ job_ad = {
804
+ "ClusterId": 8625,
805
+ "DAGNodeName": "provisioningJob",
806
+ "HoldReason": "via condor_hold (by user mgower)",
807
+ "HoldReasonCode": 1,
808
+ "HoldReasonSubCode": 0,
809
+ "JobStatus": JobStatus.REMOVED,
810
+ "ProcId": 0,
811
+ "Reason": "removed because <OtherJobRemoveRequirements = DAGManJobId =?= "
812
+ "8624> fired when job (8624.0) was removed",
813
+ "job_held_time": "2025-02-07T12:33:34",
814
+ "job_released_time": "2025-02-07T12:33:47",
815
+ }
816
+ results = WmsSpecificInfo()
817
+ _add_service_job_specific_info(job_ad, results)
818
+ self.assertEqual(
819
+ results.context, {"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""}
820
+ )
821
+
822
+ def testHeldReleasedDeleted(self):
823
+ # Since held info will be in job_ad,
824
+ # make sure knows status after released.
825
+ job_ad = {
826
+ "ClusterId": 9086,
827
+ "DAGNodeName": "provisioningJob",
828
+ "HoldReason": "via condor_hold (by user mgower)",
829
+ "HoldReasonCode": 1,
830
+ "HoldReasonSubCode": 0,
831
+ "JobStatus": JobStatus.REMOVED,
832
+ "ProcId": 0,
833
+ "Reason": "via condor_rm (by user mgower)",
834
+ "job_evicted_time": "2025-02-11T11:35:04",
835
+ "job_held_time": "2025-02-11T11:35:04",
836
+ }
837
+ results = WmsSpecificInfo()
838
+ _add_service_job_specific_info(job_ad, results)
839
+ self.assertEqual(
840
+ results.context, {"job_name": "provisioningJob", "status": "DELETED", "status_details": ""}
841
+ )
842
+
843
+ def testHeldReleasedHeld(self):
844
+ # Since release info will be in job_ad,
845
+ # make sure knows held after release.
846
+ job_ad = {
847
+ "ClusterId": 8659,
848
+ "DAGNodeName": "provisioningJob",
849
+ "HoldReason": "via condor_hold (by user mgower)",
850
+ "HoldReasonCode": 1,
851
+ "HoldReasonSubCode": 0,
852
+ "JobStatus": JobStatus.REMOVED,
853
+ "ProcId": 0,
854
+ "Reason": "Removed by DAGMan (by user mgower)",
855
+ "TerminatedNormally": False,
856
+ "job_held_time": "2025-02-07T12:36:15",
857
+ "job_released_time": "2025-02-07T12:36:07",
858
+ }
859
+ results = WmsSpecificInfo()
860
+ _add_service_job_specific_info(job_ad, results)
861
+ self.assertEqual(
862
+ results.context,
863
+ {
864
+ "job_name": "provisioningJob",
865
+ "status": "DELETED",
866
+ "status_details": "(Job was held for the following reason: via condor_hold (by user mgower))",
867
+ },
868
+ )
869
+
870
+
871
+ class GetRunSummaryTestCase(unittest.TestCase):
872
+ """Test _get_run_summary function."""
873
+
874
+ def testJobSummaryInJobAd(self):
875
+ summary = "pipetaskInit:1;label1:2;label2:2;finalJob:1"
876
+ job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "bps_job_summary": summary}
877
+ results = _get_run_summary(job_ad)
878
+ self.assertEqual(results, summary)
879
+
880
+ def testRunSummaryInJobAd(self):
881
+ summary = "pipetaskInit:1;label1:2;label2:2;finalJob:1"
882
+ job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "bps_run_summary": summary}
883
+ results = _get_run_summary(job_ad)
884
+ self.assertEqual(results, summary)
885
+
886
+ def testSummaryFromDag(self):
887
+ with temporaryDirectory() as tmp_dir:
888
+ copy2(f"{TESTDIR}/data/good.dag", tmp_dir)
889
+ job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "Iwd": tmp_dir}
890
+ results = _get_run_summary(job_ad)
891
+ self.assertEqual(results, "pipetaskInit:1;label1:1;label2:1;label3:1;finalJob:1")
892
+
893
+ def testSummaryNoDag(self):
894
+ with self.assertLogs(logger=logger, level="WARNING") as cm:
895
+ with temporaryDirectory() as tmp_dir:
896
+ job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "Iwd": tmp_dir}
897
+ results = _get_run_summary(job_ad)
898
+ self.assertEqual(results, "")
899
+ self.assertIn("lsst.ctrl.bps.htcondor", cm.records[0].name)
900
+ self.assertIn("Could not get run summary for htcondor job", cm.output[0])
901
+
902
+
903
+ class IsServiceJobTestCase(unittest.TestCase):
904
+ """Test is_service_job function."""
905
+
906
+ def testNotServiceJob(self):
907
+ job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "bps_job_type": "payload"}
908
+ self.assertFalse(is_service_job(job_ad))
909
+
910
+ def testIsServiceJob(self):
911
+ job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "bps_job_type": "service"}
912
+ self.assertTrue(is_service_job(job_ad))
913
+
914
+ def testMissingBpsType(self):
915
+ job_ad = {
916
+ "ClusterId": 8659,
917
+ "DAGNodeName": "testJob",
918
+ }
919
+ self.assertFalse(is_service_job(job_ad))
920
+
921
+
922
+ class CreateDetailedReportFromJobsTestCase(unittest.TestCase):
923
+ """Test _create_detailed_report_from_jobs function."""
924
+
925
+ def testTinySuccess(self):
926
+ with temporaryDirectory() as tmp_dir:
927
+ test_submit_dir = os.path.join(tmp_dir, "tiny_success")
928
+ copytree(f"{TESTDIR}/data/tiny_success", test_submit_dir)
929
+ wms_workflow_id, jobs, message = _get_info_from_path(test_submit_dir)
930
+ run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
931
+ self.assertEqual(len(run_reports), 1)
932
+ report = run_reports[wms_workflow_id]
933
+ self.assertEqual(report.wms_id, wms_workflow_id)
934
+ self.assertEqual(report.state, WmsStates.SUCCEEDED)
935
+ self.assertTrue(os.path.samefile(report.path, test_submit_dir))
936
+ self.assertEqual(report.run_summary, "pipetaskInit:1;label1:1;label2:1;finalJob:1")
937
+ self.assertEqual(
938
+ report.job_state_counts,
939
+ {
940
+ WmsStates.UNKNOWN: 0,
941
+ WmsStates.MISFIT: 0,
942
+ WmsStates.UNREADY: 0,
943
+ WmsStates.READY: 0,
944
+ WmsStates.PENDING: 0,
945
+ WmsStates.RUNNING: 0,
946
+ WmsStates.DELETED: 0,
947
+ WmsStates.HELD: 0,
948
+ WmsStates.SUCCEEDED: 4,
949
+ WmsStates.FAILED: 0,
950
+ WmsStates.PRUNED: 0,
951
+ },
952
+ )
953
+ self.assertEqual(
954
+ report.specific_info.context,
955
+ {"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""},
956
+ )
957
+
958
+ def testTinyProblems(self):
959
+ with temporaryDirectory() as tmp_dir:
960
+ test_submit_dir = os.path.join(tmp_dir, "tiny_problems")
961
+ copytree(f"{TESTDIR}/data/tiny_problems", test_submit_dir)
962
+ wms_workflow_id, jobs, message = _get_info_from_path(test_submit_dir)
963
+ run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
964
+ self.assertEqual(len(run_reports), 1)
965
+ report = run_reports[wms_workflow_id]
966
+ self.assertEqual(report.wms_id, wms_workflow_id)
967
+ self.assertEqual(report.state, WmsStates.FAILED)
968
+ self.assertTrue(os.path.samefile(report.path, test_submit_dir))
969
+ self.assertEqual(report.run_summary, "pipetaskInit:1;label1:2;label2:2;finalJob:1")
970
+ self.assertEqual(
971
+ report.job_state_counts,
972
+ {
973
+ WmsStates.UNKNOWN: 0,
974
+ WmsStates.MISFIT: 0,
975
+ WmsStates.UNREADY: 0,
976
+ WmsStates.READY: 0,
977
+ WmsStates.PENDING: 0,
978
+ WmsStates.RUNNING: 0,
979
+ WmsStates.DELETED: 0,
980
+ WmsStates.HELD: 0,
981
+ WmsStates.SUCCEEDED: 4,
982
+ WmsStates.FAILED: 1,
983
+ WmsStates.PRUNED: 1,
984
+ },
985
+ )
986
+ self.assertEqual(
987
+ run_reports[wms_workflow_id].specific_info.context,
988
+ {"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""},
989
+ )
990
+
991
+ def testTinyRunning(self):
992
+ with temporaryDirectory() as tmp_dir:
993
+ test_submit_dir = os.path.join(tmp_dir, "tiny_running")
994
+ copytree(f"{TESTDIR}/data/tiny_running", test_submit_dir)
995
+ wms_workflow_id, jobs, message = _get_info_from_path(test_submit_dir)
996
+ run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
997
+ self.assertEqual(len(run_reports), 1)
998
+ report = run_reports[wms_workflow_id]
999
+ self.assertEqual(report.wms_id, wms_workflow_id)
1000
+ self.assertEqual(report.state, WmsStates.RUNNING)
1001
+ self.assertTrue(os.path.samefile(report.path, test_submit_dir))
1002
+ self.assertEqual(report.run_summary, "pipetaskInit:1;label1:1;label2:1;finalJob:1")
1003
+ self.assertEqual(
1004
+ report.job_state_counts,
1005
+ {
1006
+ WmsStates.UNKNOWN: 0,
1007
+ WmsStates.MISFIT: 0,
1008
+ WmsStates.UNREADY: 2,
1009
+ WmsStates.READY: 0,
1010
+ WmsStates.PENDING: 0,
1011
+ WmsStates.RUNNING: 1,
1012
+ WmsStates.DELETED: 0,
1013
+ WmsStates.HELD: 0,
1014
+ WmsStates.SUCCEEDED: 1,
1015
+ WmsStates.FAILED: 0,
1016
+ WmsStates.PRUNED: 0,
1017
+ },
1018
+ )
1019
+ self.assertEqual(
1020
+ report.specific_info.context,
1021
+ {"job_name": "provisioningJob", "status": "RUNNING", "status_details": ""},
1022
+ )
1023
+
1024
+
1025
+ if __name__ == "__main__":
1026
+ unittest.main()
@@ -31,7 +31,7 @@ import os
31
31
  import pathlib
32
32
  import tempfile
33
33
  import unittest
34
- from shutil import copy2
34
+ from shutil import copy2, rmtree
35
35
 
36
36
  import htcondor
37
37
 
@@ -197,22 +197,23 @@ class HtcCheckDagmanOutputTestCase(unittest.TestCase):
197
197
  self.assertEqual("", results)
198
198
 
199
199
 
200
- class SummaryFromDagTestCase(unittest.TestCase):
201
- """Test summary_from_dag function."""
200
+ class SummarizeDagTestCase(unittest.TestCase):
201
+ """Test summarize_dag function."""
202
202
 
203
203
  def test_no_dag_file(self):
204
204
  with temporaryDirectory() as tmp_dir:
205
- summary, job_name_to_pipetask = lssthtc.summary_from_dag(tmp_dir)
205
+ summary, job_name_to_pipetask, job_name_to_type = lssthtc.summarize_dag(tmp_dir)
206
206
  self.assertFalse(len(job_name_to_pipetask))
207
+ self.assertFalse(len(job_name_to_type))
207
208
  self.assertFalse(summary)
208
209
 
209
210
  def test_success(self):
210
211
  with temporaryDirectory() as tmp_dir:
211
212
  copy2(f"{TESTDIR}/data/good.dag", tmp_dir)
212
- summary, job_name_to_pipetask = lssthtc.summary_from_dag(tmp_dir)
213
+ summary, job_name_to_label, job_name_to_type = lssthtc.summarize_dag(tmp_dir)
213
214
  self.assertEqual(summary, "pipetaskInit:1;label1:1;label2:1;label3:1;finalJob:1")
214
215
  self.assertEqual(
215
- job_name_to_pipetask,
216
+ job_name_to_label,
216
217
  {
217
218
  "pipetaskInit": "pipetaskInit",
218
219
  "0682f8f9-12f0-40a5-971e-8b30c7231e5c_label1_val1_val2": "label1",
@@ -221,6 +222,98 @@ class SummaryFromDagTestCase(unittest.TestCase):
221
222
  "finalJob": "finalJob",
222
223
  },
223
224
  )
225
+ self.assertEqual(
226
+ job_name_to_type,
227
+ {
228
+ "pipetaskInit": "payload",
229
+ "0682f8f9-12f0-40a5-971e-8b30c7231e5c_label1_val1_val2": "payload",
230
+ "d0305e2d-f164-4a85-bd24-06afe6c84ed9_label2_val1_val2": "payload",
231
+ "2806ecc9-1bba-4362-8fff-ab4e6abb9f83_label3_val1_val2": "payload",
232
+ "finalJob": "final",
233
+ },
234
+ )
235
+
236
+ def test_service(self):
237
+ with temporaryDirectory() as tmp_dir:
238
+ copy2(f"{TESTDIR}/data/tiny_problems/tiny_problems.dag", tmp_dir)
239
+ summary, job_name_to_label, job_name_to_type = lssthtc.summarize_dag(tmp_dir)
240
+ self.assertEqual(summary, "pipetaskInit:1;label1:2;label2:2;finalJob:1")
241
+ self.assertEqual(
242
+ job_name_to_label,
243
+ {
244
+ "pipetaskInit": "pipetaskInit",
245
+ "4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a": "label1",
246
+ "057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b": "label1",
247
+ "696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a": "label2",
248
+ "40040b97-606d-4997-98d3-e0493055fe7e_label2_val1_val2b": "label2",
249
+ "finalJob": "finalJob",
250
+ "provisioningJob": "provisioningJob",
251
+ },
252
+ )
253
+ self.assertEqual(
254
+ job_name_to_type,
255
+ {
256
+ "pipetaskInit": "payload",
257
+ "4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a": "payload",
258
+ "057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b": "payload",
259
+ "696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a": "payload",
260
+ "40040b97-606d-4997-98d3-e0493055fe7e_label2_val1_val2b": "payload",
261
+ "finalJob": "final",
262
+ "provisioningJob": "service",
263
+ },
264
+ )
265
+
266
+
267
+ class ReadDagNodesLogTestCase(unittest.TestCase):
268
+ """Test read_dag_nodes_log function."""
269
+
270
+ def setUp(self):
271
+ self.tmpdir = tempfile.mkdtemp()
272
+
273
+ def tearDown(self):
274
+ rmtree(self.tmpdir, ignore_errors=True)
275
+
276
+ def testFileMissing(self):
277
+ with self.assertRaisesRegex(FileNotFoundError, "DAGMan node log not found in"):
278
+ _, _ = lssthtc.read_dag_nodes_log(self.tmpdir)
279
+
280
+
281
+ class ReadNodeStatusTestCase(unittest.TestCase):
282
+ """Test read_node_status function."""
283
+
284
+ def setUp(self):
285
+ self.tmpdir = tempfile.mkdtemp()
286
+
287
+ def tearDown(self):
288
+ rmtree(self.tmpdir, ignore_errors=True)
289
+
290
+ def testServiceJobNotSubmitted(self):
291
+ # tiny_prov_no_submit files have successful workflow
292
+ # but provisioningJob could not submit.
293
+ copy2(f"{TESTDIR}/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.nodes.log", self.tmpdir)
294
+ copy2(f"{TESTDIR}/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.dagman.log", self.tmpdir)
295
+ copy2(f"{TESTDIR}/data/tiny_prov_no_submit/tiny_prov_no_submit.node_status", self.tmpdir)
296
+ copy2(f"{TESTDIR}/data/tiny_prov_no_submit/tiny_prov_no_submit.dag", self.tmpdir)
297
+
298
+ jobs = lssthtc.read_node_status(self.tmpdir)
299
+ found = [id_ for id_ in jobs if jobs[id_].get("bps_job_type", "MISS") == "service"]
300
+ self.assertEqual(len(found), 1)
301
+ self.assertEqual(jobs[found[0]]["DAGNodeName"], "provisioningJob")
302
+ self.assertEqual(jobs[found[0]]["NodeStatus"], lssthtc.NodeStatus.NOT_READY)
303
+
304
+ def testMissingStatusFile(self):
305
+ copy2(f"{TESTDIR}/data/tiny_problems/tiny_problems.dag.nodes.log", self.tmpdir)
306
+ copy2(f"{TESTDIR}/data/tiny_problems/tiny_problems.dag.dagman.log", self.tmpdir)
307
+ copy2(f"{TESTDIR}/data/tiny_problems/tiny_problems.dag", self.tmpdir)
308
+
309
+ jobs = lssthtc.read_node_status(self.tmpdir)
310
+ self.assertEqual(len(jobs), 7)
311
+ self.assertEqual(jobs["9230.0"]["DAGNodeName"], "pipetaskInit")
312
+ self.assertEqual(jobs["9230.0"]["bps_job_type"], "payload")
313
+ self.assertEqual(jobs["9230.0"]["JobStatus"], lssthtc.JobStatus.COMPLETED)
314
+ found = [id_ for id_ in jobs if jobs[id_].get("bps_job_type", "MISS") == "service"]
315
+ self.assertEqual(len(found), 1)
316
+ self.assertEqual(jobs[found[0]]["DAGNodeName"], "provisioningJob")
224
317
 
225
318
 
226
319
  if __name__ == "__main__":
@@ -1,2 +0,0 @@
1
- __all__ = ["__version__"]
2
- __version__ = "28.2025.800"