lsst-ctrl-bps-htcondor 28.2025.700__tar.gz → 28.2025.900__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lsst_ctrl_bps_htcondor-28.2025.700/python/lsst_ctrl_bps_htcondor.egg-info → lsst_ctrl_bps_htcondor-28.2025.900}/PKG-INFO +1 -1
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/doc/lsst.ctrl.bps.htcondor/userguide.rst +26 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/pyproject.toml +9 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst/ctrl/bps/htcondor/htcondor_service.py +81 -32
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst/ctrl/bps/htcondor/lssthtc.py +54 -45
- lsst_ctrl_bps_htcondor-28.2025.900/python/lsst/ctrl/bps/htcondor/version.py +2 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900/python/lsst_ctrl_bps_htcondor.egg-info}/PKG-INFO +1 -1
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/tests/test_htcondor_service.py +494 -2
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/tests/test_lssthtc.py +99 -6
- lsst_ctrl_bps_htcondor-28.2025.700/python/lsst/ctrl/bps/htcondor/version.py +0 -2
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/COPYRIGHT +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/LICENSE +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/MANIFEST.in +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/README.rst +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/bsd_license.txt +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/doc/lsst.ctrl.bps.htcondor/CHANGES.rst +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/doc/lsst.ctrl.bps.htcondor/index.rst +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/gpl-v3.0.txt +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst/ctrl/bps/htcondor/__init__.py +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst/ctrl/bps/htcondor/etc/__init__.py +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst/ctrl/bps/htcondor/etc/htcondor_defaults.yaml +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst/ctrl/bps/htcondor/final_post.sh +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst/ctrl/bps/htcondor/handlers.py +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst/ctrl/bps/htcondor/htcondor_config.py +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst/ctrl/bps/htcondor/provisioner.py +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst_ctrl_bps_htcondor.egg-info/SOURCES.txt +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst_ctrl_bps_htcondor.egg-info/dependency_links.txt +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst_ctrl_bps_htcondor.egg-info/requires.txt +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst_ctrl_bps_htcondor.egg-info/top_level.txt +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/python/lsst_ctrl_bps_htcondor.egg-info/zip-safe +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/setup.cfg +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/tests/test_handlers.py +0 -0
- {lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/tests/test_provisioner.py +0 -0
|
@@ -148,6 +148,17 @@ from files. So, the detailed report can distinguish between failed and
|
|
|
148
148
|
deleted jobs, and thus will show ``D`` in the flag column for a running
|
|
149
149
|
workflow if there is a deleted job.
|
|
150
150
|
|
|
151
|
+
Rarely, a detailed report may warn about job submission issues. For example:
|
|
152
|
+
|
|
153
|
+
.. code-block:: bash
|
|
154
|
+
|
|
155
|
+
Warn: Job submission issues (last: 01/30/25 10:36:57)
|
|
156
|
+
|
|
157
|
+
A job submission issue could be intermittent or not. It may cause
|
|
158
|
+
problems with the status or counts in the reports. To get more information
|
|
159
|
+
about the submission issue, look in the ``*.dag.dagman.out`` file for
|
|
160
|
+
errors, in particular lines containing ``submit attempt failed``.
|
|
161
|
+
|
|
151
162
|
Occasionally, some jobs are put on hold by HTCondor. To see the reason why
|
|
152
163
|
jobs are being held, use
|
|
153
164
|
|
|
@@ -276,12 +287,27 @@ Look for the line starting with "Provisioning job status". For example
|
|
|
276
287
|
calibrate 0 0 1 0 0 0 0 0 0 0 0 1
|
|
277
288
|
finalJob 0 0 1 0 0 0 0 0 0 0 0 1
|
|
278
289
|
|
|
290
|
+
If the provisioning job status is UNREADY, check the end of the report to see
|
|
291
|
+
if there is a warning about submission issues. There may be a temporary problem.
|
|
292
|
+
Check the ``*.dag.dagman.out`` in run submit directory for errors, in
|
|
293
|
+
particular for ``ERROR: submit attempt failed``.
|
|
294
|
+
|
|
295
|
+
If the provisioning job status is HELD, the hold reason will appear in parentheses.
|
|
296
|
+
|
|
279
297
|
The service job managing the glideins will be automatically canceled once the
|
|
280
298
|
workflow is completed. However, the existing glideins will be left for
|
|
281
299
|
HTCondor to shut them down once they remain inactive for the period specified
|
|
282
300
|
by ``provisioningMaxIdleTime`` (default value: 15 min., see below) or maximum
|
|
283
301
|
wall time is reached.
|
|
284
302
|
|
|
303
|
+
The provisioning job is expected to run as long as the workflow. If the job
|
|
304
|
+
dies, the job status will be `FAILED`. If the job just completed successfully,
|
|
305
|
+
the job status will be `SUCCEEDED` with a message saying it ended early (which
|
|
306
|
+
may or may not cause a problem since existing glideins could remain running).
|
|
307
|
+
To get more information about either of these cases, check the job output
|
|
308
|
+
and error files in the `jobs/provisioningJob` subdirectory.
|
|
309
|
+
|
|
310
|
+
|
|
285
311
|
If the automatic provisioning of the resources is enabled, the script that the
|
|
286
312
|
service job is supposed to run in order to provide the required resources *must
|
|
287
313
|
be* defined by the ``provisioningScript`` setting in the ``provisioning``
|
|
@@ -113,6 +113,15 @@ convention = "numpy"
|
|
|
113
113
|
# not fit on one line.
|
|
114
114
|
add-ignore = ["D107", "D105", "D102", "D100", "D200", "D205", "D400", "D104"]
|
|
115
115
|
|
|
116
|
+
[tool.coverage.report]
|
|
117
|
+
exclude_lines = [
|
|
118
|
+
"pragma: no cover",
|
|
119
|
+
"raise AssertionError",
|
|
120
|
+
"raise NotImplementedError",
|
|
121
|
+
"if __name__ == .__main__.:",
|
|
122
|
+
"if TYPE_CHECKING:",
|
|
123
|
+
]
|
|
124
|
+
|
|
116
125
|
[tool.ruff]
|
|
117
126
|
target-version = "py311"
|
|
118
127
|
line-length = 110
|
|
@@ -79,7 +79,7 @@ from .lssthtc import (
|
|
|
79
79
|
read_dag_log,
|
|
80
80
|
read_dag_status,
|
|
81
81
|
read_node_status,
|
|
82
|
-
|
|
82
|
+
summarize_dag,
|
|
83
83
|
write_dag_info,
|
|
84
84
|
)
|
|
85
85
|
from .provisioner import Provisioner
|
|
@@ -154,7 +154,7 @@ class HTCondorService(BaseWmsService):
|
|
|
154
154
|
if enable_provisioning:
|
|
155
155
|
provisioner = Provisioner(config)
|
|
156
156
|
provisioner.configure()
|
|
157
|
-
provisioner.prepare("
|
|
157
|
+
provisioner.prepare("provisioningJob.bash", prefix=out_prefix)
|
|
158
158
|
provisioner.provision(workflow.dag)
|
|
159
159
|
|
|
160
160
|
with time_this(
|
|
@@ -1317,9 +1317,9 @@ def _create_detailed_report_from_jobs(
|
|
|
1317
1317
|
job_state_counts=dag_ad.get("state_counts", state_counts),
|
|
1318
1318
|
exit_code_summary=_get_exit_code_summary(jobs),
|
|
1319
1319
|
)
|
|
1320
|
-
|
|
1320
|
+
specific_info = WmsSpecificInfo()
|
|
1321
1321
|
for job_id, job_ad in jobs.items():
|
|
1322
|
-
if not is_service_job(
|
|
1322
|
+
if not is_service_job(job_ad):
|
|
1323
1323
|
try:
|
|
1324
1324
|
job_report = WmsJobReport(
|
|
1325
1325
|
wms_id=job_id,
|
|
@@ -1334,33 +1334,85 @@ def _create_detailed_report_from_jobs(
|
|
|
1334
1334
|
_LOG.error("Job missing key '%s': %s", str(ex), job_ad)
|
|
1335
1335
|
raise
|
|
1336
1336
|
else:
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
)
|
|
1350
|
-
else:
|
|
1351
|
-
_LOG.warning(
|
|
1352
|
-
"Service job with id '%s' (label '%s'): no handler, no action taken", job_id, job_label
|
|
1353
|
-
)
|
|
1337
|
+
_LOG.debug(
|
|
1338
|
+
"Found service job: id='%s', name='%s', label='%s', NodeStatus='%s', JobStatus='%s'",
|
|
1339
|
+
job_id,
|
|
1340
|
+
job_ad["DAGNodeName"],
|
|
1341
|
+
job_ad.get("bps_job_label", "MISS"),
|
|
1342
|
+
job_ad.get("NodeStatus", "MISS"),
|
|
1343
|
+
job_ad.get("JobStatus", "MISS"),
|
|
1344
|
+
)
|
|
1345
|
+
_add_service_job_specific_info(job_ad, specific_info)
|
|
1346
|
+
|
|
1347
|
+
if specific_info:
|
|
1348
|
+
report.specific_info = specific_info
|
|
1354
1349
|
|
|
1355
1350
|
# Add the removed entry to restore the original content of the dictionary.
|
|
1356
1351
|
# The ordering of keys will be change permanently though.
|
|
1357
1352
|
jobs.update({wms_workflow_id: dag_ad})
|
|
1358
1353
|
|
|
1354
|
+
# Workflow will exit with non-zero DAG_STATUS if problem with
|
|
1355
|
+
# any of the wms jobs. So change FAILED to SUCCEEDED if all
|
|
1356
|
+
# payload jobs SUCCEEDED.
|
|
1357
|
+
if report.total_number_jobs == report.job_state_counts[WmsStates.SUCCEEDED]:
|
|
1358
|
+
report.state = WmsStates.SUCCEEDED
|
|
1359
|
+
|
|
1359
1360
|
run_reports = {report.wms_id: report}
|
|
1360
1361
|
_LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
|
|
1361
1362
|
return run_reports
|
|
1362
1363
|
|
|
1363
1364
|
|
|
1365
|
+
def _add_service_job_specific_info(job_ad: dict[str, Any], specific_info: WmsSpecificInfo) -> None:
|
|
1366
|
+
"""Generate report information for service job.
|
|
1367
|
+
|
|
1368
|
+
Parameters
|
|
1369
|
+
----------
|
|
1370
|
+
job_ad : `dict` [`str`, `Any`]
|
|
1371
|
+
Provisioning job information.
|
|
1372
|
+
specific_info : `lsst.ctrl.bps.WmsSpecificInfo`
|
|
1373
|
+
Where to add message.
|
|
1374
|
+
"""
|
|
1375
|
+
status_details = ""
|
|
1376
|
+
job_status = _htc_status_to_wms_state(job_ad)
|
|
1377
|
+
|
|
1378
|
+
# Service jobs in queue are deleted when DAG is done.
|
|
1379
|
+
# To get accurate status, need to check other info.
|
|
1380
|
+
if (
|
|
1381
|
+
job_status == WmsStates.DELETED
|
|
1382
|
+
and "Reason" in job_ad
|
|
1383
|
+
and (
|
|
1384
|
+
"Removed by DAGMan" in job_ad["Reason"]
|
|
1385
|
+
or "removed because <OtherJobRemoveRequirements = DAGManJobId =?=" in job_ad["Reason"]
|
|
1386
|
+
or "DAG is exiting and writing rescue file." in job_ad["Reason"]
|
|
1387
|
+
)
|
|
1388
|
+
):
|
|
1389
|
+
if "HoldReason" in job_ad:
|
|
1390
|
+
# HoldReason exists even if released, so check.
|
|
1391
|
+
if "job_released_time" in job_ad and job_ad["job_held_time"] < job_ad["job_released_time"]:
|
|
1392
|
+
# If released, assume running until deleted.
|
|
1393
|
+
job_status = WmsStates.SUCCEEDED
|
|
1394
|
+
status_details = ""
|
|
1395
|
+
else:
|
|
1396
|
+
# If job held when deleted by DAGMan, still want to
|
|
1397
|
+
# report hold reason
|
|
1398
|
+
status_details = f"(Job was held for the following reason: {job_ad['HoldReason']})"
|
|
1399
|
+
|
|
1400
|
+
else:
|
|
1401
|
+
job_status = WmsStates.SUCCEEDED
|
|
1402
|
+
elif job_status == WmsStates.SUCCEEDED:
|
|
1403
|
+
status_details = "(Note: Finished before workflow.)"
|
|
1404
|
+
elif job_status == WmsStates.HELD:
|
|
1405
|
+
status_details = f"({job_ad['HoldReason']})"
|
|
1406
|
+
|
|
1407
|
+
template = "Status of {job_name}: {status} {status_details}"
|
|
1408
|
+
context = {
|
|
1409
|
+
"job_name": job_ad["DAGNodeName"],
|
|
1410
|
+
"status": job_status.name,
|
|
1411
|
+
"status_details": status_details,
|
|
1412
|
+
}
|
|
1413
|
+
specific_info.add_message(template=template, context=context)
|
|
1414
|
+
|
|
1415
|
+
|
|
1364
1416
|
def _summary_report(user, hist, pass_thru, schedds=None):
|
|
1365
1417
|
"""Gather run information to be used in generating summary reports.
|
|
1366
1418
|
|
|
@@ -1509,7 +1561,7 @@ def _get_run_summary(job):
|
|
|
1509
1561
|
"""
|
|
1510
1562
|
summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
|
|
1511
1563
|
if not summary:
|
|
1512
|
-
summary, _ =
|
|
1564
|
+
summary, _, _ = summarize_dag(job["Iwd"])
|
|
1513
1565
|
if not summary:
|
|
1514
1566
|
_LOG.warning("Could not get run summary for htcondor job: %s", job)
|
|
1515
1567
|
_LOG.debug("_get_run_summary: summary=%s", summary)
|
|
@@ -1587,7 +1639,7 @@ def _get_state_counts_from_jobs(
|
|
|
1587
1639
|
"""
|
|
1588
1640
|
state_counts = dict.fromkeys(WmsStates, 0)
|
|
1589
1641
|
for job_id, job_ad in jobs.items():
|
|
1590
|
-
if job_id != wms_workflow_id and not is_service_job(
|
|
1642
|
+
if job_id != wms_workflow_id and not is_service_job(job_ad):
|
|
1591
1643
|
state_counts[_htc_status_to_wms_state(job_ad)] += 1
|
|
1592
1644
|
total_counted = sum(state_counts.values())
|
|
1593
1645
|
|
|
@@ -2143,13 +2195,13 @@ def _gather_site_values(config, compute_site):
|
|
|
2143
2195
|
return site_values
|
|
2144
2196
|
|
|
2145
2197
|
|
|
2146
|
-
def is_service_job(
|
|
2198
|
+
def is_service_job(job_ad: dict[str, Any]) -> bool:
|
|
2147
2199
|
"""Determine if a job is a service one.
|
|
2148
2200
|
|
|
2149
2201
|
Parameters
|
|
2150
2202
|
----------
|
|
2151
|
-
|
|
2152
|
-
HTCondor job
|
|
2203
|
+
job_ad : `dict` [`str`, Any]
|
|
2204
|
+
Information about an HTCondor job.
|
|
2153
2205
|
|
|
2154
2206
|
Returns
|
|
2155
2207
|
-------
|
|
@@ -2159,10 +2211,7 @@ def is_service_job(job_id: str) -> bool:
|
|
|
2159
2211
|
Notes
|
|
2160
2212
|
-----
|
|
2161
2213
|
At the moment, HTCondor does not provide a native way to distinguish
|
|
2162
|
-
between payload and service jobs in the workflow.
|
|
2163
|
-
|
|
2164
|
-
:py:func:`read_node_status()` (service jobs are given ids with ClusterId=0
|
|
2165
|
-
and ProcId=some integer). If it changes, this function needs to be
|
|
2166
|
-
updated too.
|
|
2214
|
+
between payload and service jobs in the workflow. This code depends
|
|
2215
|
+
on read_node_status adding bps_job_type.
|
|
2167
2216
|
"""
|
|
2168
|
-
return
|
|
2217
|
+
return job_ad.get("bps_job_type", "MISSING") == "service"
|
|
@@ -63,7 +63,8 @@ __all__ = [
|
|
|
63
63
|
"read_dag_nodes_log",
|
|
64
64
|
"read_dag_status",
|
|
65
65
|
"read_node_status",
|
|
66
|
-
"
|
|
66
|
+
"summarize_dag",
|
|
67
|
+
"update_job_info",
|
|
67
68
|
"update_job_info",
|
|
68
69
|
"write_dag_info",
|
|
69
70
|
]
|
|
@@ -1245,7 +1246,7 @@ def update_job_info(job_info, other_info):
|
|
|
1245
1246
|
return job_info
|
|
1246
1247
|
|
|
1247
1248
|
|
|
1248
|
-
def
|
|
1249
|
+
def summarize_dag(dir_name: str) -> tuple[str, dict[str, str], dict[str, str]]:
|
|
1249
1250
|
"""Build bps_run_summary string from dag file.
|
|
1250
1251
|
|
|
1251
1252
|
Parameters
|
|
@@ -1256,51 +1257,64 @@ def summary_from_dag(dir_name):
|
|
|
1256
1257
|
Returns
|
|
1257
1258
|
-------
|
|
1258
1259
|
summary : `str`
|
|
1259
|
-
Semi-colon separated list of job labels and counts
|
|
1260
|
+
Semi-colon separated list of job labels and counts
|
|
1260
1261
|
(Same format as saved in dag classad).
|
|
1261
1262
|
job_name_to_label : `dict` [`str`, `str`]
|
|
1262
1263
|
Mapping of job names to job labels.
|
|
1264
|
+
job_name_to_type : `dict` [`str`, `str`]
|
|
1265
|
+
Mapping of job names to job types
|
|
1266
|
+
(e.g., payload, final, service).
|
|
1263
1267
|
"""
|
|
1264
1268
|
# Later code depends upon insertion order
|
|
1265
|
-
counts = defaultdict(int)
|
|
1269
|
+
counts: defaultdict[str, int] = defaultdict(int) # counts of payload jobs per label
|
|
1266
1270
|
job_name_to_label = {}
|
|
1271
|
+
job_name_to_type = {}
|
|
1267
1272
|
try:
|
|
1268
1273
|
dag = next(Path(dir_name).glob("*.dag"))
|
|
1269
1274
|
with open(dag) as fh:
|
|
1270
1275
|
for line in fh:
|
|
1276
|
+
job_name = ""
|
|
1271
1277
|
if line.startswith("JOB"):
|
|
1272
|
-
m = re.match(r'JOB (\S+) "jobs/([^/]+)/', line)
|
|
1278
|
+
m = re.match(r'JOB (\S+) "?jobs/([^/]+)/', line)
|
|
1273
1279
|
if m:
|
|
1280
|
+
job_name = m.group(1)
|
|
1274
1281
|
label = m.group(2)
|
|
1275
1282
|
if label == "init":
|
|
1276
1283
|
label = "pipetaskInit"
|
|
1277
|
-
job_name_to_label[m.group(1)] = label
|
|
1278
1284
|
counts[label] += 1
|
|
1279
1285
|
else: # Check if Pegasus submission
|
|
1280
1286
|
m = re.match(r"JOB (\S+) (\S+)", line)
|
|
1281
1287
|
if m:
|
|
1288
|
+
job_name = m.group(1)
|
|
1282
1289
|
label = pegasus_name_to_label(m.group(1))
|
|
1283
|
-
job_name_to_label[m.group(1)] = label
|
|
1284
1290
|
counts[label] += 1
|
|
1285
1291
|
else:
|
|
1286
1292
|
_LOG.warning("Parse DAG: unmatched job line: %s", line)
|
|
1293
|
+
job_type = "payload"
|
|
1287
1294
|
elif line.startswith("FINAL"):
|
|
1288
1295
|
m = re.match(r"FINAL (\S+) jobs/([^/]+)/", line)
|
|
1289
1296
|
if m:
|
|
1297
|
+
job_name = m.group(1)
|
|
1290
1298
|
label = m.group(2)
|
|
1291
|
-
|
|
1292
|
-
|
|
1299
|
+
counts[label] += 1 # final counts a payload job.
|
|
1300
|
+
job_type = "final"
|
|
1293
1301
|
elif line.startswith("SERVICE"):
|
|
1294
1302
|
m = re.match(r"SERVICE (\S+) jobs/([^/]+)/", line)
|
|
1295
1303
|
if m:
|
|
1304
|
+
job_name = m.group(1)
|
|
1296
1305
|
label = m.group(2)
|
|
1297
|
-
|
|
1306
|
+
job_type = "service"
|
|
1307
|
+
|
|
1308
|
+
if job_name:
|
|
1309
|
+
job_name_to_label[job_name] = label
|
|
1310
|
+
job_name_to_type[job_name] = job_type
|
|
1311
|
+
|
|
1298
1312
|
except (OSError, PermissionError, StopIteration):
|
|
1299
1313
|
pass
|
|
1300
1314
|
|
|
1301
1315
|
summary = ";".join([f"{name}:{counts[name]}" for name in counts])
|
|
1302
|
-
_LOG.debug("
|
|
1303
|
-
return summary, job_name_to_label
|
|
1316
|
+
_LOG.debug("summarize_dag: %s %s %s", summary, job_name_to_label, job_name_to_type)
|
|
1317
|
+
return summary, job_name_to_label, job_name_to_type
|
|
1304
1318
|
|
|
1305
1319
|
|
|
1306
1320
|
def pegasus_name_to_label(name):
|
|
@@ -1400,7 +1414,7 @@ def read_node_status(wms_path):
|
|
|
1400
1414
|
file.
|
|
1401
1415
|
"""
|
|
1402
1416
|
# Get jobid info from other places to fill in gaps in info from node_status
|
|
1403
|
-
_, job_name_to_label =
|
|
1417
|
+
_, job_name_to_label, job_name_to_type = summarize_dag(wms_path)
|
|
1404
1418
|
wms_workflow_id, loginfo = read_dag_log(wms_path)
|
|
1405
1419
|
loginfo = read_dag_nodes_log(wms_path)
|
|
1406
1420
|
_LOG.debug("loginfo = %s", loginfo)
|
|
@@ -1409,17 +1423,17 @@ def read_node_status(wms_path):
|
|
|
1409
1423
|
if "LogNotes" in job_info:
|
|
1410
1424
|
m = re.match(r"DAG Node: (\S+)", job_info["LogNotes"])
|
|
1411
1425
|
if m:
|
|
1412
|
-
|
|
1413
|
-
|
|
1426
|
+
job_name = m.group(1)
|
|
1427
|
+
job_name_to_id[job_name] = job_id
|
|
1428
|
+
job_info["DAGNodeName"] = job_name
|
|
1429
|
+
job_info["bps_job_type"] = job_name_to_type[job_name]
|
|
1430
|
+
job_info["bps_job_label"] = job_name_to_label[job_name]
|
|
1414
1431
|
|
|
1432
|
+
jobs = loginfo
|
|
1433
|
+
fake_id = -1.0 # For nodes that do not yet have a job id, give fake one
|
|
1415
1434
|
try:
|
|
1416
1435
|
node_status = next(Path(wms_path).glob("*.node_status"))
|
|
1417
|
-
except StopIteration:
|
|
1418
|
-
return loginfo
|
|
1419
1436
|
|
|
1420
|
-
jobs = {}
|
|
1421
|
-
fake_id = -1.0 # For nodes that do not yet have a job id, give fake one
|
|
1422
|
-
try:
|
|
1423
1437
|
with open(node_status) as fh:
|
|
1424
1438
|
for ad in classad.parseAds(fh):
|
|
1425
1439
|
match ad["Type"]:
|
|
@@ -1438,22 +1452,19 @@ def read_node_status(wms_path):
|
|
|
1438
1452
|
# Make job info as if came from condor_q.
|
|
1439
1453
|
if job_name in job_name_to_id:
|
|
1440
1454
|
job_id = str(job_name_to_id[job_name])
|
|
1455
|
+
job = jobs[job_id]
|
|
1441
1456
|
else:
|
|
1442
1457
|
job_id = str(fake_id)
|
|
1458
|
+
job_name_to_id[job_name] = job_id
|
|
1459
|
+
job = dict(ad)
|
|
1460
|
+
jobs[job_id] = job
|
|
1443
1461
|
fake_id -= 1
|
|
1444
|
-
job = dict(ad)
|
|
1445
1462
|
job["ClusterId"] = int(float(job_id))
|
|
1446
1463
|
job["DAGManJobID"] = wms_workflow_id
|
|
1447
1464
|
job["DAGNodeName"] = job_name
|
|
1448
1465
|
job["bps_job_label"] = job_label
|
|
1466
|
+
job["bps_job_type"] = job_name_to_type[job_name]
|
|
1449
1467
|
|
|
1450
|
-
# Include information retrieved from the event log
|
|
1451
|
-
# if available.
|
|
1452
|
-
jobs[job_id] = job
|
|
1453
|
-
try:
|
|
1454
|
-
jobs[job_id] |= loginfo[job_id]
|
|
1455
|
-
except KeyError:
|
|
1456
|
-
pass
|
|
1457
1468
|
case "StatusEnd":
|
|
1458
1469
|
# Skip node status file "epilog".
|
|
1459
1470
|
pass
|
|
@@ -1463,24 +1474,22 @@ def read_node_status(wms_path):
|
|
|
1463
1474
|
ad["Type"],
|
|
1464
1475
|
wms_path,
|
|
1465
1476
|
)
|
|
1466
|
-
except (OSError, PermissionError):
|
|
1477
|
+
except (StopIteration, OSError, PermissionError):
|
|
1467
1478
|
pass
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
}
|
|
1481
|
-
|
|
1482
|
-
job_info["bps_job_label"] = job_name_to_label[job_id_to_name[job_id]]
|
|
1483
|
-
jobs[f"{job_info['ProcId']}.{job_info['ClusterId']}"] = job_info
|
|
1479
|
+
|
|
1480
|
+
# Check for missing jobs (e.g., submission failure or not submitted yet)
|
|
1481
|
+
# Use dag info to create job placeholders
|
|
1482
|
+
for name in set(job_name_to_label) - set(job_name_to_id):
|
|
1483
|
+
job = {}
|
|
1484
|
+
job["ClusterId"] = int(float(fake_id))
|
|
1485
|
+
job["ProcId"] = 0
|
|
1486
|
+
job["DAGManJobID"] = wms_workflow_id
|
|
1487
|
+
job["DAGNodeName"] = name
|
|
1488
|
+
job["bps_job_label"] = job_name_to_label[name]
|
|
1489
|
+
job["bps_job_type"] = job_name_to_type[name]
|
|
1490
|
+
job["NodeStatus"] = NodeStatus.NOT_READY
|
|
1491
|
+
jobs[f"{job['ClusterId']}.{job['ProcId']}"] = job
|
|
1492
|
+
fake_id -= 1
|
|
1484
1493
|
|
|
1485
1494
|
return jobs
|
|
1486
1495
|
|
|
@@ -31,24 +31,28 @@ import logging
|
|
|
31
31
|
import os
|
|
32
32
|
import unittest
|
|
33
33
|
from pathlib import Path
|
|
34
|
-
from shutil import copy2
|
|
34
|
+
from shutil import copy2, copytree
|
|
35
35
|
|
|
36
36
|
import htcondor
|
|
37
37
|
|
|
38
|
-
from lsst.ctrl.bps import BpsConfig, GenericWorkflowExec, GenericWorkflowJob, WmsStates
|
|
38
|
+
from lsst.ctrl.bps import BpsConfig, GenericWorkflowExec, GenericWorkflowJob, WmsSpecificInfo, WmsStates
|
|
39
39
|
from lsst.ctrl.bps.htcondor.htcondor_config import HTC_DEFAULTS_URI
|
|
40
40
|
from lsst.ctrl.bps.htcondor.htcondor_service import (
|
|
41
41
|
HTCondorService,
|
|
42
42
|
JobStatus,
|
|
43
43
|
NodeStatus,
|
|
44
44
|
WmsIdType,
|
|
45
|
+
_add_service_job_specific_info,
|
|
46
|
+
_create_detailed_report_from_jobs,
|
|
45
47
|
_get_exit_code_summary,
|
|
46
48
|
_get_info_from_path,
|
|
49
|
+
_get_run_summary,
|
|
47
50
|
_get_state_counts_from_dag_job,
|
|
48
51
|
_htc_node_status_to_wms_state,
|
|
49
52
|
_htc_status_to_wms_state,
|
|
50
53
|
_translate_job_cmds,
|
|
51
54
|
_wms_id_to_dir,
|
|
55
|
+
is_service_job,
|
|
52
56
|
)
|
|
53
57
|
from lsst.ctrl.bps.htcondor.lssthtc import MISSING_ID
|
|
54
58
|
from lsst.utils.tests import temporaryDirectory
|
|
@@ -532,3 +536,491 @@ class WmsIdToDirTestCase(unittest.TestCase):
|
|
|
532
536
|
self.assertEqual(id_type, WmsIdType.PATH)
|
|
533
537
|
self.assertEqual(abs_path.resolve(), wms_path)
|
|
534
538
|
os.chdir(orig_dir)
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
class AddServiceJobSpecificInfoTestCase(unittest.TestCase):
|
|
542
|
+
"""Test _add_service_job_specific_info function.
|
|
543
|
+
|
|
544
|
+
Note: The job_ad's are hardcoded in these tests. The
|
|
545
|
+
values in the dictionaries come from plugin code as
|
|
546
|
+
well as HTCondor. Changes in either of those codes
|
|
547
|
+
that produce data for the job_ad can break this
|
|
548
|
+
function without breaking these unit tests.
|
|
549
|
+
|
|
550
|
+
Also, since hold status/messages stick around, testing
|
|
551
|
+
various cases with and without job being held just to
|
|
552
|
+
ensure get right status in both cases.
|
|
553
|
+
"""
|
|
554
|
+
|
|
555
|
+
def testNotSubmitted(self):
|
|
556
|
+
# Service job not submitted yet or can't be submitted.
|
|
557
|
+
# (Typically an plugin bug.)
|
|
558
|
+
# At this function level, can't tell if not submitted
|
|
559
|
+
# yet or problem so it never will.
|
|
560
|
+
job_ad = {
|
|
561
|
+
"ClusterId": -64,
|
|
562
|
+
"DAGManJobID": "8997.0",
|
|
563
|
+
"DAGNodeName": "provisioningJob",
|
|
564
|
+
"NodeStatus": NodeStatus.NOT_READY,
|
|
565
|
+
"ProcId": 0,
|
|
566
|
+
"bps_job_label": "service_provisioningJob",
|
|
567
|
+
}
|
|
568
|
+
results = WmsSpecificInfo()
|
|
569
|
+
_add_service_job_specific_info(job_ad, results)
|
|
570
|
+
self.assertEqual(
|
|
571
|
+
results.context, {"job_name": "provisioningJob", "status": "UNREADY", "status_details": ""}
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
def testRunning(self):
|
|
575
|
+
# DAG hasn't completed (Running or held),
|
|
576
|
+
# Service job is running.
|
|
577
|
+
job_ad = {
|
|
578
|
+
"ClusterId": 8523,
|
|
579
|
+
"ProcId": 0,
|
|
580
|
+
"DAGNodeName": "provisioningJob",
|
|
581
|
+
"JobStatus": JobStatus.RUNNING,
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
results = WmsSpecificInfo()
|
|
585
|
+
_add_service_job_specific_info(job_ad, results)
|
|
586
|
+
self.assertEqual(
|
|
587
|
+
results.context, {"job_name": "provisioningJob", "status": "RUNNING", "status_details": ""}
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
def testDied(self):
|
|
591
|
+
# DAG hasn't completed (Running or held),
|
|
592
|
+
# Service job failed (completed non-zero exit code)
|
|
593
|
+
job_ad = {
|
|
594
|
+
"ClusterId": 8761,
|
|
595
|
+
"ProcId": 0,
|
|
596
|
+
"DAGNodeName": "provisioningJob",
|
|
597
|
+
"JobStatus": JobStatus.COMPLETED,
|
|
598
|
+
"ExitCode": 4,
|
|
599
|
+
}
|
|
600
|
+
results = WmsSpecificInfo()
|
|
601
|
+
_add_service_job_specific_info(job_ad, results)
|
|
602
|
+
self.assertEqual(
|
|
603
|
+
results.context, {"job_name": "provisioningJob", "status": "FAILED", "status_details": ""}
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
def testDeleted(self):
|
|
607
|
+
# Deleted by user (never held)
|
|
608
|
+
job_ad = {
|
|
609
|
+
"ClusterId": 9086,
|
|
610
|
+
"DAGNodeName": "provisioningJob",
|
|
611
|
+
"JobStatus": JobStatus.REMOVED,
|
|
612
|
+
"ProcId": 0,
|
|
613
|
+
"Reason": "via condor_rm (by user mgower)",
|
|
614
|
+
"job_evicted_time": "2025-02-11T11:35:04",
|
|
615
|
+
}
|
|
616
|
+
results = WmsSpecificInfo()
|
|
617
|
+
_add_service_job_specific_info(job_ad, results)
|
|
618
|
+
self.assertEqual(
|
|
619
|
+
results.context, {"job_name": "provisioningJob", "status": "DELETED", "status_details": ""}
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
def testSucceedEarly(self):
|
|
623
|
+
# DAG hasn't completed (Running or held),
|
|
624
|
+
# Service job completed with exit code 0
|
|
625
|
+
job_ad = {
|
|
626
|
+
"ClusterId": 8761,
|
|
627
|
+
"ProcId": 0,
|
|
628
|
+
"DAGNodeName": "provisioningJob",
|
|
629
|
+
"JobStatus": JobStatus.COMPLETED,
|
|
630
|
+
"ExitCode": 0,
|
|
631
|
+
}
|
|
632
|
+
results = WmsSpecificInfo()
|
|
633
|
+
_add_service_job_specific_info(job_ad, results)
|
|
634
|
+
self.assertEqual(
|
|
635
|
+
results.context,
|
|
636
|
+
{
|
|
637
|
+
"job_name": "provisioningJob",
|
|
638
|
+
"status": "SUCCEEDED",
|
|
639
|
+
"status_details": "(Note: Finished before workflow.)",
|
|
640
|
+
},
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
def testSucceedOldRemoveMessage(self):
|
|
644
|
+
# DAG completed, job was in running state when removed.
|
|
645
|
+
job_ad = {
|
|
646
|
+
"ClusterId": 8761,
|
|
647
|
+
"ProcId": 0,
|
|
648
|
+
"DAGNodeName": "provisioningJob",
|
|
649
|
+
"JobStatus": JobStatus.REMOVED,
|
|
650
|
+
"Reason": "Removed by DAGMan (by user mgower)",
|
|
651
|
+
}
|
|
652
|
+
results = WmsSpecificInfo()
|
|
653
|
+
_add_service_job_specific_info(job_ad, results)
|
|
654
|
+
self.assertEqual(
|
|
655
|
+
results.context, {"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""}
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
def testSucceed(self):
|
|
659
|
+
# DAG completed, job was in running state when removed.
|
|
660
|
+
job_ad = {
|
|
661
|
+
"ClusterId": 8761,
|
|
662
|
+
"ProcId": 0,
|
|
663
|
+
"DAGNodeName": "provisioningJob",
|
|
664
|
+
"JobStatus": JobStatus.REMOVED,
|
|
665
|
+
"Reason": (
|
|
666
|
+
"removed because <OtherJobRemoveRequirements = DAGManJobId =?= 8556>"
|
|
667
|
+
" fired when job (8556.0) was removed"
|
|
668
|
+
),
|
|
669
|
+
}
|
|
670
|
+
results = WmsSpecificInfo()
|
|
671
|
+
_add_service_job_specific_info(job_ad, results)
|
|
672
|
+
self.assertEqual(
|
|
673
|
+
results.context, {"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""}
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
def testUserHeldWhileRunning(self):
|
|
677
|
+
# DAG hasn't completed (Running or held),
|
|
678
|
+
# user put at least service job on hold
|
|
679
|
+
job_ad = {
|
|
680
|
+
"ClusterId": 8523,
|
|
681
|
+
"ProcId": 0,
|
|
682
|
+
"DAGNodeName": "provisioningJob",
|
|
683
|
+
"JobStatus": JobStatus.HELD,
|
|
684
|
+
"HoldReason": "via condor_hold (by user mgower)",
|
|
685
|
+
"HoldReasonCode": 1,
|
|
686
|
+
"HoldReasonSubCode": 0,
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
results = WmsSpecificInfo()
|
|
690
|
+
_add_service_job_specific_info(job_ad, results)
|
|
691
|
+
self.assertEqual(
|
|
692
|
+
results.context,
|
|
693
|
+
{
|
|
694
|
+
"job_name": "provisioningJob",
|
|
695
|
+
"status": "HELD",
|
|
696
|
+
"status_details": "(via condor_hold (by user mgower))",
|
|
697
|
+
},
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
def testHeldByHTC(self):
|
|
701
|
+
# Job put on hold by HTCondor, removed when DAG ends
|
|
702
|
+
job_ad = {
|
|
703
|
+
"ClusterId": 8693,
|
|
704
|
+
"DAGNodeName": "provisioningJob",
|
|
705
|
+
"HoldReason": "Failed to execute",
|
|
706
|
+
"HoldReasonCode": 6,
|
|
707
|
+
"HoldReasonSubCode": 2,
|
|
708
|
+
"JobStatus": JobStatus.REMOVED,
|
|
709
|
+
"ProcId": 0,
|
|
710
|
+
"Reason": "Removed by DAGMan (by user mgower)",
|
|
711
|
+
"job_held_time": "2025-02-07T12:50:07",
|
|
712
|
+
}
|
|
713
|
+
results = WmsSpecificInfo()
|
|
714
|
+
_add_service_job_specific_info(job_ad, results)
|
|
715
|
+
self.assertEqual(
|
|
716
|
+
results.context,
|
|
717
|
+
{
|
|
718
|
+
"job_name": "provisioningJob",
|
|
719
|
+
"status": "DELETED",
|
|
720
|
+
"status_details": "(Job was held for the following reason: Failed to execute)",
|
|
721
|
+
},
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
def testHeldReleasedRunning(self):
|
|
725
|
+
# DAG hasn't completed (Running or held),
|
|
726
|
+
# Since held info will be in job_ad, make sure knows released.
|
|
727
|
+
job_ad = {
|
|
728
|
+
"ClusterId": 8625,
|
|
729
|
+
"DAGNodeName": "provisioningJob",
|
|
730
|
+
"HoldReason": "via condor_hold (by user mgower)",
|
|
731
|
+
"HoldReasonCode": 1,
|
|
732
|
+
"HoldReasonSubCode": 0,
|
|
733
|
+
"JobStatus": JobStatus.RUNNING,
|
|
734
|
+
"LogNotes": "DAG Node: provisioningJob",
|
|
735
|
+
"ProcId": 0,
|
|
736
|
+
"job_held_time": "2025-02-07T12:33:34",
|
|
737
|
+
"job_released_time": "2025-02-07T12:33:47",
|
|
738
|
+
}
|
|
739
|
+
results = WmsSpecificInfo()
|
|
740
|
+
_add_service_job_specific_info(job_ad, results)
|
|
741
|
+
self.assertEqual(
|
|
742
|
+
results.context, {"job_name": "provisioningJob", "status": "RUNNING", "status_details": ""}
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
def testHeldReleasedDied(self):
|
|
746
|
+
# Since held info will be in job_ad,
|
|
747
|
+
# make sure knows status after released.
|
|
748
|
+
job_ad = {
|
|
749
|
+
"ClusterId": 9120,
|
|
750
|
+
"DAGNodeName": "provisioningJob",
|
|
751
|
+
"ExitBySignal": False,
|
|
752
|
+
"ExitCode": 4,
|
|
753
|
+
"HoldReason": "via condor_hold (by user mgower)",
|
|
754
|
+
"HoldReasonCode": 1,
|
|
755
|
+
"HoldReasonSubCode": 0,
|
|
756
|
+
"JobStatus": JobStatus.COMPLETED,
|
|
757
|
+
"ProcId": 0,
|
|
758
|
+
"Reason": "via condor_release (by user mgower)",
|
|
759
|
+
"ReturnValue": 4,
|
|
760
|
+
"TerminatedNormally": True,
|
|
761
|
+
"job_held_time": "2025-02-11T11:46:40",
|
|
762
|
+
"job_released_time": "2025-02-11T11:46:47",
|
|
763
|
+
}
|
|
764
|
+
results = WmsSpecificInfo()
|
|
765
|
+
_add_service_job_specific_info(job_ad, results)
|
|
766
|
+
self.assertEqual(
|
|
767
|
+
results.context, {"job_name": "provisioningJob", "status": "FAILED", "status_details": ""}
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
def testHeldReleasedSuccessEarly(self):
|
|
771
|
+
# Since held info will be in job_ad,
|
|
772
|
+
# make sure knows status after released.
|
|
773
|
+
job_ad = {
|
|
774
|
+
"ClusterId": 9154,
|
|
775
|
+
"DAGNodeName": "provisioningJob",
|
|
776
|
+
"ExitBySignal": False,
|
|
777
|
+
"ExitCode": 0,
|
|
778
|
+
"HoldReason": "via condor_hold (by user mgower)",
|
|
779
|
+
"HoldReasonCode": 1,
|
|
780
|
+
"HoldReasonSubCode": 0,
|
|
781
|
+
"JobStatus": JobStatus.COMPLETED,
|
|
782
|
+
"ProcId": 0,
|
|
783
|
+
"Reason": "via condor_release (by user mgower)",
|
|
784
|
+
"TerminatedNormally": True,
|
|
785
|
+
"job_held_time": "2025-02-11T11:55:20",
|
|
786
|
+
"job_released_time": "2025-02-11T11:55:25",
|
|
787
|
+
}
|
|
788
|
+
results = WmsSpecificInfo()
|
|
789
|
+
_add_service_job_specific_info(job_ad, results)
|
|
790
|
+
self.assertEqual(
|
|
791
|
+
results.context,
|
|
792
|
+
{
|
|
793
|
+
"job_name": "provisioningJob",
|
|
794
|
+
"status": "SUCCEEDED",
|
|
795
|
+
"status_details": "(Note: Finished before workflow.)",
|
|
796
|
+
},
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
def testHeldReleasedSuccess(self):
|
|
800
|
+
# DAG has completed.
|
|
801
|
+
# Since held info will be in job_ad,
|
|
802
|
+
# make sure knows status after released.
|
|
803
|
+
job_ad = {
|
|
804
|
+
"ClusterId": 8625,
|
|
805
|
+
"DAGNodeName": "provisioningJob",
|
|
806
|
+
"HoldReason": "via condor_hold (by user mgower)",
|
|
807
|
+
"HoldReasonCode": 1,
|
|
808
|
+
"HoldReasonSubCode": 0,
|
|
809
|
+
"JobStatus": JobStatus.REMOVED,
|
|
810
|
+
"ProcId": 0,
|
|
811
|
+
"Reason": "removed because <OtherJobRemoveRequirements = DAGManJobId =?= "
|
|
812
|
+
"8624> fired when job (8624.0) was removed",
|
|
813
|
+
"job_held_time": "2025-02-07T12:33:34",
|
|
814
|
+
"job_released_time": "2025-02-07T12:33:47",
|
|
815
|
+
}
|
|
816
|
+
results = WmsSpecificInfo()
|
|
817
|
+
_add_service_job_specific_info(job_ad, results)
|
|
818
|
+
self.assertEqual(
|
|
819
|
+
results.context, {"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""}
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
def testHeldReleasedDeleted(self):
|
|
823
|
+
# Since held info will be in job_ad,
|
|
824
|
+
# make sure knows status after released.
|
|
825
|
+
job_ad = {
|
|
826
|
+
"ClusterId": 9086,
|
|
827
|
+
"DAGNodeName": "provisioningJob",
|
|
828
|
+
"HoldReason": "via condor_hold (by user mgower)",
|
|
829
|
+
"HoldReasonCode": 1,
|
|
830
|
+
"HoldReasonSubCode": 0,
|
|
831
|
+
"JobStatus": JobStatus.REMOVED,
|
|
832
|
+
"ProcId": 0,
|
|
833
|
+
"Reason": "via condor_rm (by user mgower)",
|
|
834
|
+
"job_evicted_time": "2025-02-11T11:35:04",
|
|
835
|
+
"job_held_time": "2025-02-11T11:35:04",
|
|
836
|
+
}
|
|
837
|
+
results = WmsSpecificInfo()
|
|
838
|
+
_add_service_job_specific_info(job_ad, results)
|
|
839
|
+
self.assertEqual(
|
|
840
|
+
results.context, {"job_name": "provisioningJob", "status": "DELETED", "status_details": ""}
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
def testHeldReleasedHeld(self):
|
|
844
|
+
# Since release info will be in job_ad,
|
|
845
|
+
# make sure knows held after release.
|
|
846
|
+
job_ad = {
|
|
847
|
+
"ClusterId": 8659,
|
|
848
|
+
"DAGNodeName": "provisioningJob",
|
|
849
|
+
"HoldReason": "via condor_hold (by user mgower)",
|
|
850
|
+
"HoldReasonCode": 1,
|
|
851
|
+
"HoldReasonSubCode": 0,
|
|
852
|
+
"JobStatus": JobStatus.REMOVED,
|
|
853
|
+
"ProcId": 0,
|
|
854
|
+
"Reason": "Removed by DAGMan (by user mgower)",
|
|
855
|
+
"TerminatedNormally": False,
|
|
856
|
+
"job_held_time": "2025-02-07T12:36:15",
|
|
857
|
+
"job_released_time": "2025-02-07T12:36:07",
|
|
858
|
+
}
|
|
859
|
+
results = WmsSpecificInfo()
|
|
860
|
+
_add_service_job_specific_info(job_ad, results)
|
|
861
|
+
self.assertEqual(
|
|
862
|
+
results.context,
|
|
863
|
+
{
|
|
864
|
+
"job_name": "provisioningJob",
|
|
865
|
+
"status": "DELETED",
|
|
866
|
+
"status_details": "(Job was held for the following reason: via condor_hold (by user mgower))",
|
|
867
|
+
},
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
class GetRunSummaryTestCase(unittest.TestCase):
|
|
872
|
+
"""Test _get_run_summary function."""
|
|
873
|
+
|
|
874
|
+
def testJobSummaryInJobAd(self):
|
|
875
|
+
summary = "pipetaskInit:1;label1:2;label2:2;finalJob:1"
|
|
876
|
+
job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "bps_job_summary": summary}
|
|
877
|
+
results = _get_run_summary(job_ad)
|
|
878
|
+
self.assertEqual(results, summary)
|
|
879
|
+
|
|
880
|
+
def testRunSummaryInJobAd(self):
|
|
881
|
+
summary = "pipetaskInit:1;label1:2;label2:2;finalJob:1"
|
|
882
|
+
job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "bps_run_summary": summary}
|
|
883
|
+
results = _get_run_summary(job_ad)
|
|
884
|
+
self.assertEqual(results, summary)
|
|
885
|
+
|
|
886
|
+
def testSummaryFromDag(self):
|
|
887
|
+
with temporaryDirectory() as tmp_dir:
|
|
888
|
+
copy2(f"{TESTDIR}/data/good.dag", tmp_dir)
|
|
889
|
+
job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "Iwd": tmp_dir}
|
|
890
|
+
results = _get_run_summary(job_ad)
|
|
891
|
+
self.assertEqual(results, "pipetaskInit:1;label1:1;label2:1;label3:1;finalJob:1")
|
|
892
|
+
|
|
893
|
+
def testSummaryNoDag(self):
|
|
894
|
+
with self.assertLogs(logger=logger, level="WARNING") as cm:
|
|
895
|
+
with temporaryDirectory() as tmp_dir:
|
|
896
|
+
job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "Iwd": tmp_dir}
|
|
897
|
+
results = _get_run_summary(job_ad)
|
|
898
|
+
self.assertEqual(results, "")
|
|
899
|
+
self.assertIn("lsst.ctrl.bps.htcondor", cm.records[0].name)
|
|
900
|
+
self.assertIn("Could not get run summary for htcondor job", cm.output[0])
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
class IsServiceJobTestCase(unittest.TestCase):
|
|
904
|
+
"""Test is_service_job function."""
|
|
905
|
+
|
|
906
|
+
def testNotServiceJob(self):
|
|
907
|
+
job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "bps_job_type": "payload"}
|
|
908
|
+
self.assertFalse(is_service_job(job_ad))
|
|
909
|
+
|
|
910
|
+
def testIsServiceJob(self):
|
|
911
|
+
job_ad = {"ClusterId": 8659, "DAGNodeName": "testJob", "bps_job_type": "service"}
|
|
912
|
+
self.assertTrue(is_service_job(job_ad))
|
|
913
|
+
|
|
914
|
+
def testMissingBpsType(self):
|
|
915
|
+
job_ad = {
|
|
916
|
+
"ClusterId": 8659,
|
|
917
|
+
"DAGNodeName": "testJob",
|
|
918
|
+
}
|
|
919
|
+
self.assertFalse(is_service_job(job_ad))
|
|
920
|
+
|
|
921
|
+
|
|
922
|
+
class CreateDetailedReportFromJobsTestCase(unittest.TestCase):
|
|
923
|
+
"""Test _create_detailed_report_from_jobs function."""
|
|
924
|
+
|
|
925
|
+
def testTinySuccess(self):
|
|
926
|
+
with temporaryDirectory() as tmp_dir:
|
|
927
|
+
test_submit_dir = os.path.join(tmp_dir, "tiny_success")
|
|
928
|
+
copytree(f"{TESTDIR}/data/tiny_success", test_submit_dir)
|
|
929
|
+
wms_workflow_id, jobs, message = _get_info_from_path(test_submit_dir)
|
|
930
|
+
run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
|
|
931
|
+
self.assertEqual(len(run_reports), 1)
|
|
932
|
+
report = run_reports[wms_workflow_id]
|
|
933
|
+
self.assertEqual(report.wms_id, wms_workflow_id)
|
|
934
|
+
self.assertEqual(report.state, WmsStates.SUCCEEDED)
|
|
935
|
+
self.assertTrue(os.path.samefile(report.path, test_submit_dir))
|
|
936
|
+
self.assertEqual(report.run_summary, "pipetaskInit:1;label1:1;label2:1;finalJob:1")
|
|
937
|
+
self.assertEqual(
|
|
938
|
+
report.job_state_counts,
|
|
939
|
+
{
|
|
940
|
+
WmsStates.UNKNOWN: 0,
|
|
941
|
+
WmsStates.MISFIT: 0,
|
|
942
|
+
WmsStates.UNREADY: 0,
|
|
943
|
+
WmsStates.READY: 0,
|
|
944
|
+
WmsStates.PENDING: 0,
|
|
945
|
+
WmsStates.RUNNING: 0,
|
|
946
|
+
WmsStates.DELETED: 0,
|
|
947
|
+
WmsStates.HELD: 0,
|
|
948
|
+
WmsStates.SUCCEEDED: 4,
|
|
949
|
+
WmsStates.FAILED: 0,
|
|
950
|
+
WmsStates.PRUNED: 0,
|
|
951
|
+
},
|
|
952
|
+
)
|
|
953
|
+
self.assertEqual(
|
|
954
|
+
report.specific_info.context,
|
|
955
|
+
{"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""},
|
|
956
|
+
)
|
|
957
|
+
|
|
958
|
+
def testTinyProblems(self):
|
|
959
|
+
with temporaryDirectory() as tmp_dir:
|
|
960
|
+
test_submit_dir = os.path.join(tmp_dir, "tiny_problems")
|
|
961
|
+
copytree(f"{TESTDIR}/data/tiny_problems", test_submit_dir)
|
|
962
|
+
wms_workflow_id, jobs, message = _get_info_from_path(test_submit_dir)
|
|
963
|
+
run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
|
|
964
|
+
self.assertEqual(len(run_reports), 1)
|
|
965
|
+
report = run_reports[wms_workflow_id]
|
|
966
|
+
self.assertEqual(report.wms_id, wms_workflow_id)
|
|
967
|
+
self.assertEqual(report.state, WmsStates.FAILED)
|
|
968
|
+
self.assertTrue(os.path.samefile(report.path, test_submit_dir))
|
|
969
|
+
self.assertEqual(report.run_summary, "pipetaskInit:1;label1:2;label2:2;finalJob:1")
|
|
970
|
+
self.assertEqual(
|
|
971
|
+
report.job_state_counts,
|
|
972
|
+
{
|
|
973
|
+
WmsStates.UNKNOWN: 0,
|
|
974
|
+
WmsStates.MISFIT: 0,
|
|
975
|
+
WmsStates.UNREADY: 0,
|
|
976
|
+
WmsStates.READY: 0,
|
|
977
|
+
WmsStates.PENDING: 0,
|
|
978
|
+
WmsStates.RUNNING: 0,
|
|
979
|
+
WmsStates.DELETED: 0,
|
|
980
|
+
WmsStates.HELD: 0,
|
|
981
|
+
WmsStates.SUCCEEDED: 4,
|
|
982
|
+
WmsStates.FAILED: 1,
|
|
983
|
+
WmsStates.PRUNED: 1,
|
|
984
|
+
},
|
|
985
|
+
)
|
|
986
|
+
self.assertEqual(
|
|
987
|
+
run_reports[wms_workflow_id].specific_info.context,
|
|
988
|
+
{"job_name": "provisioningJob", "status": "SUCCEEDED", "status_details": ""},
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
def testTinyRunning(self):
|
|
992
|
+
with temporaryDirectory() as tmp_dir:
|
|
993
|
+
test_submit_dir = os.path.join(tmp_dir, "tiny_running")
|
|
994
|
+
copytree(f"{TESTDIR}/data/tiny_running", test_submit_dir)
|
|
995
|
+
wms_workflow_id, jobs, message = _get_info_from_path(test_submit_dir)
|
|
996
|
+
run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
|
|
997
|
+
self.assertEqual(len(run_reports), 1)
|
|
998
|
+
report = run_reports[wms_workflow_id]
|
|
999
|
+
self.assertEqual(report.wms_id, wms_workflow_id)
|
|
1000
|
+
self.assertEqual(report.state, WmsStates.RUNNING)
|
|
1001
|
+
self.assertTrue(os.path.samefile(report.path, test_submit_dir))
|
|
1002
|
+
self.assertEqual(report.run_summary, "pipetaskInit:1;label1:1;label2:1;finalJob:1")
|
|
1003
|
+
self.assertEqual(
|
|
1004
|
+
report.job_state_counts,
|
|
1005
|
+
{
|
|
1006
|
+
WmsStates.UNKNOWN: 0,
|
|
1007
|
+
WmsStates.MISFIT: 0,
|
|
1008
|
+
WmsStates.UNREADY: 2,
|
|
1009
|
+
WmsStates.READY: 0,
|
|
1010
|
+
WmsStates.PENDING: 0,
|
|
1011
|
+
WmsStates.RUNNING: 1,
|
|
1012
|
+
WmsStates.DELETED: 0,
|
|
1013
|
+
WmsStates.HELD: 0,
|
|
1014
|
+
WmsStates.SUCCEEDED: 1,
|
|
1015
|
+
WmsStates.FAILED: 0,
|
|
1016
|
+
WmsStates.PRUNED: 0,
|
|
1017
|
+
},
|
|
1018
|
+
)
|
|
1019
|
+
self.assertEqual(
|
|
1020
|
+
report.specific_info.context,
|
|
1021
|
+
{"job_name": "provisioningJob", "status": "RUNNING", "status_details": ""},
|
|
1022
|
+
)
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
if __name__ == "__main__":
|
|
1026
|
+
unittest.main()
|
{lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/tests/test_lssthtc.py
RENAMED
|
@@ -31,7 +31,7 @@ import os
|
|
|
31
31
|
import pathlib
|
|
32
32
|
import tempfile
|
|
33
33
|
import unittest
|
|
34
|
-
from shutil import copy2
|
|
34
|
+
from shutil import copy2, rmtree
|
|
35
35
|
|
|
36
36
|
import htcondor
|
|
37
37
|
|
|
@@ -197,22 +197,23 @@ class HtcCheckDagmanOutputTestCase(unittest.TestCase):
|
|
|
197
197
|
self.assertEqual("", results)
|
|
198
198
|
|
|
199
199
|
|
|
200
|
-
class
|
|
201
|
-
"""Test
|
|
200
|
+
class SummarizeDagTestCase(unittest.TestCase):
|
|
201
|
+
"""Test summarize_dag function."""
|
|
202
202
|
|
|
203
203
|
def test_no_dag_file(self):
|
|
204
204
|
with temporaryDirectory() as tmp_dir:
|
|
205
|
-
summary, job_name_to_pipetask = lssthtc.
|
|
205
|
+
summary, job_name_to_pipetask, job_name_to_type = lssthtc.summarize_dag(tmp_dir)
|
|
206
206
|
self.assertFalse(len(job_name_to_pipetask))
|
|
207
|
+
self.assertFalse(len(job_name_to_type))
|
|
207
208
|
self.assertFalse(summary)
|
|
208
209
|
|
|
209
210
|
def test_success(self):
|
|
210
211
|
with temporaryDirectory() as tmp_dir:
|
|
211
212
|
copy2(f"{TESTDIR}/data/good.dag", tmp_dir)
|
|
212
|
-
summary,
|
|
213
|
+
summary, job_name_to_label, job_name_to_type = lssthtc.summarize_dag(tmp_dir)
|
|
213
214
|
self.assertEqual(summary, "pipetaskInit:1;label1:1;label2:1;label3:1;finalJob:1")
|
|
214
215
|
self.assertEqual(
|
|
215
|
-
|
|
216
|
+
job_name_to_label,
|
|
216
217
|
{
|
|
217
218
|
"pipetaskInit": "pipetaskInit",
|
|
218
219
|
"0682f8f9-12f0-40a5-971e-8b30c7231e5c_label1_val1_val2": "label1",
|
|
@@ -221,6 +222,98 @@ class SummaryFromDagTestCase(unittest.TestCase):
|
|
|
221
222
|
"finalJob": "finalJob",
|
|
222
223
|
},
|
|
223
224
|
)
|
|
225
|
+
self.assertEqual(
|
|
226
|
+
job_name_to_type,
|
|
227
|
+
{
|
|
228
|
+
"pipetaskInit": "payload",
|
|
229
|
+
"0682f8f9-12f0-40a5-971e-8b30c7231e5c_label1_val1_val2": "payload",
|
|
230
|
+
"d0305e2d-f164-4a85-bd24-06afe6c84ed9_label2_val1_val2": "payload",
|
|
231
|
+
"2806ecc9-1bba-4362-8fff-ab4e6abb9f83_label3_val1_val2": "payload",
|
|
232
|
+
"finalJob": "final",
|
|
233
|
+
},
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
def test_service(self):
|
|
237
|
+
with temporaryDirectory() as tmp_dir:
|
|
238
|
+
copy2(f"{TESTDIR}/data/tiny_problems/tiny_problems.dag", tmp_dir)
|
|
239
|
+
summary, job_name_to_label, job_name_to_type = lssthtc.summarize_dag(tmp_dir)
|
|
240
|
+
self.assertEqual(summary, "pipetaskInit:1;label1:2;label2:2;finalJob:1")
|
|
241
|
+
self.assertEqual(
|
|
242
|
+
job_name_to_label,
|
|
243
|
+
{
|
|
244
|
+
"pipetaskInit": "pipetaskInit",
|
|
245
|
+
"4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a": "label1",
|
|
246
|
+
"057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b": "label1",
|
|
247
|
+
"696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a": "label2",
|
|
248
|
+
"40040b97-606d-4997-98d3-e0493055fe7e_label2_val1_val2b": "label2",
|
|
249
|
+
"finalJob": "finalJob",
|
|
250
|
+
"provisioningJob": "provisioningJob",
|
|
251
|
+
},
|
|
252
|
+
)
|
|
253
|
+
self.assertEqual(
|
|
254
|
+
job_name_to_type,
|
|
255
|
+
{
|
|
256
|
+
"pipetaskInit": "payload",
|
|
257
|
+
"4a7f478b-2e9b-435c-a730-afac3f621658_label1_val1_val2a": "payload",
|
|
258
|
+
"057c8caf-66f6-4612-abf7-cdea5b666b1b_label1_val1_val2b": "payload",
|
|
259
|
+
"696ee50d-e711-40d6-9caf-ee29ae4a656d_label2_val1_val2a": "payload",
|
|
260
|
+
"40040b97-606d-4997-98d3-e0493055fe7e_label2_val1_val2b": "payload",
|
|
261
|
+
"finalJob": "final",
|
|
262
|
+
"provisioningJob": "service",
|
|
263
|
+
},
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
class ReadDagNodesLogTestCase(unittest.TestCase):
|
|
268
|
+
"""Test read_dag_nodes_log function."""
|
|
269
|
+
|
|
270
|
+
def setUp(self):
|
|
271
|
+
self.tmpdir = tempfile.mkdtemp()
|
|
272
|
+
|
|
273
|
+
def tearDown(self):
|
|
274
|
+
rmtree(self.tmpdir, ignore_errors=True)
|
|
275
|
+
|
|
276
|
+
def testFileMissing(self):
|
|
277
|
+
with self.assertRaisesRegex(FileNotFoundError, "DAGMan node log not found in"):
|
|
278
|
+
_, _ = lssthtc.read_dag_nodes_log(self.tmpdir)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class ReadNodeStatusTestCase(unittest.TestCase):
|
|
282
|
+
"""Test read_node_status function."""
|
|
283
|
+
|
|
284
|
+
def setUp(self):
|
|
285
|
+
self.tmpdir = tempfile.mkdtemp()
|
|
286
|
+
|
|
287
|
+
def tearDown(self):
|
|
288
|
+
rmtree(self.tmpdir, ignore_errors=True)
|
|
289
|
+
|
|
290
|
+
def testServiceJobNotSubmitted(self):
|
|
291
|
+
# tiny_prov_no_submit files have successful workflow
|
|
292
|
+
# but provisioningJob could not submit.
|
|
293
|
+
copy2(f"{TESTDIR}/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.nodes.log", self.tmpdir)
|
|
294
|
+
copy2(f"{TESTDIR}/data/tiny_prov_no_submit/tiny_prov_no_submit.dag.dagman.log", self.tmpdir)
|
|
295
|
+
copy2(f"{TESTDIR}/data/tiny_prov_no_submit/tiny_prov_no_submit.node_status", self.tmpdir)
|
|
296
|
+
copy2(f"{TESTDIR}/data/tiny_prov_no_submit/tiny_prov_no_submit.dag", self.tmpdir)
|
|
297
|
+
|
|
298
|
+
jobs = lssthtc.read_node_status(self.tmpdir)
|
|
299
|
+
found = [id_ for id_ in jobs if jobs[id_].get("bps_job_type", "MISS") == "service"]
|
|
300
|
+
self.assertEqual(len(found), 1)
|
|
301
|
+
self.assertEqual(jobs[found[0]]["DAGNodeName"], "provisioningJob")
|
|
302
|
+
self.assertEqual(jobs[found[0]]["NodeStatus"], lssthtc.NodeStatus.NOT_READY)
|
|
303
|
+
|
|
304
|
+
def testMissingStatusFile(self):
|
|
305
|
+
copy2(f"{TESTDIR}/data/tiny_problems/tiny_problems.dag.nodes.log", self.tmpdir)
|
|
306
|
+
copy2(f"{TESTDIR}/data/tiny_problems/tiny_problems.dag.dagman.log", self.tmpdir)
|
|
307
|
+
copy2(f"{TESTDIR}/data/tiny_problems/tiny_problems.dag", self.tmpdir)
|
|
308
|
+
|
|
309
|
+
jobs = lssthtc.read_node_status(self.tmpdir)
|
|
310
|
+
self.assertEqual(len(jobs), 7)
|
|
311
|
+
self.assertEqual(jobs["9230.0"]["DAGNodeName"], "pipetaskInit")
|
|
312
|
+
self.assertEqual(jobs["9230.0"]["bps_job_type"], "payload")
|
|
313
|
+
self.assertEqual(jobs["9230.0"]["JobStatus"], lssthtc.JobStatus.COMPLETED)
|
|
314
|
+
found = [id_ for id_ in jobs if jobs[id_].get("bps_job_type", "MISS") == "service"]
|
|
315
|
+
self.assertEqual(len(found), 1)
|
|
316
|
+
self.assertEqual(jobs[found[0]]["DAGNodeName"], "provisioningJob")
|
|
224
317
|
|
|
225
318
|
|
|
226
319
|
if __name__ == "__main__":
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/tests/test_handlers.py
RENAMED
|
File without changes
|
{lsst_ctrl_bps_htcondor-28.2025.700 → lsst_ctrl_bps_htcondor-28.2025.900}/tests/test_provisioner.py
RENAMED
|
File without changes
|