lsst-ctrl-bps-htcondor 29.2025.3800__py3-none-any.whl → 29.2025.4100__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,842 @@
1
+ # This file is part of ctrl_bps_htcondor.
2
+ #
3
+ # Developed for the LSST Data Management System.
4
+ # This product includes software developed by the LSST Project
5
+ # (https://www.lsst.org).
6
+ # See the COPYRIGHT file at the top-level directory of this distribution
7
+ # for details of code ownership.
8
+ #
9
+ # This software is dual licensed under the GNU General Public License and also
10
+ # under a 3-clause BSD license. Recipients may choose which of these licenses
11
+ # to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12
+ # respectively. If you choose the GPL option then the following text applies
13
+ # (but note that there is still no warranty even if you opt for BSD instead):
14
+ #
15
+ # This program is free software: you can redistribute it and/or modify
16
+ # it under the terms of the GNU General Public License as published by
17
+ # the Free Software Foundation, either version 3 of the License, or
18
+ # (at your option) any later version.
19
+ #
20
+ # This program is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ # GNU General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU General Public License
26
+ # along with this program. If not, see <https://www.gnu.org/licenses/>.
27
+
28
+ """Utility functions used for reporting."""
29
+
30
+ import logging
31
+ import os
32
+ import re
33
+ from pathlib import Path
34
+ from typing import Any
35
+
36
+ import htcondor
37
+
38
+ from lsst.ctrl.bps import (
39
+ WmsJobReport,
40
+ WmsRunReport,
41
+ WmsSpecificInfo,
42
+ WmsStates,
43
+ )
44
+
45
+ from .common_utils import _htc_status_to_wms_state
46
+ from .lssthtc import (
47
+ MISSING_ID,
48
+ WmsNodeType,
49
+ condor_search,
50
+ htc_check_dagman_output,
51
+ pegasus_name_to_label,
52
+ read_dag_info,
53
+ read_dag_log,
54
+ read_dag_status,
55
+ read_node_status,
56
+ summarize_dag,
57
+ )
58
+
59
+ _LOG = logging.getLogger(__name__)
60
+
61
+
62
+ def _get_status_from_id(
63
+ wms_workflow_id: str, hist: float, schedds: dict[str, htcondor.Schedd]
64
+ ) -> tuple[WmsStates, str]:
65
+ """Gather run information using workflow id.
66
+
67
+ Parameters
68
+ ----------
69
+ wms_workflow_id : `str`
70
+ Limit to specific run based on id.
71
+ hist : `float`
72
+ Limit history search to this many days.
73
+ schedds : `dict` [ `str`, `htcondor.Schedd` ]
74
+ HTCondor schedulers which to query for job information. If empty
75
+ dictionary, all queries will be run against the local scheduler only.
76
+
77
+ Returns
78
+ -------
79
+ state : `lsst.ctrl.bps.WmsStates`
80
+ Status for the corresponding run.
81
+ message : `str`
82
+ Message with extra error information.
83
+ """
84
+ _LOG.debug("_get_status_from_id: id=%s, hist=%s, schedds=%s", wms_workflow_id, hist, schedds)
85
+
86
+ message = ""
87
+
88
+ # Collect information about the job by querying HTCondor schedd and
89
+ # HTCondor history.
90
+ schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds)
91
+ if len(schedd_dag_info) == 1:
92
+ schedd_name = next(iter(schedd_dag_info))
93
+ dag_id = next(iter(schedd_dag_info[schedd_name]))
94
+ dag_ad = schedd_dag_info[schedd_name][dag_id]
95
+ state = _htc_status_to_wms_state(dag_ad)
96
+ else:
97
+ state = WmsStates.UNKNOWN
98
+ message = f"DAGMan job {wms_workflow_id} not found in queue or history. Check id or try path."
99
+ return state, message
100
+
101
+
102
+ def _get_status_from_path(wms_path: str | os.PathLike) -> tuple[WmsStates, str]:
103
+ """Gather run status from a given run directory.
104
+
105
+ Parameters
106
+ ----------
107
+ wms_path : `str` | `os.PathLike`
108
+ The directory containing the submit side files (e.g., HTCondor files).
109
+
110
+ Returns
111
+ -------
112
+ state : `lsst.ctrl.bps.WmsStates`
113
+ Status for the run.
114
+ message : `str`
115
+ Message to be printed.
116
+ """
117
+ wms_path = Path(wms_path).resolve()
118
+ message = ""
119
+ try:
120
+ wms_workflow_id, dag_ad = read_dag_log(wms_path)
121
+ except FileNotFoundError:
122
+ wms_workflow_id = MISSING_ID
123
+ message = f"DAGMan log not found in {wms_path}. Check path."
124
+
125
+ if wms_workflow_id == MISSING_ID:
126
+ state = WmsStates.UNKNOWN
127
+ else:
128
+ state = _htc_status_to_wms_state(dag_ad[wms_workflow_id])
129
+
130
+ return state, message
131
+
132
+
133
+ def _report_from_path(wms_path):
134
+ """Gather run information from a given run directory.
135
+
136
+ Parameters
137
+ ----------
138
+ wms_path : `str`
139
+ The directory containing the submit side files (e.g., HTCondor files).
140
+
141
+ Returns
142
+ -------
143
+ run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
144
+ Run information for the detailed report. The key is the HTCondor id
145
+ and the value is a collection of report information for that run.
146
+ message : `str`
147
+ Message to be printed with the summary report.
148
+ """
149
+ wms_workflow_id, jobs, message = _get_info_from_path(wms_path)
150
+ if wms_workflow_id == MISSING_ID:
151
+ run_reports = {}
152
+ else:
153
+ run_reports = _create_detailed_report_from_jobs(wms_workflow_id, jobs)
154
+ return run_reports, message
155
+
156
+
157
+ def _report_from_id(wms_workflow_id, hist, schedds=None):
158
+ """Gather run information using workflow id.
159
+
160
+ Parameters
161
+ ----------
162
+ wms_workflow_id : `str`
163
+ Limit to specific run based on id.
164
+ hist : `float`
165
+ Limit history search to this many days.
166
+ schedds : `dict` [ `str`, `htcondor.Schedd` ], optional
167
+ HTCondor schedulers which to query for job information. If None
168
+ (default), all queries will be run against the local scheduler only.
169
+
170
+ Returns
171
+ -------
172
+ run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
173
+ Run information for the detailed report. The key is the HTCondor id
174
+ and the value is a collection of report information for that run.
175
+ message : `str`
176
+ Message to be printed with the summary report.
177
+ """
178
+ messages = []
179
+
180
+ # Collect information about the job by querying HTCondor schedd and
181
+ # HTCondor history.
182
+ schedd_dag_info = _get_info_from_schedd(wms_workflow_id, hist, schedds)
183
+ if len(schedd_dag_info) == 1:
184
+ # Extract the DAG info without altering the results of the query.
185
+ schedd_name = next(iter(schedd_dag_info))
186
+ dag_id = next(iter(schedd_dag_info[schedd_name]))
187
+ dag_ad = schedd_dag_info[schedd_name][dag_id]
188
+
189
+ # If the provided workflow id does not correspond to the one extracted
190
+ # from the DAGMan log file in the submit directory, rerun the query
191
+ # with the id found in the file.
192
+ #
193
+ # This is to cover the situation in which the user provided the old job
194
+ # id of a restarted run.
195
+ try:
196
+ path_dag_id, _ = read_dag_log(dag_ad["Iwd"])
197
+ except FileNotFoundError as exc:
198
+ # At the moment missing DAGMan log is pretty much a fatal error.
199
+ # So empty the DAG info to finish early (see the if statement
200
+ # below).
201
+ schedd_dag_info.clear()
202
+ messages.append(f"Cannot create the report for '{dag_id}': {exc}")
203
+ else:
204
+ if path_dag_id != dag_id:
205
+ schedd_dag_info = _get_info_from_schedd(path_dag_id, hist, schedds)
206
+ messages.append(
207
+ f"WARNING: Found newer workflow executions in same submit directory as id '{dag_id}'. "
208
+ "This normally occurs when a run is restarted. The report shown is for the most "
209
+ f"recent status with run id '{path_dag_id}'"
210
+ )
211
+
212
+ if len(schedd_dag_info) == 0:
213
+ run_reports = {}
214
+ elif len(schedd_dag_info) == 1:
215
+ _, dag_info = schedd_dag_info.popitem()
216
+ dag_id, dag_ad = dag_info.popitem()
217
+
218
+ # Create a mapping between jobs and their classads. The keys will
219
+ # be of format 'ClusterId.ProcId'.
220
+ job_info = {dag_id: dag_ad}
221
+
222
+ # Find jobs (nodes) belonging to that DAGMan job.
223
+ job_constraint = f"DAGManJobId == {int(float(dag_id))}"
224
+ schedd_job_info = condor_search(constraint=job_constraint, hist=hist, schedds=schedds)
225
+ if schedd_job_info:
226
+ _, node_info = schedd_job_info.popitem()
227
+ job_info.update(node_info)
228
+
229
+ # Collect additional pieces of information about jobs using HTCondor
230
+ # files in the submission directory.
231
+ _, path_jobs, message = _get_info_from_path(dag_ad["Iwd"])
232
+ _update_jobs(job_info, path_jobs)
233
+ if message:
234
+ messages.append(message)
235
+ run_reports = _create_detailed_report_from_jobs(dag_id, job_info)
236
+ else:
237
+ ids = [ad["GlobalJobId"] for dag_info in schedd_dag_info.values() for ad in dag_info.values()]
238
+ message = (
239
+ f"More than one job matches id '{wms_workflow_id}', "
240
+ f"their global ids are: {', '.join(ids)}. Rerun with one of the global ids"
241
+ )
242
+ messages.append(message)
243
+ run_reports = {}
244
+
245
+ message = "\n".join(messages)
246
+ return run_reports, message
247
+
248
+
249
+ def _get_info_from_schedd(
250
+ wms_workflow_id: str, hist: float, schedds: dict[str, htcondor.Schedd]
251
+ ) -> dict[str, dict[str, dict[str, Any]]]:
252
+ """Gather run information from HTCondor.
253
+
254
+ Parameters
255
+ ----------
256
+ wms_workflow_id : `str`
257
+ Limit to specific run based on id.
258
+ hist : `float`
259
+ Limit history search to this many days.
260
+ schedds : `dict` [ `str`, `htcondor.Schedd` ]
261
+ HTCondor schedulers which to query for job information. If empty
262
+ dictionary, all queries will be run against the local scheduler only.
263
+
264
+ Returns
265
+ -------
266
+ schedd_dag_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
267
+ Information about jobs satisfying the search criteria where for each
268
+ Scheduler, local HTCondor job ids are mapped to their respective
269
+ classads.
270
+ """
271
+ _LOG.debug("_get_info_from_schedd: id=%s, hist=%s, schedds=%s", wms_workflow_id, hist, schedds)
272
+
273
+ dag_constraint = 'regexp("dagman$", Cmd)'
274
+ try:
275
+ cluster_id = int(float(wms_workflow_id))
276
+ except ValueError:
277
+ dag_constraint += f' && GlobalJobId == "{wms_workflow_id}"'
278
+ else:
279
+ dag_constraint += f" && ClusterId == {cluster_id}"
280
+
281
+ # With the current implementation of the condor_* functions the query
282
+ # will always return only one match per Scheduler.
283
+ #
284
+ # Even in the highly unlikely situation where HTCondor history (which
285
+ # condor_search queries too) is long enough to have jobs from before
286
+ # the cluster ids were rolled over (and as a result there is more then
287
+ # one job with the same cluster id) they will not show up in
288
+ # the results.
289
+ schedd_dag_info = condor_search(constraint=dag_constraint, hist=hist, schedds=schedds)
290
+ return schedd_dag_info
291
+
292
+
293
+ def _get_info_from_path(wms_path: str | os.PathLike) -> tuple[str, dict[str, dict[str, Any]], str]:
294
+ """Gather run information from a given run directory.
295
+
296
+ Parameters
297
+ ----------
298
+ wms_path : `str` or `os.PathLike`
299
+ Directory containing HTCondor files.
300
+
301
+ Returns
302
+ -------
303
+ wms_workflow_id : `str`
304
+ The run id which is a DAGman job id.
305
+ jobs : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
306
+ Information about jobs read from files in the given directory.
307
+ The key is the HTCondor id and the value is a dictionary of HTCondor
308
+ keys and values.
309
+ message : `str`
310
+ Message to be printed with the summary report.
311
+ """
312
+ # Ensure path is absolute, in particular for folks helping
313
+ # debug failures that need to dig around submit files.
314
+ wms_path = Path(wms_path).resolve()
315
+
316
+ messages = []
317
+ try:
318
+ wms_workflow_id, jobs = read_dag_log(wms_path)
319
+ _LOG.debug("_get_info_from_path: from dag log %s = %s", wms_workflow_id, jobs)
320
+ _update_jobs(jobs, read_node_status(wms_path))
321
+ _LOG.debug("_get_info_from_path: after node status %s = %s", wms_workflow_id, jobs)
322
+
323
+ # Add more info for DAGman job
324
+ job = jobs[wms_workflow_id]
325
+ job.update(read_dag_status(wms_path))
326
+
327
+ job["total_jobs"], job["state_counts"] = _get_state_counts_from_jobs(wms_workflow_id, jobs)
328
+ if "bps_run" not in job:
329
+ _add_run_info(wms_path, job)
330
+
331
+ message = htc_check_dagman_output(wms_path)
332
+ if message:
333
+ messages.append(message)
334
+ _LOG.debug(
335
+ "_get_info: id = %s, total_jobs = %s", wms_workflow_id, jobs[wms_workflow_id]["total_jobs"]
336
+ )
337
+
338
+ # Add extra pieces of information which cannot be found in HTCondor
339
+ # generated files like 'GlobalJobId'.
340
+ #
341
+ # Do not treat absence of this file as a serious error. Neither runs
342
+ # submitted with earlier versions of the plugin nor the runs submitted
343
+ # with Pegasus plugin will have it at the moment. However, once enough
344
+ # time passes and Pegasus plugin will have its own report() method
345
+ # (instead of sneakily using HTCondor's one), the lack of that file
346
+ # should be treated as seriously as lack of any other file.
347
+ try:
348
+ job_info = read_dag_info(wms_path)
349
+ except FileNotFoundError as exc:
350
+ message = f"Warn: Some information may not be available: {exc}"
351
+ messages.append(message)
352
+ else:
353
+ schedd_name = next(iter(job_info))
354
+ job_ad = next(iter(job_info[schedd_name].values()))
355
+ job.update(job_ad)
356
+ except FileNotFoundError as err:
357
+ message = f"Could not find HTCondor files in '{wms_path}' ({err})"
358
+ _LOG.debug(message)
359
+ messages.append(message)
360
+ message = htc_check_dagman_output(wms_path)
361
+ if message:
362
+ messages.append(message)
363
+ wms_workflow_id = MISSING_ID
364
+ jobs = {}
365
+
366
+ message = "\n".join([msg for msg in messages if msg])
367
+ _LOG.debug("wms_workflow_id = %s, jobs = %s", wms_workflow_id, jobs.keys())
368
+ _LOG.debug("message = %s", message)
369
+ return wms_workflow_id, jobs, message
370
+
371
+
372
+ def _create_detailed_report_from_jobs(
373
+ wms_workflow_id: str, jobs: dict[str, dict[str, Any]]
374
+ ) -> dict[str, WmsRunReport]:
375
+ """Gather run information to be used in generating summary reports.
376
+
377
+ Parameters
378
+ ----------
379
+ wms_workflow_id : `str`
380
+ The run id to create the report for.
381
+ jobs : `dict` [`str`, `dict` [`str`, Any]]
382
+ Mapping HTCondor job id to job information.
383
+
384
+ Returns
385
+ -------
386
+ run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
387
+ Run information for the detailed report. The key is the given HTCondor
388
+ id and the value is a collection of report information for that run.
389
+ """
390
+ _LOG.debug("_create_detailed_report: id = %s, job = %s", wms_workflow_id, jobs[wms_workflow_id])
391
+
392
+ dag_ad = jobs[wms_workflow_id]
393
+
394
+ report = WmsRunReport(
395
+ wms_id=f"{dag_ad['ClusterId']}.{dag_ad['ProcId']}",
396
+ global_wms_id=dag_ad.get("GlobalJobId", "MISS"),
397
+ path=dag_ad["Iwd"],
398
+ label=dag_ad.get("bps_job_label", "MISS"),
399
+ run=dag_ad.get("bps_run", "MISS"),
400
+ project=dag_ad.get("bps_project", "MISS"),
401
+ campaign=dag_ad.get("bps_campaign", "MISS"),
402
+ payload=dag_ad.get("bps_payload", "MISS"),
403
+ operator=_get_owner(dag_ad),
404
+ run_summary=_get_run_summary(dag_ad),
405
+ state=_htc_status_to_wms_state(dag_ad),
406
+ total_number_jobs=0,
407
+ jobs=[],
408
+ job_state_counts=dict.fromkeys(WmsStates, 0),
409
+ exit_code_summary={},
410
+ )
411
+
412
+ payload_jobs = {} # keep track for later processing
413
+ specific_info = WmsSpecificInfo()
414
+ for job_id, job_ad in jobs.items():
415
+ if job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [WmsNodeType.PAYLOAD, WmsNodeType.FINAL]:
416
+ try:
417
+ name = job_ad.get("DAGNodeName", job_id)
418
+ wms_state = _htc_status_to_wms_state(job_ad)
419
+ job_report = WmsJobReport(
420
+ wms_id=job_id,
421
+ name=name,
422
+ label=job_ad.get("bps_job_label", pegasus_name_to_label(name)),
423
+ state=wms_state,
424
+ )
425
+ if job_report.label == "init":
426
+ job_report.label = "pipetaskInit"
427
+ report.job_state_counts[wms_state] += 1
428
+ report.jobs.append(job_report)
429
+ payload_jobs[job_id] = job_ad
430
+ except KeyError as ex:
431
+ _LOG.error("Job missing key '%s': %s", str(ex), job_ad)
432
+ raise
433
+ elif is_service_job(job_ad):
434
+ _LOG.debug(
435
+ "Found service job: id='%s', name='%s', label='%s', NodeStatus='%s', JobStatus='%s'",
436
+ job_id,
437
+ job_ad["DAGNodeName"],
438
+ job_ad.get("bps_job_label", "MISS"),
439
+ job_ad.get("NodeStatus", "MISS"),
440
+ job_ad.get("JobStatus", "MISS"),
441
+ )
442
+ _add_service_job_specific_info(job_ad, specific_info)
443
+
444
+ report.total_number_jobs = len(payload_jobs)
445
+ report.exit_code_summary = _get_exit_code_summary(payload_jobs)
446
+ if specific_info:
447
+ report.specific_info = specific_info
448
+
449
+ # Workflow will exit with non-zero DAG_STATUS if problem with
450
+ # any of the wms jobs. So change FAILED to SUCCEEDED if all
451
+ # payload jobs SUCCEEDED.
452
+ if report.total_number_jobs == report.job_state_counts[WmsStates.SUCCEEDED]:
453
+ report.state = WmsStates.SUCCEEDED
454
+
455
+ run_reports = {report.wms_id: report}
456
+ _LOG.debug("_create_detailed_report: run_reports = %s", run_reports)
457
+ return run_reports
458
+
459
+
460
+ def _add_service_job_specific_info(job_ad: dict[str, Any], specific_info: WmsSpecificInfo) -> None:
461
+ """Generate report information for service job.
462
+
463
+ Parameters
464
+ ----------
465
+ job_ad : `dict` [`str`, `~typing.Any`]
466
+ Provisioning job information.
467
+ specific_info : `lsst.ctrl.bps.WmsSpecificInfo`
468
+ Where to add message.
469
+ """
470
+ status_details = ""
471
+ job_status = _htc_status_to_wms_state(job_ad)
472
+
473
+ # Service jobs in queue are deleted when DAG is done.
474
+ # To get accurate status, need to check other info.
475
+ if (
476
+ job_status == WmsStates.DELETED
477
+ and "Reason" in job_ad
478
+ and (
479
+ "Removed by DAGMan" in job_ad["Reason"]
480
+ or "removed because <OtherJobRemoveRequirements = DAGManJobId =?=" in job_ad["Reason"]
481
+ or "DAG is exiting and writing rescue file." in job_ad["Reason"]
482
+ )
483
+ ):
484
+ if "HoldReason" in job_ad:
485
+ # HoldReason exists even if released, so check.
486
+ if "job_released_time" in job_ad and job_ad["job_held_time"] < job_ad["job_released_time"]:
487
+ # If released, assume running until deleted.
488
+ job_status = WmsStates.SUCCEEDED
489
+ status_details = ""
490
+ else:
491
+ # If job held when deleted by DAGMan, still want to
492
+ # report hold reason
493
+ status_details = f"(Job was held for the following reason: {job_ad['HoldReason']})"
494
+
495
+ else:
496
+ job_status = WmsStates.SUCCEEDED
497
+ elif job_status == WmsStates.SUCCEEDED:
498
+ status_details = "(Note: Finished before workflow.)"
499
+ elif job_status == WmsStates.HELD:
500
+ status_details = f"({job_ad['HoldReason']})"
501
+
502
+ template = "Status of {job_name}: {status} {status_details}"
503
+ context = {
504
+ "job_name": job_ad["DAGNodeName"],
505
+ "status": job_status.name,
506
+ "status_details": status_details,
507
+ }
508
+ specific_info.add_message(template=template, context=context)
509
+
510
+
511
+ def _summary_report(user, hist, pass_thru, schedds=None):
512
+ """Gather run information to be used in generating summary reports.
513
+
514
+ Parameters
515
+ ----------
516
+ user : `str`
517
+ Run lookup restricted to given user.
518
+ hist : `float`
519
+ How many previous days to search for run information.
520
+ pass_thru : `str`
521
+ Advanced users can define the HTCondor constraint to be used
522
+ when searching queue and history.
523
+
524
+ Returns
525
+ -------
526
+ run_reports : `dict` [`str`, `lsst.ctrl.bps.WmsRunReport`]
527
+ Run information for the summary report. The keys are HTCondor ids and
528
+ the values are collections of report information for each run.
529
+ message : `str`
530
+ Message to be printed with the summary report.
531
+ """
532
+ # only doing summary report so only look for dagman jobs
533
+ if pass_thru:
534
+ constraint = pass_thru
535
+ else:
536
+ # Notes:
537
+ # * bps_isjob == 'True' isn't getting set for DAG jobs that are
538
+ # manually restarted.
539
+ # * Any job with DAGManJobID isn't a DAG job
540
+ constraint = 'bps_isjob == "True" && JobUniverse == 7'
541
+ if user:
542
+ constraint += f' && (Owner == "{user}" || bps_operator == "{user}")'
543
+
544
+ job_info = condor_search(constraint=constraint, hist=hist, schedds=schedds)
545
+
546
+ # Have list of DAGMan jobs, need to get run_report info.
547
+ run_reports = {}
548
+ msg = ""
549
+ for jobs in job_info.values():
550
+ for job_id, job in jobs.items():
551
+ total_jobs, state_counts = _get_state_counts_from_dag_job(job)
552
+ # If didn't get from queue information (e.g., Kerberos bug),
553
+ # try reading from file.
554
+ if total_jobs == 0:
555
+ try:
556
+ job.update(read_dag_status(job["Iwd"]))
557
+ total_jobs, state_counts = _get_state_counts_from_dag_job(job)
558
+ except StopIteration:
559
+ pass # don't kill report can't find htcondor files
560
+
561
+ if "bps_run" not in job:
562
+ _add_run_info(job["Iwd"], job)
563
+ report = WmsRunReport(
564
+ wms_id=job_id,
565
+ global_wms_id=job["GlobalJobId"],
566
+ path=job["Iwd"],
567
+ label=job.get("bps_job_label", "MISS"),
568
+ run=job.get("bps_run", "MISS"),
569
+ project=job.get("bps_project", "MISS"),
570
+ campaign=job.get("bps_campaign", "MISS"),
571
+ payload=job.get("bps_payload", "MISS"),
572
+ operator=_get_owner(job),
573
+ run_summary=_get_run_summary(job),
574
+ state=_htc_status_to_wms_state(job),
575
+ jobs=[],
576
+ total_number_jobs=total_jobs,
577
+ job_state_counts=state_counts,
578
+ )
579
+ run_reports[report.global_wms_id] = report
580
+
581
+ return run_reports, msg
582
+
583
+
584
+ def _add_run_info(wms_path, job):
585
+ """Find BPS run information elsewhere for runs without bps attributes.
586
+
587
+ Parameters
588
+ ----------
589
+ wms_path : `str`
590
+ Path to submit files for the run.
591
+ job : `dict` [`str`, `~typing.Any`]
592
+ HTCondor dag job information.
593
+
594
+ Raises
595
+ ------
596
+ StopIteration
597
+ If cannot find file it is looking for. Permission errors are
598
+ caught and job's run is marked with error.
599
+ """
600
+ path = Path(wms_path) / "jobs"
601
+ try:
602
+ subfile = next(path.glob("**/*.sub"))
603
+ except (StopIteration, PermissionError):
604
+ job["bps_run"] = "Unavailable"
605
+ else:
606
+ _LOG.debug("_add_run_info: subfile = %s", subfile)
607
+ try:
608
+ with open(subfile, encoding="utf-8") as fh:
609
+ for line in fh:
610
+ if line.startswith("+bps_"):
611
+ m = re.match(r"\+(bps_[^\s]+)\s*=\s*(.+)$", line)
612
+ if m:
613
+ _LOG.debug("Matching line: %s", line)
614
+ job[m.group(1)] = m.group(2).replace('"', "")
615
+ else:
616
+ _LOG.debug("Could not parse attribute: %s", line)
617
+ except PermissionError:
618
+ job["bps_run"] = "PermissionError"
619
+ _LOG.debug("After adding job = %s", job)
620
+
621
+
622
+ def _get_owner(job):
623
+ """Get the owner of a dag job.
624
+
625
+ Parameters
626
+ ----------
627
+ job : `dict` [`str`, `~typing.Any`]
628
+ HTCondor dag job information.
629
+
630
+ Returns
631
+ -------
632
+ owner : `str`
633
+ Owner of the dag job.
634
+ """
635
+ owner = job.get("bps_operator", None)
636
+ if not owner:
637
+ owner = job.get("Owner", None)
638
+ if not owner:
639
+ _LOG.warning("Could not get Owner from htcondor job: %s", job)
640
+ owner = "MISS"
641
+ return owner
642
+
643
+
644
+ def _get_run_summary(job):
645
+ """Get the run summary for a job.
646
+
647
+ Parameters
648
+ ----------
649
+ job : `dict` [`str`, `~typing.Any`]
650
+ HTCondor dag job information.
651
+
652
+ Returns
653
+ -------
654
+ summary : `str`
655
+ Number of jobs per PipelineTask label in approximate pipeline order.
656
+ Format: <label>:<count>[;<label>:<count>]+
657
+ """
658
+ summary = job.get("bps_job_summary", job.get("bps_run_summary", None))
659
+ if not summary:
660
+ summary, _, _ = summarize_dag(job["Iwd"])
661
+ if not summary:
662
+ _LOG.warning("Could not get run summary for htcondor job: %s", job)
663
+ _LOG.debug("_get_run_summary: summary=%s", summary)
664
+
665
+ # Workaround sometimes using init vs pipetaskInit
666
+ summary = summary.replace("init:", "pipetaskInit:")
667
+
668
+ if "pegasus_version" in job and "pegasus" not in summary:
669
+ summary += ";pegasus:0"
670
+
671
+ return summary
672
+
673
+
674
+ def _get_exit_code_summary(jobs):
675
+ """Get the exit code summary for a run.
676
+
677
+ Parameters
678
+ ----------
679
+ jobs : `dict` [`str`, `dict` [`str`, Any]]
680
+ Mapping HTCondor job id to job information.
681
+
682
+ Returns
683
+ -------
684
+ summary : `dict` [`str`, `list` [`int`]]
685
+ Jobs' exit codes per job label.
686
+ """
687
+ summary = {}
688
+ for job_id, job_ad in jobs.items():
689
+ job_label = job_ad["bps_job_label"]
690
+ summary.setdefault(job_label, [])
691
+ try:
692
+ exit_code = 0
693
+ job_status = job_ad["JobStatus"]
694
+ match job_status:
695
+ case htcondor.JobStatus.COMPLETED | htcondor.JobStatus.HELD:
696
+ exit_code = job_ad["ExitSignal"] if job_ad["ExitBySignal"] else job_ad["ExitCode"]
697
+ case (
698
+ htcondor.JobStatus.IDLE
699
+ | htcondor.JobStatus.RUNNING
700
+ | htcondor.JobStatus.REMOVED
701
+ | htcondor.JobStatus.TRANSFERRING_OUTPUT
702
+ | htcondor.JobStatus.SUSPENDED
703
+ ):
704
+ pass
705
+ case _:
706
+ _LOG.debug("Unknown 'JobStatus' value ('%d') in classad for job '%s'", job_status, job_id)
707
+ if exit_code != 0:
708
+ summary[job_label].append(exit_code)
709
+ except KeyError as ex:
710
+ _LOG.debug("Attribute '%s' not found in the classad for job '%s'", ex, job_id)
711
+ return summary
712
+
713
+
714
+ def _get_state_counts_from_jobs(
715
+ wms_workflow_id: str, jobs: dict[str, dict[str, Any]]
716
+ ) -> tuple[int, dict[WmsStates, int]]:
717
+ """Count number of jobs per WMS state.
718
+
719
+ The workflow job and the service jobs are excluded from the count.
720
+
721
+ Parameters
722
+ ----------
723
+ wms_workflow_id : `str`
724
+ HTCondor job id.
725
+ jobs : `dict [`dict` [`str`, `~typing.Any`]]
726
+ HTCondor dag job information.
727
+
728
+ Returns
729
+ -------
730
+ total_count : `int`
731
+ Total number of dag nodes.
732
+ state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
733
+ Keys are the different WMS states and values are counts of jobs
734
+ that are in that WMS state.
735
+ """
736
+ state_counts = dict.fromkeys(WmsStates, 0)
737
+ for job_id, job_ad in jobs.items():
738
+ if job_id != wms_workflow_id and job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) in [
739
+ WmsNodeType.PAYLOAD,
740
+ WmsNodeType.FINAL,
741
+ ]:
742
+ state_counts[_htc_status_to_wms_state(job_ad)] += 1
743
+ total_count = sum(state_counts.values())
744
+
745
+ return total_count, state_counts
746
+
747
+
748
+ def _get_state_counts_from_dag_job(job):
749
+ """Count number of jobs per WMS state.
750
+
751
+ Parameters
752
+ ----------
753
+ job : `dict` [`str`, `~typing.Any`]
754
+ HTCondor dag job information.
755
+
756
+ Returns
757
+ -------
758
+ total_count : `int`
759
+ Total number of dag nodes.
760
+ state_counts : `dict` [`lsst.ctrl.bps.WmsStates`, `int`]
761
+ Keys are the different WMS states and values are counts of jobs
762
+ that are in that WMS state.
763
+ """
764
+ _LOG.debug("_get_state_counts_from_dag_job: job = %s %s", type(job), len(job))
765
+ state_counts = dict.fromkeys(WmsStates, 0)
766
+ if "DAG_NodesReady" in job:
767
+ state_counts = {
768
+ WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
769
+ WmsStates.READY: job.get("DAG_NodesReady", 0),
770
+ WmsStates.HELD: job.get("DAG_JobsHeld", 0),
771
+ WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
772
+ WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
773
+ WmsStates.PRUNED: job.get("DAG_NodesFutile", 0),
774
+ WmsStates.MISFIT: job.get("DAG_NodesPre", 0) + job.get("DAG_NodesPost", 0),
775
+ }
776
+ total_jobs = job.get("DAG_NodesTotal")
777
+ _LOG.debug("_get_state_counts_from_dag_job: from DAG_* keys, total_jobs = %s", total_jobs)
778
+ elif "NodesFailed" in job:
779
+ state_counts = {
780
+ WmsStates.UNREADY: job.get("NodesUnready", 0),
781
+ WmsStates.READY: job.get("NodesReady", 0),
782
+ WmsStates.HELD: job.get("JobProcsHeld", 0),
783
+ WmsStates.SUCCEEDED: job.get("NodesDone", 0),
784
+ WmsStates.FAILED: job.get("NodesFailed", 0),
785
+ WmsStates.PRUNED: job.get("NodesFutile", 0),
786
+ WmsStates.MISFIT: job.get("NodesPre", 0) + job.get("NodesPost", 0),
787
+ }
788
+ try:
789
+ total_jobs = job.get("NodesTotal")
790
+ except KeyError as ex:
791
+ _LOG.error("Job missing %s. job = %s", str(ex), job)
792
+ raise
793
+ _LOG.debug("_get_state_counts_from_dag_job: from NODES* keys, total_jobs = %s", total_jobs)
794
+ else:
795
+ # With Kerberos job auth and Kerberos bug, if warning would be printed
796
+ # for every DAG.
797
+ _LOG.debug("Can't get job state counts %s", job["Iwd"])
798
+ total_jobs = 0
799
+
800
+ _LOG.debug("total_jobs = %s, state_counts: %s", total_jobs, state_counts)
801
+ return total_jobs, state_counts
802
+
803
+
804
+ def _update_jobs(jobs1, jobs2):
805
+ """Update jobs1 with info in jobs2.
806
+
807
+ (Basically an update for nested dictionaries.)
808
+
809
+ Parameters
810
+ ----------
811
+ jobs1 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
812
+ HTCondor job information to be updated.
813
+ jobs2 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
814
+ Additional HTCondor job information.
815
+ """
816
+ for job_id, job_ad in jobs2.items():
817
+ if job_id in jobs1:
818
+ jobs1[job_id].update(job_ad)
819
+ else:
820
+ jobs1[job_id] = job_ad
821
+
822
+
823
+ def is_service_job(job_ad: dict[str, Any]) -> bool:
824
+ """Determine if a job is a service one.
825
+
826
+ Parameters
827
+ ----------
828
+ job_ad : `dict` [`str`, Any]
829
+ Information about an HTCondor job.
830
+
831
+ Returns
832
+ -------
833
+ is_service_job : `bool`
834
+ True if the job is a service one, false otherwise.
835
+
836
+ Notes
837
+ -----
838
+ At the moment, HTCondor does not provide a native way to distinguish
839
+ between payload and service jobs in the workflow. This code depends
840
+ on read_node_status adding wms_node_type.
841
+ """
842
+ return job_ad.get("wms_node_type", WmsNodeType.UNKNOWN) == WmsNodeType.SERVICE