lsst-ctrl-bps-htcondor 29.2025.1300__py3-none-any.whl → 29.2025.1500__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/ctrl/bps/htcondor/htcondor_service.py +276 -124
- lsst/ctrl/bps/htcondor/lssthtc.py +848 -260
- lsst/ctrl/bps/htcondor/version.py +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.1300.dist-info → lsst_ctrl_bps_htcondor-29.2025.1500.dist-info}/METADATA +1 -1
- {lsst_ctrl_bps_htcondor-29.2025.1300.dist-info → lsst_ctrl_bps_htcondor-29.2025.1500.dist-info}/RECORD +12 -12
- {lsst_ctrl_bps_htcondor-29.2025.1300.dist-info → lsst_ctrl_bps_htcondor-29.2025.1500.dist-info}/WHEEL +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300.dist-info → lsst_ctrl_bps_htcondor-29.2025.1500.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300.dist-info → lsst_ctrl_bps_htcondor-29.2025.1500.dist-info}/licenses/LICENSE +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300.dist-info → lsst_ctrl_bps_htcondor-29.2025.1500.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300.dist-info → lsst_ctrl_bps_htcondor-29.2025.1500.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300.dist-info → lsst_ctrl_bps_htcondor-29.2025.1500.dist-info}/top_level.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.2025.1300.dist-info → lsst_ctrl_bps_htcondor-29.2025.1500.dist-info}/zip-safe +0 -0
|
@@ -38,9 +38,9 @@ __all__ = [
|
|
|
38
38
|
"DagStatus",
|
|
39
39
|
"HTCDag",
|
|
40
40
|
"HTCJob",
|
|
41
|
-
"JobStatus",
|
|
42
41
|
"NodeStatus",
|
|
43
42
|
"RestrictedDict",
|
|
43
|
+
"WmsNodeType",
|
|
44
44
|
"condor_history",
|
|
45
45
|
"condor_q",
|
|
46
46
|
"condor_search",
|
|
@@ -65,7 +65,6 @@ __all__ = [
|
|
|
65
65
|
"read_node_status",
|
|
66
66
|
"summarize_dag",
|
|
67
67
|
"update_job_info",
|
|
68
|
-
"update_job_info",
|
|
69
68
|
"write_dag_info",
|
|
70
69
|
]
|
|
71
70
|
|
|
@@ -77,23 +76,24 @@ import os
|
|
|
77
76
|
import pprint
|
|
78
77
|
import re
|
|
79
78
|
import subprocess
|
|
80
|
-
from collections import defaultdict
|
|
79
|
+
from collections import Counter, defaultdict
|
|
81
80
|
from collections.abc import MutableMapping
|
|
82
81
|
from datetime import datetime, timedelta
|
|
83
|
-
from enum import IntEnum
|
|
82
|
+
from enum import IntEnum, auto
|
|
84
83
|
from pathlib import Path
|
|
85
|
-
from typing import Any
|
|
84
|
+
from typing import Any, TextIO
|
|
86
85
|
|
|
87
86
|
import classad
|
|
88
87
|
import htcondor
|
|
89
88
|
import networkx
|
|
89
|
+
from deprecated.sphinx import deprecated
|
|
90
90
|
from packaging import version
|
|
91
91
|
|
|
92
92
|
from .handlers import HTC_JOB_AD_HANDLERS
|
|
93
93
|
|
|
94
94
|
_LOG = logging.getLogger(__name__)
|
|
95
95
|
|
|
96
|
-
MISSING_ID = -99999
|
|
96
|
+
MISSING_ID = "-99999"
|
|
97
97
|
|
|
98
98
|
|
|
99
99
|
class DagStatus(IntEnum):
|
|
@@ -108,6 +108,13 @@ class DagStatus(IntEnum):
|
|
|
108
108
|
SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8)
|
|
109
109
|
|
|
110
110
|
|
|
111
|
+
@deprecated(
|
|
112
|
+
reason="The JobStatus is internally replaced by htcondor.JobStatus. "
|
|
113
|
+
"External reporting code should be using ctrl_bps.WmsStates. "
|
|
114
|
+
"This class will be removed after v30.",
|
|
115
|
+
version="v30.0",
|
|
116
|
+
category=FutureWarning,
|
|
117
|
+
)
|
|
111
118
|
class JobStatus(IntEnum):
|
|
112
119
|
"""HTCondor's statuses for jobs."""
|
|
113
120
|
|
|
@@ -155,6 +162,31 @@ class NodeStatus(IntEnum):
|
|
|
155
162
|
FUTILE = 7
|
|
156
163
|
|
|
157
164
|
|
|
165
|
+
class WmsNodeType(IntEnum):
|
|
166
|
+
"""HTCondor plugin node types to help with payload reporting."""
|
|
167
|
+
|
|
168
|
+
UNKNOWN = auto()
|
|
169
|
+
"""Dummy value when missing."""
|
|
170
|
+
|
|
171
|
+
PAYLOAD = auto()
|
|
172
|
+
"""Payload job."""
|
|
173
|
+
|
|
174
|
+
FINAL = auto()
|
|
175
|
+
"""Final job."""
|
|
176
|
+
|
|
177
|
+
SERVICE = auto()
|
|
178
|
+
"""Service job."""
|
|
179
|
+
|
|
180
|
+
NOOP = auto()
|
|
181
|
+
"""NOOP job used for ordering jobs."""
|
|
182
|
+
|
|
183
|
+
SUBDAG = auto()
|
|
184
|
+
"""SUBDAG job used for ordering jobs."""
|
|
185
|
+
|
|
186
|
+
SUBDAG_CHECK = auto()
|
|
187
|
+
"""Job used to correctly prune jobs after a subdag."""
|
|
188
|
+
|
|
189
|
+
|
|
158
190
|
HTC_QUOTE_KEYS = {"environment"}
|
|
159
191
|
HTC_VALID_JOB_KEYS = {
|
|
160
192
|
"universe",
|
|
@@ -189,7 +221,18 @@ HTC_VALID_JOB_KEYS = {
|
|
|
189
221
|
"accounting_group",
|
|
190
222
|
"accounting_group_user",
|
|
191
223
|
}
|
|
192
|
-
HTC_VALID_JOB_DAG_KEYS = {
|
|
224
|
+
HTC_VALID_JOB_DAG_KEYS = {
|
|
225
|
+
"dir",
|
|
226
|
+
"noop",
|
|
227
|
+
"done",
|
|
228
|
+
"vars",
|
|
229
|
+
"pre",
|
|
230
|
+
"post",
|
|
231
|
+
"retry",
|
|
232
|
+
"retry_unless_exit",
|
|
233
|
+
"abort_dag_on",
|
|
234
|
+
"abort_exit",
|
|
235
|
+
}
|
|
193
236
|
HTC_VERSION = version.parse(htcondor.__version__)
|
|
194
237
|
|
|
195
238
|
|
|
@@ -224,7 +267,7 @@ class RestrictedDict(MutableMapping):
|
|
|
224
267
|
|
|
225
268
|
Returns
|
|
226
269
|
-------
|
|
227
|
-
value : `~
|
|
270
|
+
value : `~typing.Any`
|
|
228
271
|
Value associated with given key.
|
|
229
272
|
|
|
230
273
|
Raises
|
|
@@ -256,7 +299,7 @@ class RestrictedDict(MutableMapping):
|
|
|
256
299
|
----------
|
|
257
300
|
key : `str`
|
|
258
301
|
Identifier to associate with given value.
|
|
259
|
-
value : `~
|
|
302
|
+
value : `~typing.Any`
|
|
260
303
|
Value to store.
|
|
261
304
|
|
|
262
305
|
Raises
|
|
@@ -278,7 +321,9 @@ class RestrictedDict(MutableMapping):
|
|
|
278
321
|
return str(self.data)
|
|
279
322
|
|
|
280
323
|
|
|
281
|
-
def htc_backup_files(
|
|
324
|
+
def htc_backup_files(
|
|
325
|
+
wms_path: str | os.PathLike, subdir: str | os.PathLike | None = None, limit: int = 100
|
|
326
|
+
) -> Path | None:
|
|
282
327
|
"""Backup select HTCondor files in the submit directory.
|
|
283
328
|
|
|
284
329
|
Files will be saved in separate subdirectories which will be created in
|
|
@@ -293,9 +338,9 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
|
|
|
293
338
|
|
|
294
339
|
Parameters
|
|
295
340
|
----------
|
|
296
|
-
wms_path : `str` or `
|
|
341
|
+
wms_path : `str` or `os.PathLike`
|
|
297
342
|
Path to the submit directory either absolute or relative.
|
|
298
|
-
subdir : `str` or `
|
|
343
|
+
subdir : `str` or `os.PathLike`, optional
|
|
299
344
|
A path, relative to the submit directory, where all subdirectories with
|
|
300
345
|
backup files will be kept. Defaults to None which means that the backup
|
|
301
346
|
subdirectories will be placed directly in the submit directory.
|
|
@@ -305,6 +350,11 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
|
|
|
305
350
|
to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in
|
|
306
351
|
version 8.8+.
|
|
307
352
|
|
|
353
|
+
Returns
|
|
354
|
+
-------
|
|
355
|
+
last_rescue_file : `pathlib.Path` or None
|
|
356
|
+
Path to the latest rescue file or None if doesn't exist.
|
|
357
|
+
|
|
308
358
|
Raises
|
|
309
359
|
------
|
|
310
360
|
FileNotFoundError
|
|
@@ -327,17 +377,18 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
|
|
|
327
377
|
raise FileNotFoundError(f"Directory {path} not found")
|
|
328
378
|
|
|
329
379
|
# Initialize the backup counter.
|
|
330
|
-
rescue_dags = list(
|
|
380
|
+
rescue_dags = list(path.glob("*.rescue[0-9][0-9][0-9]"))
|
|
331
381
|
counter = min(len(rescue_dags), limit)
|
|
332
382
|
|
|
333
383
|
# Create the backup directory and move select files there.
|
|
334
|
-
dest =
|
|
384
|
+
dest = path
|
|
335
385
|
if subdir:
|
|
336
386
|
# PurePath.is_relative_to() is not available before Python 3.9. Hence
|
|
337
387
|
# we need to check is 'subdir' is in the submit directory in some other
|
|
338
388
|
# way if it is an absolute path.
|
|
339
389
|
subdir = Path(subdir)
|
|
340
390
|
if subdir.is_absolute():
|
|
391
|
+
subdir = subdir.resolve() # Since resolve was run on path, must run it here
|
|
341
392
|
if dest not in subdir.parents:
|
|
342
393
|
_LOG.warning(
|
|
343
394
|
"Invalid backup location: '%s' not in the submit directory, will use '%s' instead.",
|
|
@@ -349,21 +400,66 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
|
|
|
349
400
|
else:
|
|
350
401
|
dest /= subdir
|
|
351
402
|
dest /= f"{counter:0{width}}"
|
|
403
|
+
_LOG.debug("dest = %s", dest)
|
|
352
404
|
try:
|
|
353
405
|
dest.mkdir(parents=True, exist_ok=False if counter < limit else True)
|
|
354
406
|
except FileExistsError:
|
|
355
407
|
_LOG.warning("Refusing to do backups: target directory '%s' already exists", dest)
|
|
356
408
|
else:
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
409
|
+
htc_backup_files_single_path(path, dest)
|
|
410
|
+
|
|
411
|
+
# also back up any subdag info
|
|
412
|
+
for subdag_dir in path.glob("subdags/*"):
|
|
413
|
+
subdag_dest = dest / subdag_dir.relative_to(path)
|
|
414
|
+
subdag_dest.mkdir(parents=True, exist_ok=False)
|
|
415
|
+
htc_backup_files_single_path(subdag_dir, subdag_dest)
|
|
416
|
+
|
|
417
|
+
last_rescue_file = rescue_dags[-1] if rescue_dags else None
|
|
418
|
+
_LOG.debug("last_rescue_file = %s", last_rescue_file)
|
|
419
|
+
return last_rescue_file
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def htc_backup_files_single_path(src: str | os.PathLike, dest: str | os.PathLike) -> None:
|
|
423
|
+
"""Move particular htc files to a different directory for later debugging.
|
|
424
|
+
|
|
425
|
+
Parameters
|
|
426
|
+
----------
|
|
427
|
+
src : `str` or `os.PathLike`
|
|
428
|
+
Directory from which to backup particular files.
|
|
429
|
+
dest : `str` or `os.PathLike`
|
|
430
|
+
Directory to which particular files are moved.
|
|
431
|
+
|
|
432
|
+
Raises
|
|
433
|
+
------
|
|
434
|
+
RuntimeError
|
|
435
|
+
If given dest directory matches given src directory.
|
|
436
|
+
OSError
|
|
437
|
+
If problems moving file.
|
|
438
|
+
FileNotFoundError
|
|
439
|
+
Item matching pattern in src directory isn't a file.
|
|
440
|
+
"""
|
|
441
|
+
src = Path(src)
|
|
442
|
+
dest = Path(dest)
|
|
443
|
+
if dest.samefile(src):
|
|
444
|
+
raise RuntimeError(f"Destination directory is same as the source directory ({src})")
|
|
445
|
+
|
|
446
|
+
for patt in [
|
|
447
|
+
"*.info.*",
|
|
448
|
+
"*.dag.metrics",
|
|
449
|
+
"*.dag.nodes.log",
|
|
450
|
+
"*.node_status",
|
|
451
|
+
"wms_*.dag.post.out",
|
|
452
|
+
"wms_*.status.txt",
|
|
453
|
+
]:
|
|
454
|
+
for source in src.glob(patt):
|
|
455
|
+
if source.is_file():
|
|
456
|
+
target = dest / source.relative_to(src)
|
|
457
|
+
try:
|
|
458
|
+
source.rename(target)
|
|
459
|
+
except OSError as exc:
|
|
460
|
+
raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None
|
|
461
|
+
else:
|
|
462
|
+
raise FileNotFoundError(f"Backing up '{source}' failed: not a file")
|
|
367
463
|
|
|
368
464
|
|
|
369
465
|
def htc_escape(value):
|
|
@@ -371,12 +467,12 @@ def htc_escape(value):
|
|
|
371
467
|
|
|
372
468
|
Parameters
|
|
373
469
|
----------
|
|
374
|
-
value : `~
|
|
470
|
+
value : `~typing.Any`
|
|
375
471
|
Value that needs to have characters escaped if string.
|
|
376
472
|
|
|
377
473
|
Returns
|
|
378
474
|
-------
|
|
379
|
-
new_value : `~
|
|
475
|
+
new_value : `~typing.Any`
|
|
380
476
|
Given value with characters escaped appropriate for HTCondor if string.
|
|
381
477
|
"""
|
|
382
478
|
if isinstance(value, str):
|
|
@@ -407,12 +503,14 @@ def htc_write_attribs(stream, attrs):
|
|
|
407
503
|
print(f"+{key} = {pval}", file=stream)
|
|
408
504
|
|
|
409
505
|
|
|
410
|
-
def htc_write_condor_file(
|
|
506
|
+
def htc_write_condor_file(
|
|
507
|
+
filename: str | os.PathLike, job_name: str, job: RestrictedDict, job_attrs: dict[str, Any]
|
|
508
|
+
) -> None:
|
|
411
509
|
"""Write an HTCondor submit file.
|
|
412
510
|
|
|
413
511
|
Parameters
|
|
414
512
|
----------
|
|
415
|
-
filename : `str`
|
|
513
|
+
filename : `str` or os.PathLike
|
|
416
514
|
Filename for the HTCondor submit file.
|
|
417
515
|
job_name : `str`
|
|
418
516
|
Job name to use in submit file.
|
|
@@ -463,7 +561,7 @@ if HTC_VERSION < version.parse("8.9.8"):
|
|
|
463
561
|
|
|
464
562
|
Returns
|
|
465
563
|
-------
|
|
466
|
-
kwargs : `dict` [`str`, Any]
|
|
564
|
+
kwargs : `dict` [`str`, `~typing.Any`]
|
|
467
565
|
Keywords arguments that are guaranteed to work with the Python
|
|
468
566
|
HTCondor API.
|
|
469
567
|
|
|
@@ -501,7 +599,7 @@ else:
|
|
|
501
599
|
|
|
502
600
|
Returns
|
|
503
601
|
-------
|
|
504
|
-
kwargs : `dict` [`str`, Any]
|
|
602
|
+
kwargs : `dict` [`str`, `~typing.Any`]
|
|
505
603
|
Keywords arguments that were passed to the function.
|
|
506
604
|
"""
|
|
507
605
|
return kwargs
|
|
@@ -521,7 +619,7 @@ def htc_query_history(schedds, **kwargs):
|
|
|
521
619
|
------
|
|
522
620
|
schedd_name : `str`
|
|
523
621
|
Name of the HTCondor scheduler managing the job queue.
|
|
524
|
-
job_ad : `dict` [`str`, Any]
|
|
622
|
+
job_ad : `dict` [`str`, `~typing.Any`]
|
|
525
623
|
A dictionary representing HTCondor ClassAd describing a job. It maps
|
|
526
624
|
job attributes names to values of the ClassAd expressions they
|
|
527
625
|
represent.
|
|
@@ -549,7 +647,7 @@ def htc_query_present(schedds, **kwargs):
|
|
|
549
647
|
------
|
|
550
648
|
schedd_name : `str`
|
|
551
649
|
Name of the HTCondor scheduler managing the job queue.
|
|
552
|
-
job_ad : `dict` [`str`, Any]
|
|
650
|
+
job_ad : `dict` [`str`, `~typing.Any`]
|
|
553
651
|
A dictionary representing HTCondor ClassAd describing a job. It maps
|
|
554
652
|
job attributes names to values of the ClassAd expressions they
|
|
555
653
|
represent.
|
|
@@ -581,7 +679,8 @@ def htc_submit_dag(sub):
|
|
|
581
679
|
|
|
582
680
|
Returns
|
|
583
681
|
-------
|
|
584
|
-
schedd_job_info : `dict` [`str`, `dict` [`str`,
|
|
682
|
+
schedd_job_info : `dict` [`str`, `dict` [`str`, \
|
|
683
|
+
`dict` [`str`, `~typing.Any`]]]
|
|
585
684
|
Information about jobs satisfying the search criteria where for each
|
|
586
685
|
Scheduler, local HTCondor job ids are mapped to their respective
|
|
587
686
|
classads.
|
|
@@ -611,7 +710,7 @@ def htc_create_submit_from_dag(dag_filename, submit_options=None):
|
|
|
611
710
|
----------
|
|
612
711
|
dag_filename : `str`
|
|
613
712
|
Name of file containing HTCondor DAG commands.
|
|
614
|
-
submit_options : `dict` [`str`, Any], optional
|
|
713
|
+
submit_options : `dict` [`str`, `~typing.Any`], optional
|
|
615
714
|
Contains extra options for command line (Value of None means flag).
|
|
616
715
|
|
|
617
716
|
Returns
|
|
@@ -624,6 +723,19 @@ def htc_create_submit_from_dag(dag_filename, submit_options=None):
|
|
|
624
723
|
Use with HTCondor versions which support htcondor.Submit.from_dag(),
|
|
625
724
|
i.e., 8.9.3 or newer.
|
|
626
725
|
"""
|
|
726
|
+
# Passing do_recurse as submit_option does not seem to
|
|
727
|
+
# override DAGMAN_GENERATE_SUBDAG_SUBMITS as manual implies.
|
|
728
|
+
# So setting it and the other bps required setting here as
|
|
729
|
+
# environment variables if they don't exist.
|
|
730
|
+
var_name = "_CONDOR_DAGMAN_MANAGER_JOB_APPEND_GETENV"
|
|
731
|
+
if var_name not in os.environ:
|
|
732
|
+
os.environ[var_name] = "True"
|
|
733
|
+
do_recurse = submit_options.get("do_recurse", None)
|
|
734
|
+
if do_recurse:
|
|
735
|
+
var_name = "_CONDOR_DAGMAN_GENERATE_SUBDAG_SUBMITS"
|
|
736
|
+
if var_name not in os.environ:
|
|
737
|
+
os.environ[var_name] = str(do_recurse)
|
|
738
|
+
|
|
627
739
|
return htcondor.Submit.from_dag(dag_filename, submit_options)
|
|
628
740
|
|
|
629
741
|
|
|
@@ -637,7 +749,7 @@ def htc_create_submit_from_cmd(dag_filename, submit_options=None):
|
|
|
637
749
|
----------
|
|
638
750
|
dag_filename : `str`
|
|
639
751
|
Name of file containing HTCondor DAG commands.
|
|
640
|
-
submit_options : `dict` [`str`, Any], optional
|
|
752
|
+
submit_options : `dict` [`str`, `~typing.Any`], optional
|
|
641
753
|
Contains extra options for command line (Value of None means flag).
|
|
642
754
|
|
|
643
755
|
Returns
|
|
@@ -702,7 +814,7 @@ def htc_create_submit_from_file(submit_file):
|
|
|
702
814
|
return htcondor.Submit(descriptors)
|
|
703
815
|
|
|
704
816
|
|
|
705
|
-
def _htc_write_job_commands(stream, name,
|
|
817
|
+
def _htc_write_job_commands(stream, name, commands):
|
|
706
818
|
"""Output the DAGMan job lines for single job in DAG.
|
|
707
819
|
|
|
708
820
|
Parameters
|
|
@@ -711,40 +823,60 @@ def _htc_write_job_commands(stream, name, jobs):
|
|
|
711
823
|
Writeable text stream (typically an opened file).
|
|
712
824
|
name : `str`
|
|
713
825
|
Job name.
|
|
714
|
-
|
|
715
|
-
DAG
|
|
826
|
+
commands : `RestrictedDict`
|
|
827
|
+
DAG commands for a job.
|
|
716
828
|
"""
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
f"
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
829
|
+
# Note: optional pieces of commands include a space at the beginning.
|
|
830
|
+
# also making sure values aren't empty strings as placeholders.
|
|
831
|
+
if "pre" in commands and commands["pre"]:
|
|
832
|
+
defer = ""
|
|
833
|
+
if "defer" in commands["pre"] and commands["pre"]["defer"]:
|
|
834
|
+
defer = f" DEFER {commands['pre']['defer']['status']} {commands['pre']['defer']['time']}"
|
|
835
|
+
|
|
836
|
+
debug = ""
|
|
837
|
+
if "debug" in commands["pre"] and commands["pre"]["debug"]:
|
|
838
|
+
debug = f" DEBUG {commands['pre']['debug']['filename']} {commands['pre']['debug']['type']}"
|
|
839
|
+
|
|
840
|
+
arguments = ""
|
|
841
|
+
if "arguments" in commands["pre"] and commands["pre"]["arguments"]:
|
|
842
|
+
arguments = f" {commands['pre']['arguments']}"
|
|
843
|
+
|
|
844
|
+
executable = commands["pre"]["executable"]
|
|
845
|
+
print(f"SCRIPT{defer}{debug} PRE {name} {executable}{arguments}", file=stream)
|
|
846
|
+
|
|
847
|
+
if "post" in commands and commands["post"]:
|
|
848
|
+
defer = ""
|
|
849
|
+
if "defer" in commands["post"] and commands["post"]["defer"]:
|
|
850
|
+
defer = f" DEFER {commands['post']['defer']['status']} {commands['post']['defer']['time']}"
|
|
851
|
+
|
|
852
|
+
debug = ""
|
|
853
|
+
if "debug" in commands["post"] and commands["post"]["debug"]:
|
|
854
|
+
debug = f" DEBUG {commands['post']['debug']['filename']} {commands['post']['debug']['type']}"
|
|
855
|
+
|
|
856
|
+
arguments = ""
|
|
857
|
+
if "arguments" in commands["post"] and commands["post"]["arguments"]:
|
|
858
|
+
arguments = f" {commands['post']['arguments']}"
|
|
859
|
+
|
|
860
|
+
executable = commands["post"]["executable"]
|
|
861
|
+
print(f"SCRIPT{defer}{debug} POST {name} {executable}{arguments}", file=stream)
|
|
862
|
+
|
|
863
|
+
if "vars" in commands and commands["vars"]:
|
|
864
|
+
for key, value in commands["vars"].items():
|
|
733
865
|
print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream)
|
|
734
866
|
|
|
735
|
-
if "pre_skip" in
|
|
736
|
-
print(f"PRE_SKIP {name} {
|
|
867
|
+
if "pre_skip" in commands and commands["pre_skip"]:
|
|
868
|
+
print(f"PRE_SKIP {name} {commands['pre_skip']}", file=stream)
|
|
737
869
|
|
|
738
|
-
if "retry" in
|
|
739
|
-
print(f"RETRY {name} {
|
|
740
|
-
if "retry_unless_exit" in
|
|
741
|
-
print(f"UNLESS-EXIT {
|
|
742
|
-
print("
|
|
870
|
+
if "retry" in commands and commands["retry"]:
|
|
871
|
+
print(f"RETRY {name} {commands['retry']}", end="", file=stream)
|
|
872
|
+
if "retry_unless_exit" in commands:
|
|
873
|
+
print(f" UNLESS-EXIT {commands['retry_unless_exit']}", end="", file=stream)
|
|
874
|
+
print("", file=stream) # Since previous prints don't include new line
|
|
743
875
|
|
|
744
|
-
if "abort_dag_on" in
|
|
876
|
+
if "abort_dag_on" in commands and commands["abort_dag_on"]:
|
|
745
877
|
print(
|
|
746
|
-
f"ABORT-DAG-ON {name} {
|
|
747
|
-
f" RETURN {
|
|
878
|
+
f"ABORT-DAG-ON {name} {commands['abort_dag_on']['node_exit']}"
|
|
879
|
+
f" RETURN {commands['abort_dag_on']['abort_exit']}",
|
|
748
880
|
file=stream,
|
|
749
881
|
)
|
|
750
882
|
|
|
@@ -773,6 +905,8 @@ class HTCJob:
|
|
|
773
905
|
self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds)
|
|
774
906
|
self.attrs = initattrs
|
|
775
907
|
self.subfile = None
|
|
908
|
+
self.subdir = None
|
|
909
|
+
self.subdag = None
|
|
776
910
|
|
|
777
911
|
def __str__(self):
|
|
778
912
|
return self.name
|
|
@@ -810,33 +944,54 @@ class HTCJob:
|
|
|
810
944
|
if new_attrs:
|
|
811
945
|
self.attrs.update(new_attrs)
|
|
812
946
|
|
|
813
|
-
def write_submit_file(self, submit_path
|
|
947
|
+
def write_submit_file(self, submit_path: str | os.PathLike) -> None:
|
|
814
948
|
"""Write job description to submit file.
|
|
815
949
|
|
|
816
950
|
Parameters
|
|
817
951
|
----------
|
|
818
|
-
submit_path : `str`
|
|
952
|
+
submit_path : `str` or `os.PathLike`
|
|
819
953
|
Prefix path for the submit file.
|
|
820
|
-
job_subdir : `str`, optional
|
|
821
|
-
Template for job subdir.
|
|
822
954
|
"""
|
|
823
955
|
if not self.subfile:
|
|
824
956
|
self.subfile = f"{self.name}.sub"
|
|
825
|
-
job_subdir = job_subdir.format(self=self)
|
|
826
|
-
if job_subdir:
|
|
827
|
-
self.subfile = os.path.join(job_subdir, self.subfile)
|
|
828
|
-
htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs)
|
|
829
957
|
|
|
830
|
-
|
|
958
|
+
subfile = self.subfile
|
|
959
|
+
if self.subdir:
|
|
960
|
+
subfile = Path(self.subdir) / subfile
|
|
961
|
+
|
|
962
|
+
subfile = Path(os.path.expandvars(subfile))
|
|
963
|
+
if not subfile.is_absolute():
|
|
964
|
+
subfile = Path(submit_path) / subfile
|
|
965
|
+
if not subfile.exists():
|
|
966
|
+
htc_write_condor_file(subfile, self.name, self.cmds, self.attrs)
|
|
967
|
+
|
|
968
|
+
def write_dag_commands(self, stream, dag_rel_path, command_name="JOB"):
|
|
831
969
|
"""Write DAG commands for single job to output stream.
|
|
832
970
|
|
|
833
971
|
Parameters
|
|
834
972
|
----------
|
|
835
973
|
stream : `IO` or `str`
|
|
836
974
|
Output Stream.
|
|
975
|
+
dag_rel_path : `str`
|
|
976
|
+
Relative path of dag to submit directory.
|
|
977
|
+
command_name : `str`
|
|
978
|
+
Name of the DAG command (e.g., JOB, FINAL).
|
|
837
979
|
"""
|
|
838
|
-
|
|
839
|
-
|
|
980
|
+
subfile = os.path.expandvars(self.subfile)
|
|
981
|
+
|
|
982
|
+
# JOB NodeName SubmitDescription [DIR directory] [NOOP] [DONE]
|
|
983
|
+
job_line = f'{command_name} {self.name} "{subfile}"'
|
|
984
|
+
if "dir" in self.dagcmds:
|
|
985
|
+
dir_val = self.dagcmds["dir"]
|
|
986
|
+
if dag_rel_path:
|
|
987
|
+
dir_val = os.path.join(dag_rel_path, dir_val)
|
|
988
|
+
job_line += f' DIR "{dir_val}"'
|
|
989
|
+
if self.dagcmds.get("noop", False):
|
|
990
|
+
job_line += " NOOP"
|
|
991
|
+
|
|
992
|
+
print(job_line, file=stream)
|
|
993
|
+
if self.dagcmds:
|
|
994
|
+
_htc_write_job_commands(stream, self.name, self.dagcmds)
|
|
840
995
|
|
|
841
996
|
def dump(self, fh):
|
|
842
997
|
"""Dump job information to output stream.
|
|
@@ -871,6 +1026,7 @@ class HTCDag(networkx.DiGraph):
|
|
|
871
1026
|
self.graph["submit_path"] = None
|
|
872
1027
|
self.graph["final_job"] = None
|
|
873
1028
|
self.graph["service_job"] = None
|
|
1029
|
+
self.graph["submit_options"] = {}
|
|
874
1030
|
|
|
875
1031
|
def __str__(self):
|
|
876
1032
|
"""Represent basic DAG info as string.
|
|
@@ -906,6 +1062,7 @@ class HTCDag(networkx.DiGraph):
|
|
|
906
1062
|
Names of child jobs.
|
|
907
1063
|
"""
|
|
908
1064
|
assert isinstance(job, HTCJob)
|
|
1065
|
+
_LOG.debug("Adding job %s to dag", job.name)
|
|
909
1066
|
|
|
910
1067
|
# Add dag level attributes to each job
|
|
911
1068
|
job.add_job_attrs(self.graph["attr"])
|
|
@@ -913,10 +1070,10 @@ class HTCDag(networkx.DiGraph):
|
|
|
913
1070
|
self.add_node(job.name, data=job)
|
|
914
1071
|
|
|
915
1072
|
if parent_names is not None:
|
|
916
|
-
self.add_job_relationships(parent_names, job.name)
|
|
1073
|
+
self.add_job_relationships(parent_names, [job.name])
|
|
917
1074
|
|
|
918
1075
|
if child_names is not None:
|
|
919
|
-
self.add_job_relationships(child_names, job.name)
|
|
1076
|
+
self.add_job_relationships(child_names, [job.name])
|
|
920
1077
|
|
|
921
1078
|
def add_job_relationships(self, parents, children):
|
|
922
1079
|
"""Add DAG edge between parents and children jobs.
|
|
@@ -972,24 +1129,48 @@ class HTCDag(networkx.DiGraph):
|
|
|
972
1129
|
# Delete job node (which deletes its edges).
|
|
973
1130
|
self.remove_node(job_name)
|
|
974
1131
|
|
|
975
|
-
def write(self, submit_path, job_subdir=""):
|
|
1132
|
+
def write(self, submit_path, job_subdir="", dag_subdir="", dag_rel_path=""):
|
|
976
1133
|
"""Write DAG to a file.
|
|
977
1134
|
|
|
978
1135
|
Parameters
|
|
979
1136
|
----------
|
|
980
1137
|
submit_path : `str`
|
|
981
|
-
Prefix path for
|
|
1138
|
+
Prefix path for all outputs.
|
|
982
1139
|
job_subdir : `str`, optional
|
|
983
|
-
Template for job subdir.
|
|
1140
|
+
Template for job subdir (submit_path + job_subdir).
|
|
1141
|
+
dag_subdir : `str`, optional
|
|
1142
|
+
DAG subdir (submit_path + dag_subdir).
|
|
1143
|
+
dag_rel_path : `str`, optional
|
|
1144
|
+
Prefix to job_subdir for jobs inside subdag.
|
|
984
1145
|
"""
|
|
985
1146
|
self.graph["submit_path"] = submit_path
|
|
986
|
-
self.graph["dag_filename"] = os.path.join(
|
|
987
|
-
os.
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
1147
|
+
self.graph["dag_filename"] = os.path.join(dag_subdir, f"{self.graph['name']}.dag")
|
|
1148
|
+
full_filename = os.path.join(submit_path, self.graph["dag_filename"])
|
|
1149
|
+
os.makedirs(os.path.dirname(full_filename), exist_ok=True)
|
|
1150
|
+
with open(full_filename, "w") as fh:
|
|
1151
|
+
for name, nodeval in self.nodes().items():
|
|
1152
|
+
try:
|
|
1153
|
+
job = nodeval["data"]
|
|
1154
|
+
except KeyError:
|
|
1155
|
+
_LOG.error("Job %s doesn't have data (keys: %s).", name, nodeval.keys())
|
|
1156
|
+
raise
|
|
1157
|
+
if job.subdag:
|
|
1158
|
+
dag_subdir = f"subdags/{job.name}"
|
|
1159
|
+
if "dir" in job.dagcmds:
|
|
1160
|
+
subdir = job.dagcmds["dir"]
|
|
1161
|
+
else:
|
|
1162
|
+
subdir = job_subdir
|
|
1163
|
+
job.subdag.write(submit_path, subdir, dag_subdir, "../..")
|
|
1164
|
+
fh.write(
|
|
1165
|
+
f"SUBDAG EXTERNAL {job.name} {Path(job.subdag.graph['dag_filename']).name} "
|
|
1166
|
+
f"DIR {dag_subdir}\n"
|
|
1167
|
+
)
|
|
1168
|
+
if job.dagcmds:
|
|
1169
|
+
_htc_write_job_commands(fh, job.name, job.dagcmds)
|
|
1170
|
+
else:
|
|
1171
|
+
job.write_submit_file(submit_path)
|
|
1172
|
+
job.write_dag_commands(fh, dag_rel_path)
|
|
1173
|
+
|
|
993
1174
|
for edge in self.edges():
|
|
994
1175
|
print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh)
|
|
995
1176
|
print(f"DOT {self.name}.dot", file=fh)
|
|
@@ -1006,12 +1187,8 @@ class HTCDag(networkx.DiGraph):
|
|
|
1006
1187
|
}
|
|
1007
1188
|
for dagcmd, job in special_jobs.items():
|
|
1008
1189
|
if job is not None:
|
|
1009
|
-
job.write_submit_file(submit_path
|
|
1010
|
-
|
|
1011
|
-
if "pre" in job.dagcmds:
|
|
1012
|
-
print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh)
|
|
1013
|
-
if "post" in job.dagcmds:
|
|
1014
|
-
print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh)
|
|
1190
|
+
job.write_submit_file(submit_path)
|
|
1191
|
+
job.write_dag_commands(fh, dag_rel_path, dagcmd)
|
|
1015
1192
|
|
|
1016
1193
|
def dump(self, fh):
|
|
1017
1194
|
"""Dump DAG info to output stream.
|
|
@@ -1061,7 +1238,7 @@ def condor_q(constraint=None, schedds=None, **kwargs):
|
|
|
1061
1238
|
|
|
1062
1239
|
Returns
|
|
1063
1240
|
-------
|
|
1064
|
-
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str
|
|
1241
|
+
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str`, `~typing.Any`]]]
|
|
1065
1242
|
Information about jobs satisfying the search criteria where for each
|
|
1066
1243
|
Scheduler, local HTCondor job ids are mapped to their respective
|
|
1067
1244
|
classads.
|
|
@@ -1086,7 +1263,7 @@ def condor_history(constraint=None, schedds=None, **kwargs):
|
|
|
1086
1263
|
|
|
1087
1264
|
Returns
|
|
1088
1265
|
-------
|
|
1089
|
-
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str
|
|
1266
|
+
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str`, `~typing.Any`]]]
|
|
1090
1267
|
Information about jobs satisfying the search criteria where for each
|
|
1091
1268
|
Scheduler, local HTCondor job ids are mapped to their respective
|
|
1092
1269
|
classads.
|
|
@@ -1117,7 +1294,7 @@ def condor_query(constraint=None, schedds=None, query_func=htc_query_present, **
|
|
|
1117
1294
|
|
|
1118
1295
|
Returns
|
|
1119
1296
|
-------
|
|
1120
|
-
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str
|
|
1297
|
+
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str`, `~typing.Any`]]]
|
|
1121
1298
|
Information about jobs satisfying the search criteria where for each
|
|
1122
1299
|
Scheduler, local HTCondor job ids are mapped to their respective
|
|
1123
1300
|
classads.
|
|
@@ -1172,7 +1349,7 @@ def condor_search(constraint=None, hist=None, schedds=None):
|
|
|
1172
1349
|
|
|
1173
1350
|
Returns
|
|
1174
1351
|
-------
|
|
1175
|
-
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
|
|
1352
|
+
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` `~typing.Any`]]]
|
|
1176
1353
|
Information about jobs satisfying the search criteria where for each
|
|
1177
1354
|
Scheduler, local HTCondor job ids are mapped to their respective
|
|
1178
1355
|
classads.
|
|
@@ -1203,7 +1380,7 @@ def condor_status(constraint=None, coll=None):
|
|
|
1203
1380
|
|
|
1204
1381
|
Returns
|
|
1205
1382
|
-------
|
|
1206
|
-
pool_info : `dict` [`str`, `dict` [`str`, Any]]
|
|
1383
|
+
pool_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1207
1384
|
Mapping between HTCondor slot names and slot information (classAds).
|
|
1208
1385
|
"""
|
|
1209
1386
|
if coll is None:
|
|
@@ -1225,14 +1402,14 @@ def update_job_info(job_info, other_info):
|
|
|
1225
1402
|
|
|
1226
1403
|
Parameters
|
|
1227
1404
|
----------
|
|
1228
|
-
job_info : `dict` [`str`, `dict` [`str`, Any]]
|
|
1405
|
+
job_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1229
1406
|
Results of the job query that needs to be updated.
|
|
1230
|
-
other_info : `dict` [`str`, `dict` [`str`, Any]]
|
|
1407
|
+
other_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1231
1408
|
Results of the other job query.
|
|
1232
1409
|
|
|
1233
1410
|
Returns
|
|
1234
1411
|
-------
|
|
1235
|
-
job_info : `dict` [`str`, `dict` [`str`, Any]]
|
|
1412
|
+
job_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1236
1413
|
The updated results.
|
|
1237
1414
|
"""
|
|
1238
1415
|
for schedd_name, others in other_info.items():
|
|
@@ -1246,7 +1423,98 @@ def update_job_info(job_info, other_info):
|
|
|
1246
1423
|
return job_info
|
|
1247
1424
|
|
|
1248
1425
|
|
|
1249
|
-
def
|
|
1426
|
+
def count_jobs_in_single_dag(
|
|
1427
|
+
filename: str | os.PathLike,
|
|
1428
|
+
) -> tuple[Counter[str], dict[str, str], dict[str, WmsNodeType]]:
|
|
1429
|
+
"""Build bps_run_summary string from dag file.
|
|
1430
|
+
|
|
1431
|
+
Parameters
|
|
1432
|
+
----------
|
|
1433
|
+
filename : `str`
|
|
1434
|
+
Path that includes dag file for a run.
|
|
1435
|
+
|
|
1436
|
+
Returns
|
|
1437
|
+
-------
|
|
1438
|
+
counts : `Counter` [`str`]
|
|
1439
|
+
Semi-colon separated list of job labels and counts.
|
|
1440
|
+
(Same format as saved in dag classad).
|
|
1441
|
+
job_name_to_label : `dict` [`str`, `str`]
|
|
1442
|
+
Mapping of job names to job labels.
|
|
1443
|
+
job_name_to_type : `dict` [`str`, `lsst.ctrl.bps.htcondor.WmsNodeType`]
|
|
1444
|
+
Mapping of job names to job types
|
|
1445
|
+
(e.g., payload, final, service).
|
|
1446
|
+
"""
|
|
1447
|
+
# Later code depends upon insertion order
|
|
1448
|
+
counts: Counter = Counter() # counts of payload jobs per label
|
|
1449
|
+
job_name_to_label: dict[str, str] = {}
|
|
1450
|
+
job_name_to_type: dict[str, WmsNodeType] = {}
|
|
1451
|
+
with open(filename) as fh:
|
|
1452
|
+
for line in fh:
|
|
1453
|
+
job_name = ""
|
|
1454
|
+
m = re.match(
|
|
1455
|
+
r"(?P<command>JOB|FINAL|SERVICE|SUBDAG EXTERNAL)\s+"
|
|
1456
|
+
r'(?P<jobname>(?P<wms>wms_)?\S+)\s+"?(?P<subfile>\S+)"?\s*'
|
|
1457
|
+
r'(DIR "?(?P<dir>[^\s"]+)"?)?\s*(?P<noop>NOOP)?',
|
|
1458
|
+
line,
|
|
1459
|
+
)
|
|
1460
|
+
if m:
|
|
1461
|
+
job_name = m.group("jobname")
|
|
1462
|
+
name_parts = job_name.split("_")
|
|
1463
|
+
|
|
1464
|
+
label = ""
|
|
1465
|
+
if m.group("dir"):
|
|
1466
|
+
dir_match = re.search(r"jobs/([^\s/]+)", m.group("dir"))
|
|
1467
|
+
if dir_match:
|
|
1468
|
+
label = dir_match.group(1)
|
|
1469
|
+
else:
|
|
1470
|
+
_LOG.debug("Parse DAG: unparsed dir = %s", line)
|
|
1471
|
+
elif m.group("subfile"):
|
|
1472
|
+
subfile_match = re.search(r"jobs/([^\s/]+)", m.group("subfile"))
|
|
1473
|
+
if subfile_match:
|
|
1474
|
+
label = m.group("subfile").split("/")[1]
|
|
1475
|
+
else:
|
|
1476
|
+
label = pegasus_name_to_label(job_name)
|
|
1477
|
+
|
|
1478
|
+
match m.group("command"):
|
|
1479
|
+
case "JOB":
|
|
1480
|
+
if m.group("noop"):
|
|
1481
|
+
job_type = WmsNodeType.NOOP
|
|
1482
|
+
# wms_noop_label
|
|
1483
|
+
label = name_parts[2]
|
|
1484
|
+
elif m.group("wms"):
|
|
1485
|
+
if name_parts[1] == "check":
|
|
1486
|
+
job_type = WmsNodeType.SUBDAG_CHECK
|
|
1487
|
+
# wms_check_status_wms_group_label
|
|
1488
|
+
label = name_parts[5]
|
|
1489
|
+
else:
|
|
1490
|
+
_LOG.warning(
|
|
1491
|
+
"Unexpected skipping of dag line due to unknown wms job: %s", line
|
|
1492
|
+
)
|
|
1493
|
+
else:
|
|
1494
|
+
job_type = WmsNodeType.PAYLOAD
|
|
1495
|
+
if label == "init":
|
|
1496
|
+
label = "pipetaskInit"
|
|
1497
|
+
counts[label] += 1
|
|
1498
|
+
case "FINAL":
|
|
1499
|
+
job_type = WmsNodeType.FINAL
|
|
1500
|
+
counts[label] += 1 # final counts a payload job.
|
|
1501
|
+
case "SERVICE":
|
|
1502
|
+
job_type = WmsNodeType.SERVICE
|
|
1503
|
+
case "SUBDAG EXTERNAL":
|
|
1504
|
+
job_type = WmsNodeType.SUBDAG
|
|
1505
|
+
label = name_parts[2]
|
|
1506
|
+
|
|
1507
|
+
job_name_to_label[job_name] = label
|
|
1508
|
+
job_name_to_type[job_name] = job_type
|
|
1509
|
+
elif not line.startswith(("VARS", "PARENT", "DOT", "NODE_STATUS_FILE", "SET_JOB_ATTR", "SCRIPT")):
|
|
1510
|
+
# Only print warning if not a line wanting to skip
|
|
1511
|
+
# Probably means problem with regex in above match pattern.
|
|
1512
|
+
_LOG.warning("Unexpected skipping of dag line: %s", line)
|
|
1513
|
+
|
|
1514
|
+
return counts, job_name_to_label, job_name_to_type
|
|
1515
|
+
|
|
1516
|
+
|
|
1517
|
+
def summarize_dag(dir_name: str) -> tuple[str, dict[str, str], dict[str, WmsNodeType]]:
|
|
1250
1518
|
"""Build bps_run_summary string from dag file.
|
|
1251
1519
|
|
|
1252
1520
|
Parameters
|
|
@@ -1261,56 +1529,25 @@ def summarize_dag(dir_name: str) -> tuple[str, dict[str, str], dict[str, str]]:
|
|
|
1261
1529
|
(Same format as saved in dag classad).
|
|
1262
1530
|
job_name_to_label : `dict` [`str`, `str`]
|
|
1263
1531
|
Mapping of job names to job labels.
|
|
1264
|
-
job_name_to_type : `dict` [`str`, `
|
|
1532
|
+
job_name_to_type : `dict` [`str`, `lsst.ctrl.bps.htcondor.WmsNodeType`]
|
|
1265
1533
|
Mapping of job names to job types
|
|
1266
1534
|
(e.g., payload, final, service).
|
|
1267
1535
|
"""
|
|
1268
1536
|
# Later code depends upon insertion order
|
|
1269
|
-
counts:
|
|
1270
|
-
job_name_to_label = {}
|
|
1271
|
-
job_name_to_type = {}
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
label = "pipetaskInit"
|
|
1284
|
-
counts[label] += 1
|
|
1285
|
-
else: # Check if Pegasus submission
|
|
1286
|
-
m = re.match(r"JOB (\S+) (\S+)", line)
|
|
1287
|
-
if m:
|
|
1288
|
-
job_name = m.group(1)
|
|
1289
|
-
label = pegasus_name_to_label(m.group(1))
|
|
1290
|
-
counts[label] += 1
|
|
1291
|
-
else:
|
|
1292
|
-
_LOG.warning("Parse DAG: unmatched job line: %s", line)
|
|
1293
|
-
job_type = "payload"
|
|
1294
|
-
elif line.startswith("FINAL"):
|
|
1295
|
-
m = re.match(r"FINAL (\S+) jobs/([^/]+)/", line)
|
|
1296
|
-
if m:
|
|
1297
|
-
job_name = m.group(1)
|
|
1298
|
-
label = m.group(2)
|
|
1299
|
-
counts[label] += 1 # final counts a payload job.
|
|
1300
|
-
job_type = "final"
|
|
1301
|
-
elif line.startswith("SERVICE"):
|
|
1302
|
-
m = re.match(r"SERVICE (\S+) jobs/([^/]+)/", line)
|
|
1303
|
-
if m:
|
|
1304
|
-
job_name = m.group(1)
|
|
1305
|
-
label = m.group(2)
|
|
1306
|
-
job_type = "service"
|
|
1307
|
-
|
|
1308
|
-
if job_name:
|
|
1309
|
-
job_name_to_label[job_name] = label
|
|
1310
|
-
job_name_to_type[job_name] = job_type
|
|
1311
|
-
|
|
1312
|
-
except (OSError, PermissionError, StopIteration):
|
|
1313
|
-
pass
|
|
1537
|
+
counts: Counter[str] = Counter() # counts of payload jobs per label
|
|
1538
|
+
job_name_to_label: dict[str, str] = {}
|
|
1539
|
+
job_name_to_type: dict[str, WmsNodeType] = {}
|
|
1540
|
+
for filename in Path(dir_name).glob("*.dag"):
|
|
1541
|
+
single_counts, single_job_name_to_label, single_job_name_to_type = count_jobs_in_single_dag(filename)
|
|
1542
|
+
counts += single_counts
|
|
1543
|
+
_update_dicts(job_name_to_label, single_job_name_to_label)
|
|
1544
|
+
_update_dicts(job_name_to_type, single_job_name_to_type)
|
|
1545
|
+
|
|
1546
|
+
for filename in Path(dir_name).glob("subdags/*/*.dag"):
|
|
1547
|
+
single_counts, single_job_name_to_label, single_job_name_to_type = count_jobs_in_single_dag(filename)
|
|
1548
|
+
counts += single_counts
|
|
1549
|
+
_update_dicts(job_name_to_label, single_job_name_to_label)
|
|
1550
|
+
_update_dicts(job_name_to_type, single_job_name_to_type)
|
|
1314
1551
|
|
|
1315
1552
|
summary = ";".join([f"{name}:{counts[name]}" for name in counts])
|
|
1316
1553
|
_LOG.debug("summarize_dag: %s %s %s", summary, job_name_to_label, job_name_to_type)
|
|
@@ -1343,69 +1580,100 @@ def pegasus_name_to_label(name):
|
|
|
1343
1580
|
return label
|
|
1344
1581
|
|
|
1345
1582
|
|
|
1346
|
-
def
|
|
1583
|
+
def read_single_dag_status(filename: str | os.PathLike) -> dict[str, Any]:
|
|
1347
1584
|
"""Read the node status file for DAG summary information.
|
|
1348
1585
|
|
|
1349
1586
|
Parameters
|
|
1350
1587
|
----------
|
|
1351
|
-
|
|
1352
|
-
|
|
1588
|
+
filename : `str` or `Path.pathlib`
|
|
1589
|
+
Node status filename.
|
|
1353
1590
|
|
|
1354
1591
|
Returns
|
|
1355
1592
|
-------
|
|
1356
|
-
dag_ad : `dict` [`str`, Any]
|
|
1593
|
+
dag_ad : `dict` [`str`, `~typing.Any`]
|
|
1357
1594
|
DAG summary information.
|
|
1358
1595
|
"""
|
|
1359
|
-
dag_ad = {}
|
|
1596
|
+
dag_ad: dict[str, Any] = {}
|
|
1360
1597
|
|
|
1361
1598
|
# While this is probably more up to date than dag classad, only read from
|
|
1362
1599
|
# file if need to.
|
|
1363
1600
|
try:
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
dag_ad = classad.parseNext(infh) # pylint: disable=E1101
|
|
1369
|
-
except StopIteration:
|
|
1370
|
-
pass
|
|
1601
|
+
node_stat_file = Path(filename)
|
|
1602
|
+
_LOG.debug("Reading Node Status File %s", node_stat_file)
|
|
1603
|
+
with open(node_stat_file) as infh:
|
|
1604
|
+
dag_ad = dict(classad.parseNext(infh)) # pylint: disable=E1101
|
|
1371
1605
|
|
|
1372
1606
|
if not dag_ad:
|
|
1373
1607
|
# Pegasus check here
|
|
1374
|
-
|
|
1375
|
-
|
|
1608
|
+
metrics_file = node_stat_file.with_suffix(".dag.metrics")
|
|
1609
|
+
if metrics_file.exists():
|
|
1376
1610
|
with open(metrics_file) as infh:
|
|
1377
1611
|
metrics = json.load(infh)
|
|
1378
1612
|
dag_ad["NodesTotal"] = metrics.get("jobs", 0)
|
|
1379
1613
|
dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0)
|
|
1380
1614
|
dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0)
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
with open(metrics_file) as infh:
|
|
1386
|
-
metrics = json.load(infh)
|
|
1387
|
-
dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"]
|
|
1388
|
-
dag_ad["pegasus_version"] = metrics.get("version", "")
|
|
1389
|
-
except StopIteration:
|
|
1390
|
-
pass
|
|
1615
|
+
metrics_file = node_stat_file.with_suffix(".metrics")
|
|
1616
|
+
with open(metrics_file) as infh:
|
|
1617
|
+
metrics = json.load(infh)
|
|
1618
|
+
dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"]
|
|
1391
1619
|
except (OSError, PermissionError):
|
|
1392
1620
|
pass
|
|
1393
1621
|
|
|
1394
1622
|
_LOG.debug("read_dag_status: %s", dag_ad)
|
|
1395
|
-
return
|
|
1623
|
+
return dag_ad
|
|
1396
1624
|
|
|
1397
1625
|
|
|
1398
|
-
def
|
|
1399
|
-
"""Read
|
|
1626
|
+
def read_dag_status(wms_path: str | os.PathLike) -> dict[str, Any]:
|
|
1627
|
+
"""Read the node status file for DAG summary information.
|
|
1400
1628
|
|
|
1401
1629
|
Parameters
|
|
1402
1630
|
----------
|
|
1403
|
-
wms_path : `str`
|
|
1631
|
+
wms_path : `str` or `os.PathLike
|
|
1404
1632
|
Path that includes node status file for a run.
|
|
1405
1633
|
|
|
1406
1634
|
Returns
|
|
1407
1635
|
-------
|
|
1408
|
-
|
|
1636
|
+
dag_ad : `dict` [`str`, `~typing.Any`]
|
|
1637
|
+
DAG summary information, counts summed across any subdags.
|
|
1638
|
+
"""
|
|
1639
|
+
dag_ads: dict[str, Any] = {}
|
|
1640
|
+
path = Path(wms_path)
|
|
1641
|
+
try:
|
|
1642
|
+
node_stat_file = next(path.glob("*.node_status"))
|
|
1643
|
+
except StopIteration as exc:
|
|
1644
|
+
raise FileNotFoundError(f"DAGMan node status not found in {wms_path}") from exc
|
|
1645
|
+
|
|
1646
|
+
dag_ads = read_single_dag_status(node_stat_file)
|
|
1647
|
+
|
|
1648
|
+
for node_stat_file in path.glob("subdags/*/*.node_status"):
|
|
1649
|
+
dag_ad = read_single_dag_status(node_stat_file)
|
|
1650
|
+
dag_ads["JobProcsHeld"] += dag_ad.get("JobProcsHeld", 0)
|
|
1651
|
+
dag_ads["NodesPost"] += dag_ad.get("NodesPost", 0)
|
|
1652
|
+
dag_ads["JobProcsIdle"] += dag_ad.get("JobProcsIdle", 0)
|
|
1653
|
+
dag_ads["NodesTotal"] += dag_ad.get("NodesTotal", 0)
|
|
1654
|
+
dag_ads["NodesFailed"] += dag_ad.get("NodesFailed", 0)
|
|
1655
|
+
dag_ads["NodesDone"] += dag_ad.get("NodesDone", 0)
|
|
1656
|
+
dag_ads["NodesQueued"] += dag_ad.get("NodesQueued", 0)
|
|
1657
|
+
dag_ads["NodesPre"] += dag_ad.get("NodesReady", 0)
|
|
1658
|
+
dag_ads["NodesFutile"] += dag_ad.get("NodesFutile", 0)
|
|
1659
|
+
dag_ads["NodesUnready"] += dag_ad.get("NodesUnready", 0)
|
|
1660
|
+
|
|
1661
|
+
return dag_ads
|
|
1662
|
+
|
|
1663
|
+
|
|
1664
|
+
def read_single_node_status(filename: str | os.PathLike, init_fake_id: int) -> dict[str, Any]:
|
|
1665
|
+
"""Read entire node status file.
|
|
1666
|
+
|
|
1667
|
+
Parameters
|
|
1668
|
+
----------
|
|
1669
|
+
filename : `str` or `pathlib.Path`
|
|
1670
|
+
Node status filename.
|
|
1671
|
+
init_fake_id : `int`
|
|
1672
|
+
Initial fake id value.
|
|
1673
|
+
|
|
1674
|
+
Returns
|
|
1675
|
+
-------
|
|
1676
|
+
jobs : `dict` [`str`, `~typing.Any`]
|
|
1409
1677
|
DAG summary information compiled from the node status file combined
|
|
1410
1678
|
with the information found in the node event log.
|
|
1411
1679
|
|
|
@@ -1413,28 +1681,34 @@ def read_node_status(wms_path):
|
|
|
1413
1681
|
from the event log takes precedence over the value from the node status
|
|
1414
1682
|
file.
|
|
1415
1683
|
"""
|
|
1684
|
+
filename = Path(filename)
|
|
1685
|
+
|
|
1416
1686
|
# Get jobid info from other places to fill in gaps in info from node_status
|
|
1417
|
-
_, job_name_to_label, job_name_to_type =
|
|
1418
|
-
|
|
1419
|
-
|
|
1687
|
+
_, job_name_to_label, job_name_to_type = count_jobs_in_single_dag(filename.with_suffix(".dag"))
|
|
1688
|
+
loginfo: dict[str, dict[str, Any]] = {}
|
|
1689
|
+
try:
|
|
1690
|
+
wms_workflow_id, loginfo = read_single_dag_log(filename.with_suffix(".dag.dagman.log"))
|
|
1691
|
+
loginfo = read_single_dag_nodes_log(filename.with_suffix(".dag.nodes.log"))
|
|
1692
|
+
except (OSError, PermissionError):
|
|
1693
|
+
pass
|
|
1694
|
+
|
|
1695
|
+
job_name_to_id: dict[str, str] = {}
|
|
1420
1696
|
_LOG.debug("loginfo = %s", loginfo)
|
|
1421
|
-
|
|
1697
|
+
log_job_name_to_id: dict[str, str] = {}
|
|
1422
1698
|
for job_id, job_info in loginfo.items():
|
|
1423
1699
|
if "LogNotes" in job_info:
|
|
1424
1700
|
m = re.match(r"DAG Node: (\S+)", job_info["LogNotes"])
|
|
1425
1701
|
if m:
|
|
1426
1702
|
job_name = m.group(1)
|
|
1427
|
-
|
|
1703
|
+
log_job_name_to_id[job_name] = job_id
|
|
1428
1704
|
job_info["DAGNodeName"] = job_name
|
|
1429
|
-
job_info["
|
|
1705
|
+
job_info["wms_node_type"] = job_name_to_type[job_name]
|
|
1430
1706
|
job_info["bps_job_label"] = job_name_to_label[job_name]
|
|
1431
1707
|
|
|
1432
|
-
jobs =
|
|
1433
|
-
fake_id =
|
|
1708
|
+
jobs = {}
|
|
1709
|
+
fake_id = init_fake_id # For nodes that do not yet have a job id, give fake one
|
|
1434
1710
|
try:
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
with open(node_status) as fh:
|
|
1711
|
+
with open(filename) as fh:
|
|
1438
1712
|
for ad in classad.parseAds(fh):
|
|
1439
1713
|
match ad["Type"]:
|
|
1440
1714
|
case "DagStatus":
|
|
@@ -1449,21 +1723,23 @@ def read_node_status(wms_path):
|
|
|
1449
1723
|
else:
|
|
1450
1724
|
job_label = job_name
|
|
1451
1725
|
|
|
1452
|
-
|
|
1453
|
-
if job_name in
|
|
1454
|
-
job_id = str(
|
|
1455
|
-
job
|
|
1726
|
+
job = dict(ad)
|
|
1727
|
+
if job_name in log_job_name_to_id:
|
|
1728
|
+
job_id = str(log_job_name_to_id[job_name])
|
|
1729
|
+
_update_dicts(job, loginfo[job_id])
|
|
1456
1730
|
else:
|
|
1457
1731
|
job_id = str(fake_id)
|
|
1458
|
-
job_name_to_id[job_name] = job_id
|
|
1459
1732
|
job = dict(ad)
|
|
1460
|
-
jobs[job_id] = job
|
|
1461
1733
|
fake_id -= 1
|
|
1734
|
+
jobs[job_id] = job
|
|
1735
|
+
job_name_to_id[job_name] = job_id
|
|
1736
|
+
|
|
1737
|
+
# Make job info as if came from condor_q.
|
|
1462
1738
|
job["ClusterId"] = int(float(job_id))
|
|
1463
1739
|
job["DAGManJobID"] = wms_workflow_id
|
|
1464
1740
|
job["DAGNodeName"] = job_name
|
|
1465
1741
|
job["bps_job_label"] = job_label
|
|
1466
|
-
job["
|
|
1742
|
+
job["wms_node_type"] = job_name_to_type[job_name]
|
|
1467
1743
|
|
|
1468
1744
|
case "StatusEnd":
|
|
1469
1745
|
# Skip node status file "epilog".
|
|
@@ -1472,41 +1748,104 @@ def read_node_status(wms_path):
|
|
|
1472
1748
|
_LOG.debug(
|
|
1473
1749
|
"Ignoring unknown classad type '%s' in the node status file '%s'",
|
|
1474
1750
|
ad["Type"],
|
|
1475
|
-
|
|
1751
|
+
filename,
|
|
1476
1752
|
)
|
|
1477
|
-
except (
|
|
1753
|
+
except (OSError, PermissionError):
|
|
1478
1754
|
pass
|
|
1479
1755
|
|
|
1480
1756
|
# Check for missing jobs (e.g., submission failure or not submitted yet)
|
|
1481
1757
|
# Use dag info to create job placeholders
|
|
1482
1758
|
for name in set(job_name_to_label) - set(job_name_to_id):
|
|
1483
|
-
job
|
|
1484
|
-
|
|
1759
|
+
if name in log_job_name_to_id: # job was in nodes.log, but not node_status
|
|
1760
|
+
job_id = str(log_job_name_to_id[name])
|
|
1761
|
+
job = dict(loginfo[job_id])
|
|
1762
|
+
else:
|
|
1763
|
+
job_id = str(fake_id)
|
|
1764
|
+
fake_id -= 1
|
|
1765
|
+
job = {}
|
|
1766
|
+
job["NodeStatus"] = NodeStatus.NOT_READY
|
|
1767
|
+
|
|
1768
|
+
job["ClusterId"] = int(float(job_id))
|
|
1485
1769
|
job["ProcId"] = 0
|
|
1486
1770
|
job["DAGManJobID"] = wms_workflow_id
|
|
1487
1771
|
job["DAGNodeName"] = name
|
|
1488
1772
|
job["bps_job_label"] = job_name_to_label[name]
|
|
1489
|
-
job["
|
|
1490
|
-
job["NodeStatus"] = NodeStatus.NOT_READY
|
|
1773
|
+
job["wms_node_type"] = job_name_to_type[name]
|
|
1491
1774
|
jobs[f"{job['ClusterId']}.{job['ProcId']}"] = job
|
|
1492
|
-
|
|
1775
|
+
|
|
1776
|
+
for job_info in jobs.values():
|
|
1777
|
+
job_info["from_dag_job"] = f"wms_{filename.stem}"
|
|
1778
|
+
|
|
1779
|
+
return jobs
|
|
1780
|
+
|
|
1781
|
+
|
|
1782
|
+
def read_node_status(wms_path: str | os.PathLike) -> dict[str, dict[str, Any]]:
|
|
1783
|
+
"""Read entire node status file.
|
|
1784
|
+
|
|
1785
|
+
Parameters
|
|
1786
|
+
----------
|
|
1787
|
+
wms_path : `str` or `os.PathLike`
|
|
1788
|
+
Path that includes node status file for a run.
|
|
1789
|
+
|
|
1790
|
+
Returns
|
|
1791
|
+
-------
|
|
1792
|
+
jobs : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1793
|
+
DAG summary information compiled from the node status file combined
|
|
1794
|
+
with the information found in the node event log.
|
|
1795
|
+
|
|
1796
|
+
Currently, if the same job attribute is found in both files, its value
|
|
1797
|
+
from the event log takes precedence over the value from the node status
|
|
1798
|
+
file.
|
|
1799
|
+
"""
|
|
1800
|
+
jobs: dict[str, dict[str, Any]] = {}
|
|
1801
|
+
init_fake_id = -1
|
|
1802
|
+
|
|
1803
|
+
# subdags may not have run so wouldn't have node_status file
|
|
1804
|
+
# use dag files and let read_single_node_status handle missing
|
|
1805
|
+
# node_status file.
|
|
1806
|
+
for dag_filename in Path(wms_path).glob("*.dag"):
|
|
1807
|
+
filename = dag_filename.with_suffix(".node_status")
|
|
1808
|
+
info = read_single_node_status(filename, init_fake_id)
|
|
1809
|
+
init_fake_id -= len(info)
|
|
1810
|
+
_update_dicts(jobs, info)
|
|
1811
|
+
|
|
1812
|
+
for dag_filename in Path(wms_path).glob("subdags/*/*.dag"):
|
|
1813
|
+
filename = dag_filename.with_suffix(".node_status")
|
|
1814
|
+
info = read_single_node_status(filename, init_fake_id)
|
|
1815
|
+
init_fake_id -= len(info)
|
|
1816
|
+
_update_dicts(jobs, info)
|
|
1817
|
+
|
|
1818
|
+
# Propagate pruned from subdags to jobs
|
|
1819
|
+
name_to_id: dict[str, str] = {}
|
|
1820
|
+
missing_status: dict[str, list[str]] = {}
|
|
1821
|
+
for id_, job in jobs.items():
|
|
1822
|
+
if job["DAGNodeName"].startswith("wms_"):
|
|
1823
|
+
name_to_id[job["DAGNodeName"]] = id_
|
|
1824
|
+
if "NodeStatus" not in job or job["NodeStatus"] == NodeStatus.NOT_READY:
|
|
1825
|
+
missing_status.setdefault(job["from_dag_job"], []).append(id_)
|
|
1826
|
+
|
|
1827
|
+
for name, dag_id in name_to_id.items():
|
|
1828
|
+
dag_status = jobs[dag_id].get("NodeStatus", NodeStatus.NOT_READY)
|
|
1829
|
+
if dag_status in {NodeStatus.NOT_READY, NodeStatus.FUTILE}:
|
|
1830
|
+
for id_ in missing_status.get(name, []):
|
|
1831
|
+
jobs[id_]["NodeStatus"] = dag_status
|
|
1493
1832
|
|
|
1494
1833
|
return jobs
|
|
1495
1834
|
|
|
1496
1835
|
|
|
1497
|
-
def
|
|
1836
|
+
def read_single_dag_log(log_filename: str | os.PathLike) -> tuple[str, dict[str, dict[str, Any]]]:
|
|
1498
1837
|
"""Read job information from the DAGMan log file.
|
|
1499
1838
|
|
|
1500
1839
|
Parameters
|
|
1501
1840
|
----------
|
|
1502
|
-
|
|
1503
|
-
|
|
1841
|
+
log_filename : `str` or `os.PathLike`
|
|
1842
|
+
DAGMan log filename.
|
|
1504
1843
|
|
|
1505
1844
|
Returns
|
|
1506
1845
|
-------
|
|
1507
1846
|
wms_workflow_id : `str`
|
|
1508
1847
|
HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job.
|
|
1509
|
-
dag_info : `dict` [`str`, `~
|
|
1848
|
+
dag_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1510
1849
|
HTCondor job information read from the log file mapped to HTCondor
|
|
1511
1850
|
job id.
|
|
1512
1851
|
|
|
@@ -1515,25 +1854,21 @@ def read_dag_log(wms_path: str) -> tuple[str, dict[str, Any]]:
|
|
|
1515
1854
|
FileNotFoundError
|
|
1516
1855
|
If cannot find DAGMan log in given wms_path.
|
|
1517
1856
|
"""
|
|
1518
|
-
wms_workflow_id = 0
|
|
1519
|
-
dag_info = {}
|
|
1857
|
+
wms_workflow_id = "0"
|
|
1858
|
+
dag_info: dict[str, dict[str, Any]] = {}
|
|
1520
1859
|
|
|
1521
|
-
|
|
1522
|
-
if
|
|
1523
|
-
try:
|
|
1524
|
-
filename = next(path.glob("*.dag.dagman.log"))
|
|
1525
|
-
except StopIteration as exc:
|
|
1526
|
-
raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc
|
|
1860
|
+
filename = Path(log_filename)
|
|
1861
|
+
if filename.exists():
|
|
1527
1862
|
_LOG.debug("dag node log filename: %s", filename)
|
|
1528
1863
|
|
|
1529
|
-
info = {}
|
|
1864
|
+
info: dict[str, Any] = {}
|
|
1530
1865
|
job_event_log = htcondor.JobEventLog(str(filename))
|
|
1531
1866
|
for event in job_event_log.events(stop_after=0):
|
|
1532
1867
|
id_ = f"{event['Cluster']}.{event['Proc']}"
|
|
1533
1868
|
if id_ not in info:
|
|
1534
1869
|
info[id_] = {}
|
|
1535
1870
|
wms_workflow_id = id_ # taking last job id in case of restarts
|
|
1536
|
-
info[id_]
|
|
1871
|
+
_update_dicts(info[id_], event)
|
|
1537
1872
|
info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
|
|
1538
1873
|
|
|
1539
1874
|
# only save latest DAG job
|
|
@@ -1544,17 +1879,53 @@ def read_dag_log(wms_path: str) -> tuple[str, dict[str, Any]]:
|
|
|
1544
1879
|
return wms_workflow_id, dag_info
|
|
1545
1880
|
|
|
1546
1881
|
|
|
1547
|
-
def
|
|
1882
|
+
def read_dag_log(wms_path: str | os.PathLike) -> tuple[str, dict[str, Any]]:
|
|
1883
|
+
"""Read job information from the DAGMan log file.
|
|
1884
|
+
|
|
1885
|
+
Parameters
|
|
1886
|
+
----------
|
|
1887
|
+
wms_path : `str` or `os.PathLike`
|
|
1888
|
+
Path containing the DAGMan log file.
|
|
1889
|
+
|
|
1890
|
+
Returns
|
|
1891
|
+
-------
|
|
1892
|
+
wms_workflow_id : `str`
|
|
1893
|
+
HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job.
|
|
1894
|
+
dag_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1895
|
+
HTCondor job information read from the log file mapped to HTCondor
|
|
1896
|
+
job id.
|
|
1897
|
+
|
|
1898
|
+
Raises
|
|
1899
|
+
------
|
|
1900
|
+
FileNotFoundError
|
|
1901
|
+
If cannot find DAGMan log in given wms_path.
|
|
1902
|
+
"""
|
|
1903
|
+
wms_workflow_id = MISSING_ID
|
|
1904
|
+
dag_info: dict[str, dict[str, Any]] = {}
|
|
1905
|
+
|
|
1906
|
+
path = Path(wms_path)
|
|
1907
|
+
if path.exists():
|
|
1908
|
+
try:
|
|
1909
|
+
filename = next(path.glob("*.dag.dagman.log"))
|
|
1910
|
+
except StopIteration as exc:
|
|
1911
|
+
raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc
|
|
1912
|
+
_LOG.debug("dag node log filename: %s", filename)
|
|
1913
|
+
wms_workflow_id, dag_info = read_single_dag_log(filename)
|
|
1914
|
+
|
|
1915
|
+
return wms_workflow_id, dag_info
|
|
1916
|
+
|
|
1917
|
+
|
|
1918
|
+
def read_single_dag_nodes_log(filename: str | os.PathLike) -> dict[str, dict[str, Any]]:
|
|
1548
1919
|
"""Read job information from the DAGMan nodes log file.
|
|
1549
1920
|
|
|
1550
1921
|
Parameters
|
|
1551
1922
|
----------
|
|
1552
|
-
|
|
1923
|
+
filename : `str` or `os.PathLike`
|
|
1553
1924
|
Path containing the DAGMan nodes log file.
|
|
1554
1925
|
|
|
1555
1926
|
Returns
|
|
1556
1927
|
-------
|
|
1557
|
-
info : `dict` [`str`, Any]
|
|
1928
|
+
info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1558
1929
|
HTCondor job information read from the log file mapped to HTCondor
|
|
1559
1930
|
job id.
|
|
1560
1931
|
|
|
@@ -1563,20 +1934,46 @@ def read_dag_nodes_log(wms_path):
|
|
|
1563
1934
|
FileNotFoundError
|
|
1564
1935
|
If cannot find DAGMan node log in given wms_path.
|
|
1565
1936
|
"""
|
|
1566
|
-
try:
|
|
1567
|
-
filename = next(Path(wms_path).glob("*.dag.nodes.log"))
|
|
1568
|
-
except StopIteration as exc:
|
|
1569
|
-
raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc
|
|
1570
1937
|
_LOG.debug("dag node log filename: %s", filename)
|
|
1938
|
+
filename = Path(filename)
|
|
1939
|
+
|
|
1940
|
+
info: dict[str, dict[str, Any]] = {}
|
|
1941
|
+
if not filename.exists():
|
|
1942
|
+
raise FileNotFoundError(f"{filename} does not exist")
|
|
1943
|
+
|
|
1944
|
+
try:
|
|
1945
|
+
job_event_log = htcondor.JobEventLog(str(filename))
|
|
1946
|
+
except htcondor.HTCondorIOError as ex:
|
|
1947
|
+
_LOG.error("Problem reading nodes log file (%s): %s", filename, ex)
|
|
1948
|
+
import traceback
|
|
1571
1949
|
|
|
1572
|
-
|
|
1573
|
-
|
|
1950
|
+
traceback.print_stack()
|
|
1951
|
+
raise
|
|
1574
1952
|
for event in job_event_log.events(stop_after=0):
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1953
|
+
_LOG.debug("log event type = %s, keys = %s", event["EventTypeNumber"], event.keys())
|
|
1954
|
+
|
|
1955
|
+
try:
|
|
1956
|
+
id_ = f"{event['Cluster']}.{event['Proc']}"
|
|
1957
|
+
except KeyError:
|
|
1958
|
+
_LOG.warn(
|
|
1959
|
+
"Log event missing ids (DAGNodeName=%s, EventTime=%s, EventTypeNumber=%s)",
|
|
1960
|
+
event.get("DAGNodeName", "UNK"),
|
|
1961
|
+
event.get("EventTime", "UNK"),
|
|
1962
|
+
event.get("EventTypeNumber", "UNK"),
|
|
1963
|
+
)
|
|
1964
|
+
else:
|
|
1965
|
+
if id_ not in info:
|
|
1966
|
+
info[id_] = {}
|
|
1967
|
+
# Workaround: Please check to see if still problem in
|
|
1968
|
+
# future HTCondor versions. Sometimes get a
|
|
1969
|
+
# JobAbortedEvent for a subdag job after it already
|
|
1970
|
+
# terminated normally. Seems to happen when using job
|
|
1971
|
+
# plus subdags.
|
|
1972
|
+
if event["EventTypeNumber"] == 9 and info[id_].get("EventTypeNumber", -1) == 5:
|
|
1973
|
+
_LOG.debug("Skipping spurious JobAbortedEvent: %s", dict(event))
|
|
1974
|
+
else:
|
|
1975
|
+
_update_dicts(info[id_], event)
|
|
1976
|
+
info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
|
|
1580
1977
|
|
|
1581
1978
|
# Add more condor_q-like info to info parsed from log file.
|
|
1582
1979
|
for job in info.values():
|
|
@@ -1585,17 +1982,54 @@ def read_dag_nodes_log(wms_path):
|
|
|
1585
1982
|
return info
|
|
1586
1983
|
|
|
1587
1984
|
|
|
1588
|
-
def
|
|
1985
|
+
def read_dag_nodes_log(wms_path: str | os.PathLike) -> dict[str, dict[str, Any]]:
|
|
1986
|
+
"""Read job information from the DAGMan nodes log file.
|
|
1987
|
+
|
|
1988
|
+
Parameters
|
|
1989
|
+
----------
|
|
1990
|
+
wms_path : `str` or `os.PathLike`
|
|
1991
|
+
Path containing the DAGMan nodes log file.
|
|
1992
|
+
|
|
1993
|
+
Returns
|
|
1994
|
+
-------
|
|
1995
|
+
info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1996
|
+
HTCondor job information read from the log file mapped to HTCondor
|
|
1997
|
+
job id.
|
|
1998
|
+
|
|
1999
|
+
Raises
|
|
2000
|
+
------
|
|
2001
|
+
FileNotFoundError
|
|
2002
|
+
If cannot find DAGMan node log in given wms_path.
|
|
2003
|
+
"""
|
|
2004
|
+
info: dict[str, dict[str, Any]] = {}
|
|
2005
|
+
for filename in Path(wms_path).glob("*.dag.nodes.log"):
|
|
2006
|
+
_LOG.debug("dag node log filename: %s", filename)
|
|
2007
|
+
_update_dicts(info, read_single_dag_nodes_log(filename))
|
|
2008
|
+
|
|
2009
|
+
# If submitted, the main nodes log file should exist
|
|
2010
|
+
if not info:
|
|
2011
|
+
raise FileNotFoundError(f"DAGMan node log not found in {wms_path}")
|
|
2012
|
+
|
|
2013
|
+
# Subdags will not have dag nodes log files if they haven't
|
|
2014
|
+
# started running yet (so missing is not an error).
|
|
2015
|
+
for filename in Path(wms_path).glob("subdags/*/*.dag.nodes.log"):
|
|
2016
|
+
_LOG.debug("dag node log filename: %s", filename)
|
|
2017
|
+
_update_dicts(info, read_single_dag_nodes_log(filename))
|
|
2018
|
+
|
|
2019
|
+
return info
|
|
2020
|
+
|
|
2021
|
+
|
|
2022
|
+
def read_dag_info(wms_path: str | os.PathLike) -> dict[str, dict[str, Any]]:
|
|
1589
2023
|
"""Read custom DAGMan job information from the file.
|
|
1590
2024
|
|
|
1591
2025
|
Parameters
|
|
1592
2026
|
----------
|
|
1593
|
-
wms_path : `str`
|
|
2027
|
+
wms_path : `str` or `os.PathLike`
|
|
1594
2028
|
Path containing the file with the DAGMan job info.
|
|
1595
2029
|
|
|
1596
2030
|
Returns
|
|
1597
2031
|
-------
|
|
1598
|
-
dag_info : `dict` [`str`, `dict` [`str`, Any]]
|
|
2032
|
+
dag_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1599
2033
|
HTCondor job information.
|
|
1600
2034
|
|
|
1601
2035
|
Raises
|
|
@@ -1603,6 +2037,7 @@ def read_dag_info(wms_path):
|
|
|
1603
2037
|
FileNotFoundError
|
|
1604
2038
|
If cannot find DAGMan job info file in the given location.
|
|
1605
2039
|
"""
|
|
2040
|
+
dag_info: dict[str, dict[str, Any]] = {}
|
|
1606
2041
|
try:
|
|
1607
2042
|
filename = next(Path(wms_path).glob("*.info.json"))
|
|
1608
2043
|
except StopIteration as exc:
|
|
@@ -1613,7 +2048,6 @@ def read_dag_info(wms_path):
|
|
|
1613
2048
|
dag_info = json.load(fh)
|
|
1614
2049
|
except (OSError, PermissionError) as exc:
|
|
1615
2050
|
_LOG.debug("Retrieving DAGMan job information failed: %s", exc)
|
|
1616
|
-
dag_info = {}
|
|
1617
2051
|
return dag_info
|
|
1618
2052
|
|
|
1619
2053
|
|
|
@@ -1624,7 +2058,7 @@ def write_dag_info(filename, dag_info):
|
|
|
1624
2058
|
----------
|
|
1625
2059
|
filename : `str`
|
|
1626
2060
|
Name of the file where the information will be stored.
|
|
1627
|
-
dag_info : `dict` [`str` `dict` [`str`, Any]]
|
|
2061
|
+
dag_info : `dict` [`str` `dict` [`str`, `~typing.Any`]]
|
|
1628
2062
|
Information about the DAGMan job.
|
|
1629
2063
|
"""
|
|
1630
2064
|
schedd_name = next(iter(dag_info))
|
|
@@ -1647,7 +2081,7 @@ def _tweak_log_info(filename, job):
|
|
|
1647
2081
|
----------
|
|
1648
2082
|
filename : `pathlib.Path`
|
|
1649
2083
|
Name of the DAGMan log.
|
|
1650
|
-
job : `dict` [ `str`, Any ]
|
|
2084
|
+
job : `dict` [ `str`, `~typing.Any` ]
|
|
1651
2085
|
A mapping between HTCondor job id and job information read from
|
|
1652
2086
|
the log.
|
|
1653
2087
|
"""
|
|
@@ -1661,37 +2095,47 @@ def _tweak_log_info(filename, job):
|
|
|
1661
2095
|
|
|
1662
2096
|
match job["MyType"]:
|
|
1663
2097
|
case "ExecuteEvent":
|
|
1664
|
-
job["JobStatus"] = JobStatus.RUNNING
|
|
2098
|
+
job["JobStatus"] = htcondor.JobStatus.RUNNING
|
|
1665
2099
|
case "JobTerminatedEvent" | "PostScriptTerminatedEvent":
|
|
1666
|
-
job["JobStatus"] = JobStatus.COMPLETED
|
|
2100
|
+
job["JobStatus"] = htcondor.JobStatus.COMPLETED
|
|
1667
2101
|
case "SubmitEvent":
|
|
1668
|
-
job["JobStatus"] = JobStatus.IDLE
|
|
2102
|
+
job["JobStatus"] = htcondor.JobStatus.IDLE
|
|
1669
2103
|
case "JobAbortedEvent":
|
|
1670
|
-
job["JobStatus"] = JobStatus.REMOVED
|
|
2104
|
+
job["JobStatus"] = htcondor.JobStatus.REMOVED
|
|
1671
2105
|
case "JobHeldEvent":
|
|
1672
|
-
job["JobStatus"] = JobStatus.HELD
|
|
2106
|
+
job["JobStatus"] = htcondor.JobStatus.HELD
|
|
2107
|
+
case "JobReleaseEvent":
|
|
2108
|
+
# Shows up as last event if a DAG job was held and released
|
|
2109
|
+
# so assume job is running. If regular job is released, there
|
|
2110
|
+
# will be other events so JobReleaseEvent won't be the last
|
|
2111
|
+
job["JobStatus"] = htcondor.JobStatus.RUNNING
|
|
1673
2112
|
case _:
|
|
1674
2113
|
_LOG.debug("Unknown log event type: %s", job["MyType"])
|
|
1675
|
-
job["JobStatus"] =
|
|
2114
|
+
job["JobStatus"] = None
|
|
1676
2115
|
|
|
1677
|
-
if job["JobStatus"] in {JobStatus.COMPLETED, JobStatus.HELD}:
|
|
2116
|
+
if job["JobStatus"] in {htcondor.JobStatus.COMPLETED, htcondor.JobStatus.HELD}:
|
|
1678
2117
|
new_job = HTC_JOB_AD_HANDLERS.handle(job)
|
|
1679
2118
|
if new_job is not None:
|
|
1680
2119
|
job = new_job
|
|
1681
2120
|
else:
|
|
1682
2121
|
_LOG.error("Could not determine exit status for job '%s.%s'", job["ClusterId"], job["ProcId"])
|
|
1683
2122
|
|
|
2123
|
+
if "LogNotes" in job:
|
|
2124
|
+
m = re.match(r"DAG Node: (\S+)", job["LogNotes"])
|
|
2125
|
+
if m:
|
|
2126
|
+
job["DAGNodeName"] = m.group(1)
|
|
2127
|
+
|
|
1684
2128
|
except KeyError as e:
|
|
1685
2129
|
_LOG.error("Missing key %s in job: %s", str(e), job)
|
|
1686
2130
|
raise
|
|
1687
2131
|
|
|
1688
2132
|
|
|
1689
|
-
def htc_check_dagman_output(wms_path):
|
|
2133
|
+
def htc_check_dagman_output(wms_path: str | os.PathLike) -> str:
|
|
1690
2134
|
"""Check the DAGMan output for error messages.
|
|
1691
2135
|
|
|
1692
2136
|
Parameters
|
|
1693
2137
|
----------
|
|
1694
|
-
wms_path : `str`
|
|
2138
|
+
wms_path : `str` or `os.PathLike`
|
|
1695
2139
|
Directory containing the DAGman output file.
|
|
1696
2140
|
|
|
1697
2141
|
Returns
|
|
@@ -1711,32 +2155,176 @@ def htc_check_dagman_output(wms_path):
|
|
|
1711
2155
|
raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc
|
|
1712
2156
|
_LOG.debug("dag output filename: %s", filename)
|
|
1713
2157
|
|
|
2158
|
+
p = re.compile(r"^(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) (Job submit try \d+/\d+ failed|Warning:.*$|ERROR:.*$)")
|
|
2159
|
+
|
|
1714
2160
|
message = ""
|
|
1715
2161
|
try:
|
|
1716
2162
|
with open(filename) as fh:
|
|
1717
|
-
last_submit_failed = ""
|
|
2163
|
+
last_submit_failed = "" # Since submit retries multiple times only report last one
|
|
1718
2164
|
for line in fh:
|
|
1719
|
-
m =
|
|
2165
|
+
m = p.match(line)
|
|
1720
2166
|
if m:
|
|
1721
|
-
|
|
1722
|
-
|
|
1723
|
-
m
|
|
1724
|
-
|
|
1725
|
-
|
|
2167
|
+
if m.group(2).startswith("Job submit try"):
|
|
2168
|
+
last_submit_failed = m.group(1)
|
|
2169
|
+
elif m.group(2).startswith("ERROR: submit attempt failed"):
|
|
2170
|
+
pass # Should be handled by Job submit try
|
|
2171
|
+
elif m.group(2).startswith("Warning"):
|
|
2172
|
+
if ".dag.nodes.log is in /tmp" in m.group(2):
|
|
1726
2173
|
last_warning = "Cannot submit from /tmp."
|
|
1727
2174
|
else:
|
|
1728
|
-
last_warning = m.group(
|
|
2175
|
+
last_warning = m.group(2)
|
|
2176
|
+
elif m.group(2) == "ERROR: Warning is fatal error because of DAGMAN_USE_STRICT setting":
|
|
2177
|
+
message += "ERROR: "
|
|
2178
|
+
message += last_warning
|
|
2179
|
+
message += "\n"
|
|
2180
|
+
elif m.group(2) in [
|
|
2181
|
+
"ERROR: the following job(s) failed:",
|
|
2182
|
+
"ERROR: the following Node(s) failed:",
|
|
2183
|
+
]:
|
|
2184
|
+
pass
|
|
1729
2185
|
else:
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
m.group(1)
|
|
1734
|
-
== "ERROR: Warning is fatal error because of DAGMAN_USE_STRICT setting"
|
|
1735
|
-
):
|
|
1736
|
-
message += f"ERROR: {last_warning}"
|
|
2186
|
+
message += m.group(2)
|
|
2187
|
+
message += "\n"
|
|
2188
|
+
|
|
1737
2189
|
if last_submit_failed:
|
|
1738
2190
|
message += f"Warn: Job submission issues (last: {last_submit_failed})"
|
|
1739
2191
|
except (OSError, PermissionError):
|
|
1740
2192
|
message = f"Warn: Could not read dagman output file from {wms_path}."
|
|
1741
2193
|
_LOG.debug("dag output file message: %s", message)
|
|
1742
2194
|
return message
|
|
2195
|
+
|
|
2196
|
+
|
|
2197
|
+
def _read_rescue_headers(infh: TextIO) -> tuple[list[str], list[str]]:
|
|
2198
|
+
"""Read header lines from a rescue file.
|
|
2199
|
+
|
|
2200
|
+
Parameters
|
|
2201
|
+
----------
|
|
2202
|
+
infh : `TextIO`
|
|
2203
|
+
The rescue file from which to read the header lines.
|
|
2204
|
+
|
|
2205
|
+
Returns
|
|
2206
|
+
-------
|
|
2207
|
+
header_lines : `list` [`str`]
|
|
2208
|
+
Header lines read from the rescue file.
|
|
2209
|
+
failed_subdags : `list` [`str`]
|
|
2210
|
+
Names of failed subdag jobs.
|
|
2211
|
+
"""
|
|
2212
|
+
header_lines: list[str] = []
|
|
2213
|
+
failed = False
|
|
2214
|
+
failed_subdags: list[str] = []
|
|
2215
|
+
|
|
2216
|
+
for line in infh:
|
|
2217
|
+
line = line.strip()
|
|
2218
|
+
if line.startswith("#"):
|
|
2219
|
+
if line.startswith("# Nodes that failed:"):
|
|
2220
|
+
failed = True
|
|
2221
|
+
header_lines.append(line)
|
|
2222
|
+
elif failed:
|
|
2223
|
+
orig_failed_nodes = line[1:].strip().split(",")
|
|
2224
|
+
new_failed_nodes = []
|
|
2225
|
+
for node in orig_failed_nodes:
|
|
2226
|
+
if node.startswith("wms_check_status"):
|
|
2227
|
+
group_node = node[17:]
|
|
2228
|
+
failed_subdags.append(group_node)
|
|
2229
|
+
new_failed_nodes.append(group_node)
|
|
2230
|
+
else:
|
|
2231
|
+
new_failed_nodes.append(node)
|
|
2232
|
+
header_lines.append(f"# {','.join(new_failed_nodes)}")
|
|
2233
|
+
if orig_failed_nodes[-1] == "<ENDLIST>":
|
|
2234
|
+
failed = False
|
|
2235
|
+
else:
|
|
2236
|
+
header_lines.append(line)
|
|
2237
|
+
elif line.strip() == "": # end of headers
|
|
2238
|
+
break
|
|
2239
|
+
return header_lines, failed_subdags
|
|
2240
|
+
|
|
2241
|
+
|
|
2242
|
+
def _write_rescue_headers(header_lines: list[str], failed_subdags: list[str], outfh: TextIO) -> None:
|
|
2243
|
+
"""Write the header lines to the new rescue file.
|
|
2244
|
+
|
|
2245
|
+
Parameters
|
|
2246
|
+
----------
|
|
2247
|
+
header_lines : `list` [`str`]
|
|
2248
|
+
Header lines to write to the new rescue file.
|
|
2249
|
+
failed_subdags : `list` [`str`]
|
|
2250
|
+
Job names of the failed subdags.
|
|
2251
|
+
outfh : `TextIO`
|
|
2252
|
+
New rescue file.
|
|
2253
|
+
"""
|
|
2254
|
+
done_str = "# Nodes premarked DONE"
|
|
2255
|
+
pattern = f"^{done_str}:\\s+(\\d+)"
|
|
2256
|
+
for header_line in header_lines:
|
|
2257
|
+
m = re.match(pattern, header_line)
|
|
2258
|
+
if m:
|
|
2259
|
+
print(f"{done_str}: {int(m.group(1)) - len(failed_subdags)}", file=outfh)
|
|
2260
|
+
else:
|
|
2261
|
+
print(header_line, file=outfh)
|
|
2262
|
+
|
|
2263
|
+
print("", file=outfh)
|
|
2264
|
+
|
|
2265
|
+
|
|
2266
|
+
def _copy_done_lines(failed_subdags: list[str], infh: TextIO, outfh: TextIO) -> None:
|
|
2267
|
+
"""Copy the DONE lines from the original rescue file skipping
|
|
2268
|
+
the failed group jobs.
|
|
2269
|
+
|
|
2270
|
+
Parameters
|
|
2271
|
+
----------
|
|
2272
|
+
failed_subdags : `list` [`str`]
|
|
2273
|
+
List of job names for the failed subdags
|
|
2274
|
+
infh : `TextIO`
|
|
2275
|
+
Original rescue file to copy from.
|
|
2276
|
+
outfh : `TextIO`
|
|
2277
|
+
New rescue file to copy to.
|
|
2278
|
+
"""
|
|
2279
|
+
for line in infh:
|
|
2280
|
+
line = line.strip()
|
|
2281
|
+
try:
|
|
2282
|
+
_, node_name = line.split()
|
|
2283
|
+
except ValueError:
|
|
2284
|
+
_LOG.error(f"Unexpected line in rescue file = '{line}'")
|
|
2285
|
+
raise
|
|
2286
|
+
if node_name not in failed_subdags:
|
|
2287
|
+
print(line, file=outfh)
|
|
2288
|
+
|
|
2289
|
+
|
|
2290
|
+
def _update_rescue_file(rescue_file: Path) -> None:
|
|
2291
|
+
"""Update the subdag failures in the main rescue file
|
|
2292
|
+
and backup the failed subdag dirs.
|
|
2293
|
+
|
|
2294
|
+
Parameters
|
|
2295
|
+
----------
|
|
2296
|
+
rescue_file : `pathlib.Path`
|
|
2297
|
+
The main rescue file that needs to be updated.
|
|
2298
|
+
"""
|
|
2299
|
+
# To reduce memory requirements, not reading entire file into memory.
|
|
2300
|
+
rescue_tmp = rescue_file.with_suffix(rescue_file.suffix + ".tmp")
|
|
2301
|
+
with open(rescue_file) as infh:
|
|
2302
|
+
header_lines, failed_subdags = _read_rescue_headers(infh)
|
|
2303
|
+
with open(rescue_tmp, "w") as outfh:
|
|
2304
|
+
_write_rescue_headers(header_lines, failed_subdags, outfh)
|
|
2305
|
+
_copy_done_lines(failed_subdags, infh, outfh)
|
|
2306
|
+
rescue_file.unlink()
|
|
2307
|
+
rescue_tmp.rename(rescue_file)
|
|
2308
|
+
for failed_subdag in failed_subdags:
|
|
2309
|
+
htc_backup_files(
|
|
2310
|
+
rescue_file.parent / "subdags" / failed_subdag, subdir=f"backups/subdags/{failed_subdag}"
|
|
2311
|
+
)
|
|
2312
|
+
|
|
2313
|
+
|
|
2314
|
+
def _update_dicts(dict1, dict2):
|
|
2315
|
+
"""Update dict1 with info in dict2.
|
|
2316
|
+
|
|
2317
|
+
(Basically an update for nested dictionaries.)
|
|
2318
|
+
|
|
2319
|
+
Parameters
|
|
2320
|
+
----------
|
|
2321
|
+
dict1 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
2322
|
+
HTCondor job information to be updated.
|
|
2323
|
+
dict2 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
2324
|
+
Additional HTCondor job information.
|
|
2325
|
+
"""
|
|
2326
|
+
for key, value in dict2.items():
|
|
2327
|
+
if key in dict1 and isinstance(dict1[key], dict) and isinstance(value, dict):
|
|
2328
|
+
_update_dicts(dict1[key], value)
|
|
2329
|
+
else:
|
|
2330
|
+
dict1[key] = value
|