lsst-ctrl-bps-htcondor 29.0.1rc1__py3-none-any.whl → 29.1.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/ctrl/bps/htcondor/htcondor_service.py +438 -209
- lsst/ctrl/bps/htcondor/lssthtc.py +864 -261
- lsst/ctrl/bps/htcondor/version.py +1 -1
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc2.dist-info}/METADATA +1 -1
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc2.dist-info}/RECORD +12 -12
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc2.dist-info}/WHEEL +1 -1
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc2.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc2.dist-info}/licenses/LICENSE +0 -0
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc2.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc2.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc2.dist-info}/top_level.txt +0 -0
- {lsst_ctrl_bps_htcondor-29.0.1rc1.dist-info → lsst_ctrl_bps_htcondor-29.1.0rc2.dist-info}/zip-safe +0 -0
|
@@ -38,9 +38,9 @@ __all__ = [
|
|
|
38
38
|
"DagStatus",
|
|
39
39
|
"HTCDag",
|
|
40
40
|
"HTCJob",
|
|
41
|
-
"JobStatus",
|
|
42
41
|
"NodeStatus",
|
|
43
42
|
"RestrictedDict",
|
|
43
|
+
"WmsNodeType",
|
|
44
44
|
"condor_history",
|
|
45
45
|
"condor_q",
|
|
46
46
|
"condor_search",
|
|
@@ -65,7 +65,6 @@ __all__ = [
|
|
|
65
65
|
"read_node_status",
|
|
66
66
|
"summarize_dag",
|
|
67
67
|
"update_job_info",
|
|
68
|
-
"update_job_info",
|
|
69
68
|
"write_dag_info",
|
|
70
69
|
]
|
|
71
70
|
|
|
@@ -77,23 +76,24 @@ import os
|
|
|
77
76
|
import pprint
|
|
78
77
|
import re
|
|
79
78
|
import subprocess
|
|
80
|
-
from collections import defaultdict
|
|
79
|
+
from collections import Counter, defaultdict
|
|
81
80
|
from collections.abc import MutableMapping
|
|
82
81
|
from datetime import datetime, timedelta
|
|
83
|
-
from enum import IntEnum
|
|
82
|
+
from enum import IntEnum, auto
|
|
84
83
|
from pathlib import Path
|
|
85
|
-
from typing import Any
|
|
84
|
+
from typing import Any, TextIO
|
|
86
85
|
|
|
87
86
|
import classad
|
|
88
87
|
import htcondor
|
|
89
88
|
import networkx
|
|
89
|
+
from deprecated.sphinx import deprecated
|
|
90
90
|
from packaging import version
|
|
91
91
|
|
|
92
92
|
from .handlers import HTC_JOB_AD_HANDLERS
|
|
93
93
|
|
|
94
94
|
_LOG = logging.getLogger(__name__)
|
|
95
95
|
|
|
96
|
-
MISSING_ID = -99999
|
|
96
|
+
MISSING_ID = "-99999"
|
|
97
97
|
|
|
98
98
|
|
|
99
99
|
class DagStatus(IntEnum):
|
|
@@ -108,6 +108,13 @@ class DagStatus(IntEnum):
|
|
|
108
108
|
SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8)
|
|
109
109
|
|
|
110
110
|
|
|
111
|
+
@deprecated(
|
|
112
|
+
reason="The JobStatus is internally replaced by htcondor.JobStatus. "
|
|
113
|
+
"External reporting code should be using ctrl_bps.WmsStates. "
|
|
114
|
+
"This class will be removed after v30.",
|
|
115
|
+
version="v30.0",
|
|
116
|
+
category=FutureWarning,
|
|
117
|
+
)
|
|
111
118
|
class JobStatus(IntEnum):
|
|
112
119
|
"""HTCondor's statuses for jobs."""
|
|
113
120
|
|
|
@@ -155,6 +162,31 @@ class NodeStatus(IntEnum):
|
|
|
155
162
|
FUTILE = 7
|
|
156
163
|
|
|
157
164
|
|
|
165
|
+
class WmsNodeType(IntEnum):
|
|
166
|
+
"""HTCondor plugin node types to help with payload reporting."""
|
|
167
|
+
|
|
168
|
+
UNKNOWN = auto()
|
|
169
|
+
"""Dummy value when missing."""
|
|
170
|
+
|
|
171
|
+
PAYLOAD = auto()
|
|
172
|
+
"""Payload job."""
|
|
173
|
+
|
|
174
|
+
FINAL = auto()
|
|
175
|
+
"""Final job."""
|
|
176
|
+
|
|
177
|
+
SERVICE = auto()
|
|
178
|
+
"""Service job."""
|
|
179
|
+
|
|
180
|
+
NOOP = auto()
|
|
181
|
+
"""NOOP job used for ordering jobs."""
|
|
182
|
+
|
|
183
|
+
SUBDAG = auto()
|
|
184
|
+
"""SUBDAG job used for ordering jobs."""
|
|
185
|
+
|
|
186
|
+
SUBDAG_CHECK = auto()
|
|
187
|
+
"""Job used to correctly prune jobs after a subdag."""
|
|
188
|
+
|
|
189
|
+
|
|
158
190
|
HTC_QUOTE_KEYS = {"environment"}
|
|
159
191
|
HTC_VALID_JOB_KEYS = {
|
|
160
192
|
"universe",
|
|
@@ -189,7 +221,18 @@ HTC_VALID_JOB_KEYS = {
|
|
|
189
221
|
"accounting_group",
|
|
190
222
|
"accounting_group_user",
|
|
191
223
|
}
|
|
192
|
-
HTC_VALID_JOB_DAG_KEYS = {
|
|
224
|
+
HTC_VALID_JOB_DAG_KEYS = {
|
|
225
|
+
"dir",
|
|
226
|
+
"noop",
|
|
227
|
+
"done",
|
|
228
|
+
"vars",
|
|
229
|
+
"pre",
|
|
230
|
+
"post",
|
|
231
|
+
"retry",
|
|
232
|
+
"retry_unless_exit",
|
|
233
|
+
"abort_dag_on",
|
|
234
|
+
"abort_exit",
|
|
235
|
+
}
|
|
193
236
|
HTC_VERSION = version.parse(htcondor.__version__)
|
|
194
237
|
|
|
195
238
|
|
|
@@ -224,7 +267,7 @@ class RestrictedDict(MutableMapping):
|
|
|
224
267
|
|
|
225
268
|
Returns
|
|
226
269
|
-------
|
|
227
|
-
value : `~
|
|
270
|
+
value : `~typing.Any`
|
|
228
271
|
Value associated with given key.
|
|
229
272
|
|
|
230
273
|
Raises
|
|
@@ -256,7 +299,7 @@ class RestrictedDict(MutableMapping):
|
|
|
256
299
|
----------
|
|
257
300
|
key : `str`
|
|
258
301
|
Identifier to associate with given value.
|
|
259
|
-
value : `~
|
|
302
|
+
value : `~typing.Any`
|
|
260
303
|
Value to store.
|
|
261
304
|
|
|
262
305
|
Raises
|
|
@@ -278,7 +321,9 @@ class RestrictedDict(MutableMapping):
|
|
|
278
321
|
return str(self.data)
|
|
279
322
|
|
|
280
323
|
|
|
281
|
-
def htc_backup_files(
|
|
324
|
+
def htc_backup_files(
|
|
325
|
+
wms_path: str | os.PathLike, subdir: str | os.PathLike | None = None, limit: int = 100
|
|
326
|
+
) -> Path | None:
|
|
282
327
|
"""Backup select HTCondor files in the submit directory.
|
|
283
328
|
|
|
284
329
|
Files will be saved in separate subdirectories which will be created in
|
|
@@ -293,9 +338,9 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
|
|
|
293
338
|
|
|
294
339
|
Parameters
|
|
295
340
|
----------
|
|
296
|
-
wms_path : `str` or `
|
|
341
|
+
wms_path : `str` or `os.PathLike`
|
|
297
342
|
Path to the submit directory either absolute or relative.
|
|
298
|
-
subdir : `str` or `
|
|
343
|
+
subdir : `str` or `os.PathLike`, optional
|
|
299
344
|
A path, relative to the submit directory, where all subdirectories with
|
|
300
345
|
backup files will be kept. Defaults to None which means that the backup
|
|
301
346
|
subdirectories will be placed directly in the submit directory.
|
|
@@ -305,6 +350,11 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
|
|
|
305
350
|
to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in
|
|
306
351
|
version 8.8+.
|
|
307
352
|
|
|
353
|
+
Returns
|
|
354
|
+
-------
|
|
355
|
+
last_rescue_file : `pathlib.Path` or None
|
|
356
|
+
Path to the latest rescue file or None if doesn't exist.
|
|
357
|
+
|
|
308
358
|
Raises
|
|
309
359
|
------
|
|
310
360
|
FileNotFoundError
|
|
@@ -327,17 +377,18 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
|
|
|
327
377
|
raise FileNotFoundError(f"Directory {path} not found")
|
|
328
378
|
|
|
329
379
|
# Initialize the backup counter.
|
|
330
|
-
rescue_dags = list(
|
|
380
|
+
rescue_dags = list(path.glob("*.rescue[0-9][0-9][0-9]"))
|
|
331
381
|
counter = min(len(rescue_dags), limit)
|
|
332
382
|
|
|
333
383
|
# Create the backup directory and move select files there.
|
|
334
|
-
dest =
|
|
384
|
+
dest = path
|
|
335
385
|
if subdir:
|
|
336
386
|
# PurePath.is_relative_to() is not available before Python 3.9. Hence
|
|
337
387
|
# we need to check is 'subdir' is in the submit directory in some other
|
|
338
388
|
# way if it is an absolute path.
|
|
339
389
|
subdir = Path(subdir)
|
|
340
390
|
if subdir.is_absolute():
|
|
391
|
+
subdir = subdir.resolve() # Since resolve was run on path, must run it here
|
|
341
392
|
if dest not in subdir.parents:
|
|
342
393
|
_LOG.warning(
|
|
343
394
|
"Invalid backup location: '%s' not in the submit directory, will use '%s' instead.",
|
|
@@ -349,21 +400,66 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
|
|
|
349
400
|
else:
|
|
350
401
|
dest /= subdir
|
|
351
402
|
dest /= f"{counter:0{width}}"
|
|
403
|
+
_LOG.debug("dest = %s", dest)
|
|
352
404
|
try:
|
|
353
405
|
dest.mkdir(parents=True, exist_ok=False if counter < limit else True)
|
|
354
406
|
except FileExistsError:
|
|
355
407
|
_LOG.warning("Refusing to do backups: target directory '%s' already exists", dest)
|
|
356
408
|
else:
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
409
|
+
htc_backup_files_single_path(path, dest)
|
|
410
|
+
|
|
411
|
+
# also back up any subdag info
|
|
412
|
+
for subdag_dir in path.glob("subdags/*"):
|
|
413
|
+
subdag_dest = dest / subdag_dir.relative_to(path)
|
|
414
|
+
subdag_dest.mkdir(parents=True, exist_ok=False)
|
|
415
|
+
htc_backup_files_single_path(subdag_dir, subdag_dest)
|
|
416
|
+
|
|
417
|
+
last_rescue_file = rescue_dags[-1] if rescue_dags else None
|
|
418
|
+
_LOG.debug("last_rescue_file = %s", last_rescue_file)
|
|
419
|
+
return last_rescue_file
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def htc_backup_files_single_path(src: str | os.PathLike, dest: str | os.PathLike) -> None:
|
|
423
|
+
"""Move particular htc files to a different directory for later debugging.
|
|
424
|
+
|
|
425
|
+
Parameters
|
|
426
|
+
----------
|
|
427
|
+
src : `str` or `os.PathLike`
|
|
428
|
+
Directory from which to backup particular files.
|
|
429
|
+
dest : `str` or `os.PathLike`
|
|
430
|
+
Directory to which particular files are moved.
|
|
431
|
+
|
|
432
|
+
Raises
|
|
433
|
+
------
|
|
434
|
+
RuntimeError
|
|
435
|
+
If given dest directory matches given src directory.
|
|
436
|
+
OSError
|
|
437
|
+
If problems moving file.
|
|
438
|
+
FileNotFoundError
|
|
439
|
+
Item matching pattern in src directory isn't a file.
|
|
440
|
+
"""
|
|
441
|
+
src = Path(src)
|
|
442
|
+
dest = Path(dest)
|
|
443
|
+
if dest.samefile(src):
|
|
444
|
+
raise RuntimeError(f"Destination directory is same as the source directory ({src})")
|
|
445
|
+
|
|
446
|
+
for patt in [
|
|
447
|
+
"*.info.*",
|
|
448
|
+
"*.dag.metrics",
|
|
449
|
+
"*.dag.nodes.log",
|
|
450
|
+
"*.node_status",
|
|
451
|
+
"wms_*.dag.post.out",
|
|
452
|
+
"wms_*.status.txt",
|
|
453
|
+
]:
|
|
454
|
+
for source in src.glob(patt):
|
|
455
|
+
if source.is_file():
|
|
456
|
+
target = dest / source.relative_to(src)
|
|
457
|
+
try:
|
|
458
|
+
source.rename(target)
|
|
459
|
+
except OSError as exc:
|
|
460
|
+
raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None
|
|
461
|
+
else:
|
|
462
|
+
raise FileNotFoundError(f"Backing up '{source}' failed: not a file")
|
|
367
463
|
|
|
368
464
|
|
|
369
465
|
def htc_escape(value):
|
|
@@ -371,12 +467,12 @@ def htc_escape(value):
|
|
|
371
467
|
|
|
372
468
|
Parameters
|
|
373
469
|
----------
|
|
374
|
-
value : `~
|
|
470
|
+
value : `~typing.Any`
|
|
375
471
|
Value that needs to have characters escaped if string.
|
|
376
472
|
|
|
377
473
|
Returns
|
|
378
474
|
-------
|
|
379
|
-
new_value : `~
|
|
475
|
+
new_value : `~typing.Any`
|
|
380
476
|
Given value with characters escaped appropriate for HTCondor if string.
|
|
381
477
|
"""
|
|
382
478
|
if isinstance(value, str):
|
|
@@ -407,12 +503,14 @@ def htc_write_attribs(stream, attrs):
|
|
|
407
503
|
print(f"+{key} = {pval}", file=stream)
|
|
408
504
|
|
|
409
505
|
|
|
410
|
-
def htc_write_condor_file(
|
|
506
|
+
def htc_write_condor_file(
|
|
507
|
+
filename: str | os.PathLike, job_name: str, job: RestrictedDict, job_attrs: dict[str, Any]
|
|
508
|
+
) -> None:
|
|
411
509
|
"""Write an HTCondor submit file.
|
|
412
510
|
|
|
413
511
|
Parameters
|
|
414
512
|
----------
|
|
415
|
-
filename : `str`
|
|
513
|
+
filename : `str` or os.PathLike
|
|
416
514
|
Filename for the HTCondor submit file.
|
|
417
515
|
job_name : `str`
|
|
418
516
|
Job name to use in submit file.
|
|
@@ -463,7 +561,7 @@ if HTC_VERSION < version.parse("8.9.8"):
|
|
|
463
561
|
|
|
464
562
|
Returns
|
|
465
563
|
-------
|
|
466
|
-
kwargs : `dict` [`str`, Any]
|
|
564
|
+
kwargs : `dict` [`str`, `~typing.Any`]
|
|
467
565
|
Keywords arguments that are guaranteed to work with the Python
|
|
468
566
|
HTCondor API.
|
|
469
567
|
|
|
@@ -501,7 +599,7 @@ else:
|
|
|
501
599
|
|
|
502
600
|
Returns
|
|
503
601
|
-------
|
|
504
|
-
kwargs : `dict` [`str`, Any]
|
|
602
|
+
kwargs : `dict` [`str`, `~typing.Any`]
|
|
505
603
|
Keywords arguments that were passed to the function.
|
|
506
604
|
"""
|
|
507
605
|
return kwargs
|
|
@@ -521,7 +619,7 @@ def htc_query_history(schedds, **kwargs):
|
|
|
521
619
|
------
|
|
522
620
|
schedd_name : `str`
|
|
523
621
|
Name of the HTCondor scheduler managing the job queue.
|
|
524
|
-
job_ad : `dict` [`str`, Any]
|
|
622
|
+
job_ad : `dict` [`str`, `~typing.Any`]
|
|
525
623
|
A dictionary representing HTCondor ClassAd describing a job. It maps
|
|
526
624
|
job attributes names to values of the ClassAd expressions they
|
|
527
625
|
represent.
|
|
@@ -549,7 +647,7 @@ def htc_query_present(schedds, **kwargs):
|
|
|
549
647
|
------
|
|
550
648
|
schedd_name : `str`
|
|
551
649
|
Name of the HTCondor scheduler managing the job queue.
|
|
552
|
-
job_ad : `dict` [`str`, Any]
|
|
650
|
+
job_ad : `dict` [`str`, `~typing.Any`]
|
|
553
651
|
A dictionary representing HTCondor ClassAd describing a job. It maps
|
|
554
652
|
job attributes names to values of the ClassAd expressions they
|
|
555
653
|
represent.
|
|
@@ -581,7 +679,8 @@ def htc_submit_dag(sub):
|
|
|
581
679
|
|
|
582
680
|
Returns
|
|
583
681
|
-------
|
|
584
|
-
schedd_job_info : `dict` [`str`, `dict` [`str`,
|
|
682
|
+
schedd_job_info : `dict` [`str`, `dict` [`str`, \
|
|
683
|
+
`dict` [`str`, `~typing.Any`]]]
|
|
585
684
|
Information about jobs satisfying the search criteria where for each
|
|
586
685
|
Scheduler, local HTCondor job ids are mapped to their respective
|
|
587
686
|
classads.
|
|
@@ -604,14 +703,14 @@ def htc_submit_dag(sub):
|
|
|
604
703
|
return schedd_dag_info
|
|
605
704
|
|
|
606
705
|
|
|
607
|
-
def htc_create_submit_from_dag(dag_filename, submit_options
|
|
706
|
+
def htc_create_submit_from_dag(dag_filename: str, submit_options: dict[str, Any]) -> htcondor.Submit:
|
|
608
707
|
"""Create a DAGMan job submit description.
|
|
609
708
|
|
|
610
709
|
Parameters
|
|
611
710
|
----------
|
|
612
711
|
dag_filename : `str`
|
|
613
712
|
Name of file containing HTCondor DAG commands.
|
|
614
|
-
submit_options : `dict` [`str`, Any], optional
|
|
713
|
+
submit_options : `dict` [`str`, `~typing.Any`], optional
|
|
615
714
|
Contains extra options for command line (Value of None means flag).
|
|
616
715
|
|
|
617
716
|
Returns
|
|
@@ -624,6 +723,34 @@ def htc_create_submit_from_dag(dag_filename, submit_options=None):
|
|
|
624
723
|
Use with HTCondor versions which support htcondor.Submit.from_dag(),
|
|
625
724
|
i.e., 8.9.3 or newer.
|
|
626
725
|
"""
|
|
726
|
+
# Passing do_recurse as submit_option does not seem to
|
|
727
|
+
# override DAGMAN_GENERATE_SUBDAG_SUBMITS as manual implies.
|
|
728
|
+
# So setting it and the other bps required setting here as
|
|
729
|
+
# environment variables if they don't exist.
|
|
730
|
+
var_name = "_CONDOR_DAGMAN_MANAGER_JOB_APPEND_GETENV"
|
|
731
|
+
if var_name not in os.environ:
|
|
732
|
+
os.environ[var_name] = "True"
|
|
733
|
+
|
|
734
|
+
if "do_recurse" in submit_options:
|
|
735
|
+
var_name = "_CONDOR_DAGMAN_GENERATE_SUBDAG_SUBMITS"
|
|
736
|
+
if var_name not in os.environ:
|
|
737
|
+
os.environ[var_name] = str(submit_options["do_recurse"])
|
|
738
|
+
|
|
739
|
+
# Config and environment variables do not seem to override -MaxIdle
|
|
740
|
+
# on the .dag.condor.sub's command line (broken in some 24.0.x versions).
|
|
741
|
+
# Explicitly forward them as a submit_option if either exists.
|
|
742
|
+
# Note: auto generated subdag submit files are still the -MaxIdle=1000
|
|
743
|
+
# in the broken versions.
|
|
744
|
+
if "MaxIdle" not in submit_options:
|
|
745
|
+
max_jobs_idle: int | None = None
|
|
746
|
+
config_var_name = "DAGMAN_MAX_JOBS_IDLE"
|
|
747
|
+
if f"_CONDOR_{config_var_name}" in os.environ:
|
|
748
|
+
max_jobs_idle = int(os.environ[f"_CONDOR_{config_var_name}"])
|
|
749
|
+
elif config_var_name in htcondor.param:
|
|
750
|
+
max_jobs_idle = htcondor.param[config_var_name]
|
|
751
|
+
if max_jobs_idle:
|
|
752
|
+
submit_options["MaxIdle"] = max_jobs_idle
|
|
753
|
+
|
|
627
754
|
return htcondor.Submit.from_dag(dag_filename, submit_options)
|
|
628
755
|
|
|
629
756
|
|
|
@@ -637,7 +764,7 @@ def htc_create_submit_from_cmd(dag_filename, submit_options=None):
|
|
|
637
764
|
----------
|
|
638
765
|
dag_filename : `str`
|
|
639
766
|
Name of file containing HTCondor DAG commands.
|
|
640
|
-
submit_options : `dict` [`str`, Any], optional
|
|
767
|
+
submit_options : `dict` [`str`, `~typing.Any`], optional
|
|
641
768
|
Contains extra options for command line (Value of None means flag).
|
|
642
769
|
|
|
643
770
|
Returns
|
|
@@ -702,7 +829,7 @@ def htc_create_submit_from_file(submit_file):
|
|
|
702
829
|
return htcondor.Submit(descriptors)
|
|
703
830
|
|
|
704
831
|
|
|
705
|
-
def _htc_write_job_commands(stream, name,
|
|
832
|
+
def _htc_write_job_commands(stream, name, commands):
|
|
706
833
|
"""Output the DAGMan job lines for single job in DAG.
|
|
707
834
|
|
|
708
835
|
Parameters
|
|
@@ -711,40 +838,60 @@ def _htc_write_job_commands(stream, name, jobs):
|
|
|
711
838
|
Writeable text stream (typically an opened file).
|
|
712
839
|
name : `str`
|
|
713
840
|
Job name.
|
|
714
|
-
|
|
715
|
-
DAG
|
|
841
|
+
commands : `RestrictedDict`
|
|
842
|
+
DAG commands for a job.
|
|
716
843
|
"""
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
f"
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
844
|
+
# Note: optional pieces of commands include a space at the beginning.
|
|
845
|
+
# also making sure values aren't empty strings as placeholders.
|
|
846
|
+
if "pre" in commands and commands["pre"]:
|
|
847
|
+
defer = ""
|
|
848
|
+
if "defer" in commands["pre"] and commands["pre"]["defer"]:
|
|
849
|
+
defer = f" DEFER {commands['pre']['defer']['status']} {commands['pre']['defer']['time']}"
|
|
850
|
+
|
|
851
|
+
debug = ""
|
|
852
|
+
if "debug" in commands["pre"] and commands["pre"]["debug"]:
|
|
853
|
+
debug = f" DEBUG {commands['pre']['debug']['filename']} {commands['pre']['debug']['type']}"
|
|
854
|
+
|
|
855
|
+
arguments = ""
|
|
856
|
+
if "arguments" in commands["pre"] and commands["pre"]["arguments"]:
|
|
857
|
+
arguments = f" {commands['pre']['arguments']}"
|
|
858
|
+
|
|
859
|
+
executable = commands["pre"]["executable"]
|
|
860
|
+
print(f"SCRIPT{defer}{debug} PRE {name} {executable}{arguments}", file=stream)
|
|
861
|
+
|
|
862
|
+
if "post" in commands and commands["post"]:
|
|
863
|
+
defer = ""
|
|
864
|
+
if "defer" in commands["post"] and commands["post"]["defer"]:
|
|
865
|
+
defer = f" DEFER {commands['post']['defer']['status']} {commands['post']['defer']['time']}"
|
|
866
|
+
|
|
867
|
+
debug = ""
|
|
868
|
+
if "debug" in commands["post"] and commands["post"]["debug"]:
|
|
869
|
+
debug = f" DEBUG {commands['post']['debug']['filename']} {commands['post']['debug']['type']}"
|
|
870
|
+
|
|
871
|
+
arguments = ""
|
|
872
|
+
if "arguments" in commands["post"] and commands["post"]["arguments"]:
|
|
873
|
+
arguments = f" {commands['post']['arguments']}"
|
|
874
|
+
|
|
875
|
+
executable = commands["post"]["executable"]
|
|
876
|
+
print(f"SCRIPT{defer}{debug} POST {name} {executable}{arguments}", file=stream)
|
|
877
|
+
|
|
878
|
+
if "vars" in commands and commands["vars"]:
|
|
879
|
+
for key, value in commands["vars"].items():
|
|
733
880
|
print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream)
|
|
734
881
|
|
|
735
|
-
if "pre_skip" in
|
|
736
|
-
print(f"PRE_SKIP {name} {
|
|
882
|
+
if "pre_skip" in commands and commands["pre_skip"]:
|
|
883
|
+
print(f"PRE_SKIP {name} {commands['pre_skip']}", file=stream)
|
|
737
884
|
|
|
738
|
-
if "retry" in
|
|
739
|
-
print(f"RETRY {name} {
|
|
740
|
-
if "retry_unless_exit" in
|
|
741
|
-
print(f"UNLESS-EXIT {
|
|
742
|
-
print("
|
|
885
|
+
if "retry" in commands and commands["retry"]:
|
|
886
|
+
print(f"RETRY {name} {commands['retry']}", end="", file=stream)
|
|
887
|
+
if "retry_unless_exit" in commands:
|
|
888
|
+
print(f" UNLESS-EXIT {commands['retry_unless_exit']}", end="", file=stream)
|
|
889
|
+
print("", file=stream) # Since previous prints don't include new line
|
|
743
890
|
|
|
744
|
-
if "abort_dag_on" in
|
|
891
|
+
if "abort_dag_on" in commands and commands["abort_dag_on"]:
|
|
745
892
|
print(
|
|
746
|
-
f"ABORT-DAG-ON {name} {
|
|
747
|
-
f" RETURN {
|
|
893
|
+
f"ABORT-DAG-ON {name} {commands['abort_dag_on']['node_exit']}"
|
|
894
|
+
f" RETURN {commands['abort_dag_on']['abort_exit']}",
|
|
748
895
|
file=stream,
|
|
749
896
|
)
|
|
750
897
|
|
|
@@ -773,6 +920,8 @@ class HTCJob:
|
|
|
773
920
|
self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds)
|
|
774
921
|
self.attrs = initattrs
|
|
775
922
|
self.subfile = None
|
|
923
|
+
self.subdir = None
|
|
924
|
+
self.subdag = None
|
|
776
925
|
|
|
777
926
|
def __str__(self):
|
|
778
927
|
return self.name
|
|
@@ -810,33 +959,54 @@ class HTCJob:
|
|
|
810
959
|
if new_attrs:
|
|
811
960
|
self.attrs.update(new_attrs)
|
|
812
961
|
|
|
813
|
-
def write_submit_file(self, submit_path
|
|
962
|
+
def write_submit_file(self, submit_path: str | os.PathLike) -> None:
|
|
814
963
|
"""Write job description to submit file.
|
|
815
964
|
|
|
816
965
|
Parameters
|
|
817
966
|
----------
|
|
818
|
-
submit_path : `str`
|
|
967
|
+
submit_path : `str` or `os.PathLike`
|
|
819
968
|
Prefix path for the submit file.
|
|
820
|
-
job_subdir : `str`, optional
|
|
821
|
-
Template for job subdir.
|
|
822
969
|
"""
|
|
823
970
|
if not self.subfile:
|
|
824
971
|
self.subfile = f"{self.name}.sub"
|
|
825
|
-
job_subdir = job_subdir.format(self=self)
|
|
826
|
-
if job_subdir:
|
|
827
|
-
self.subfile = os.path.join(job_subdir, self.subfile)
|
|
828
|
-
htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs)
|
|
829
972
|
|
|
830
|
-
|
|
973
|
+
subfile = self.subfile
|
|
974
|
+
if self.subdir:
|
|
975
|
+
subfile = Path(self.subdir) / subfile
|
|
976
|
+
|
|
977
|
+
subfile = Path(os.path.expandvars(subfile))
|
|
978
|
+
if not subfile.is_absolute():
|
|
979
|
+
subfile = Path(submit_path) / subfile
|
|
980
|
+
if not subfile.exists():
|
|
981
|
+
htc_write_condor_file(subfile, self.name, self.cmds, self.attrs)
|
|
982
|
+
|
|
983
|
+
def write_dag_commands(self, stream, dag_rel_path, command_name="JOB"):
|
|
831
984
|
"""Write DAG commands for single job to output stream.
|
|
832
985
|
|
|
833
986
|
Parameters
|
|
834
987
|
----------
|
|
835
988
|
stream : `IO` or `str`
|
|
836
989
|
Output Stream.
|
|
990
|
+
dag_rel_path : `str`
|
|
991
|
+
Relative path of dag to submit directory.
|
|
992
|
+
command_name : `str`
|
|
993
|
+
Name of the DAG command (e.g., JOB, FINAL).
|
|
837
994
|
"""
|
|
838
|
-
|
|
839
|
-
|
|
995
|
+
subfile = os.path.expandvars(self.subfile)
|
|
996
|
+
|
|
997
|
+
# JOB NodeName SubmitDescription [DIR directory] [NOOP] [DONE]
|
|
998
|
+
job_line = f'{command_name} {self.name} "{subfile}"'
|
|
999
|
+
if "dir" in self.dagcmds:
|
|
1000
|
+
dir_val = self.dagcmds["dir"]
|
|
1001
|
+
if dag_rel_path:
|
|
1002
|
+
dir_val = os.path.join(dag_rel_path, dir_val)
|
|
1003
|
+
job_line += f' DIR "{dir_val}"'
|
|
1004
|
+
if self.dagcmds.get("noop", False):
|
|
1005
|
+
job_line += " NOOP"
|
|
1006
|
+
|
|
1007
|
+
print(job_line, file=stream)
|
|
1008
|
+
if self.dagcmds:
|
|
1009
|
+
_htc_write_job_commands(stream, self.name, self.dagcmds)
|
|
840
1010
|
|
|
841
1011
|
def dump(self, fh):
|
|
842
1012
|
"""Dump job information to output stream.
|
|
@@ -871,6 +1041,7 @@ class HTCDag(networkx.DiGraph):
|
|
|
871
1041
|
self.graph["submit_path"] = None
|
|
872
1042
|
self.graph["final_job"] = None
|
|
873
1043
|
self.graph["service_job"] = None
|
|
1044
|
+
self.graph["submit_options"] = {}
|
|
874
1045
|
|
|
875
1046
|
def __str__(self):
|
|
876
1047
|
"""Represent basic DAG info as string.
|
|
@@ -906,6 +1077,7 @@ class HTCDag(networkx.DiGraph):
|
|
|
906
1077
|
Names of child jobs.
|
|
907
1078
|
"""
|
|
908
1079
|
assert isinstance(job, HTCJob)
|
|
1080
|
+
_LOG.debug("Adding job %s to dag", job.name)
|
|
909
1081
|
|
|
910
1082
|
# Add dag level attributes to each job
|
|
911
1083
|
job.add_job_attrs(self.graph["attr"])
|
|
@@ -913,10 +1085,10 @@ class HTCDag(networkx.DiGraph):
|
|
|
913
1085
|
self.add_node(job.name, data=job)
|
|
914
1086
|
|
|
915
1087
|
if parent_names is not None:
|
|
916
|
-
self.add_job_relationships(parent_names, job.name)
|
|
1088
|
+
self.add_job_relationships(parent_names, [job.name])
|
|
917
1089
|
|
|
918
1090
|
if child_names is not None:
|
|
919
|
-
self.add_job_relationships(child_names, job.name)
|
|
1091
|
+
self.add_job_relationships(child_names, [job.name])
|
|
920
1092
|
|
|
921
1093
|
def add_job_relationships(self, parents, children):
|
|
922
1094
|
"""Add DAG edge between parents and children jobs.
|
|
@@ -972,24 +1144,48 @@ class HTCDag(networkx.DiGraph):
|
|
|
972
1144
|
# Delete job node (which deletes its edges).
|
|
973
1145
|
self.remove_node(job_name)
|
|
974
1146
|
|
|
975
|
-
def write(self, submit_path, job_subdir=""):
|
|
1147
|
+
def write(self, submit_path, job_subdir="", dag_subdir="", dag_rel_path=""):
|
|
976
1148
|
"""Write DAG to a file.
|
|
977
1149
|
|
|
978
1150
|
Parameters
|
|
979
1151
|
----------
|
|
980
1152
|
submit_path : `str`
|
|
981
|
-
Prefix path for
|
|
1153
|
+
Prefix path for all outputs.
|
|
982
1154
|
job_subdir : `str`, optional
|
|
983
|
-
Template for job subdir.
|
|
1155
|
+
Template for job subdir (submit_path + job_subdir).
|
|
1156
|
+
dag_subdir : `str`, optional
|
|
1157
|
+
DAG subdir (submit_path + dag_subdir).
|
|
1158
|
+
dag_rel_path : `str`, optional
|
|
1159
|
+
Prefix to job_subdir for jobs inside subdag.
|
|
984
1160
|
"""
|
|
985
1161
|
self.graph["submit_path"] = submit_path
|
|
986
|
-
self.graph["dag_filename"] = os.path.join(
|
|
987
|
-
os.
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
1162
|
+
self.graph["dag_filename"] = os.path.join(dag_subdir, f"{self.graph['name']}.dag")
|
|
1163
|
+
full_filename = os.path.join(submit_path, self.graph["dag_filename"])
|
|
1164
|
+
os.makedirs(os.path.dirname(full_filename), exist_ok=True)
|
|
1165
|
+
with open(full_filename, "w") as fh:
|
|
1166
|
+
for name, nodeval in self.nodes().items():
|
|
1167
|
+
try:
|
|
1168
|
+
job = nodeval["data"]
|
|
1169
|
+
except KeyError:
|
|
1170
|
+
_LOG.error("Job %s doesn't have data (keys: %s).", name, nodeval.keys())
|
|
1171
|
+
raise
|
|
1172
|
+
if job.subdag:
|
|
1173
|
+
dag_subdir = f"subdags/{job.name}"
|
|
1174
|
+
if "dir" in job.dagcmds:
|
|
1175
|
+
subdir = job.dagcmds["dir"]
|
|
1176
|
+
else:
|
|
1177
|
+
subdir = job_subdir
|
|
1178
|
+
job.subdag.write(submit_path, subdir, dag_subdir, "../..")
|
|
1179
|
+
fh.write(
|
|
1180
|
+
f"SUBDAG EXTERNAL {job.name} {Path(job.subdag.graph['dag_filename']).name} "
|
|
1181
|
+
f"DIR {dag_subdir}\n"
|
|
1182
|
+
)
|
|
1183
|
+
if job.dagcmds:
|
|
1184
|
+
_htc_write_job_commands(fh, job.name, job.dagcmds)
|
|
1185
|
+
else:
|
|
1186
|
+
job.write_submit_file(submit_path)
|
|
1187
|
+
job.write_dag_commands(fh, dag_rel_path)
|
|
1188
|
+
|
|
993
1189
|
for edge in self.edges():
|
|
994
1190
|
print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh)
|
|
995
1191
|
print(f"DOT {self.name}.dot", file=fh)
|
|
@@ -1006,12 +1202,8 @@ class HTCDag(networkx.DiGraph):
|
|
|
1006
1202
|
}
|
|
1007
1203
|
for dagcmd, job in special_jobs.items():
|
|
1008
1204
|
if job is not None:
|
|
1009
|
-
job.write_submit_file(submit_path
|
|
1010
|
-
|
|
1011
|
-
if "pre" in job.dagcmds:
|
|
1012
|
-
print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh)
|
|
1013
|
-
if "post" in job.dagcmds:
|
|
1014
|
-
print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh)
|
|
1205
|
+
job.write_submit_file(submit_path)
|
|
1206
|
+
job.write_dag_commands(fh, dag_rel_path, dagcmd)
|
|
1015
1207
|
|
|
1016
1208
|
def dump(self, fh):
|
|
1017
1209
|
"""Dump DAG info to output stream.
|
|
@@ -1061,7 +1253,7 @@ def condor_q(constraint=None, schedds=None, **kwargs):
|
|
|
1061
1253
|
|
|
1062
1254
|
Returns
|
|
1063
1255
|
-------
|
|
1064
|
-
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str
|
|
1256
|
+
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str`, `~typing.Any`]]]
|
|
1065
1257
|
Information about jobs satisfying the search criteria where for each
|
|
1066
1258
|
Scheduler, local HTCondor job ids are mapped to their respective
|
|
1067
1259
|
classads.
|
|
@@ -1086,7 +1278,7 @@ def condor_history(constraint=None, schedds=None, **kwargs):
|
|
|
1086
1278
|
|
|
1087
1279
|
Returns
|
|
1088
1280
|
-------
|
|
1089
|
-
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str
|
|
1281
|
+
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str`, `~typing.Any`]]]
|
|
1090
1282
|
Information about jobs satisfying the search criteria where for each
|
|
1091
1283
|
Scheduler, local HTCondor job ids are mapped to their respective
|
|
1092
1284
|
classads.
|
|
@@ -1117,7 +1309,7 @@ def condor_query(constraint=None, schedds=None, query_func=htc_query_present, **
|
|
|
1117
1309
|
|
|
1118
1310
|
Returns
|
|
1119
1311
|
-------
|
|
1120
|
-
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str
|
|
1312
|
+
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str`, `~typing.Any`]]]
|
|
1121
1313
|
Information about jobs satisfying the search criteria where for each
|
|
1122
1314
|
Scheduler, local HTCondor job ids are mapped to their respective
|
|
1123
1315
|
classads.
|
|
@@ -1172,7 +1364,7 @@ def condor_search(constraint=None, hist=None, schedds=None):
|
|
|
1172
1364
|
|
|
1173
1365
|
Returns
|
|
1174
1366
|
-------
|
|
1175
|
-
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
|
|
1367
|
+
job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` `~typing.Any`]]]
|
|
1176
1368
|
Information about jobs satisfying the search criteria where for each
|
|
1177
1369
|
Scheduler, local HTCondor job ids are mapped to their respective
|
|
1178
1370
|
classads.
|
|
@@ -1203,7 +1395,7 @@ def condor_status(constraint=None, coll=None):
|
|
|
1203
1395
|
|
|
1204
1396
|
Returns
|
|
1205
1397
|
-------
|
|
1206
|
-
pool_info : `dict` [`str`, `dict` [`str`, Any]]
|
|
1398
|
+
pool_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1207
1399
|
Mapping between HTCondor slot names and slot information (classAds).
|
|
1208
1400
|
"""
|
|
1209
1401
|
if coll is None:
|
|
@@ -1225,14 +1417,14 @@ def update_job_info(job_info, other_info):
|
|
|
1225
1417
|
|
|
1226
1418
|
Parameters
|
|
1227
1419
|
----------
|
|
1228
|
-
job_info : `dict` [`str`, `dict` [`str`, Any]]
|
|
1420
|
+
job_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1229
1421
|
Results of the job query that needs to be updated.
|
|
1230
|
-
other_info : `dict` [`str`, `dict` [`str`, Any]]
|
|
1422
|
+
other_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1231
1423
|
Results of the other job query.
|
|
1232
1424
|
|
|
1233
1425
|
Returns
|
|
1234
1426
|
-------
|
|
1235
|
-
job_info : `dict` [`str`, `dict` [`str`, Any]]
|
|
1427
|
+
job_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1236
1428
|
The updated results.
|
|
1237
1429
|
"""
|
|
1238
1430
|
for schedd_name, others in other_info.items():
|
|
@@ -1246,7 +1438,98 @@ def update_job_info(job_info, other_info):
|
|
|
1246
1438
|
return job_info
|
|
1247
1439
|
|
|
1248
1440
|
|
|
1249
|
-
def
|
|
1441
|
+
def count_jobs_in_single_dag(
|
|
1442
|
+
filename: str | os.PathLike,
|
|
1443
|
+
) -> tuple[Counter[str], dict[str, str], dict[str, WmsNodeType]]:
|
|
1444
|
+
"""Build bps_run_summary string from dag file.
|
|
1445
|
+
|
|
1446
|
+
Parameters
|
|
1447
|
+
----------
|
|
1448
|
+
filename : `str`
|
|
1449
|
+
Path that includes dag file for a run.
|
|
1450
|
+
|
|
1451
|
+
Returns
|
|
1452
|
+
-------
|
|
1453
|
+
counts : `Counter` [`str`]
|
|
1454
|
+
Semi-colon separated list of job labels and counts.
|
|
1455
|
+
(Same format as saved in dag classad).
|
|
1456
|
+
job_name_to_label : `dict` [`str`, `str`]
|
|
1457
|
+
Mapping of job names to job labels.
|
|
1458
|
+
job_name_to_type : `dict` [`str`, `lsst.ctrl.bps.htcondor.WmsNodeType`]
|
|
1459
|
+
Mapping of job names to job types
|
|
1460
|
+
(e.g., payload, final, service).
|
|
1461
|
+
"""
|
|
1462
|
+
# Later code depends upon insertion order
|
|
1463
|
+
counts: Counter = Counter() # counts of payload jobs per label
|
|
1464
|
+
job_name_to_label: dict[str, str] = {}
|
|
1465
|
+
job_name_to_type: dict[str, WmsNodeType] = {}
|
|
1466
|
+
with open(filename) as fh:
|
|
1467
|
+
for line in fh:
|
|
1468
|
+
job_name = ""
|
|
1469
|
+
m = re.match(
|
|
1470
|
+
r"(?P<command>JOB|FINAL|SERVICE|SUBDAG EXTERNAL)\s+"
|
|
1471
|
+
r'(?P<jobname>(?P<wms>wms_)?\S+)\s+"?(?P<subfile>\S+)"?\s*'
|
|
1472
|
+
r'(DIR "?(?P<dir>[^\s"]+)"?)?\s*(?P<noop>NOOP)?',
|
|
1473
|
+
line,
|
|
1474
|
+
)
|
|
1475
|
+
if m:
|
|
1476
|
+
job_name = m.group("jobname")
|
|
1477
|
+
name_parts = job_name.split("_")
|
|
1478
|
+
|
|
1479
|
+
label = ""
|
|
1480
|
+
if m.group("dir"):
|
|
1481
|
+
dir_match = re.search(r"jobs/([^\s/]+)", m.group("dir"))
|
|
1482
|
+
if dir_match:
|
|
1483
|
+
label = dir_match.group(1)
|
|
1484
|
+
else:
|
|
1485
|
+
_LOG.debug("Parse DAG: unparsed dir = %s", line)
|
|
1486
|
+
elif m.group("subfile"):
|
|
1487
|
+
subfile_match = re.search(r"jobs/([^\s/]+)", m.group("subfile"))
|
|
1488
|
+
if subfile_match:
|
|
1489
|
+
label = m.group("subfile").split("/")[1]
|
|
1490
|
+
else:
|
|
1491
|
+
label = pegasus_name_to_label(job_name)
|
|
1492
|
+
|
|
1493
|
+
match m.group("command"):
|
|
1494
|
+
case "JOB":
|
|
1495
|
+
if m.group("noop"):
|
|
1496
|
+
job_type = WmsNodeType.NOOP
|
|
1497
|
+
# wms_noop_label
|
|
1498
|
+
label = name_parts[2]
|
|
1499
|
+
elif m.group("wms"):
|
|
1500
|
+
if name_parts[1] == "check":
|
|
1501
|
+
job_type = WmsNodeType.SUBDAG_CHECK
|
|
1502
|
+
# wms_check_status_wms_group_label
|
|
1503
|
+
label = name_parts[5]
|
|
1504
|
+
else:
|
|
1505
|
+
_LOG.warning(
|
|
1506
|
+
"Unexpected skipping of dag line due to unknown wms job: %s", line
|
|
1507
|
+
)
|
|
1508
|
+
else:
|
|
1509
|
+
job_type = WmsNodeType.PAYLOAD
|
|
1510
|
+
if label == "init":
|
|
1511
|
+
label = "pipetaskInit"
|
|
1512
|
+
counts[label] += 1
|
|
1513
|
+
case "FINAL":
|
|
1514
|
+
job_type = WmsNodeType.FINAL
|
|
1515
|
+
counts[label] += 1 # final counts a payload job.
|
|
1516
|
+
case "SERVICE":
|
|
1517
|
+
job_type = WmsNodeType.SERVICE
|
|
1518
|
+
case "SUBDAG EXTERNAL":
|
|
1519
|
+
job_type = WmsNodeType.SUBDAG
|
|
1520
|
+
label = name_parts[2]
|
|
1521
|
+
|
|
1522
|
+
job_name_to_label[job_name] = label
|
|
1523
|
+
job_name_to_type[job_name] = job_type
|
|
1524
|
+
elif not line.startswith(("VARS", "PARENT", "DOT", "NODE_STATUS_FILE", "SET_JOB_ATTR", "SCRIPT")):
|
|
1525
|
+
# Only print warning if not a line wanting to skip
|
|
1526
|
+
# Probably means problem with regex in above match pattern.
|
|
1527
|
+
_LOG.warning("Unexpected skipping of dag line: %s", line)
|
|
1528
|
+
|
|
1529
|
+
return counts, job_name_to_label, job_name_to_type
|
|
1530
|
+
|
|
1531
|
+
|
|
1532
|
+
def summarize_dag(dir_name: str) -> tuple[str, dict[str, str], dict[str, WmsNodeType]]:
|
|
1250
1533
|
"""Build bps_run_summary string from dag file.
|
|
1251
1534
|
|
|
1252
1535
|
Parameters
|
|
@@ -1261,56 +1544,25 @@ def summarize_dag(dir_name: str) -> tuple[str, dict[str, str], dict[str, str]]:
|
|
|
1261
1544
|
(Same format as saved in dag classad).
|
|
1262
1545
|
job_name_to_label : `dict` [`str`, `str`]
|
|
1263
1546
|
Mapping of job names to job labels.
|
|
1264
|
-
job_name_to_type : `dict` [`str`, `
|
|
1547
|
+
job_name_to_type : `dict` [`str`, `lsst.ctrl.bps.htcondor.WmsNodeType`]
|
|
1265
1548
|
Mapping of job names to job types
|
|
1266
1549
|
(e.g., payload, final, service).
|
|
1267
1550
|
"""
|
|
1268
1551
|
# Later code depends upon insertion order
|
|
1269
|
-
counts:
|
|
1270
|
-
job_name_to_label = {}
|
|
1271
|
-
job_name_to_type = {}
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
label = "pipetaskInit"
|
|
1284
|
-
counts[label] += 1
|
|
1285
|
-
else: # Check if Pegasus submission
|
|
1286
|
-
m = re.match(r"JOB (\S+) (\S+)", line)
|
|
1287
|
-
if m:
|
|
1288
|
-
job_name = m.group(1)
|
|
1289
|
-
label = pegasus_name_to_label(m.group(1))
|
|
1290
|
-
counts[label] += 1
|
|
1291
|
-
else:
|
|
1292
|
-
_LOG.warning("Parse DAG: unmatched job line: %s", line)
|
|
1293
|
-
job_type = "payload"
|
|
1294
|
-
elif line.startswith("FINAL"):
|
|
1295
|
-
m = re.match(r"FINAL (\S+) jobs/([^/]+)/", line)
|
|
1296
|
-
if m:
|
|
1297
|
-
job_name = m.group(1)
|
|
1298
|
-
label = m.group(2)
|
|
1299
|
-
counts[label] += 1 # final counts a payload job.
|
|
1300
|
-
job_type = "final"
|
|
1301
|
-
elif line.startswith("SERVICE"):
|
|
1302
|
-
m = re.match(r"SERVICE (\S+) jobs/([^/]+)/", line)
|
|
1303
|
-
if m:
|
|
1304
|
-
job_name = m.group(1)
|
|
1305
|
-
label = m.group(2)
|
|
1306
|
-
job_type = "service"
|
|
1307
|
-
|
|
1308
|
-
if job_name:
|
|
1309
|
-
job_name_to_label[job_name] = label
|
|
1310
|
-
job_name_to_type[job_name] = job_type
|
|
1311
|
-
|
|
1312
|
-
except (OSError, PermissionError, StopIteration):
|
|
1313
|
-
pass
|
|
1552
|
+
counts: Counter[str] = Counter() # counts of payload jobs per label
|
|
1553
|
+
job_name_to_label: dict[str, str] = {}
|
|
1554
|
+
job_name_to_type: dict[str, WmsNodeType] = {}
|
|
1555
|
+
for filename in Path(dir_name).glob("*.dag"):
|
|
1556
|
+
single_counts, single_job_name_to_label, single_job_name_to_type = count_jobs_in_single_dag(filename)
|
|
1557
|
+
counts += single_counts
|
|
1558
|
+
_update_dicts(job_name_to_label, single_job_name_to_label)
|
|
1559
|
+
_update_dicts(job_name_to_type, single_job_name_to_type)
|
|
1560
|
+
|
|
1561
|
+
for filename in Path(dir_name).glob("subdags/*/*.dag"):
|
|
1562
|
+
single_counts, single_job_name_to_label, single_job_name_to_type = count_jobs_in_single_dag(filename)
|
|
1563
|
+
counts += single_counts
|
|
1564
|
+
_update_dicts(job_name_to_label, single_job_name_to_label)
|
|
1565
|
+
_update_dicts(job_name_to_type, single_job_name_to_type)
|
|
1314
1566
|
|
|
1315
1567
|
summary = ";".join([f"{name}:{counts[name]}" for name in counts])
|
|
1316
1568
|
_LOG.debug("summarize_dag: %s %s %s", summary, job_name_to_label, job_name_to_type)
|
|
@@ -1343,69 +1595,100 @@ def pegasus_name_to_label(name):
|
|
|
1343
1595
|
return label
|
|
1344
1596
|
|
|
1345
1597
|
|
|
1346
|
-
def
|
|
1598
|
+
def read_single_dag_status(filename: str | os.PathLike) -> dict[str, Any]:
|
|
1347
1599
|
"""Read the node status file for DAG summary information.
|
|
1348
1600
|
|
|
1349
1601
|
Parameters
|
|
1350
1602
|
----------
|
|
1351
|
-
|
|
1352
|
-
|
|
1603
|
+
filename : `str` or `Path.pathlib`
|
|
1604
|
+
Node status filename.
|
|
1353
1605
|
|
|
1354
1606
|
Returns
|
|
1355
1607
|
-------
|
|
1356
|
-
dag_ad : `dict` [`str`, Any]
|
|
1608
|
+
dag_ad : `dict` [`str`, `~typing.Any`]
|
|
1357
1609
|
DAG summary information.
|
|
1358
1610
|
"""
|
|
1359
|
-
dag_ad = {}
|
|
1611
|
+
dag_ad: dict[str, Any] = {}
|
|
1360
1612
|
|
|
1361
1613
|
# While this is probably more up to date than dag classad, only read from
|
|
1362
1614
|
# file if need to.
|
|
1363
1615
|
try:
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
dag_ad = classad.parseNext(infh) # pylint: disable=E1101
|
|
1369
|
-
except StopIteration:
|
|
1370
|
-
pass
|
|
1616
|
+
node_stat_file = Path(filename)
|
|
1617
|
+
_LOG.debug("Reading Node Status File %s", node_stat_file)
|
|
1618
|
+
with open(node_stat_file) as infh:
|
|
1619
|
+
dag_ad = dict(classad.parseNext(infh)) # pylint: disable=E1101
|
|
1371
1620
|
|
|
1372
1621
|
if not dag_ad:
|
|
1373
1622
|
# Pegasus check here
|
|
1374
|
-
|
|
1375
|
-
|
|
1623
|
+
metrics_file = node_stat_file.with_suffix(".dag.metrics")
|
|
1624
|
+
if metrics_file.exists():
|
|
1376
1625
|
with open(metrics_file) as infh:
|
|
1377
1626
|
metrics = json.load(infh)
|
|
1378
1627
|
dag_ad["NodesTotal"] = metrics.get("jobs", 0)
|
|
1379
1628
|
dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0)
|
|
1380
1629
|
dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0)
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
with open(metrics_file) as infh:
|
|
1386
|
-
metrics = json.load(infh)
|
|
1387
|
-
dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"]
|
|
1388
|
-
dag_ad["pegasus_version"] = metrics.get("version", "")
|
|
1389
|
-
except StopIteration:
|
|
1390
|
-
pass
|
|
1630
|
+
metrics_file = node_stat_file.with_suffix(".metrics")
|
|
1631
|
+
with open(metrics_file) as infh:
|
|
1632
|
+
metrics = json.load(infh)
|
|
1633
|
+
dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"]
|
|
1391
1634
|
except (OSError, PermissionError):
|
|
1392
1635
|
pass
|
|
1393
1636
|
|
|
1394
1637
|
_LOG.debug("read_dag_status: %s", dag_ad)
|
|
1395
|
-
return
|
|
1638
|
+
return dag_ad
|
|
1396
1639
|
|
|
1397
1640
|
|
|
1398
|
-
def
|
|
1399
|
-
"""Read
|
|
1641
|
+
def read_dag_status(wms_path: str | os.PathLike) -> dict[str, Any]:
|
|
1642
|
+
"""Read the node status file for DAG summary information.
|
|
1400
1643
|
|
|
1401
1644
|
Parameters
|
|
1402
1645
|
----------
|
|
1403
|
-
wms_path : `str`
|
|
1646
|
+
wms_path : `str` or `os.PathLike
|
|
1404
1647
|
Path that includes node status file for a run.
|
|
1405
1648
|
|
|
1406
1649
|
Returns
|
|
1407
1650
|
-------
|
|
1408
|
-
|
|
1651
|
+
dag_ad : `dict` [`str`, `~typing.Any`]
|
|
1652
|
+
DAG summary information, counts summed across any subdags.
|
|
1653
|
+
"""
|
|
1654
|
+
dag_ads: dict[str, Any] = {}
|
|
1655
|
+
path = Path(wms_path)
|
|
1656
|
+
try:
|
|
1657
|
+
node_stat_file = next(path.glob("*.node_status"))
|
|
1658
|
+
except StopIteration as exc:
|
|
1659
|
+
raise FileNotFoundError(f"DAGMan node status not found in {wms_path}") from exc
|
|
1660
|
+
|
|
1661
|
+
dag_ads = read_single_dag_status(node_stat_file)
|
|
1662
|
+
|
|
1663
|
+
for node_stat_file in path.glob("subdags/*/*.node_status"):
|
|
1664
|
+
dag_ad = read_single_dag_status(node_stat_file)
|
|
1665
|
+
dag_ads["JobProcsHeld"] += dag_ad.get("JobProcsHeld", 0)
|
|
1666
|
+
dag_ads["NodesPost"] += dag_ad.get("NodesPost", 0)
|
|
1667
|
+
dag_ads["JobProcsIdle"] += dag_ad.get("JobProcsIdle", 0)
|
|
1668
|
+
dag_ads["NodesTotal"] += dag_ad.get("NodesTotal", 0)
|
|
1669
|
+
dag_ads["NodesFailed"] += dag_ad.get("NodesFailed", 0)
|
|
1670
|
+
dag_ads["NodesDone"] += dag_ad.get("NodesDone", 0)
|
|
1671
|
+
dag_ads["NodesQueued"] += dag_ad.get("NodesQueued", 0)
|
|
1672
|
+
dag_ads["NodesPre"] += dag_ad.get("NodesReady", 0)
|
|
1673
|
+
dag_ads["NodesFutile"] += dag_ad.get("NodesFutile", 0)
|
|
1674
|
+
dag_ads["NodesUnready"] += dag_ad.get("NodesUnready", 0)
|
|
1675
|
+
|
|
1676
|
+
return dag_ads
|
|
1677
|
+
|
|
1678
|
+
|
|
1679
|
+
def read_single_node_status(filename: str | os.PathLike, init_fake_id: int) -> dict[str, Any]:
|
|
1680
|
+
"""Read entire node status file.
|
|
1681
|
+
|
|
1682
|
+
Parameters
|
|
1683
|
+
----------
|
|
1684
|
+
filename : `str` or `pathlib.Path`
|
|
1685
|
+
Node status filename.
|
|
1686
|
+
init_fake_id : `int`
|
|
1687
|
+
Initial fake id value.
|
|
1688
|
+
|
|
1689
|
+
Returns
|
|
1690
|
+
-------
|
|
1691
|
+
jobs : `dict` [`str`, `~typing.Any`]
|
|
1409
1692
|
DAG summary information compiled from the node status file combined
|
|
1410
1693
|
with the information found in the node event log.
|
|
1411
1694
|
|
|
@@ -1413,28 +1696,34 @@ def read_node_status(wms_path):
|
|
|
1413
1696
|
from the event log takes precedence over the value from the node status
|
|
1414
1697
|
file.
|
|
1415
1698
|
"""
|
|
1699
|
+
filename = Path(filename)
|
|
1700
|
+
|
|
1416
1701
|
# Get jobid info from other places to fill in gaps in info from node_status
|
|
1417
|
-
_, job_name_to_label, job_name_to_type =
|
|
1418
|
-
|
|
1419
|
-
|
|
1702
|
+
_, job_name_to_label, job_name_to_type = count_jobs_in_single_dag(filename.with_suffix(".dag"))
|
|
1703
|
+
loginfo: dict[str, dict[str, Any]] = {}
|
|
1704
|
+
try:
|
|
1705
|
+
wms_workflow_id, loginfo = read_single_dag_log(filename.with_suffix(".dag.dagman.log"))
|
|
1706
|
+
loginfo = read_single_dag_nodes_log(filename.with_suffix(".dag.nodes.log"))
|
|
1707
|
+
except (OSError, PermissionError):
|
|
1708
|
+
pass
|
|
1709
|
+
|
|
1710
|
+
job_name_to_id: dict[str, str] = {}
|
|
1420
1711
|
_LOG.debug("loginfo = %s", loginfo)
|
|
1421
|
-
|
|
1712
|
+
log_job_name_to_id: dict[str, str] = {}
|
|
1422
1713
|
for job_id, job_info in loginfo.items():
|
|
1423
1714
|
if "LogNotes" in job_info:
|
|
1424
1715
|
m = re.match(r"DAG Node: (\S+)", job_info["LogNotes"])
|
|
1425
1716
|
if m:
|
|
1426
1717
|
job_name = m.group(1)
|
|
1427
|
-
|
|
1718
|
+
log_job_name_to_id[job_name] = job_id
|
|
1428
1719
|
job_info["DAGNodeName"] = job_name
|
|
1429
|
-
job_info["
|
|
1720
|
+
job_info["wms_node_type"] = job_name_to_type[job_name]
|
|
1430
1721
|
job_info["bps_job_label"] = job_name_to_label[job_name]
|
|
1431
1722
|
|
|
1432
|
-
jobs =
|
|
1433
|
-
fake_id =
|
|
1723
|
+
jobs = {}
|
|
1724
|
+
fake_id = init_fake_id # For nodes that do not yet have a job id, give fake one
|
|
1434
1725
|
try:
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
with open(node_status) as fh:
|
|
1726
|
+
with open(filename) as fh:
|
|
1438
1727
|
for ad in classad.parseAds(fh):
|
|
1439
1728
|
match ad["Type"]:
|
|
1440
1729
|
case "DagStatus":
|
|
@@ -1449,21 +1738,23 @@ def read_node_status(wms_path):
|
|
|
1449
1738
|
else:
|
|
1450
1739
|
job_label = job_name
|
|
1451
1740
|
|
|
1452
|
-
|
|
1453
|
-
if job_name in
|
|
1454
|
-
job_id = str(
|
|
1455
|
-
job
|
|
1741
|
+
job = dict(ad)
|
|
1742
|
+
if job_name in log_job_name_to_id:
|
|
1743
|
+
job_id = str(log_job_name_to_id[job_name])
|
|
1744
|
+
_update_dicts(job, loginfo[job_id])
|
|
1456
1745
|
else:
|
|
1457
1746
|
job_id = str(fake_id)
|
|
1458
|
-
job_name_to_id[job_name] = job_id
|
|
1459
1747
|
job = dict(ad)
|
|
1460
|
-
jobs[job_id] = job
|
|
1461
1748
|
fake_id -= 1
|
|
1749
|
+
jobs[job_id] = job
|
|
1750
|
+
job_name_to_id[job_name] = job_id
|
|
1751
|
+
|
|
1752
|
+
# Make job info as if came from condor_q.
|
|
1462
1753
|
job["ClusterId"] = int(float(job_id))
|
|
1463
1754
|
job["DAGManJobID"] = wms_workflow_id
|
|
1464
1755
|
job["DAGNodeName"] = job_name
|
|
1465
1756
|
job["bps_job_label"] = job_label
|
|
1466
|
-
job["
|
|
1757
|
+
job["wms_node_type"] = job_name_to_type[job_name]
|
|
1467
1758
|
|
|
1468
1759
|
case "StatusEnd":
|
|
1469
1760
|
# Skip node status file "epilog".
|
|
@@ -1472,41 +1763,104 @@ def read_node_status(wms_path):
|
|
|
1472
1763
|
_LOG.debug(
|
|
1473
1764
|
"Ignoring unknown classad type '%s' in the node status file '%s'",
|
|
1474
1765
|
ad["Type"],
|
|
1475
|
-
|
|
1766
|
+
filename,
|
|
1476
1767
|
)
|
|
1477
|
-
except (
|
|
1768
|
+
except (OSError, PermissionError):
|
|
1478
1769
|
pass
|
|
1479
1770
|
|
|
1480
1771
|
# Check for missing jobs (e.g., submission failure or not submitted yet)
|
|
1481
1772
|
# Use dag info to create job placeholders
|
|
1482
1773
|
for name in set(job_name_to_label) - set(job_name_to_id):
|
|
1483
|
-
job
|
|
1484
|
-
|
|
1774
|
+
if name in log_job_name_to_id: # job was in nodes.log, but not node_status
|
|
1775
|
+
job_id = str(log_job_name_to_id[name])
|
|
1776
|
+
job = dict(loginfo[job_id])
|
|
1777
|
+
else:
|
|
1778
|
+
job_id = str(fake_id)
|
|
1779
|
+
fake_id -= 1
|
|
1780
|
+
job = {}
|
|
1781
|
+
job["NodeStatus"] = NodeStatus.NOT_READY
|
|
1782
|
+
|
|
1783
|
+
job["ClusterId"] = int(float(job_id))
|
|
1485
1784
|
job["ProcId"] = 0
|
|
1486
1785
|
job["DAGManJobID"] = wms_workflow_id
|
|
1487
1786
|
job["DAGNodeName"] = name
|
|
1488
1787
|
job["bps_job_label"] = job_name_to_label[name]
|
|
1489
|
-
job["
|
|
1490
|
-
job["NodeStatus"] = NodeStatus.NOT_READY
|
|
1788
|
+
job["wms_node_type"] = job_name_to_type[name]
|
|
1491
1789
|
jobs[f"{job['ClusterId']}.{job['ProcId']}"] = job
|
|
1492
|
-
|
|
1790
|
+
|
|
1791
|
+
for job_info in jobs.values():
|
|
1792
|
+
job_info["from_dag_job"] = f"wms_{filename.stem}"
|
|
1793
|
+
|
|
1794
|
+
return jobs
|
|
1795
|
+
|
|
1796
|
+
|
|
1797
|
+
def read_node_status(wms_path: str | os.PathLike) -> dict[str, dict[str, Any]]:
|
|
1798
|
+
"""Read entire node status file.
|
|
1799
|
+
|
|
1800
|
+
Parameters
|
|
1801
|
+
----------
|
|
1802
|
+
wms_path : `str` or `os.PathLike`
|
|
1803
|
+
Path that includes node status file for a run.
|
|
1804
|
+
|
|
1805
|
+
Returns
|
|
1806
|
+
-------
|
|
1807
|
+
jobs : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1808
|
+
DAG summary information compiled from the node status file combined
|
|
1809
|
+
with the information found in the node event log.
|
|
1810
|
+
|
|
1811
|
+
Currently, if the same job attribute is found in both files, its value
|
|
1812
|
+
from the event log takes precedence over the value from the node status
|
|
1813
|
+
file.
|
|
1814
|
+
"""
|
|
1815
|
+
jobs: dict[str, dict[str, Any]] = {}
|
|
1816
|
+
init_fake_id = -1
|
|
1817
|
+
|
|
1818
|
+
# subdags may not have run so wouldn't have node_status file
|
|
1819
|
+
# use dag files and let read_single_node_status handle missing
|
|
1820
|
+
# node_status file.
|
|
1821
|
+
for dag_filename in Path(wms_path).glob("*.dag"):
|
|
1822
|
+
filename = dag_filename.with_suffix(".node_status")
|
|
1823
|
+
info = read_single_node_status(filename, init_fake_id)
|
|
1824
|
+
init_fake_id -= len(info)
|
|
1825
|
+
_update_dicts(jobs, info)
|
|
1826
|
+
|
|
1827
|
+
for dag_filename in Path(wms_path).glob("subdags/*/*.dag"):
|
|
1828
|
+
filename = dag_filename.with_suffix(".node_status")
|
|
1829
|
+
info = read_single_node_status(filename, init_fake_id)
|
|
1830
|
+
init_fake_id -= len(info)
|
|
1831
|
+
_update_dicts(jobs, info)
|
|
1832
|
+
|
|
1833
|
+
# Propagate pruned from subdags to jobs
|
|
1834
|
+
name_to_id: dict[str, str] = {}
|
|
1835
|
+
missing_status: dict[str, list[str]] = {}
|
|
1836
|
+
for id_, job in jobs.items():
|
|
1837
|
+
if job["DAGNodeName"].startswith("wms_"):
|
|
1838
|
+
name_to_id[job["DAGNodeName"]] = id_
|
|
1839
|
+
if "NodeStatus" not in job or job["NodeStatus"] == NodeStatus.NOT_READY:
|
|
1840
|
+
missing_status.setdefault(job["from_dag_job"], []).append(id_)
|
|
1841
|
+
|
|
1842
|
+
for name, dag_id in name_to_id.items():
|
|
1843
|
+
dag_status = jobs[dag_id].get("NodeStatus", NodeStatus.NOT_READY)
|
|
1844
|
+
if dag_status in {NodeStatus.NOT_READY, NodeStatus.FUTILE}:
|
|
1845
|
+
for id_ in missing_status.get(name, []):
|
|
1846
|
+
jobs[id_]["NodeStatus"] = dag_status
|
|
1493
1847
|
|
|
1494
1848
|
return jobs
|
|
1495
1849
|
|
|
1496
1850
|
|
|
1497
|
-
def
|
|
1851
|
+
def read_single_dag_log(log_filename: str | os.PathLike) -> tuple[str, dict[str, dict[str, Any]]]:
|
|
1498
1852
|
"""Read job information from the DAGMan log file.
|
|
1499
1853
|
|
|
1500
1854
|
Parameters
|
|
1501
1855
|
----------
|
|
1502
|
-
|
|
1503
|
-
|
|
1856
|
+
log_filename : `str` or `os.PathLike`
|
|
1857
|
+
DAGMan log filename.
|
|
1504
1858
|
|
|
1505
1859
|
Returns
|
|
1506
1860
|
-------
|
|
1507
1861
|
wms_workflow_id : `str`
|
|
1508
1862
|
HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job.
|
|
1509
|
-
dag_info : `dict` [`str`, `~
|
|
1863
|
+
dag_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1510
1864
|
HTCondor job information read from the log file mapped to HTCondor
|
|
1511
1865
|
job id.
|
|
1512
1866
|
|
|
@@ -1515,25 +1869,21 @@ def read_dag_log(wms_path: str) -> tuple[str, dict[str, Any]]:
|
|
|
1515
1869
|
FileNotFoundError
|
|
1516
1870
|
If cannot find DAGMan log in given wms_path.
|
|
1517
1871
|
"""
|
|
1518
|
-
wms_workflow_id = 0
|
|
1519
|
-
dag_info = {}
|
|
1872
|
+
wms_workflow_id = "0"
|
|
1873
|
+
dag_info: dict[str, dict[str, Any]] = {}
|
|
1520
1874
|
|
|
1521
|
-
|
|
1522
|
-
if
|
|
1523
|
-
try:
|
|
1524
|
-
filename = next(path.glob("*.dag.dagman.log"))
|
|
1525
|
-
except StopIteration as exc:
|
|
1526
|
-
raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc
|
|
1875
|
+
filename = Path(log_filename)
|
|
1876
|
+
if filename.exists():
|
|
1527
1877
|
_LOG.debug("dag node log filename: %s", filename)
|
|
1528
1878
|
|
|
1529
|
-
info = {}
|
|
1879
|
+
info: dict[str, Any] = {}
|
|
1530
1880
|
job_event_log = htcondor.JobEventLog(str(filename))
|
|
1531
1881
|
for event in job_event_log.events(stop_after=0):
|
|
1532
1882
|
id_ = f"{event['Cluster']}.{event['Proc']}"
|
|
1533
1883
|
if id_ not in info:
|
|
1534
1884
|
info[id_] = {}
|
|
1535
1885
|
wms_workflow_id = id_ # taking last job id in case of restarts
|
|
1536
|
-
info[id_]
|
|
1886
|
+
_update_dicts(info[id_], event)
|
|
1537
1887
|
info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
|
|
1538
1888
|
|
|
1539
1889
|
# only save latest DAG job
|
|
@@ -1544,17 +1894,53 @@ def read_dag_log(wms_path: str) -> tuple[str, dict[str, Any]]:
|
|
|
1544
1894
|
return wms_workflow_id, dag_info
|
|
1545
1895
|
|
|
1546
1896
|
|
|
1547
|
-
def
|
|
1897
|
+
def read_dag_log(wms_path: str | os.PathLike) -> tuple[str, dict[str, Any]]:
|
|
1898
|
+
"""Read job information from the DAGMan log file.
|
|
1899
|
+
|
|
1900
|
+
Parameters
|
|
1901
|
+
----------
|
|
1902
|
+
wms_path : `str` or `os.PathLike`
|
|
1903
|
+
Path containing the DAGMan log file.
|
|
1904
|
+
|
|
1905
|
+
Returns
|
|
1906
|
+
-------
|
|
1907
|
+
wms_workflow_id : `str`
|
|
1908
|
+
HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job.
|
|
1909
|
+
dag_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1910
|
+
HTCondor job information read from the log file mapped to HTCondor
|
|
1911
|
+
job id.
|
|
1912
|
+
|
|
1913
|
+
Raises
|
|
1914
|
+
------
|
|
1915
|
+
FileNotFoundError
|
|
1916
|
+
If cannot find DAGMan log in given wms_path.
|
|
1917
|
+
"""
|
|
1918
|
+
wms_workflow_id = MISSING_ID
|
|
1919
|
+
dag_info: dict[str, dict[str, Any]] = {}
|
|
1920
|
+
|
|
1921
|
+
path = Path(wms_path)
|
|
1922
|
+
if path.exists():
|
|
1923
|
+
try:
|
|
1924
|
+
filename = next(path.glob("*.dag.dagman.log"))
|
|
1925
|
+
except StopIteration as exc:
|
|
1926
|
+
raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc
|
|
1927
|
+
_LOG.debug("dag node log filename: %s", filename)
|
|
1928
|
+
wms_workflow_id, dag_info = read_single_dag_log(filename)
|
|
1929
|
+
|
|
1930
|
+
return wms_workflow_id, dag_info
|
|
1931
|
+
|
|
1932
|
+
|
|
1933
|
+
def read_single_dag_nodes_log(filename: str | os.PathLike) -> dict[str, dict[str, Any]]:
|
|
1548
1934
|
"""Read job information from the DAGMan nodes log file.
|
|
1549
1935
|
|
|
1550
1936
|
Parameters
|
|
1551
1937
|
----------
|
|
1552
|
-
|
|
1938
|
+
filename : `str` or `os.PathLike`
|
|
1553
1939
|
Path containing the DAGMan nodes log file.
|
|
1554
1940
|
|
|
1555
1941
|
Returns
|
|
1556
1942
|
-------
|
|
1557
|
-
info : `dict` [`str`, Any]
|
|
1943
|
+
info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1558
1944
|
HTCondor job information read from the log file mapped to HTCondor
|
|
1559
1945
|
job id.
|
|
1560
1946
|
|
|
@@ -1563,20 +1949,46 @@ def read_dag_nodes_log(wms_path):
|
|
|
1563
1949
|
FileNotFoundError
|
|
1564
1950
|
If cannot find DAGMan node log in given wms_path.
|
|
1565
1951
|
"""
|
|
1566
|
-
try:
|
|
1567
|
-
filename = next(Path(wms_path).glob("*.dag.nodes.log"))
|
|
1568
|
-
except StopIteration as exc:
|
|
1569
|
-
raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc
|
|
1570
1952
|
_LOG.debug("dag node log filename: %s", filename)
|
|
1953
|
+
filename = Path(filename)
|
|
1954
|
+
|
|
1955
|
+
info: dict[str, dict[str, Any]] = {}
|
|
1956
|
+
if not filename.exists():
|
|
1957
|
+
raise FileNotFoundError(f"{filename} does not exist")
|
|
1958
|
+
|
|
1959
|
+
try:
|
|
1960
|
+
job_event_log = htcondor.JobEventLog(str(filename))
|
|
1961
|
+
except htcondor.HTCondorIOError as ex:
|
|
1962
|
+
_LOG.error("Problem reading nodes log file (%s): %s", filename, ex)
|
|
1963
|
+
import traceback
|
|
1571
1964
|
|
|
1572
|
-
|
|
1573
|
-
|
|
1965
|
+
traceback.print_stack()
|
|
1966
|
+
raise
|
|
1574
1967
|
for event in job_event_log.events(stop_after=0):
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1968
|
+
_LOG.debug("log event type = %s, keys = %s", event["EventTypeNumber"], event.keys())
|
|
1969
|
+
|
|
1970
|
+
try:
|
|
1971
|
+
id_ = f"{event['Cluster']}.{event['Proc']}"
|
|
1972
|
+
except KeyError:
|
|
1973
|
+
_LOG.warn(
|
|
1974
|
+
"Log event missing ids (DAGNodeName=%s, EventTime=%s, EventTypeNumber=%s)",
|
|
1975
|
+
event.get("DAGNodeName", "UNK"),
|
|
1976
|
+
event.get("EventTime", "UNK"),
|
|
1977
|
+
event.get("EventTypeNumber", "UNK"),
|
|
1978
|
+
)
|
|
1979
|
+
else:
|
|
1980
|
+
if id_ not in info:
|
|
1981
|
+
info[id_] = {}
|
|
1982
|
+
# Workaround: Please check to see if still problem in
|
|
1983
|
+
# future HTCondor versions. Sometimes get a
|
|
1984
|
+
# JobAbortedEvent for a subdag job after it already
|
|
1985
|
+
# terminated normally. Seems to happen when using job
|
|
1986
|
+
# plus subdags.
|
|
1987
|
+
if event["EventTypeNumber"] == 9 and info[id_].get("EventTypeNumber", -1) == 5:
|
|
1988
|
+
_LOG.debug("Skipping spurious JobAbortedEvent: %s", dict(event))
|
|
1989
|
+
else:
|
|
1990
|
+
_update_dicts(info[id_], event)
|
|
1991
|
+
info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
|
|
1580
1992
|
|
|
1581
1993
|
# Add more condor_q-like info to info parsed from log file.
|
|
1582
1994
|
for job in info.values():
|
|
@@ -1585,17 +1997,54 @@ def read_dag_nodes_log(wms_path):
|
|
|
1585
1997
|
return info
|
|
1586
1998
|
|
|
1587
1999
|
|
|
1588
|
-
def
|
|
2000
|
+
def read_dag_nodes_log(wms_path: str | os.PathLike) -> dict[str, dict[str, Any]]:
|
|
2001
|
+
"""Read job information from the DAGMan nodes log file.
|
|
2002
|
+
|
|
2003
|
+
Parameters
|
|
2004
|
+
----------
|
|
2005
|
+
wms_path : `str` or `os.PathLike`
|
|
2006
|
+
Path containing the DAGMan nodes log file.
|
|
2007
|
+
|
|
2008
|
+
Returns
|
|
2009
|
+
-------
|
|
2010
|
+
info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
2011
|
+
HTCondor job information read from the log file mapped to HTCondor
|
|
2012
|
+
job id.
|
|
2013
|
+
|
|
2014
|
+
Raises
|
|
2015
|
+
------
|
|
2016
|
+
FileNotFoundError
|
|
2017
|
+
If cannot find DAGMan node log in given wms_path.
|
|
2018
|
+
"""
|
|
2019
|
+
info: dict[str, dict[str, Any]] = {}
|
|
2020
|
+
for filename in Path(wms_path).glob("*.dag.nodes.log"):
|
|
2021
|
+
_LOG.debug("dag node log filename: %s", filename)
|
|
2022
|
+
_update_dicts(info, read_single_dag_nodes_log(filename))
|
|
2023
|
+
|
|
2024
|
+
# If submitted, the main nodes log file should exist
|
|
2025
|
+
if not info:
|
|
2026
|
+
raise FileNotFoundError(f"DAGMan node log not found in {wms_path}")
|
|
2027
|
+
|
|
2028
|
+
# Subdags will not have dag nodes log files if they haven't
|
|
2029
|
+
# started running yet (so missing is not an error).
|
|
2030
|
+
for filename in Path(wms_path).glob("subdags/*/*.dag.nodes.log"):
|
|
2031
|
+
_LOG.debug("dag node log filename: %s", filename)
|
|
2032
|
+
_update_dicts(info, read_single_dag_nodes_log(filename))
|
|
2033
|
+
|
|
2034
|
+
return info
|
|
2035
|
+
|
|
2036
|
+
|
|
2037
|
+
def read_dag_info(wms_path: str | os.PathLike) -> dict[str, dict[str, Any]]:
|
|
1589
2038
|
"""Read custom DAGMan job information from the file.
|
|
1590
2039
|
|
|
1591
2040
|
Parameters
|
|
1592
2041
|
----------
|
|
1593
|
-
wms_path : `str`
|
|
2042
|
+
wms_path : `str` or `os.PathLike`
|
|
1594
2043
|
Path containing the file with the DAGMan job info.
|
|
1595
2044
|
|
|
1596
2045
|
Returns
|
|
1597
2046
|
-------
|
|
1598
|
-
dag_info : `dict` [`str`, `dict` [`str`, Any]]
|
|
2047
|
+
dag_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
1599
2048
|
HTCondor job information.
|
|
1600
2049
|
|
|
1601
2050
|
Raises
|
|
@@ -1603,6 +2052,7 @@ def read_dag_info(wms_path):
|
|
|
1603
2052
|
FileNotFoundError
|
|
1604
2053
|
If cannot find DAGMan job info file in the given location.
|
|
1605
2054
|
"""
|
|
2055
|
+
dag_info: dict[str, dict[str, Any]] = {}
|
|
1606
2056
|
try:
|
|
1607
2057
|
filename = next(Path(wms_path).glob("*.info.json"))
|
|
1608
2058
|
except StopIteration as exc:
|
|
@@ -1613,7 +2063,6 @@ def read_dag_info(wms_path):
|
|
|
1613
2063
|
dag_info = json.load(fh)
|
|
1614
2064
|
except (OSError, PermissionError) as exc:
|
|
1615
2065
|
_LOG.debug("Retrieving DAGMan job information failed: %s", exc)
|
|
1616
|
-
dag_info = {}
|
|
1617
2066
|
return dag_info
|
|
1618
2067
|
|
|
1619
2068
|
|
|
@@ -1624,7 +2073,7 @@ def write_dag_info(filename, dag_info):
|
|
|
1624
2073
|
----------
|
|
1625
2074
|
filename : `str`
|
|
1626
2075
|
Name of the file where the information will be stored.
|
|
1627
|
-
dag_info : `dict` [`str` `dict` [`str`, Any]]
|
|
2076
|
+
dag_info : `dict` [`str` `dict` [`str`, `~typing.Any`]]
|
|
1628
2077
|
Information about the DAGMan job.
|
|
1629
2078
|
"""
|
|
1630
2079
|
schedd_name = next(iter(dag_info))
|
|
@@ -1647,7 +2096,7 @@ def _tweak_log_info(filename, job):
|
|
|
1647
2096
|
----------
|
|
1648
2097
|
filename : `pathlib.Path`
|
|
1649
2098
|
Name of the DAGMan log.
|
|
1650
|
-
job : `dict` [ `str`, Any ]
|
|
2099
|
+
job : `dict` [ `str`, `~typing.Any` ]
|
|
1651
2100
|
A mapping between HTCondor job id and job information read from
|
|
1652
2101
|
the log.
|
|
1653
2102
|
"""
|
|
@@ -1661,37 +2110,47 @@ def _tweak_log_info(filename, job):
|
|
|
1661
2110
|
|
|
1662
2111
|
match job["MyType"]:
|
|
1663
2112
|
case "ExecuteEvent":
|
|
1664
|
-
job["JobStatus"] = JobStatus.RUNNING
|
|
2113
|
+
job["JobStatus"] = htcondor.JobStatus.RUNNING
|
|
1665
2114
|
case "JobTerminatedEvent" | "PostScriptTerminatedEvent":
|
|
1666
|
-
job["JobStatus"] = JobStatus.COMPLETED
|
|
2115
|
+
job["JobStatus"] = htcondor.JobStatus.COMPLETED
|
|
1667
2116
|
case "SubmitEvent":
|
|
1668
|
-
job["JobStatus"] = JobStatus.IDLE
|
|
2117
|
+
job["JobStatus"] = htcondor.JobStatus.IDLE
|
|
1669
2118
|
case "JobAbortedEvent":
|
|
1670
|
-
job["JobStatus"] = JobStatus.REMOVED
|
|
2119
|
+
job["JobStatus"] = htcondor.JobStatus.REMOVED
|
|
1671
2120
|
case "JobHeldEvent":
|
|
1672
|
-
job["JobStatus"] = JobStatus.HELD
|
|
2121
|
+
job["JobStatus"] = htcondor.JobStatus.HELD
|
|
2122
|
+
case "JobReleaseEvent":
|
|
2123
|
+
# Shows up as last event if a DAG job was held and released
|
|
2124
|
+
# so assume job is running. If regular job is released, there
|
|
2125
|
+
# will be other events so JobReleaseEvent won't be the last
|
|
2126
|
+
job["JobStatus"] = htcondor.JobStatus.RUNNING
|
|
1673
2127
|
case _:
|
|
1674
2128
|
_LOG.debug("Unknown log event type: %s", job["MyType"])
|
|
1675
|
-
job["JobStatus"] =
|
|
2129
|
+
job["JobStatus"] = None
|
|
1676
2130
|
|
|
1677
|
-
if job["JobStatus"] in {JobStatus.COMPLETED, JobStatus.HELD}:
|
|
2131
|
+
if job["JobStatus"] in {htcondor.JobStatus.COMPLETED, htcondor.JobStatus.HELD}:
|
|
1678
2132
|
new_job = HTC_JOB_AD_HANDLERS.handle(job)
|
|
1679
2133
|
if new_job is not None:
|
|
1680
2134
|
job = new_job
|
|
1681
2135
|
else:
|
|
1682
2136
|
_LOG.error("Could not determine exit status for job '%s.%s'", job["ClusterId"], job["ProcId"])
|
|
1683
2137
|
|
|
2138
|
+
if "LogNotes" in job:
|
|
2139
|
+
m = re.match(r"DAG Node: (\S+)", job["LogNotes"])
|
|
2140
|
+
if m:
|
|
2141
|
+
job["DAGNodeName"] = m.group(1)
|
|
2142
|
+
|
|
1684
2143
|
except KeyError as e:
|
|
1685
2144
|
_LOG.error("Missing key %s in job: %s", str(e), job)
|
|
1686
2145
|
raise
|
|
1687
2146
|
|
|
1688
2147
|
|
|
1689
|
-
def htc_check_dagman_output(wms_path):
|
|
2148
|
+
def htc_check_dagman_output(wms_path: str | os.PathLike) -> str:
|
|
1690
2149
|
"""Check the DAGMan output for error messages.
|
|
1691
2150
|
|
|
1692
2151
|
Parameters
|
|
1693
2152
|
----------
|
|
1694
|
-
wms_path : `str`
|
|
2153
|
+
wms_path : `str` or `os.PathLike`
|
|
1695
2154
|
Directory containing the DAGman output file.
|
|
1696
2155
|
|
|
1697
2156
|
Returns
|
|
@@ -1711,32 +2170,176 @@ def htc_check_dagman_output(wms_path):
|
|
|
1711
2170
|
raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc
|
|
1712
2171
|
_LOG.debug("dag output filename: %s", filename)
|
|
1713
2172
|
|
|
2173
|
+
p = re.compile(r"^(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) (Job submit try \d+/\d+ failed|Warning:.*$|ERROR:.*$)")
|
|
2174
|
+
|
|
1714
2175
|
message = ""
|
|
1715
2176
|
try:
|
|
1716
2177
|
with open(filename) as fh:
|
|
1717
|
-
last_submit_failed = ""
|
|
2178
|
+
last_submit_failed = "" # Since submit retries multiple times only report last one
|
|
1718
2179
|
for line in fh:
|
|
1719
|
-
m =
|
|
2180
|
+
m = p.match(line)
|
|
1720
2181
|
if m:
|
|
1721
|
-
|
|
1722
|
-
|
|
1723
|
-
m
|
|
1724
|
-
|
|
1725
|
-
|
|
2182
|
+
if m.group(2).startswith("Job submit try"):
|
|
2183
|
+
last_submit_failed = m.group(1)
|
|
2184
|
+
elif m.group(2).startswith("ERROR: submit attempt failed"):
|
|
2185
|
+
pass # Should be handled by Job submit try
|
|
2186
|
+
elif m.group(2).startswith("Warning"):
|
|
2187
|
+
if ".dag.nodes.log is in /tmp" in m.group(2):
|
|
1726
2188
|
last_warning = "Cannot submit from /tmp."
|
|
1727
2189
|
else:
|
|
1728
|
-
last_warning = m.group(
|
|
2190
|
+
last_warning = m.group(2)
|
|
2191
|
+
elif m.group(2) == "ERROR: Warning is fatal error because of DAGMAN_USE_STRICT setting":
|
|
2192
|
+
message += "ERROR: "
|
|
2193
|
+
message += last_warning
|
|
2194
|
+
message += "\n"
|
|
2195
|
+
elif m.group(2) in [
|
|
2196
|
+
"ERROR: the following job(s) failed:",
|
|
2197
|
+
"ERROR: the following Node(s) failed:",
|
|
2198
|
+
]:
|
|
2199
|
+
pass
|
|
1729
2200
|
else:
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
m.group(1)
|
|
1734
|
-
== "ERROR: Warning is fatal error because of DAGMAN_USE_STRICT setting"
|
|
1735
|
-
):
|
|
1736
|
-
message += f"ERROR: {last_warning}"
|
|
2201
|
+
message += m.group(2)
|
|
2202
|
+
message += "\n"
|
|
2203
|
+
|
|
1737
2204
|
if last_submit_failed:
|
|
1738
2205
|
message += f"Warn: Job submission issues (last: {last_submit_failed})"
|
|
1739
2206
|
except (OSError, PermissionError):
|
|
1740
2207
|
message = f"Warn: Could not read dagman output file from {wms_path}."
|
|
1741
2208
|
_LOG.debug("dag output file message: %s", message)
|
|
1742
2209
|
return message
|
|
2210
|
+
|
|
2211
|
+
|
|
2212
|
+
def _read_rescue_headers(infh: TextIO) -> tuple[list[str], list[str]]:
|
|
2213
|
+
"""Read header lines from a rescue file.
|
|
2214
|
+
|
|
2215
|
+
Parameters
|
|
2216
|
+
----------
|
|
2217
|
+
infh : `TextIO`
|
|
2218
|
+
The rescue file from which to read the header lines.
|
|
2219
|
+
|
|
2220
|
+
Returns
|
|
2221
|
+
-------
|
|
2222
|
+
header_lines : `list` [`str`]
|
|
2223
|
+
Header lines read from the rescue file.
|
|
2224
|
+
failed_subdags : `list` [`str`]
|
|
2225
|
+
Names of failed subdag jobs.
|
|
2226
|
+
"""
|
|
2227
|
+
header_lines: list[str] = []
|
|
2228
|
+
failed = False
|
|
2229
|
+
failed_subdags: list[str] = []
|
|
2230
|
+
|
|
2231
|
+
for line in infh:
|
|
2232
|
+
line = line.strip()
|
|
2233
|
+
if line.startswith("#"):
|
|
2234
|
+
if line.startswith("# Nodes that failed:"):
|
|
2235
|
+
failed = True
|
|
2236
|
+
header_lines.append(line)
|
|
2237
|
+
elif failed:
|
|
2238
|
+
orig_failed_nodes = line[1:].strip().split(",")
|
|
2239
|
+
new_failed_nodes = []
|
|
2240
|
+
for node in orig_failed_nodes:
|
|
2241
|
+
if node.startswith("wms_check_status"):
|
|
2242
|
+
group_node = node[17:]
|
|
2243
|
+
failed_subdags.append(group_node)
|
|
2244
|
+
new_failed_nodes.append(group_node)
|
|
2245
|
+
else:
|
|
2246
|
+
new_failed_nodes.append(node)
|
|
2247
|
+
header_lines.append(f"# {','.join(new_failed_nodes)}")
|
|
2248
|
+
if orig_failed_nodes[-1] == "<ENDLIST>":
|
|
2249
|
+
failed = False
|
|
2250
|
+
else:
|
|
2251
|
+
header_lines.append(line)
|
|
2252
|
+
elif line.strip() == "": # end of headers
|
|
2253
|
+
break
|
|
2254
|
+
return header_lines, failed_subdags
|
|
2255
|
+
|
|
2256
|
+
|
|
2257
|
+
def _write_rescue_headers(header_lines: list[str], failed_subdags: list[str], outfh: TextIO) -> None:
|
|
2258
|
+
"""Write the header lines to the new rescue file.
|
|
2259
|
+
|
|
2260
|
+
Parameters
|
|
2261
|
+
----------
|
|
2262
|
+
header_lines : `list` [`str`]
|
|
2263
|
+
Header lines to write to the new rescue file.
|
|
2264
|
+
failed_subdags : `list` [`str`]
|
|
2265
|
+
Job names of the failed subdags.
|
|
2266
|
+
outfh : `TextIO`
|
|
2267
|
+
New rescue file.
|
|
2268
|
+
"""
|
|
2269
|
+
done_str = "# Nodes premarked DONE"
|
|
2270
|
+
pattern = f"^{done_str}:\\s+(\\d+)"
|
|
2271
|
+
for header_line in header_lines:
|
|
2272
|
+
m = re.match(pattern, header_line)
|
|
2273
|
+
if m:
|
|
2274
|
+
print(f"{done_str}: {int(m.group(1)) - len(failed_subdags)}", file=outfh)
|
|
2275
|
+
else:
|
|
2276
|
+
print(header_line, file=outfh)
|
|
2277
|
+
|
|
2278
|
+
print("", file=outfh)
|
|
2279
|
+
|
|
2280
|
+
|
|
2281
|
+
def _copy_done_lines(failed_subdags: list[str], infh: TextIO, outfh: TextIO) -> None:
|
|
2282
|
+
"""Copy the DONE lines from the original rescue file skipping
|
|
2283
|
+
the failed group jobs.
|
|
2284
|
+
|
|
2285
|
+
Parameters
|
|
2286
|
+
----------
|
|
2287
|
+
failed_subdags : `list` [`str`]
|
|
2288
|
+
List of job names for the failed subdags
|
|
2289
|
+
infh : `TextIO`
|
|
2290
|
+
Original rescue file to copy from.
|
|
2291
|
+
outfh : `TextIO`
|
|
2292
|
+
New rescue file to copy to.
|
|
2293
|
+
"""
|
|
2294
|
+
for line in infh:
|
|
2295
|
+
line = line.strip()
|
|
2296
|
+
try:
|
|
2297
|
+
_, node_name = line.split()
|
|
2298
|
+
except ValueError:
|
|
2299
|
+
_LOG.error(f"Unexpected line in rescue file = '{line}'")
|
|
2300
|
+
raise
|
|
2301
|
+
if node_name not in failed_subdags:
|
|
2302
|
+
print(line, file=outfh)
|
|
2303
|
+
|
|
2304
|
+
|
|
2305
|
+
def _update_rescue_file(rescue_file: Path) -> None:
|
|
2306
|
+
"""Update the subdag failures in the main rescue file
|
|
2307
|
+
and backup the failed subdag dirs.
|
|
2308
|
+
|
|
2309
|
+
Parameters
|
|
2310
|
+
----------
|
|
2311
|
+
rescue_file : `pathlib.Path`
|
|
2312
|
+
The main rescue file that needs to be updated.
|
|
2313
|
+
"""
|
|
2314
|
+
# To reduce memory requirements, not reading entire file into memory.
|
|
2315
|
+
rescue_tmp = rescue_file.with_suffix(rescue_file.suffix + ".tmp")
|
|
2316
|
+
with open(rescue_file) as infh:
|
|
2317
|
+
header_lines, failed_subdags = _read_rescue_headers(infh)
|
|
2318
|
+
with open(rescue_tmp, "w") as outfh:
|
|
2319
|
+
_write_rescue_headers(header_lines, failed_subdags, outfh)
|
|
2320
|
+
_copy_done_lines(failed_subdags, infh, outfh)
|
|
2321
|
+
rescue_file.unlink()
|
|
2322
|
+
rescue_tmp.rename(rescue_file)
|
|
2323
|
+
for failed_subdag in failed_subdags:
|
|
2324
|
+
htc_backup_files(
|
|
2325
|
+
rescue_file.parent / "subdags" / failed_subdag, subdir=f"backups/subdags/{failed_subdag}"
|
|
2326
|
+
)
|
|
2327
|
+
|
|
2328
|
+
|
|
2329
|
+
def _update_dicts(dict1, dict2):
|
|
2330
|
+
"""Update dict1 with info in dict2.
|
|
2331
|
+
|
|
2332
|
+
(Basically an update for nested dictionaries.)
|
|
2333
|
+
|
|
2334
|
+
Parameters
|
|
2335
|
+
----------
|
|
2336
|
+
dict1 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
2337
|
+
HTCondor job information to be updated.
|
|
2338
|
+
dict2 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
|
|
2339
|
+
Additional HTCondor job information.
|
|
2340
|
+
"""
|
|
2341
|
+
for key, value in dict2.items():
|
|
2342
|
+
if key in dict1 and isinstance(dict1[key], dict) and isinstance(value, dict):
|
|
2343
|
+
_update_dicts(dict1[key], value)
|
|
2344
|
+
else:
|
|
2345
|
+
dict1[key] = value
|