lsst-ctrl-bps-htcondor 29.2025.1300__py3-none-any.whl → 29.2025.1500__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -38,9 +38,9 @@ __all__ = [
38
38
  "DagStatus",
39
39
  "HTCDag",
40
40
  "HTCJob",
41
- "JobStatus",
42
41
  "NodeStatus",
43
42
  "RestrictedDict",
43
+ "WmsNodeType",
44
44
  "condor_history",
45
45
  "condor_q",
46
46
  "condor_search",
@@ -65,7 +65,6 @@ __all__ = [
65
65
  "read_node_status",
66
66
  "summarize_dag",
67
67
  "update_job_info",
68
- "update_job_info",
69
68
  "write_dag_info",
70
69
  ]
71
70
 
@@ -77,23 +76,24 @@ import os
77
76
  import pprint
78
77
  import re
79
78
  import subprocess
80
- from collections import defaultdict
79
+ from collections import Counter, defaultdict
81
80
  from collections.abc import MutableMapping
82
81
  from datetime import datetime, timedelta
83
- from enum import IntEnum
82
+ from enum import IntEnum, auto
84
83
  from pathlib import Path
85
- from typing import Any
84
+ from typing import Any, TextIO
86
85
 
87
86
  import classad
88
87
  import htcondor
89
88
  import networkx
89
+ from deprecated.sphinx import deprecated
90
90
  from packaging import version
91
91
 
92
92
  from .handlers import HTC_JOB_AD_HANDLERS
93
93
 
94
94
  _LOG = logging.getLogger(__name__)
95
95
 
96
- MISSING_ID = -99999
96
+ MISSING_ID = "-99999"
97
97
 
98
98
 
99
99
  class DagStatus(IntEnum):
@@ -108,6 +108,13 @@ class DagStatus(IntEnum):
108
108
  SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8)
109
109
 
110
110
 
111
+ @deprecated(
112
+ reason="The JobStatus is internally replaced by htcondor.JobStatus. "
113
+ "External reporting code should be using ctrl_bps.WmsStates. "
114
+ "This class will be removed after v30.",
115
+ version="v30.0",
116
+ category=FutureWarning,
117
+ )
111
118
  class JobStatus(IntEnum):
112
119
  """HTCondor's statuses for jobs."""
113
120
 
@@ -155,6 +162,31 @@ class NodeStatus(IntEnum):
155
162
  FUTILE = 7
156
163
 
157
164
 
165
+ class WmsNodeType(IntEnum):
166
+ """HTCondor plugin node types to help with payload reporting."""
167
+
168
+ UNKNOWN = auto()
169
+ """Dummy value when missing."""
170
+
171
+ PAYLOAD = auto()
172
+ """Payload job."""
173
+
174
+ FINAL = auto()
175
+ """Final job."""
176
+
177
+ SERVICE = auto()
178
+ """Service job."""
179
+
180
+ NOOP = auto()
181
+ """NOOP job used for ordering jobs."""
182
+
183
+ SUBDAG = auto()
184
+ """SUBDAG job used for ordering jobs."""
185
+
186
+ SUBDAG_CHECK = auto()
187
+ """Job used to correctly prune jobs after a subdag."""
188
+
189
+
158
190
  HTC_QUOTE_KEYS = {"environment"}
159
191
  HTC_VALID_JOB_KEYS = {
160
192
  "universe",
@@ -189,7 +221,18 @@ HTC_VALID_JOB_KEYS = {
189
221
  "accounting_group",
190
222
  "accounting_group_user",
191
223
  }
192
- HTC_VALID_JOB_DAG_KEYS = {"vars", "pre", "post", "retry", "retry_unless_exit", "abort_dag_on", "abort_exit"}
224
+ HTC_VALID_JOB_DAG_KEYS = {
225
+ "dir",
226
+ "noop",
227
+ "done",
228
+ "vars",
229
+ "pre",
230
+ "post",
231
+ "retry",
232
+ "retry_unless_exit",
233
+ "abort_dag_on",
234
+ "abort_exit",
235
+ }
193
236
  HTC_VERSION = version.parse(htcondor.__version__)
194
237
 
195
238
 
@@ -224,7 +267,7 @@ class RestrictedDict(MutableMapping):
224
267
 
225
268
  Returns
226
269
  -------
227
- value : `~collections.abc.Any`
270
+ value : `~typing.Any`
228
271
  Value associated with given key.
229
272
 
230
273
  Raises
@@ -256,7 +299,7 @@ class RestrictedDict(MutableMapping):
256
299
  ----------
257
300
  key : `str`
258
301
  Identifier to associate with given value.
259
- value : `~collections.abc.Any`
302
+ value : `~typing.Any`
260
303
  Value to store.
261
304
 
262
305
  Raises
@@ -278,7 +321,9 @@ class RestrictedDict(MutableMapping):
278
321
  return str(self.data)
279
322
 
280
323
 
281
- def htc_backup_files(wms_path, subdir=None, limit=100):
324
+ def htc_backup_files(
325
+ wms_path: str | os.PathLike, subdir: str | os.PathLike | None = None, limit: int = 100
326
+ ) -> Path | None:
282
327
  """Backup select HTCondor files in the submit directory.
283
328
 
284
329
  Files will be saved in separate subdirectories which will be created in
@@ -293,9 +338,9 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
293
338
 
294
339
  Parameters
295
340
  ----------
296
- wms_path : `str` or `pathlib.Path`
341
+ wms_path : `str` or `os.PathLike`
297
342
  Path to the submit directory either absolute or relative.
298
- subdir : `str` or `pathlib.Path`, optional
343
+ subdir : `str` or `os.PathLike`, optional
299
344
  A path, relative to the submit directory, where all subdirectories with
300
345
  backup files will be kept. Defaults to None which means that the backup
301
346
  subdirectories will be placed directly in the submit directory.
@@ -305,6 +350,11 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
305
350
  to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in
306
351
  version 8.8+.
307
352
 
353
+ Returns
354
+ -------
355
+ last_rescue_file : `pathlib.Path` or None
356
+ Path to the latest rescue file or None if doesn't exist.
357
+
308
358
  Raises
309
359
  ------
310
360
  FileNotFoundError
@@ -327,17 +377,18 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
327
377
  raise FileNotFoundError(f"Directory {path} not found")
328
378
 
329
379
  # Initialize the backup counter.
330
- rescue_dags = list(Path(wms_path).glob("*.rescue*"))
380
+ rescue_dags = list(path.glob("*.rescue[0-9][0-9][0-9]"))
331
381
  counter = min(len(rescue_dags), limit)
332
382
 
333
383
  # Create the backup directory and move select files there.
334
- dest = Path(wms_path)
384
+ dest = path
335
385
  if subdir:
336
386
  # PurePath.is_relative_to() is not available before Python 3.9. Hence
337
387
  # we need to check is 'subdir' is in the submit directory in some other
338
388
  # way if it is an absolute path.
339
389
  subdir = Path(subdir)
340
390
  if subdir.is_absolute():
391
+ subdir = subdir.resolve() # Since resolve was run on path, must run it here
341
392
  if dest not in subdir.parents:
342
393
  _LOG.warning(
343
394
  "Invalid backup location: '%s' not in the submit directory, will use '%s' instead.",
@@ -349,21 +400,66 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
349
400
  else:
350
401
  dest /= subdir
351
402
  dest /= f"{counter:0{width}}"
403
+ _LOG.debug("dest = %s", dest)
352
404
  try:
353
405
  dest.mkdir(parents=True, exist_ok=False if counter < limit else True)
354
406
  except FileExistsError:
355
407
  _LOG.warning("Refusing to do backups: target directory '%s' already exists", dest)
356
408
  else:
357
- for patt in ["*.info.*", "*.dag.metrics", "*.dag.nodes.log", "*.node_status"]:
358
- for source in path.glob(patt):
359
- if source.is_file():
360
- target = dest / source.relative_to(path)
361
- try:
362
- source.rename(target)
363
- except OSError as exc:
364
- raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None
365
- else:
366
- raise FileNotFoundError(f"Backing up '{source}' failed: not a file")
409
+ htc_backup_files_single_path(path, dest)
410
+
411
+ # also back up any subdag info
412
+ for subdag_dir in path.glob("subdags/*"):
413
+ subdag_dest = dest / subdag_dir.relative_to(path)
414
+ subdag_dest.mkdir(parents=True, exist_ok=False)
415
+ htc_backup_files_single_path(subdag_dir, subdag_dest)
416
+
417
+ last_rescue_file = rescue_dags[-1] if rescue_dags else None
418
+ _LOG.debug("last_rescue_file = %s", last_rescue_file)
419
+ return last_rescue_file
420
+
421
+
422
+ def htc_backup_files_single_path(src: str | os.PathLike, dest: str | os.PathLike) -> None:
423
+ """Move particular htc files to a different directory for later debugging.
424
+
425
+ Parameters
426
+ ----------
427
+ src : `str` or `os.PathLike`
428
+ Directory from which to backup particular files.
429
+ dest : `str` or `os.PathLike`
430
+ Directory to which particular files are moved.
431
+
432
+ Raises
433
+ ------
434
+ RuntimeError
435
+ If given dest directory matches given src directory.
436
+ OSError
437
+ If problems moving file.
438
+ FileNotFoundError
439
+ Item matching pattern in src directory isn't a file.
440
+ """
441
+ src = Path(src)
442
+ dest = Path(dest)
443
+ if dest.samefile(src):
444
+ raise RuntimeError(f"Destination directory is same as the source directory ({src})")
445
+
446
+ for patt in [
447
+ "*.info.*",
448
+ "*.dag.metrics",
449
+ "*.dag.nodes.log",
450
+ "*.node_status",
451
+ "wms_*.dag.post.out",
452
+ "wms_*.status.txt",
453
+ ]:
454
+ for source in src.glob(patt):
455
+ if source.is_file():
456
+ target = dest / source.relative_to(src)
457
+ try:
458
+ source.rename(target)
459
+ except OSError as exc:
460
+ raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None
461
+ else:
462
+ raise FileNotFoundError(f"Backing up '{source}' failed: not a file")
367
463
 
368
464
 
369
465
  def htc_escape(value):
@@ -371,12 +467,12 @@ def htc_escape(value):
371
467
 
372
468
  Parameters
373
469
  ----------
374
- value : `~collections.abc.Any`
470
+ value : `~typing.Any`
375
471
  Value that needs to have characters escaped if string.
376
472
 
377
473
  Returns
378
474
  -------
379
- new_value : `~collections.abc.Any`
475
+ new_value : `~typing.Any`
380
476
  Given value with characters escaped appropriate for HTCondor if string.
381
477
  """
382
478
  if isinstance(value, str):
@@ -407,12 +503,14 @@ def htc_write_attribs(stream, attrs):
407
503
  print(f"+{key} = {pval}", file=stream)
408
504
 
409
505
 
410
- def htc_write_condor_file(filename, job_name, job, job_attrs):
506
+ def htc_write_condor_file(
507
+ filename: str | os.PathLike, job_name: str, job: RestrictedDict, job_attrs: dict[str, Any]
508
+ ) -> None:
411
509
  """Write an HTCondor submit file.
412
510
 
413
511
  Parameters
414
512
  ----------
415
- filename : `str`
513
+ filename : `str` or os.PathLike
416
514
  Filename for the HTCondor submit file.
417
515
  job_name : `str`
418
516
  Job name to use in submit file.
@@ -463,7 +561,7 @@ if HTC_VERSION < version.parse("8.9.8"):
463
561
 
464
562
  Returns
465
563
  -------
466
- kwargs : `dict` [`str`, Any]
564
+ kwargs : `dict` [`str`, `~typing.Any`]
467
565
  Keywords arguments that are guaranteed to work with the Python
468
566
  HTCondor API.
469
567
 
@@ -501,7 +599,7 @@ else:
501
599
 
502
600
  Returns
503
601
  -------
504
- kwargs : `dict` [`str`, Any]
602
+ kwargs : `dict` [`str`, `~typing.Any`]
505
603
  Keywords arguments that were passed to the function.
506
604
  """
507
605
  return kwargs
@@ -521,7 +619,7 @@ def htc_query_history(schedds, **kwargs):
521
619
  ------
522
620
  schedd_name : `str`
523
621
  Name of the HTCondor scheduler managing the job queue.
524
- job_ad : `dict` [`str`, Any]
622
+ job_ad : `dict` [`str`, `~typing.Any`]
525
623
  A dictionary representing HTCondor ClassAd describing a job. It maps
526
624
  job attributes names to values of the ClassAd expressions they
527
625
  represent.
@@ -549,7 +647,7 @@ def htc_query_present(schedds, **kwargs):
549
647
  ------
550
648
  schedd_name : `str`
551
649
  Name of the HTCondor scheduler managing the job queue.
552
- job_ad : `dict` [`str`, Any]
650
+ job_ad : `dict` [`str`, `~typing.Any`]
553
651
  A dictionary representing HTCondor ClassAd describing a job. It maps
554
652
  job attributes names to values of the ClassAd expressions they
555
653
  represent.
@@ -581,7 +679,8 @@ def htc_submit_dag(sub):
581
679
 
582
680
  Returns
583
681
  -------
584
- schedd_job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
682
+ schedd_job_info : `dict` [`str`, `dict` [`str`, \
683
+ `dict` [`str`, `~typing.Any`]]]
585
684
  Information about jobs satisfying the search criteria where for each
586
685
  Scheduler, local HTCondor job ids are mapped to their respective
587
686
  classads.
@@ -611,7 +710,7 @@ def htc_create_submit_from_dag(dag_filename, submit_options=None):
611
710
  ----------
612
711
  dag_filename : `str`
613
712
  Name of file containing HTCondor DAG commands.
614
- submit_options : `dict` [`str`, Any], optional
713
+ submit_options : `dict` [`str`, `~typing.Any`], optional
615
714
  Contains extra options for command line (Value of None means flag).
616
715
 
617
716
  Returns
@@ -624,6 +723,19 @@ def htc_create_submit_from_dag(dag_filename, submit_options=None):
624
723
  Use with HTCondor versions which support htcondor.Submit.from_dag(),
625
724
  i.e., 8.9.3 or newer.
626
725
  """
726
+ # Passing do_recurse as submit_option does not seem to
727
+ # override DAGMAN_GENERATE_SUBDAG_SUBMITS as manual implies.
728
+ # So setting it and the other bps required setting here as
729
+ # environment variables if they don't exist.
730
+ var_name = "_CONDOR_DAGMAN_MANAGER_JOB_APPEND_GETENV"
731
+ if var_name not in os.environ:
732
+ os.environ[var_name] = "True"
733
+ do_recurse = submit_options.get("do_recurse", None)
734
+ if do_recurse:
735
+ var_name = "_CONDOR_DAGMAN_GENERATE_SUBDAG_SUBMITS"
736
+ if var_name not in os.environ:
737
+ os.environ[var_name] = str(do_recurse)
738
+
627
739
  return htcondor.Submit.from_dag(dag_filename, submit_options)
628
740
 
629
741
 
@@ -637,7 +749,7 @@ def htc_create_submit_from_cmd(dag_filename, submit_options=None):
637
749
  ----------
638
750
  dag_filename : `str`
639
751
  Name of file containing HTCondor DAG commands.
640
- submit_options : `dict` [`str`, Any], optional
752
+ submit_options : `dict` [`str`, `~typing.Any`], optional
641
753
  Contains extra options for command line (Value of None means flag).
642
754
 
643
755
  Returns
@@ -702,7 +814,7 @@ def htc_create_submit_from_file(submit_file):
702
814
  return htcondor.Submit(descriptors)
703
815
 
704
816
 
705
- def _htc_write_job_commands(stream, name, jobs):
817
+ def _htc_write_job_commands(stream, name, commands):
706
818
  """Output the DAGMan job lines for single job in DAG.
707
819
 
708
820
  Parameters
@@ -711,40 +823,60 @@ def _htc_write_job_commands(stream, name, jobs):
711
823
  Writeable text stream (typically an opened file).
712
824
  name : `str`
713
825
  Job name.
714
- jobs : `RestrictedDict`
715
- DAG job keys and values.
826
+ commands : `RestrictedDict`
827
+ DAG commands for a job.
716
828
  """
717
- if "pre" in jobs:
718
- print(
719
- f"SCRIPT {jobs['pre'].get('defer', '')} PRE {name}"
720
- f"{jobs['pre']['executable']} {jobs['pre'].get('arguments', '')}",
721
- file=stream,
722
- )
723
-
724
- if "post" in jobs:
725
- print(
726
- f"SCRIPT {jobs['post'].get('defer', '')} PRE {name}"
727
- f"{jobs['post']['executable']} {jobs['post'].get('arguments', '')}",
728
- file=stream,
729
- )
730
-
731
- if "vars" in jobs:
732
- for key, value in jobs["vars"]:
829
+ # Note: optional pieces of commands include a space at the beginning.
830
+ # also making sure values aren't empty strings as placeholders.
831
+ if "pre" in commands and commands["pre"]:
832
+ defer = ""
833
+ if "defer" in commands["pre"] and commands["pre"]["defer"]:
834
+ defer = f" DEFER {commands['pre']['defer']['status']} {commands['pre']['defer']['time']}"
835
+
836
+ debug = ""
837
+ if "debug" in commands["pre"] and commands["pre"]["debug"]:
838
+ debug = f" DEBUG {commands['pre']['debug']['filename']} {commands['pre']['debug']['type']}"
839
+
840
+ arguments = ""
841
+ if "arguments" in commands["pre"] and commands["pre"]["arguments"]:
842
+ arguments = f" {commands['pre']['arguments']}"
843
+
844
+ executable = commands["pre"]["executable"]
845
+ print(f"SCRIPT{defer}{debug} PRE {name} {executable}{arguments}", file=stream)
846
+
847
+ if "post" in commands and commands["post"]:
848
+ defer = ""
849
+ if "defer" in commands["post"] and commands["post"]["defer"]:
850
+ defer = f" DEFER {commands['post']['defer']['status']} {commands['post']['defer']['time']}"
851
+
852
+ debug = ""
853
+ if "debug" in commands["post"] and commands["post"]["debug"]:
854
+ debug = f" DEBUG {commands['post']['debug']['filename']} {commands['post']['debug']['type']}"
855
+
856
+ arguments = ""
857
+ if "arguments" in commands["post"] and commands["post"]["arguments"]:
858
+ arguments = f" {commands['post']['arguments']}"
859
+
860
+ executable = commands["post"]["executable"]
861
+ print(f"SCRIPT{defer}{debug} POST {name} {executable}{arguments}", file=stream)
862
+
863
+ if "vars" in commands and commands["vars"]:
864
+ for key, value in commands["vars"].items():
733
865
  print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream)
734
866
 
735
- if "pre_skip" in jobs:
736
- print(f"PRE_SKIP {name} {jobs['pre_skip']}", file=stream)
867
+ if "pre_skip" in commands and commands["pre_skip"]:
868
+ print(f"PRE_SKIP {name} {commands['pre_skip']}", file=stream)
737
869
 
738
- if "retry" in jobs and jobs["retry"]:
739
- print(f"RETRY {name} {jobs['retry']} ", end="", file=stream)
740
- if "retry_unless_exit" in jobs:
741
- print(f"UNLESS-EXIT {jobs['retry_unless_exit']}", end="", file=stream)
742
- print("\n", file=stream)
870
+ if "retry" in commands and commands["retry"]:
871
+ print(f"RETRY {name} {commands['retry']}", end="", file=stream)
872
+ if "retry_unless_exit" in commands:
873
+ print(f" UNLESS-EXIT {commands['retry_unless_exit']}", end="", file=stream)
874
+ print("", file=stream) # Since previous prints don't include new line
743
875
 
744
- if "abort_dag_on" in jobs and jobs["abort_dag_on"]:
876
+ if "abort_dag_on" in commands and commands["abort_dag_on"]:
745
877
  print(
746
- f"ABORT-DAG-ON {name} {jobs['abort_dag_on']['node_exit']}"
747
- f" RETURN {jobs['abort_dag_on']['abort_exit']}",
878
+ f"ABORT-DAG-ON {name} {commands['abort_dag_on']['node_exit']}"
879
+ f" RETURN {commands['abort_dag_on']['abort_exit']}",
748
880
  file=stream,
749
881
  )
750
882
 
@@ -773,6 +905,8 @@ class HTCJob:
773
905
  self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds)
774
906
  self.attrs = initattrs
775
907
  self.subfile = None
908
+ self.subdir = None
909
+ self.subdag = None
776
910
 
777
911
  def __str__(self):
778
912
  return self.name
@@ -810,33 +944,54 @@ class HTCJob:
810
944
  if new_attrs:
811
945
  self.attrs.update(new_attrs)
812
946
 
813
- def write_submit_file(self, submit_path, job_subdir=""):
947
+ def write_submit_file(self, submit_path: str | os.PathLike) -> None:
814
948
  """Write job description to submit file.
815
949
 
816
950
  Parameters
817
951
  ----------
818
- submit_path : `str`
952
+ submit_path : `str` or `os.PathLike`
819
953
  Prefix path for the submit file.
820
- job_subdir : `str`, optional
821
- Template for job subdir.
822
954
  """
823
955
  if not self.subfile:
824
956
  self.subfile = f"{self.name}.sub"
825
- job_subdir = job_subdir.format(self=self)
826
- if job_subdir:
827
- self.subfile = os.path.join(job_subdir, self.subfile)
828
- htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs)
829
957
 
830
- def write_dag_commands(self, stream):
958
+ subfile = self.subfile
959
+ if self.subdir:
960
+ subfile = Path(self.subdir) / subfile
961
+
962
+ subfile = Path(os.path.expandvars(subfile))
963
+ if not subfile.is_absolute():
964
+ subfile = Path(submit_path) / subfile
965
+ if not subfile.exists():
966
+ htc_write_condor_file(subfile, self.name, self.cmds, self.attrs)
967
+
968
+ def write_dag_commands(self, stream, dag_rel_path, command_name="JOB"):
831
969
  """Write DAG commands for single job to output stream.
832
970
 
833
971
  Parameters
834
972
  ----------
835
973
  stream : `IO` or `str`
836
974
  Output Stream.
975
+ dag_rel_path : `str`
976
+ Relative path of dag to submit directory.
977
+ command_name : `str`
978
+ Name of the DAG command (e.g., JOB, FINAL).
837
979
  """
838
- print(f'JOB {self.name} "{self.subfile}"', file=stream)
839
- _htc_write_job_commands(stream, self.name, self.dagcmds)
980
+ subfile = os.path.expandvars(self.subfile)
981
+
982
+ # JOB NodeName SubmitDescription [DIR directory] [NOOP] [DONE]
983
+ job_line = f'{command_name} {self.name} "{subfile}"'
984
+ if "dir" in self.dagcmds:
985
+ dir_val = self.dagcmds["dir"]
986
+ if dag_rel_path:
987
+ dir_val = os.path.join(dag_rel_path, dir_val)
988
+ job_line += f' DIR "{dir_val}"'
989
+ if self.dagcmds.get("noop", False):
990
+ job_line += " NOOP"
991
+
992
+ print(job_line, file=stream)
993
+ if self.dagcmds:
994
+ _htc_write_job_commands(stream, self.name, self.dagcmds)
840
995
 
841
996
  def dump(self, fh):
842
997
  """Dump job information to output stream.
@@ -871,6 +1026,7 @@ class HTCDag(networkx.DiGraph):
871
1026
  self.graph["submit_path"] = None
872
1027
  self.graph["final_job"] = None
873
1028
  self.graph["service_job"] = None
1029
+ self.graph["submit_options"] = {}
874
1030
 
875
1031
  def __str__(self):
876
1032
  """Represent basic DAG info as string.
@@ -906,6 +1062,7 @@ class HTCDag(networkx.DiGraph):
906
1062
  Names of child jobs.
907
1063
  """
908
1064
  assert isinstance(job, HTCJob)
1065
+ _LOG.debug("Adding job %s to dag", job.name)
909
1066
 
910
1067
  # Add dag level attributes to each job
911
1068
  job.add_job_attrs(self.graph["attr"])
@@ -913,10 +1070,10 @@ class HTCDag(networkx.DiGraph):
913
1070
  self.add_node(job.name, data=job)
914
1071
 
915
1072
  if parent_names is not None:
916
- self.add_job_relationships(parent_names, job.name)
1073
+ self.add_job_relationships(parent_names, [job.name])
917
1074
 
918
1075
  if child_names is not None:
919
- self.add_job_relationships(child_names, job.name)
1076
+ self.add_job_relationships(child_names, [job.name])
920
1077
 
921
1078
  def add_job_relationships(self, parents, children):
922
1079
  """Add DAG edge between parents and children jobs.
@@ -972,24 +1129,48 @@ class HTCDag(networkx.DiGraph):
972
1129
  # Delete job node (which deletes its edges).
973
1130
  self.remove_node(job_name)
974
1131
 
975
- def write(self, submit_path, job_subdir=""):
1132
+ def write(self, submit_path, job_subdir="", dag_subdir="", dag_rel_path=""):
976
1133
  """Write DAG to a file.
977
1134
 
978
1135
  Parameters
979
1136
  ----------
980
1137
  submit_path : `str`
981
- Prefix path for dag filename to be combined with DAG name.
1138
+ Prefix path for all outputs.
982
1139
  job_subdir : `str`, optional
983
- Template for job subdir.
1140
+ Template for job subdir (submit_path + job_subdir).
1141
+ dag_subdir : `str`, optional
1142
+ DAG subdir (submit_path + dag_subdir).
1143
+ dag_rel_path : `str`, optional
1144
+ Prefix to job_subdir for jobs inside subdag.
984
1145
  """
985
1146
  self.graph["submit_path"] = submit_path
986
- self.graph["dag_filename"] = os.path.join(submit_path, f"{self.graph['name']}.dag")
987
- os.makedirs(submit_path, exist_ok=True)
988
- with open(self.graph["dag_filename"], "w") as fh:
989
- for _, nodeval in self.nodes().items():
990
- job = nodeval["data"]
991
- job.write_submit_file(submit_path, job_subdir)
992
- job.write_dag_commands(fh)
1147
+ self.graph["dag_filename"] = os.path.join(dag_subdir, f"{self.graph['name']}.dag")
1148
+ full_filename = os.path.join(submit_path, self.graph["dag_filename"])
1149
+ os.makedirs(os.path.dirname(full_filename), exist_ok=True)
1150
+ with open(full_filename, "w") as fh:
1151
+ for name, nodeval in self.nodes().items():
1152
+ try:
1153
+ job = nodeval["data"]
1154
+ except KeyError:
1155
+ _LOG.error("Job %s doesn't have data (keys: %s).", name, nodeval.keys())
1156
+ raise
1157
+ if job.subdag:
1158
+ dag_subdir = f"subdags/{job.name}"
1159
+ if "dir" in job.dagcmds:
1160
+ subdir = job.dagcmds["dir"]
1161
+ else:
1162
+ subdir = job_subdir
1163
+ job.subdag.write(submit_path, subdir, dag_subdir, "../..")
1164
+ fh.write(
1165
+ f"SUBDAG EXTERNAL {job.name} {Path(job.subdag.graph['dag_filename']).name} "
1166
+ f"DIR {dag_subdir}\n"
1167
+ )
1168
+ if job.dagcmds:
1169
+ _htc_write_job_commands(fh, job.name, job.dagcmds)
1170
+ else:
1171
+ job.write_submit_file(submit_path)
1172
+ job.write_dag_commands(fh, dag_rel_path)
1173
+
993
1174
  for edge in self.edges():
994
1175
  print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh)
995
1176
  print(f"DOT {self.name}.dot", file=fh)
@@ -1006,12 +1187,8 @@ class HTCDag(networkx.DiGraph):
1006
1187
  }
1007
1188
  for dagcmd, job in special_jobs.items():
1008
1189
  if job is not None:
1009
- job.write_submit_file(submit_path, job_subdir)
1010
- print(f"{dagcmd} {job.name} {job.subfile}", file=fh)
1011
- if "pre" in job.dagcmds:
1012
- print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh)
1013
- if "post" in job.dagcmds:
1014
- print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh)
1190
+ job.write_submit_file(submit_path)
1191
+ job.write_dag_commands(fh, dag_rel_path, dagcmd)
1015
1192
 
1016
1193
  def dump(self, fh):
1017
1194
  """Dump DAG info to output stream.
@@ -1061,7 +1238,7 @@ def condor_q(constraint=None, schedds=None, **kwargs):
1061
1238
 
1062
1239
  Returns
1063
1240
  -------
1064
- job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1241
+ job_info : `dict` [`str`, `dict` [`str`, `dict` [`str`, `~typing.Any`]]]
1065
1242
  Information about jobs satisfying the search criteria where for each
1066
1243
  Scheduler, local HTCondor job ids are mapped to their respective
1067
1244
  classads.
@@ -1086,7 +1263,7 @@ def condor_history(constraint=None, schedds=None, **kwargs):
1086
1263
 
1087
1264
  Returns
1088
1265
  -------
1089
- job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1266
+ job_info : `dict` [`str`, `dict` [`str`, `dict` [`str`, `~typing.Any`]]]
1090
1267
  Information about jobs satisfying the search criteria where for each
1091
1268
  Scheduler, local HTCondor job ids are mapped to their respective
1092
1269
  classads.
@@ -1117,7 +1294,7 @@ def condor_query(constraint=None, schedds=None, query_func=htc_query_present, **
1117
1294
 
1118
1295
  Returns
1119
1296
  -------
1120
- job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1297
+ job_info : `dict` [`str`, `dict` [`str`, `dict` [`str`, `~typing.Any`]]]
1121
1298
  Information about jobs satisfying the search criteria where for each
1122
1299
  Scheduler, local HTCondor job ids are mapped to their respective
1123
1300
  classads.
@@ -1172,7 +1349,7 @@ def condor_search(constraint=None, hist=None, schedds=None):
1172
1349
 
1173
1350
  Returns
1174
1351
  -------
1175
- job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1352
+ job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` `~typing.Any`]]]
1176
1353
  Information about jobs satisfying the search criteria where for each
1177
1354
  Scheduler, local HTCondor job ids are mapped to their respective
1178
1355
  classads.
@@ -1203,7 +1380,7 @@ def condor_status(constraint=None, coll=None):
1203
1380
 
1204
1381
  Returns
1205
1382
  -------
1206
- pool_info : `dict` [`str`, `dict` [`str`, Any]]
1383
+ pool_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1207
1384
  Mapping between HTCondor slot names and slot information (classAds).
1208
1385
  """
1209
1386
  if coll is None:
@@ -1225,14 +1402,14 @@ def update_job_info(job_info, other_info):
1225
1402
 
1226
1403
  Parameters
1227
1404
  ----------
1228
- job_info : `dict` [`str`, `dict` [`str`, Any]]
1405
+ job_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1229
1406
  Results of the job query that needs to be updated.
1230
- other_info : `dict` [`str`, `dict` [`str`, Any]]
1407
+ other_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1231
1408
  Results of the other job query.
1232
1409
 
1233
1410
  Returns
1234
1411
  -------
1235
- job_info : `dict` [`str`, `dict` [`str`, Any]]
1412
+ job_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1236
1413
  The updated results.
1237
1414
  """
1238
1415
  for schedd_name, others in other_info.items():
@@ -1246,7 +1423,98 @@ def update_job_info(job_info, other_info):
1246
1423
  return job_info
1247
1424
 
1248
1425
 
1249
- def summarize_dag(dir_name: str) -> tuple[str, dict[str, str], dict[str, str]]:
1426
+ def count_jobs_in_single_dag(
1427
+ filename: str | os.PathLike,
1428
+ ) -> tuple[Counter[str], dict[str, str], dict[str, WmsNodeType]]:
1429
+ """Build bps_run_summary string from dag file.
1430
+
1431
+ Parameters
1432
+ ----------
1433
+ filename : `str`
1434
+ Path that includes dag file for a run.
1435
+
1436
+ Returns
1437
+ -------
1438
+ counts : `Counter` [`str`]
1439
+ Semi-colon separated list of job labels and counts.
1440
+ (Same format as saved in dag classad).
1441
+ job_name_to_label : `dict` [`str`, `str`]
1442
+ Mapping of job names to job labels.
1443
+ job_name_to_type : `dict` [`str`, `lsst.ctrl.bps.htcondor.WmsNodeType`]
1444
+ Mapping of job names to job types
1445
+ (e.g., payload, final, service).
1446
+ """
1447
+ # Later code depends upon insertion order
1448
+ counts: Counter = Counter() # counts of payload jobs per label
1449
+ job_name_to_label: dict[str, str] = {}
1450
+ job_name_to_type: dict[str, WmsNodeType] = {}
1451
+ with open(filename) as fh:
1452
+ for line in fh:
1453
+ job_name = ""
1454
+ m = re.match(
1455
+ r"(?P<command>JOB|FINAL|SERVICE|SUBDAG EXTERNAL)\s+"
1456
+ r'(?P<jobname>(?P<wms>wms_)?\S+)\s+"?(?P<subfile>\S+)"?\s*'
1457
+ r'(DIR "?(?P<dir>[^\s"]+)"?)?\s*(?P<noop>NOOP)?',
1458
+ line,
1459
+ )
1460
+ if m:
1461
+ job_name = m.group("jobname")
1462
+ name_parts = job_name.split("_")
1463
+
1464
+ label = ""
1465
+ if m.group("dir"):
1466
+ dir_match = re.search(r"jobs/([^\s/]+)", m.group("dir"))
1467
+ if dir_match:
1468
+ label = dir_match.group(1)
1469
+ else:
1470
+ _LOG.debug("Parse DAG: unparsed dir = %s", line)
1471
+ elif m.group("subfile"):
1472
+ subfile_match = re.search(r"jobs/([^\s/]+)", m.group("subfile"))
1473
+ if subfile_match:
1474
+ label = m.group("subfile").split("/")[1]
1475
+ else:
1476
+ label = pegasus_name_to_label(job_name)
1477
+
1478
+ match m.group("command"):
1479
+ case "JOB":
1480
+ if m.group("noop"):
1481
+ job_type = WmsNodeType.NOOP
1482
+ # wms_noop_label
1483
+ label = name_parts[2]
1484
+ elif m.group("wms"):
1485
+ if name_parts[1] == "check":
1486
+ job_type = WmsNodeType.SUBDAG_CHECK
1487
+ # wms_check_status_wms_group_label
1488
+ label = name_parts[5]
1489
+ else:
1490
+ _LOG.warning(
1491
+ "Unexpected skipping of dag line due to unknown wms job: %s", line
1492
+ )
1493
+ else:
1494
+ job_type = WmsNodeType.PAYLOAD
1495
+ if label == "init":
1496
+ label = "pipetaskInit"
1497
+ counts[label] += 1
1498
+ case "FINAL":
1499
+ job_type = WmsNodeType.FINAL
1500
+ counts[label] += 1 # final counts a payload job.
1501
+ case "SERVICE":
1502
+ job_type = WmsNodeType.SERVICE
1503
+ case "SUBDAG EXTERNAL":
1504
+ job_type = WmsNodeType.SUBDAG
1505
+ label = name_parts[2]
1506
+
1507
+ job_name_to_label[job_name] = label
1508
+ job_name_to_type[job_name] = job_type
1509
+ elif not line.startswith(("VARS", "PARENT", "DOT", "NODE_STATUS_FILE", "SET_JOB_ATTR", "SCRIPT")):
1510
+ # Only print warning if not a line wanting to skip
1511
+ # Probably means problem with regex in above match pattern.
1512
+ _LOG.warning("Unexpected skipping of dag line: %s", line)
1513
+
1514
+ return counts, job_name_to_label, job_name_to_type
1515
+
1516
+
1517
+ def summarize_dag(dir_name: str) -> tuple[str, dict[str, str], dict[str, WmsNodeType]]:
1250
1518
  """Build bps_run_summary string from dag file.
1251
1519
 
1252
1520
  Parameters
@@ -1261,56 +1529,25 @@ def summarize_dag(dir_name: str) -> tuple[str, dict[str, str], dict[str, str]]:
1261
1529
  (Same format as saved in dag classad).
1262
1530
  job_name_to_label : `dict` [`str`, `str`]
1263
1531
  Mapping of job names to job labels.
1264
- job_name_to_type : `dict` [`str`, `str`]
1532
+ job_name_to_type : `dict` [`str`, `lsst.ctrl.bps.htcondor.WmsNodeType`]
1265
1533
  Mapping of job names to job types
1266
1534
  (e.g., payload, final, service).
1267
1535
  """
1268
1536
  # Later code depends upon insertion order
1269
- counts: defaultdict[str, int] = defaultdict(int) # counts of payload jobs per label
1270
- job_name_to_label = {}
1271
- job_name_to_type = {}
1272
- try:
1273
- dag = next(Path(dir_name).glob("*.dag"))
1274
- with open(dag) as fh:
1275
- for line in fh:
1276
- job_name = ""
1277
- if line.startswith("JOB"):
1278
- m = re.match(r'JOB (\S+) "?jobs/([^/]+)/', line)
1279
- if m:
1280
- job_name = m.group(1)
1281
- label = m.group(2)
1282
- if label == "init":
1283
- label = "pipetaskInit"
1284
- counts[label] += 1
1285
- else: # Check if Pegasus submission
1286
- m = re.match(r"JOB (\S+) (\S+)", line)
1287
- if m:
1288
- job_name = m.group(1)
1289
- label = pegasus_name_to_label(m.group(1))
1290
- counts[label] += 1
1291
- else:
1292
- _LOG.warning("Parse DAG: unmatched job line: %s", line)
1293
- job_type = "payload"
1294
- elif line.startswith("FINAL"):
1295
- m = re.match(r"FINAL (\S+) jobs/([^/]+)/", line)
1296
- if m:
1297
- job_name = m.group(1)
1298
- label = m.group(2)
1299
- counts[label] += 1 # final counts a payload job.
1300
- job_type = "final"
1301
- elif line.startswith("SERVICE"):
1302
- m = re.match(r"SERVICE (\S+) jobs/([^/]+)/", line)
1303
- if m:
1304
- job_name = m.group(1)
1305
- label = m.group(2)
1306
- job_type = "service"
1307
-
1308
- if job_name:
1309
- job_name_to_label[job_name] = label
1310
- job_name_to_type[job_name] = job_type
1311
-
1312
- except (OSError, PermissionError, StopIteration):
1313
- pass
1537
+ counts: Counter[str] = Counter() # counts of payload jobs per label
1538
+ job_name_to_label: dict[str, str] = {}
1539
+ job_name_to_type: dict[str, WmsNodeType] = {}
1540
+ for filename in Path(dir_name).glob("*.dag"):
1541
+ single_counts, single_job_name_to_label, single_job_name_to_type = count_jobs_in_single_dag(filename)
1542
+ counts += single_counts
1543
+ _update_dicts(job_name_to_label, single_job_name_to_label)
1544
+ _update_dicts(job_name_to_type, single_job_name_to_type)
1545
+
1546
+ for filename in Path(dir_name).glob("subdags/*/*.dag"):
1547
+ single_counts, single_job_name_to_label, single_job_name_to_type = count_jobs_in_single_dag(filename)
1548
+ counts += single_counts
1549
+ _update_dicts(job_name_to_label, single_job_name_to_label)
1550
+ _update_dicts(job_name_to_type, single_job_name_to_type)
1314
1551
 
1315
1552
  summary = ";".join([f"{name}:{counts[name]}" for name in counts])
1316
1553
  _LOG.debug("summarize_dag: %s %s %s", summary, job_name_to_label, job_name_to_type)
@@ -1343,69 +1580,100 @@ def pegasus_name_to_label(name):
1343
1580
  return label
1344
1581
 
1345
1582
 
1346
- def read_dag_status(wms_path):
1583
+ def read_single_dag_status(filename: str | os.PathLike) -> dict[str, Any]:
1347
1584
  """Read the node status file for DAG summary information.
1348
1585
 
1349
1586
  Parameters
1350
1587
  ----------
1351
- wms_path : `str`
1352
- Path that includes node status file for a run.
1588
+ filename : `str` or `Path.pathlib`
1589
+ Node status filename.
1353
1590
 
1354
1591
  Returns
1355
1592
  -------
1356
- dag_ad : `dict` [`str`, Any]
1593
+ dag_ad : `dict` [`str`, `~typing.Any`]
1357
1594
  DAG summary information.
1358
1595
  """
1359
- dag_ad = {}
1596
+ dag_ad: dict[str, Any] = {}
1360
1597
 
1361
1598
  # While this is probably more up to date than dag classad, only read from
1362
1599
  # file if need to.
1363
1600
  try:
1364
- try:
1365
- node_stat_file = next(Path(wms_path).glob("*.node_status"))
1366
- _LOG.debug("Reading Node Status File %s", node_stat_file)
1367
- with open(node_stat_file) as infh:
1368
- dag_ad = classad.parseNext(infh) # pylint: disable=E1101
1369
- except StopIteration:
1370
- pass
1601
+ node_stat_file = Path(filename)
1602
+ _LOG.debug("Reading Node Status File %s", node_stat_file)
1603
+ with open(node_stat_file) as infh:
1604
+ dag_ad = dict(classad.parseNext(infh)) # pylint: disable=E1101
1371
1605
 
1372
1606
  if not dag_ad:
1373
1607
  # Pegasus check here
1374
- try:
1375
- metrics_file = next(Path(wms_path).glob("*.dag.metrics"))
1608
+ metrics_file = node_stat_file.with_suffix(".dag.metrics")
1609
+ if metrics_file.exists():
1376
1610
  with open(metrics_file) as infh:
1377
1611
  metrics = json.load(infh)
1378
1612
  dag_ad["NodesTotal"] = metrics.get("jobs", 0)
1379
1613
  dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0)
1380
1614
  dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0)
1381
- dag_ad["pegasus_version"] = metrics.get("planner_version", "")
1382
- except StopIteration:
1383
- try:
1384
- metrics_file = next(Path(wms_path).glob("*.metrics"))
1385
- with open(metrics_file) as infh:
1386
- metrics = json.load(infh)
1387
- dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"]
1388
- dag_ad["pegasus_version"] = metrics.get("version", "")
1389
- except StopIteration:
1390
- pass
1615
+ metrics_file = node_stat_file.with_suffix(".metrics")
1616
+ with open(metrics_file) as infh:
1617
+ metrics = json.load(infh)
1618
+ dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"]
1391
1619
  except (OSError, PermissionError):
1392
1620
  pass
1393
1621
 
1394
1622
  _LOG.debug("read_dag_status: %s", dag_ad)
1395
- return dict(dag_ad)
1623
+ return dag_ad
1396
1624
 
1397
1625
 
1398
- def read_node_status(wms_path):
1399
- """Read entire node status file.
1626
+ def read_dag_status(wms_path: str | os.PathLike) -> dict[str, Any]:
1627
+ """Read the node status file for DAG summary information.
1400
1628
 
1401
1629
  Parameters
1402
1630
  ----------
1403
- wms_path : `str`
1631
+ wms_path : `str` or `os.PathLike
1404
1632
  Path that includes node status file for a run.
1405
1633
 
1406
1634
  Returns
1407
1635
  -------
1408
- jobs : `dict` [`str`, Any]
1636
+ dag_ad : `dict` [`str`, `~typing.Any`]
1637
+ DAG summary information, counts summed across any subdags.
1638
+ """
1639
+ dag_ads: dict[str, Any] = {}
1640
+ path = Path(wms_path)
1641
+ try:
1642
+ node_stat_file = next(path.glob("*.node_status"))
1643
+ except StopIteration as exc:
1644
+ raise FileNotFoundError(f"DAGMan node status not found in {wms_path}") from exc
1645
+
1646
+ dag_ads = read_single_dag_status(node_stat_file)
1647
+
1648
+ for node_stat_file in path.glob("subdags/*/*.node_status"):
1649
+ dag_ad = read_single_dag_status(node_stat_file)
1650
+ dag_ads["JobProcsHeld"] += dag_ad.get("JobProcsHeld", 0)
1651
+ dag_ads["NodesPost"] += dag_ad.get("NodesPost", 0)
1652
+ dag_ads["JobProcsIdle"] += dag_ad.get("JobProcsIdle", 0)
1653
+ dag_ads["NodesTotal"] += dag_ad.get("NodesTotal", 0)
1654
+ dag_ads["NodesFailed"] += dag_ad.get("NodesFailed", 0)
1655
+ dag_ads["NodesDone"] += dag_ad.get("NodesDone", 0)
1656
+ dag_ads["NodesQueued"] += dag_ad.get("NodesQueued", 0)
1657
+ dag_ads["NodesPre"] += dag_ad.get("NodesReady", 0)
1658
+ dag_ads["NodesFutile"] += dag_ad.get("NodesFutile", 0)
1659
+ dag_ads["NodesUnready"] += dag_ad.get("NodesUnready", 0)
1660
+
1661
+ return dag_ads
1662
+
1663
+
1664
+ def read_single_node_status(filename: str | os.PathLike, init_fake_id: int) -> dict[str, Any]:
1665
+ """Read entire node status file.
1666
+
1667
+ Parameters
1668
+ ----------
1669
+ filename : `str` or `pathlib.Path`
1670
+ Node status filename.
1671
+ init_fake_id : `int`
1672
+ Initial fake id value.
1673
+
1674
+ Returns
1675
+ -------
1676
+ jobs : `dict` [`str`, `~typing.Any`]
1409
1677
  DAG summary information compiled from the node status file combined
1410
1678
  with the information found in the node event log.
1411
1679
 
@@ -1413,28 +1681,34 @@ def read_node_status(wms_path):
1413
1681
  from the event log takes precedence over the value from the node status
1414
1682
  file.
1415
1683
  """
1684
+ filename = Path(filename)
1685
+
1416
1686
  # Get jobid info from other places to fill in gaps in info from node_status
1417
- _, job_name_to_label, job_name_to_type = summarize_dag(wms_path)
1418
- wms_workflow_id, loginfo = read_dag_log(wms_path)
1419
- loginfo = read_dag_nodes_log(wms_path)
1687
+ _, job_name_to_label, job_name_to_type = count_jobs_in_single_dag(filename.with_suffix(".dag"))
1688
+ loginfo: dict[str, dict[str, Any]] = {}
1689
+ try:
1690
+ wms_workflow_id, loginfo = read_single_dag_log(filename.with_suffix(".dag.dagman.log"))
1691
+ loginfo = read_single_dag_nodes_log(filename.with_suffix(".dag.nodes.log"))
1692
+ except (OSError, PermissionError):
1693
+ pass
1694
+
1695
+ job_name_to_id: dict[str, str] = {}
1420
1696
  _LOG.debug("loginfo = %s", loginfo)
1421
- job_name_to_id = {}
1697
+ log_job_name_to_id: dict[str, str] = {}
1422
1698
  for job_id, job_info in loginfo.items():
1423
1699
  if "LogNotes" in job_info:
1424
1700
  m = re.match(r"DAG Node: (\S+)", job_info["LogNotes"])
1425
1701
  if m:
1426
1702
  job_name = m.group(1)
1427
- job_name_to_id[job_name] = job_id
1703
+ log_job_name_to_id[job_name] = job_id
1428
1704
  job_info["DAGNodeName"] = job_name
1429
- job_info["bps_job_type"] = job_name_to_type[job_name]
1705
+ job_info["wms_node_type"] = job_name_to_type[job_name]
1430
1706
  job_info["bps_job_label"] = job_name_to_label[job_name]
1431
1707
 
1432
- jobs = loginfo
1433
- fake_id = -1.0 # For nodes that do not yet have a job id, give fake one
1708
+ jobs = {}
1709
+ fake_id = init_fake_id # For nodes that do not yet have a job id, give fake one
1434
1710
  try:
1435
- node_status = next(Path(wms_path).glob("*.node_status"))
1436
-
1437
- with open(node_status) as fh:
1711
+ with open(filename) as fh:
1438
1712
  for ad in classad.parseAds(fh):
1439
1713
  match ad["Type"]:
1440
1714
  case "DagStatus":
@@ -1449,21 +1723,23 @@ def read_node_status(wms_path):
1449
1723
  else:
1450
1724
  job_label = job_name
1451
1725
 
1452
- # Make job info as if came from condor_q.
1453
- if job_name in job_name_to_id:
1454
- job_id = str(job_name_to_id[job_name])
1455
- job = jobs[job_id]
1726
+ job = dict(ad)
1727
+ if job_name in log_job_name_to_id:
1728
+ job_id = str(log_job_name_to_id[job_name])
1729
+ _update_dicts(job, loginfo[job_id])
1456
1730
  else:
1457
1731
  job_id = str(fake_id)
1458
- job_name_to_id[job_name] = job_id
1459
1732
  job = dict(ad)
1460
- jobs[job_id] = job
1461
1733
  fake_id -= 1
1734
+ jobs[job_id] = job
1735
+ job_name_to_id[job_name] = job_id
1736
+
1737
+ # Make job info as if came from condor_q.
1462
1738
  job["ClusterId"] = int(float(job_id))
1463
1739
  job["DAGManJobID"] = wms_workflow_id
1464
1740
  job["DAGNodeName"] = job_name
1465
1741
  job["bps_job_label"] = job_label
1466
- job["bps_job_type"] = job_name_to_type[job_name]
1742
+ job["wms_node_type"] = job_name_to_type[job_name]
1467
1743
 
1468
1744
  case "StatusEnd":
1469
1745
  # Skip node status file "epilog".
@@ -1472,41 +1748,104 @@ def read_node_status(wms_path):
1472
1748
  _LOG.debug(
1473
1749
  "Ignoring unknown classad type '%s' in the node status file '%s'",
1474
1750
  ad["Type"],
1475
- wms_path,
1751
+ filename,
1476
1752
  )
1477
- except (StopIteration, OSError, PermissionError):
1753
+ except (OSError, PermissionError):
1478
1754
  pass
1479
1755
 
1480
1756
  # Check for missing jobs (e.g., submission failure or not submitted yet)
1481
1757
  # Use dag info to create job placeholders
1482
1758
  for name in set(job_name_to_label) - set(job_name_to_id):
1483
- job = {}
1484
- job["ClusterId"] = int(float(fake_id))
1759
+ if name in log_job_name_to_id: # job was in nodes.log, but not node_status
1760
+ job_id = str(log_job_name_to_id[name])
1761
+ job = dict(loginfo[job_id])
1762
+ else:
1763
+ job_id = str(fake_id)
1764
+ fake_id -= 1
1765
+ job = {}
1766
+ job["NodeStatus"] = NodeStatus.NOT_READY
1767
+
1768
+ job["ClusterId"] = int(float(job_id))
1485
1769
  job["ProcId"] = 0
1486
1770
  job["DAGManJobID"] = wms_workflow_id
1487
1771
  job["DAGNodeName"] = name
1488
1772
  job["bps_job_label"] = job_name_to_label[name]
1489
- job["bps_job_type"] = job_name_to_type[name]
1490
- job["NodeStatus"] = NodeStatus.NOT_READY
1773
+ job["wms_node_type"] = job_name_to_type[name]
1491
1774
  jobs[f"{job['ClusterId']}.{job['ProcId']}"] = job
1492
- fake_id -= 1
1775
+
1776
+ for job_info in jobs.values():
1777
+ job_info["from_dag_job"] = f"wms_{filename.stem}"
1778
+
1779
+ return jobs
1780
+
1781
+
1782
+ def read_node_status(wms_path: str | os.PathLike) -> dict[str, dict[str, Any]]:
1783
+ """Read entire node status file.
1784
+
1785
+ Parameters
1786
+ ----------
1787
+ wms_path : `str` or `os.PathLike`
1788
+ Path that includes node status file for a run.
1789
+
1790
+ Returns
1791
+ -------
1792
+ jobs : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1793
+ DAG summary information compiled from the node status file combined
1794
+ with the information found in the node event log.
1795
+
1796
+ Currently, if the same job attribute is found in both files, its value
1797
+ from the event log takes precedence over the value from the node status
1798
+ file.
1799
+ """
1800
+ jobs: dict[str, dict[str, Any]] = {}
1801
+ init_fake_id = -1
1802
+
1803
+ # subdags may not have run so wouldn't have node_status file
1804
+ # use dag files and let read_single_node_status handle missing
1805
+ # node_status file.
1806
+ for dag_filename in Path(wms_path).glob("*.dag"):
1807
+ filename = dag_filename.with_suffix(".node_status")
1808
+ info = read_single_node_status(filename, init_fake_id)
1809
+ init_fake_id -= len(info)
1810
+ _update_dicts(jobs, info)
1811
+
1812
+ for dag_filename in Path(wms_path).glob("subdags/*/*.dag"):
1813
+ filename = dag_filename.with_suffix(".node_status")
1814
+ info = read_single_node_status(filename, init_fake_id)
1815
+ init_fake_id -= len(info)
1816
+ _update_dicts(jobs, info)
1817
+
1818
+ # Propagate pruned from subdags to jobs
1819
+ name_to_id: dict[str, str] = {}
1820
+ missing_status: dict[str, list[str]] = {}
1821
+ for id_, job in jobs.items():
1822
+ if job["DAGNodeName"].startswith("wms_"):
1823
+ name_to_id[job["DAGNodeName"]] = id_
1824
+ if "NodeStatus" not in job or job["NodeStatus"] == NodeStatus.NOT_READY:
1825
+ missing_status.setdefault(job["from_dag_job"], []).append(id_)
1826
+
1827
+ for name, dag_id in name_to_id.items():
1828
+ dag_status = jobs[dag_id].get("NodeStatus", NodeStatus.NOT_READY)
1829
+ if dag_status in {NodeStatus.NOT_READY, NodeStatus.FUTILE}:
1830
+ for id_ in missing_status.get(name, []):
1831
+ jobs[id_]["NodeStatus"] = dag_status
1493
1832
 
1494
1833
  return jobs
1495
1834
 
1496
1835
 
1497
- def read_dag_log(wms_path: str) -> tuple[str, dict[str, Any]]:
1836
+ def read_single_dag_log(log_filename: str | os.PathLike) -> tuple[str, dict[str, dict[str, Any]]]:
1498
1837
  """Read job information from the DAGMan log file.
1499
1838
 
1500
1839
  Parameters
1501
1840
  ----------
1502
- wms_path : `str`
1503
- Path containing the DAGMan log file.
1841
+ log_filename : `str` or `os.PathLike`
1842
+ DAGMan log filename.
1504
1843
 
1505
1844
  Returns
1506
1845
  -------
1507
1846
  wms_workflow_id : `str`
1508
1847
  HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job.
1509
- dag_info : `dict` [`str`, `~collections.abc.Any`]
1848
+ dag_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1510
1849
  HTCondor job information read from the log file mapped to HTCondor
1511
1850
  job id.
1512
1851
 
@@ -1515,25 +1854,21 @@ def read_dag_log(wms_path: str) -> tuple[str, dict[str, Any]]:
1515
1854
  FileNotFoundError
1516
1855
  If cannot find DAGMan log in given wms_path.
1517
1856
  """
1518
- wms_workflow_id = 0
1519
- dag_info = {}
1857
+ wms_workflow_id = "0"
1858
+ dag_info: dict[str, dict[str, Any]] = {}
1520
1859
 
1521
- path = Path(wms_path)
1522
- if path.exists():
1523
- try:
1524
- filename = next(path.glob("*.dag.dagman.log"))
1525
- except StopIteration as exc:
1526
- raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc
1860
+ filename = Path(log_filename)
1861
+ if filename.exists():
1527
1862
  _LOG.debug("dag node log filename: %s", filename)
1528
1863
 
1529
- info = {}
1864
+ info: dict[str, Any] = {}
1530
1865
  job_event_log = htcondor.JobEventLog(str(filename))
1531
1866
  for event in job_event_log.events(stop_after=0):
1532
1867
  id_ = f"{event['Cluster']}.{event['Proc']}"
1533
1868
  if id_ not in info:
1534
1869
  info[id_] = {}
1535
1870
  wms_workflow_id = id_ # taking last job id in case of restarts
1536
- info[id_].update(event)
1871
+ _update_dicts(info[id_], event)
1537
1872
  info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1538
1873
 
1539
1874
  # only save latest DAG job
@@ -1544,17 +1879,53 @@ def read_dag_log(wms_path: str) -> tuple[str, dict[str, Any]]:
1544
1879
  return wms_workflow_id, dag_info
1545
1880
 
1546
1881
 
1547
- def read_dag_nodes_log(wms_path):
1882
+ def read_dag_log(wms_path: str | os.PathLike) -> tuple[str, dict[str, Any]]:
1883
+ """Read job information from the DAGMan log file.
1884
+
1885
+ Parameters
1886
+ ----------
1887
+ wms_path : `str` or `os.PathLike`
1888
+ Path containing the DAGMan log file.
1889
+
1890
+ Returns
1891
+ -------
1892
+ wms_workflow_id : `str`
1893
+ HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job.
1894
+ dag_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1895
+ HTCondor job information read from the log file mapped to HTCondor
1896
+ job id.
1897
+
1898
+ Raises
1899
+ ------
1900
+ FileNotFoundError
1901
+ If cannot find DAGMan log in given wms_path.
1902
+ """
1903
+ wms_workflow_id = MISSING_ID
1904
+ dag_info: dict[str, dict[str, Any]] = {}
1905
+
1906
+ path = Path(wms_path)
1907
+ if path.exists():
1908
+ try:
1909
+ filename = next(path.glob("*.dag.dagman.log"))
1910
+ except StopIteration as exc:
1911
+ raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc
1912
+ _LOG.debug("dag node log filename: %s", filename)
1913
+ wms_workflow_id, dag_info = read_single_dag_log(filename)
1914
+
1915
+ return wms_workflow_id, dag_info
1916
+
1917
+
1918
+ def read_single_dag_nodes_log(filename: str | os.PathLike) -> dict[str, dict[str, Any]]:
1548
1919
  """Read job information from the DAGMan nodes log file.
1549
1920
 
1550
1921
  Parameters
1551
1922
  ----------
1552
- wms_path : `str`
1923
+ filename : `str` or `os.PathLike`
1553
1924
  Path containing the DAGMan nodes log file.
1554
1925
 
1555
1926
  Returns
1556
1927
  -------
1557
- info : `dict` [`str`, Any]
1928
+ info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1558
1929
  HTCondor job information read from the log file mapped to HTCondor
1559
1930
  job id.
1560
1931
 
@@ -1563,20 +1934,46 @@ def read_dag_nodes_log(wms_path):
1563
1934
  FileNotFoundError
1564
1935
  If cannot find DAGMan node log in given wms_path.
1565
1936
  """
1566
- try:
1567
- filename = next(Path(wms_path).glob("*.dag.nodes.log"))
1568
- except StopIteration as exc:
1569
- raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc
1570
1937
  _LOG.debug("dag node log filename: %s", filename)
1938
+ filename = Path(filename)
1939
+
1940
+ info: dict[str, dict[str, Any]] = {}
1941
+ if not filename.exists():
1942
+ raise FileNotFoundError(f"{filename} does not exist")
1943
+
1944
+ try:
1945
+ job_event_log = htcondor.JobEventLog(str(filename))
1946
+ except htcondor.HTCondorIOError as ex:
1947
+ _LOG.error("Problem reading nodes log file (%s): %s", filename, ex)
1948
+ import traceback
1571
1949
 
1572
- info = {}
1573
- job_event_log = htcondor.JobEventLog(str(filename))
1950
+ traceback.print_stack()
1951
+ raise
1574
1952
  for event in job_event_log.events(stop_after=0):
1575
- id_ = f"{event['Cluster']}.{event['Proc']}"
1576
- if id_ not in info:
1577
- info[id_] = {}
1578
- info[id_].update(event)
1579
- info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1953
+ _LOG.debug("log event type = %s, keys = %s", event["EventTypeNumber"], event.keys())
1954
+
1955
+ try:
1956
+ id_ = f"{event['Cluster']}.{event['Proc']}"
1957
+ except KeyError:
1958
+ _LOG.warn(
1959
+ "Log event missing ids (DAGNodeName=%s, EventTime=%s, EventTypeNumber=%s)",
1960
+ event.get("DAGNodeName", "UNK"),
1961
+ event.get("EventTime", "UNK"),
1962
+ event.get("EventTypeNumber", "UNK"),
1963
+ )
1964
+ else:
1965
+ if id_ not in info:
1966
+ info[id_] = {}
1967
+ # Workaround: Please check to see if still problem in
1968
+ # future HTCondor versions. Sometimes get a
1969
+ # JobAbortedEvent for a subdag job after it already
1970
+ # terminated normally. Seems to happen when using job
1971
+ # plus subdags.
1972
+ if event["EventTypeNumber"] == 9 and info[id_].get("EventTypeNumber", -1) == 5:
1973
+ _LOG.debug("Skipping spurious JobAbortedEvent: %s", dict(event))
1974
+ else:
1975
+ _update_dicts(info[id_], event)
1976
+ info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1580
1977
 
1581
1978
  # Add more condor_q-like info to info parsed from log file.
1582
1979
  for job in info.values():
@@ -1585,17 +1982,54 @@ def read_dag_nodes_log(wms_path):
1585
1982
  return info
1586
1983
 
1587
1984
 
1588
- def read_dag_info(wms_path):
1985
+ def read_dag_nodes_log(wms_path: str | os.PathLike) -> dict[str, dict[str, Any]]:
1986
+ """Read job information from the DAGMan nodes log file.
1987
+
1988
+ Parameters
1989
+ ----------
1990
+ wms_path : `str` or `os.PathLike`
1991
+ Path containing the DAGMan nodes log file.
1992
+
1993
+ Returns
1994
+ -------
1995
+ info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1996
+ HTCondor job information read from the log file mapped to HTCondor
1997
+ job id.
1998
+
1999
+ Raises
2000
+ ------
2001
+ FileNotFoundError
2002
+ If cannot find DAGMan node log in given wms_path.
2003
+ """
2004
+ info: dict[str, dict[str, Any]] = {}
2005
+ for filename in Path(wms_path).glob("*.dag.nodes.log"):
2006
+ _LOG.debug("dag node log filename: %s", filename)
2007
+ _update_dicts(info, read_single_dag_nodes_log(filename))
2008
+
2009
+ # If submitted, the main nodes log file should exist
2010
+ if not info:
2011
+ raise FileNotFoundError(f"DAGMan node log not found in {wms_path}")
2012
+
2013
+ # Subdags will not have dag nodes log files if they haven't
2014
+ # started running yet (so missing is not an error).
2015
+ for filename in Path(wms_path).glob("subdags/*/*.dag.nodes.log"):
2016
+ _LOG.debug("dag node log filename: %s", filename)
2017
+ _update_dicts(info, read_single_dag_nodes_log(filename))
2018
+
2019
+ return info
2020
+
2021
+
2022
+ def read_dag_info(wms_path: str | os.PathLike) -> dict[str, dict[str, Any]]:
1589
2023
  """Read custom DAGMan job information from the file.
1590
2024
 
1591
2025
  Parameters
1592
2026
  ----------
1593
- wms_path : `str`
2027
+ wms_path : `str` or `os.PathLike`
1594
2028
  Path containing the file with the DAGMan job info.
1595
2029
 
1596
2030
  Returns
1597
2031
  -------
1598
- dag_info : `dict` [`str`, `dict` [`str`, Any]]
2032
+ dag_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1599
2033
  HTCondor job information.
1600
2034
 
1601
2035
  Raises
@@ -1603,6 +2037,7 @@ def read_dag_info(wms_path):
1603
2037
  FileNotFoundError
1604
2038
  If cannot find DAGMan job info file in the given location.
1605
2039
  """
2040
+ dag_info: dict[str, dict[str, Any]] = {}
1606
2041
  try:
1607
2042
  filename = next(Path(wms_path).glob("*.info.json"))
1608
2043
  except StopIteration as exc:
@@ -1613,7 +2048,6 @@ def read_dag_info(wms_path):
1613
2048
  dag_info = json.load(fh)
1614
2049
  except (OSError, PermissionError) as exc:
1615
2050
  _LOG.debug("Retrieving DAGMan job information failed: %s", exc)
1616
- dag_info = {}
1617
2051
  return dag_info
1618
2052
 
1619
2053
 
@@ -1624,7 +2058,7 @@ def write_dag_info(filename, dag_info):
1624
2058
  ----------
1625
2059
  filename : `str`
1626
2060
  Name of the file where the information will be stored.
1627
- dag_info : `dict` [`str` `dict` [`str`, Any]]
2061
+ dag_info : `dict` [`str` `dict` [`str`, `~typing.Any`]]
1628
2062
  Information about the DAGMan job.
1629
2063
  """
1630
2064
  schedd_name = next(iter(dag_info))
@@ -1647,7 +2081,7 @@ def _tweak_log_info(filename, job):
1647
2081
  ----------
1648
2082
  filename : `pathlib.Path`
1649
2083
  Name of the DAGMan log.
1650
- job : `dict` [ `str`, Any ]
2084
+ job : `dict` [ `str`, `~typing.Any` ]
1651
2085
  A mapping between HTCondor job id and job information read from
1652
2086
  the log.
1653
2087
  """
@@ -1661,37 +2095,47 @@ def _tweak_log_info(filename, job):
1661
2095
 
1662
2096
  match job["MyType"]:
1663
2097
  case "ExecuteEvent":
1664
- job["JobStatus"] = JobStatus.RUNNING
2098
+ job["JobStatus"] = htcondor.JobStatus.RUNNING
1665
2099
  case "JobTerminatedEvent" | "PostScriptTerminatedEvent":
1666
- job["JobStatus"] = JobStatus.COMPLETED
2100
+ job["JobStatus"] = htcondor.JobStatus.COMPLETED
1667
2101
  case "SubmitEvent":
1668
- job["JobStatus"] = JobStatus.IDLE
2102
+ job["JobStatus"] = htcondor.JobStatus.IDLE
1669
2103
  case "JobAbortedEvent":
1670
- job["JobStatus"] = JobStatus.REMOVED
2104
+ job["JobStatus"] = htcondor.JobStatus.REMOVED
1671
2105
  case "JobHeldEvent":
1672
- job["JobStatus"] = JobStatus.HELD
2106
+ job["JobStatus"] = htcondor.JobStatus.HELD
2107
+ case "JobReleaseEvent":
2108
+ # Shows up as last event if a DAG job was held and released
2109
+ # so assume job is running. If regular job is released, there
2110
+ # will be other events so JobReleaseEvent won't be the last
2111
+ job["JobStatus"] = htcondor.JobStatus.RUNNING
1673
2112
  case _:
1674
2113
  _LOG.debug("Unknown log event type: %s", job["MyType"])
1675
- job["JobStatus"] = JobStatus.UNEXPANDED
2114
+ job["JobStatus"] = None
1676
2115
 
1677
- if job["JobStatus"] in {JobStatus.COMPLETED, JobStatus.HELD}:
2116
+ if job["JobStatus"] in {htcondor.JobStatus.COMPLETED, htcondor.JobStatus.HELD}:
1678
2117
  new_job = HTC_JOB_AD_HANDLERS.handle(job)
1679
2118
  if new_job is not None:
1680
2119
  job = new_job
1681
2120
  else:
1682
2121
  _LOG.error("Could not determine exit status for job '%s.%s'", job["ClusterId"], job["ProcId"])
1683
2122
 
2123
+ if "LogNotes" in job:
2124
+ m = re.match(r"DAG Node: (\S+)", job["LogNotes"])
2125
+ if m:
2126
+ job["DAGNodeName"] = m.group(1)
2127
+
1684
2128
  except KeyError as e:
1685
2129
  _LOG.error("Missing key %s in job: %s", str(e), job)
1686
2130
  raise
1687
2131
 
1688
2132
 
1689
- def htc_check_dagman_output(wms_path):
2133
+ def htc_check_dagman_output(wms_path: str | os.PathLike) -> str:
1690
2134
  """Check the DAGMan output for error messages.
1691
2135
 
1692
2136
  Parameters
1693
2137
  ----------
1694
- wms_path : `str`
2138
+ wms_path : `str` or `os.PathLike`
1695
2139
  Directory containing the DAGman output file.
1696
2140
 
1697
2141
  Returns
@@ -1711,32 +2155,176 @@ def htc_check_dagman_output(wms_path):
1711
2155
  raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc
1712
2156
  _LOG.debug("dag output filename: %s", filename)
1713
2157
 
2158
+ p = re.compile(r"^(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) (Job submit try \d+/\d+ failed|Warning:.*$|ERROR:.*$)")
2159
+
1714
2160
  message = ""
1715
2161
  try:
1716
2162
  with open(filename) as fh:
1717
- last_submit_failed = ""
2163
+ last_submit_failed = "" # Since submit retries multiple times only report last one
1718
2164
  for line in fh:
1719
- m = re.match(r"(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) Job submit try \d+/\d+ failed", line)
2165
+ m = p.match(line)
1720
2166
  if m:
1721
- last_submit_failed = m.group(1)
1722
- else:
1723
- m = re.search(r"Warning: (.+)", line)
1724
- if m:
1725
- if ".dag.nodes.log is in /tmp" in m.group(1):
2167
+ if m.group(2).startswith("Job submit try"):
2168
+ last_submit_failed = m.group(1)
2169
+ elif m.group(2).startswith("ERROR: submit attempt failed"):
2170
+ pass # Should be handled by Job submit try
2171
+ elif m.group(2).startswith("Warning"):
2172
+ if ".dag.nodes.log is in /tmp" in m.group(2):
1726
2173
  last_warning = "Cannot submit from /tmp."
1727
2174
  else:
1728
- last_warning = m.group(1)
2175
+ last_warning = m.group(2)
2176
+ elif m.group(2) == "ERROR: Warning is fatal error because of DAGMAN_USE_STRICT setting":
2177
+ message += "ERROR: "
2178
+ message += last_warning
2179
+ message += "\n"
2180
+ elif m.group(2) in [
2181
+ "ERROR: the following job(s) failed:",
2182
+ "ERROR: the following Node(s) failed:",
2183
+ ]:
2184
+ pass
1729
2185
  else:
1730
- m = re.search(r"(ERROR: .+)", line)
1731
- if m:
1732
- if (
1733
- m.group(1)
1734
- == "ERROR: Warning is fatal error because of DAGMAN_USE_STRICT setting"
1735
- ):
1736
- message += f"ERROR: {last_warning}"
2186
+ message += m.group(2)
2187
+ message += "\n"
2188
+
1737
2189
  if last_submit_failed:
1738
2190
  message += f"Warn: Job submission issues (last: {last_submit_failed})"
1739
2191
  except (OSError, PermissionError):
1740
2192
  message = f"Warn: Could not read dagman output file from {wms_path}."
1741
2193
  _LOG.debug("dag output file message: %s", message)
1742
2194
  return message
2195
+
2196
+
2197
+ def _read_rescue_headers(infh: TextIO) -> tuple[list[str], list[str]]:
2198
+ """Read header lines from a rescue file.
2199
+
2200
+ Parameters
2201
+ ----------
2202
+ infh : `TextIO`
2203
+ The rescue file from which to read the header lines.
2204
+
2205
+ Returns
2206
+ -------
2207
+ header_lines : `list` [`str`]
2208
+ Header lines read from the rescue file.
2209
+ failed_subdags : `list` [`str`]
2210
+ Names of failed subdag jobs.
2211
+ """
2212
+ header_lines: list[str] = []
2213
+ failed = False
2214
+ failed_subdags: list[str] = []
2215
+
2216
+ for line in infh:
2217
+ line = line.strip()
2218
+ if line.startswith("#"):
2219
+ if line.startswith("# Nodes that failed:"):
2220
+ failed = True
2221
+ header_lines.append(line)
2222
+ elif failed:
2223
+ orig_failed_nodes = line[1:].strip().split(",")
2224
+ new_failed_nodes = []
2225
+ for node in orig_failed_nodes:
2226
+ if node.startswith("wms_check_status"):
2227
+ group_node = node[17:]
2228
+ failed_subdags.append(group_node)
2229
+ new_failed_nodes.append(group_node)
2230
+ else:
2231
+ new_failed_nodes.append(node)
2232
+ header_lines.append(f"# {','.join(new_failed_nodes)}")
2233
+ if orig_failed_nodes[-1] == "<ENDLIST>":
2234
+ failed = False
2235
+ else:
2236
+ header_lines.append(line)
2237
+ elif line.strip() == "": # end of headers
2238
+ break
2239
+ return header_lines, failed_subdags
2240
+
2241
+
2242
+ def _write_rescue_headers(header_lines: list[str], failed_subdags: list[str], outfh: TextIO) -> None:
2243
+ """Write the header lines to the new rescue file.
2244
+
2245
+ Parameters
2246
+ ----------
2247
+ header_lines : `list` [`str`]
2248
+ Header lines to write to the new rescue file.
2249
+ failed_subdags : `list` [`str`]
2250
+ Job names of the failed subdags.
2251
+ outfh : `TextIO`
2252
+ New rescue file.
2253
+ """
2254
+ done_str = "# Nodes premarked DONE"
2255
+ pattern = f"^{done_str}:\\s+(\\d+)"
2256
+ for header_line in header_lines:
2257
+ m = re.match(pattern, header_line)
2258
+ if m:
2259
+ print(f"{done_str}: {int(m.group(1)) - len(failed_subdags)}", file=outfh)
2260
+ else:
2261
+ print(header_line, file=outfh)
2262
+
2263
+ print("", file=outfh)
2264
+
2265
+
2266
+ def _copy_done_lines(failed_subdags: list[str], infh: TextIO, outfh: TextIO) -> None:
2267
+ """Copy the DONE lines from the original rescue file skipping
2268
+ the failed group jobs.
2269
+
2270
+ Parameters
2271
+ ----------
2272
+ failed_subdags : `list` [`str`]
2273
+ List of job names for the failed subdags
2274
+ infh : `TextIO`
2275
+ Original rescue file to copy from.
2276
+ outfh : `TextIO`
2277
+ New rescue file to copy to.
2278
+ """
2279
+ for line in infh:
2280
+ line = line.strip()
2281
+ try:
2282
+ _, node_name = line.split()
2283
+ except ValueError:
2284
+ _LOG.error(f"Unexpected line in rescue file = '{line}'")
2285
+ raise
2286
+ if node_name not in failed_subdags:
2287
+ print(line, file=outfh)
2288
+
2289
+
2290
+ def _update_rescue_file(rescue_file: Path) -> None:
2291
+ """Update the subdag failures in the main rescue file
2292
+ and backup the failed subdag dirs.
2293
+
2294
+ Parameters
2295
+ ----------
2296
+ rescue_file : `pathlib.Path`
2297
+ The main rescue file that needs to be updated.
2298
+ """
2299
+ # To reduce memory requirements, not reading entire file into memory.
2300
+ rescue_tmp = rescue_file.with_suffix(rescue_file.suffix + ".tmp")
2301
+ with open(rescue_file) as infh:
2302
+ header_lines, failed_subdags = _read_rescue_headers(infh)
2303
+ with open(rescue_tmp, "w") as outfh:
2304
+ _write_rescue_headers(header_lines, failed_subdags, outfh)
2305
+ _copy_done_lines(failed_subdags, infh, outfh)
2306
+ rescue_file.unlink()
2307
+ rescue_tmp.rename(rescue_file)
2308
+ for failed_subdag in failed_subdags:
2309
+ htc_backup_files(
2310
+ rescue_file.parent / "subdags" / failed_subdag, subdir=f"backups/subdags/{failed_subdag}"
2311
+ )
2312
+
2313
+
2314
+ def _update_dicts(dict1, dict2):
2315
+ """Update dict1 with info in dict2.
2316
+
2317
+ (Basically an update for nested dictionaries.)
2318
+
2319
+ Parameters
2320
+ ----------
2321
+ dict1 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
2322
+ HTCondor job information to be updated.
2323
+ dict2 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
2324
+ Additional HTCondor job information.
2325
+ """
2326
+ for key, value in dict2.items():
2327
+ if key in dict1 and isinstance(dict1[key], dict) and isinstance(value, dict):
2328
+ _update_dicts(dict1[key], value)
2329
+ else:
2330
+ dict1[key] = value