lsst-ctrl-bps-htcondor 29.0.1rc1__py3-none-any.whl → 29.1.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -38,9 +38,9 @@ __all__ = [
38
38
  "DagStatus",
39
39
  "HTCDag",
40
40
  "HTCJob",
41
- "JobStatus",
42
41
  "NodeStatus",
43
42
  "RestrictedDict",
43
+ "WmsNodeType",
44
44
  "condor_history",
45
45
  "condor_q",
46
46
  "condor_search",
@@ -65,7 +65,6 @@ __all__ = [
65
65
  "read_node_status",
66
66
  "summarize_dag",
67
67
  "update_job_info",
68
- "update_job_info",
69
68
  "write_dag_info",
70
69
  ]
71
70
 
@@ -77,23 +76,24 @@ import os
77
76
  import pprint
78
77
  import re
79
78
  import subprocess
80
- from collections import defaultdict
79
+ from collections import Counter, defaultdict
81
80
  from collections.abc import MutableMapping
82
81
  from datetime import datetime, timedelta
83
- from enum import IntEnum
82
+ from enum import IntEnum, auto
84
83
  from pathlib import Path
85
- from typing import Any
84
+ from typing import Any, TextIO
86
85
 
87
86
  import classad
88
87
  import htcondor
89
88
  import networkx
89
+ from deprecated.sphinx import deprecated
90
90
  from packaging import version
91
91
 
92
92
  from .handlers import HTC_JOB_AD_HANDLERS
93
93
 
94
94
  _LOG = logging.getLogger(__name__)
95
95
 
96
- MISSING_ID = -99999
96
+ MISSING_ID = "-99999"
97
97
 
98
98
 
99
99
  class DagStatus(IntEnum):
@@ -108,6 +108,13 @@ class DagStatus(IntEnum):
108
108
  SUSPENDED = 6 # the DAG has been suspended (see section 2.10.8)
109
109
 
110
110
 
111
+ @deprecated(
112
+ reason="The JobStatus is internally replaced by htcondor.JobStatus. "
113
+ "External reporting code should be using ctrl_bps.WmsStates. "
114
+ "This class will be removed after v30.",
115
+ version="v30.0",
116
+ category=FutureWarning,
117
+ )
111
118
  class JobStatus(IntEnum):
112
119
  """HTCondor's statuses for jobs."""
113
120
 
@@ -155,6 +162,31 @@ class NodeStatus(IntEnum):
155
162
  FUTILE = 7
156
163
 
157
164
 
165
+ class WmsNodeType(IntEnum):
166
+ """HTCondor plugin node types to help with payload reporting."""
167
+
168
+ UNKNOWN = auto()
169
+ """Dummy value when missing."""
170
+
171
+ PAYLOAD = auto()
172
+ """Payload job."""
173
+
174
+ FINAL = auto()
175
+ """Final job."""
176
+
177
+ SERVICE = auto()
178
+ """Service job."""
179
+
180
+ NOOP = auto()
181
+ """NOOP job used for ordering jobs."""
182
+
183
+ SUBDAG = auto()
184
+ """SUBDAG job used for ordering jobs."""
185
+
186
+ SUBDAG_CHECK = auto()
187
+ """Job used to correctly prune jobs after a subdag."""
188
+
189
+
158
190
  HTC_QUOTE_KEYS = {"environment"}
159
191
  HTC_VALID_JOB_KEYS = {
160
192
  "universe",
@@ -189,7 +221,18 @@ HTC_VALID_JOB_KEYS = {
189
221
  "accounting_group",
190
222
  "accounting_group_user",
191
223
  }
192
- HTC_VALID_JOB_DAG_KEYS = {"vars", "pre", "post", "retry", "retry_unless_exit", "abort_dag_on", "abort_exit"}
224
+ HTC_VALID_JOB_DAG_KEYS = {
225
+ "dir",
226
+ "noop",
227
+ "done",
228
+ "vars",
229
+ "pre",
230
+ "post",
231
+ "retry",
232
+ "retry_unless_exit",
233
+ "abort_dag_on",
234
+ "abort_exit",
235
+ }
193
236
  HTC_VERSION = version.parse(htcondor.__version__)
194
237
 
195
238
 
@@ -224,7 +267,7 @@ class RestrictedDict(MutableMapping):
224
267
 
225
268
  Returns
226
269
  -------
227
- value : `~collections.abc.Any`
270
+ value : `~typing.Any`
228
271
  Value associated with given key.
229
272
 
230
273
  Raises
@@ -256,7 +299,7 @@ class RestrictedDict(MutableMapping):
256
299
  ----------
257
300
  key : `str`
258
301
  Identifier to associate with given value.
259
- value : `~collections.abc.Any`
302
+ value : `~typing.Any`
260
303
  Value to store.
261
304
 
262
305
  Raises
@@ -278,7 +321,9 @@ class RestrictedDict(MutableMapping):
278
321
  return str(self.data)
279
322
 
280
323
 
281
- def htc_backup_files(wms_path, subdir=None, limit=100):
324
+ def htc_backup_files(
325
+ wms_path: str | os.PathLike, subdir: str | os.PathLike | None = None, limit: int = 100
326
+ ) -> Path | None:
282
327
  """Backup select HTCondor files in the submit directory.
283
328
 
284
329
  Files will be saved in separate subdirectories which will be created in
@@ -293,9 +338,9 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
293
338
 
294
339
  Parameters
295
340
  ----------
296
- wms_path : `str` or `pathlib.Path`
341
+ wms_path : `str` or `os.PathLike`
297
342
  Path to the submit directory either absolute or relative.
298
- subdir : `str` or `pathlib.Path`, optional
343
+ subdir : `str` or `os.PathLike`, optional
299
344
  A path, relative to the submit directory, where all subdirectories with
300
345
  backup files will be kept. Defaults to None which means that the backup
301
346
  subdirectories will be placed directly in the submit directory.
@@ -305,6 +350,11 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
305
350
  to match the default value of HTCondor's DAGMAN_MAX_RESCUE_NUM in
306
351
  version 8.8+.
307
352
 
353
+ Returns
354
+ -------
355
+ last_rescue_file : `pathlib.Path` or None
356
+ Path to the latest rescue file or None if doesn't exist.
357
+
308
358
  Raises
309
359
  ------
310
360
  FileNotFoundError
@@ -327,17 +377,18 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
327
377
  raise FileNotFoundError(f"Directory {path} not found")
328
378
 
329
379
  # Initialize the backup counter.
330
- rescue_dags = list(Path(wms_path).glob("*.rescue*"))
380
+ rescue_dags = list(path.glob("*.rescue[0-9][0-9][0-9]"))
331
381
  counter = min(len(rescue_dags), limit)
332
382
 
333
383
  # Create the backup directory and move select files there.
334
- dest = Path(wms_path)
384
+ dest = path
335
385
  if subdir:
336
386
  # PurePath.is_relative_to() is not available before Python 3.9. Hence
337
387
  # we need to check is 'subdir' is in the submit directory in some other
338
388
  # way if it is an absolute path.
339
389
  subdir = Path(subdir)
340
390
  if subdir.is_absolute():
391
+ subdir = subdir.resolve() # Since resolve was run on path, must run it here
341
392
  if dest not in subdir.parents:
342
393
  _LOG.warning(
343
394
  "Invalid backup location: '%s' not in the submit directory, will use '%s' instead.",
@@ -349,21 +400,66 @@ def htc_backup_files(wms_path, subdir=None, limit=100):
349
400
  else:
350
401
  dest /= subdir
351
402
  dest /= f"{counter:0{width}}"
403
+ _LOG.debug("dest = %s", dest)
352
404
  try:
353
405
  dest.mkdir(parents=True, exist_ok=False if counter < limit else True)
354
406
  except FileExistsError:
355
407
  _LOG.warning("Refusing to do backups: target directory '%s' already exists", dest)
356
408
  else:
357
- for patt in ["*.info.*", "*.dag.metrics", "*.dag.nodes.log", "*.node_status"]:
358
- for source in path.glob(patt):
359
- if source.is_file():
360
- target = dest / source.relative_to(path)
361
- try:
362
- source.rename(target)
363
- except OSError as exc:
364
- raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None
365
- else:
366
- raise FileNotFoundError(f"Backing up '{source}' failed: not a file")
409
+ htc_backup_files_single_path(path, dest)
410
+
411
+ # also back up any subdag info
412
+ for subdag_dir in path.glob("subdags/*"):
413
+ subdag_dest = dest / subdag_dir.relative_to(path)
414
+ subdag_dest.mkdir(parents=True, exist_ok=False)
415
+ htc_backup_files_single_path(subdag_dir, subdag_dest)
416
+
417
+ last_rescue_file = rescue_dags[-1] if rescue_dags else None
418
+ _LOG.debug("last_rescue_file = %s", last_rescue_file)
419
+ return last_rescue_file
420
+
421
+
422
+ def htc_backup_files_single_path(src: str | os.PathLike, dest: str | os.PathLike) -> None:
423
+ """Move particular htc files to a different directory for later debugging.
424
+
425
+ Parameters
426
+ ----------
427
+ src : `str` or `os.PathLike`
428
+ Directory from which to backup particular files.
429
+ dest : `str` or `os.PathLike`
430
+ Directory to which particular files are moved.
431
+
432
+ Raises
433
+ ------
434
+ RuntimeError
435
+ If given dest directory matches given src directory.
436
+ OSError
437
+ If problems moving file.
438
+ FileNotFoundError
439
+ Item matching pattern in src directory isn't a file.
440
+ """
441
+ src = Path(src)
442
+ dest = Path(dest)
443
+ if dest.samefile(src):
444
+ raise RuntimeError(f"Destination directory is same as the source directory ({src})")
445
+
446
+ for patt in [
447
+ "*.info.*",
448
+ "*.dag.metrics",
449
+ "*.dag.nodes.log",
450
+ "*.node_status",
451
+ "wms_*.dag.post.out",
452
+ "wms_*.status.txt",
453
+ ]:
454
+ for source in src.glob(patt):
455
+ if source.is_file():
456
+ target = dest / source.relative_to(src)
457
+ try:
458
+ source.rename(target)
459
+ except OSError as exc:
460
+ raise type(exc)(f"Backing up '{source}' failed: {exc.strerror}") from None
461
+ else:
462
+ raise FileNotFoundError(f"Backing up '{source}' failed: not a file")
367
463
 
368
464
 
369
465
  def htc_escape(value):
@@ -371,12 +467,12 @@ def htc_escape(value):
371
467
 
372
468
  Parameters
373
469
  ----------
374
- value : `~collections.abc.Any`
470
+ value : `~typing.Any`
375
471
  Value that needs to have characters escaped if string.
376
472
 
377
473
  Returns
378
474
  -------
379
- new_value : `~collections.abc.Any`
475
+ new_value : `~typing.Any`
380
476
  Given value with characters escaped appropriate for HTCondor if string.
381
477
  """
382
478
  if isinstance(value, str):
@@ -407,12 +503,14 @@ def htc_write_attribs(stream, attrs):
407
503
  print(f"+{key} = {pval}", file=stream)
408
504
 
409
505
 
410
- def htc_write_condor_file(filename, job_name, job, job_attrs):
506
+ def htc_write_condor_file(
507
+ filename: str | os.PathLike, job_name: str, job: RestrictedDict, job_attrs: dict[str, Any]
508
+ ) -> None:
411
509
  """Write an HTCondor submit file.
412
510
 
413
511
  Parameters
414
512
  ----------
415
- filename : `str`
513
+ filename : `str` or os.PathLike
416
514
  Filename for the HTCondor submit file.
417
515
  job_name : `str`
418
516
  Job name to use in submit file.
@@ -463,7 +561,7 @@ if HTC_VERSION < version.parse("8.9.8"):
463
561
 
464
562
  Returns
465
563
  -------
466
- kwargs : `dict` [`str`, Any]
564
+ kwargs : `dict` [`str`, `~typing.Any`]
467
565
  Keywords arguments that are guaranteed to work with the Python
468
566
  HTCondor API.
469
567
 
@@ -501,7 +599,7 @@ else:
501
599
 
502
600
  Returns
503
601
  -------
504
- kwargs : `dict` [`str`, Any]
602
+ kwargs : `dict` [`str`, `~typing.Any`]
505
603
  Keywords arguments that were passed to the function.
506
604
  """
507
605
  return kwargs
@@ -521,7 +619,7 @@ def htc_query_history(schedds, **kwargs):
521
619
  ------
522
620
  schedd_name : `str`
523
621
  Name of the HTCondor scheduler managing the job queue.
524
- job_ad : `dict` [`str`, Any]
622
+ job_ad : `dict` [`str`, `~typing.Any`]
525
623
  A dictionary representing HTCondor ClassAd describing a job. It maps
526
624
  job attributes names to values of the ClassAd expressions they
527
625
  represent.
@@ -549,7 +647,7 @@ def htc_query_present(schedds, **kwargs):
549
647
  ------
550
648
  schedd_name : `str`
551
649
  Name of the HTCondor scheduler managing the job queue.
552
- job_ad : `dict` [`str`, Any]
650
+ job_ad : `dict` [`str`, `~typing.Any`]
553
651
  A dictionary representing HTCondor ClassAd describing a job. It maps
554
652
  job attributes names to values of the ClassAd expressions they
555
653
  represent.
@@ -581,7 +679,8 @@ def htc_submit_dag(sub):
581
679
 
582
680
  Returns
583
681
  -------
584
- schedd_job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
682
+ schedd_job_info : `dict` [`str`, `dict` [`str`, \
683
+ `dict` [`str`, `~typing.Any`]]]
585
684
  Information about jobs satisfying the search criteria where for each
586
685
  Scheduler, local HTCondor job ids are mapped to their respective
587
686
  classads.
@@ -604,14 +703,14 @@ def htc_submit_dag(sub):
604
703
  return schedd_dag_info
605
704
 
606
705
 
607
- def htc_create_submit_from_dag(dag_filename, submit_options=None):
706
+ def htc_create_submit_from_dag(dag_filename: str, submit_options: dict[str, Any]) -> htcondor.Submit:
608
707
  """Create a DAGMan job submit description.
609
708
 
610
709
  Parameters
611
710
  ----------
612
711
  dag_filename : `str`
613
712
  Name of file containing HTCondor DAG commands.
614
- submit_options : `dict` [`str`, Any], optional
713
+ submit_options : `dict` [`str`, `~typing.Any`], optional
615
714
  Contains extra options for command line (Value of None means flag).
616
715
 
617
716
  Returns
@@ -624,6 +723,34 @@ def htc_create_submit_from_dag(dag_filename, submit_options=None):
624
723
  Use with HTCondor versions which support htcondor.Submit.from_dag(),
625
724
  i.e., 8.9.3 or newer.
626
725
  """
726
+ # Passing do_recurse as submit_option does not seem to
727
+ # override DAGMAN_GENERATE_SUBDAG_SUBMITS as manual implies.
728
+ # So setting it and the other bps required setting here as
729
+ # environment variables if they don't exist.
730
+ var_name = "_CONDOR_DAGMAN_MANAGER_JOB_APPEND_GETENV"
731
+ if var_name not in os.environ:
732
+ os.environ[var_name] = "True"
733
+
734
+ if "do_recurse" in submit_options:
735
+ var_name = "_CONDOR_DAGMAN_GENERATE_SUBDAG_SUBMITS"
736
+ if var_name not in os.environ:
737
+ os.environ[var_name] = str(submit_options["do_recurse"])
738
+
739
+ # Config and environment variables do not seem to override -MaxIdle
740
+ # on the .dag.condor.sub's command line (broken in some 24.0.x versions).
741
+ # Explicitly forward them as a submit_option if either exists.
742
+ # Note: auto generated subdag submit files are still the -MaxIdle=1000
743
+ # in the broken versions.
744
+ if "MaxIdle" not in submit_options:
745
+ max_jobs_idle: int | None = None
746
+ config_var_name = "DAGMAN_MAX_JOBS_IDLE"
747
+ if f"_CONDOR_{config_var_name}" in os.environ:
748
+ max_jobs_idle = int(os.environ[f"_CONDOR_{config_var_name}"])
749
+ elif config_var_name in htcondor.param:
750
+ max_jobs_idle = htcondor.param[config_var_name]
751
+ if max_jobs_idle:
752
+ submit_options["MaxIdle"] = max_jobs_idle
753
+
627
754
  return htcondor.Submit.from_dag(dag_filename, submit_options)
628
755
 
629
756
 
@@ -637,7 +764,7 @@ def htc_create_submit_from_cmd(dag_filename, submit_options=None):
637
764
  ----------
638
765
  dag_filename : `str`
639
766
  Name of file containing HTCondor DAG commands.
640
- submit_options : `dict` [`str`, Any], optional
767
+ submit_options : `dict` [`str`, `~typing.Any`], optional
641
768
  Contains extra options for command line (Value of None means flag).
642
769
 
643
770
  Returns
@@ -702,7 +829,7 @@ def htc_create_submit_from_file(submit_file):
702
829
  return htcondor.Submit(descriptors)
703
830
 
704
831
 
705
- def _htc_write_job_commands(stream, name, jobs):
832
+ def _htc_write_job_commands(stream, name, commands):
706
833
  """Output the DAGMan job lines for single job in DAG.
707
834
 
708
835
  Parameters
@@ -711,40 +838,60 @@ def _htc_write_job_commands(stream, name, jobs):
711
838
  Writeable text stream (typically an opened file).
712
839
  name : `str`
713
840
  Job name.
714
- jobs : `RestrictedDict`
715
- DAG job keys and values.
841
+ commands : `RestrictedDict`
842
+ DAG commands for a job.
716
843
  """
717
- if "pre" in jobs:
718
- print(
719
- f"SCRIPT {jobs['pre'].get('defer', '')} PRE {name}"
720
- f"{jobs['pre']['executable']} {jobs['pre'].get('arguments', '')}",
721
- file=stream,
722
- )
723
-
724
- if "post" in jobs:
725
- print(
726
- f"SCRIPT {jobs['post'].get('defer', '')} PRE {name}"
727
- f"{jobs['post']['executable']} {jobs['post'].get('arguments', '')}",
728
- file=stream,
729
- )
730
-
731
- if "vars" in jobs:
732
- for key, value in jobs["vars"]:
844
+ # Note: optional pieces of commands include a space at the beginning.
845
+ # also making sure values aren't empty strings as placeholders.
846
+ if "pre" in commands and commands["pre"]:
847
+ defer = ""
848
+ if "defer" in commands["pre"] and commands["pre"]["defer"]:
849
+ defer = f" DEFER {commands['pre']['defer']['status']} {commands['pre']['defer']['time']}"
850
+
851
+ debug = ""
852
+ if "debug" in commands["pre"] and commands["pre"]["debug"]:
853
+ debug = f" DEBUG {commands['pre']['debug']['filename']} {commands['pre']['debug']['type']}"
854
+
855
+ arguments = ""
856
+ if "arguments" in commands["pre"] and commands["pre"]["arguments"]:
857
+ arguments = f" {commands['pre']['arguments']}"
858
+
859
+ executable = commands["pre"]["executable"]
860
+ print(f"SCRIPT{defer}{debug} PRE {name} {executable}{arguments}", file=stream)
861
+
862
+ if "post" in commands and commands["post"]:
863
+ defer = ""
864
+ if "defer" in commands["post"] and commands["post"]["defer"]:
865
+ defer = f" DEFER {commands['post']['defer']['status']} {commands['post']['defer']['time']}"
866
+
867
+ debug = ""
868
+ if "debug" in commands["post"] and commands["post"]["debug"]:
869
+ debug = f" DEBUG {commands['post']['debug']['filename']} {commands['post']['debug']['type']}"
870
+
871
+ arguments = ""
872
+ if "arguments" in commands["post"] and commands["post"]["arguments"]:
873
+ arguments = f" {commands['post']['arguments']}"
874
+
875
+ executable = commands["post"]["executable"]
876
+ print(f"SCRIPT{defer}{debug} POST {name} {executable}{arguments}", file=stream)
877
+
878
+ if "vars" in commands and commands["vars"]:
879
+ for key, value in commands["vars"].items():
733
880
  print(f'VARS {name} {key}="{htc_escape(value)}"', file=stream)
734
881
 
735
- if "pre_skip" in jobs:
736
- print(f"PRE_SKIP {name} {jobs['pre_skip']}", file=stream)
882
+ if "pre_skip" in commands and commands["pre_skip"]:
883
+ print(f"PRE_SKIP {name} {commands['pre_skip']}", file=stream)
737
884
 
738
- if "retry" in jobs and jobs["retry"]:
739
- print(f"RETRY {name} {jobs['retry']} ", end="", file=stream)
740
- if "retry_unless_exit" in jobs:
741
- print(f"UNLESS-EXIT {jobs['retry_unless_exit']}", end="", file=stream)
742
- print("\n", file=stream)
885
+ if "retry" in commands and commands["retry"]:
886
+ print(f"RETRY {name} {commands['retry']}", end="", file=stream)
887
+ if "retry_unless_exit" in commands:
888
+ print(f" UNLESS-EXIT {commands['retry_unless_exit']}", end="", file=stream)
889
+ print("", file=stream) # Since previous prints don't include new line
743
890
 
744
- if "abort_dag_on" in jobs and jobs["abort_dag_on"]:
891
+ if "abort_dag_on" in commands and commands["abort_dag_on"]:
745
892
  print(
746
- f"ABORT-DAG-ON {name} {jobs['abort_dag_on']['node_exit']}"
747
- f" RETURN {jobs['abort_dag_on']['abort_exit']}",
893
+ f"ABORT-DAG-ON {name} {commands['abort_dag_on']['node_exit']}"
894
+ f" RETURN {commands['abort_dag_on']['abort_exit']}",
748
895
  file=stream,
749
896
  )
750
897
 
@@ -773,6 +920,8 @@ class HTCJob:
773
920
  self.dagcmds = RestrictedDict(HTC_VALID_JOB_DAG_KEYS, initdagcmds)
774
921
  self.attrs = initattrs
775
922
  self.subfile = None
923
+ self.subdir = None
924
+ self.subdag = None
776
925
 
777
926
  def __str__(self):
778
927
  return self.name
@@ -810,33 +959,54 @@ class HTCJob:
810
959
  if new_attrs:
811
960
  self.attrs.update(new_attrs)
812
961
 
813
- def write_submit_file(self, submit_path, job_subdir=""):
962
+ def write_submit_file(self, submit_path: str | os.PathLike) -> None:
814
963
  """Write job description to submit file.
815
964
 
816
965
  Parameters
817
966
  ----------
818
- submit_path : `str`
967
+ submit_path : `str` or `os.PathLike`
819
968
  Prefix path for the submit file.
820
- job_subdir : `str`, optional
821
- Template for job subdir.
822
969
  """
823
970
  if not self.subfile:
824
971
  self.subfile = f"{self.name}.sub"
825
- job_subdir = job_subdir.format(self=self)
826
- if job_subdir:
827
- self.subfile = os.path.join(job_subdir, self.subfile)
828
- htc_write_condor_file(os.path.join(submit_path, self.subfile), self.name, self.cmds, self.attrs)
829
972
 
830
- def write_dag_commands(self, stream):
973
+ subfile = self.subfile
974
+ if self.subdir:
975
+ subfile = Path(self.subdir) / subfile
976
+
977
+ subfile = Path(os.path.expandvars(subfile))
978
+ if not subfile.is_absolute():
979
+ subfile = Path(submit_path) / subfile
980
+ if not subfile.exists():
981
+ htc_write_condor_file(subfile, self.name, self.cmds, self.attrs)
982
+
983
+ def write_dag_commands(self, stream, dag_rel_path, command_name="JOB"):
831
984
  """Write DAG commands for single job to output stream.
832
985
 
833
986
  Parameters
834
987
  ----------
835
988
  stream : `IO` or `str`
836
989
  Output Stream.
990
+ dag_rel_path : `str`
991
+ Relative path of dag to submit directory.
992
+ command_name : `str`
993
+ Name of the DAG command (e.g., JOB, FINAL).
837
994
  """
838
- print(f'JOB {self.name} "{self.subfile}"', file=stream)
839
- _htc_write_job_commands(stream, self.name, self.dagcmds)
995
+ subfile = os.path.expandvars(self.subfile)
996
+
997
+ # JOB NodeName SubmitDescription [DIR directory] [NOOP] [DONE]
998
+ job_line = f'{command_name} {self.name} "{subfile}"'
999
+ if "dir" in self.dagcmds:
1000
+ dir_val = self.dagcmds["dir"]
1001
+ if dag_rel_path:
1002
+ dir_val = os.path.join(dag_rel_path, dir_val)
1003
+ job_line += f' DIR "{dir_val}"'
1004
+ if self.dagcmds.get("noop", False):
1005
+ job_line += " NOOP"
1006
+
1007
+ print(job_line, file=stream)
1008
+ if self.dagcmds:
1009
+ _htc_write_job_commands(stream, self.name, self.dagcmds)
840
1010
 
841
1011
  def dump(self, fh):
842
1012
  """Dump job information to output stream.
@@ -871,6 +1041,7 @@ class HTCDag(networkx.DiGraph):
871
1041
  self.graph["submit_path"] = None
872
1042
  self.graph["final_job"] = None
873
1043
  self.graph["service_job"] = None
1044
+ self.graph["submit_options"] = {}
874
1045
 
875
1046
  def __str__(self):
876
1047
  """Represent basic DAG info as string.
@@ -906,6 +1077,7 @@ class HTCDag(networkx.DiGraph):
906
1077
  Names of child jobs.
907
1078
  """
908
1079
  assert isinstance(job, HTCJob)
1080
+ _LOG.debug("Adding job %s to dag", job.name)
909
1081
 
910
1082
  # Add dag level attributes to each job
911
1083
  job.add_job_attrs(self.graph["attr"])
@@ -913,10 +1085,10 @@ class HTCDag(networkx.DiGraph):
913
1085
  self.add_node(job.name, data=job)
914
1086
 
915
1087
  if parent_names is not None:
916
- self.add_job_relationships(parent_names, job.name)
1088
+ self.add_job_relationships(parent_names, [job.name])
917
1089
 
918
1090
  if child_names is not None:
919
- self.add_job_relationships(child_names, job.name)
1091
+ self.add_job_relationships(child_names, [job.name])
920
1092
 
921
1093
  def add_job_relationships(self, parents, children):
922
1094
  """Add DAG edge between parents and children jobs.
@@ -972,24 +1144,48 @@ class HTCDag(networkx.DiGraph):
972
1144
  # Delete job node (which deletes its edges).
973
1145
  self.remove_node(job_name)
974
1146
 
975
- def write(self, submit_path, job_subdir=""):
1147
+ def write(self, submit_path, job_subdir="", dag_subdir="", dag_rel_path=""):
976
1148
  """Write DAG to a file.
977
1149
 
978
1150
  Parameters
979
1151
  ----------
980
1152
  submit_path : `str`
981
- Prefix path for dag filename to be combined with DAG name.
1153
+ Prefix path for all outputs.
982
1154
  job_subdir : `str`, optional
983
- Template for job subdir.
1155
+ Template for job subdir (submit_path + job_subdir).
1156
+ dag_subdir : `str`, optional
1157
+ DAG subdir (submit_path + dag_subdir).
1158
+ dag_rel_path : `str`, optional
1159
+ Prefix to job_subdir for jobs inside subdag.
984
1160
  """
985
1161
  self.graph["submit_path"] = submit_path
986
- self.graph["dag_filename"] = os.path.join(submit_path, f"{self.graph['name']}.dag")
987
- os.makedirs(submit_path, exist_ok=True)
988
- with open(self.graph["dag_filename"], "w") as fh:
989
- for _, nodeval in self.nodes().items():
990
- job = nodeval["data"]
991
- job.write_submit_file(submit_path, job_subdir)
992
- job.write_dag_commands(fh)
1162
+ self.graph["dag_filename"] = os.path.join(dag_subdir, f"{self.graph['name']}.dag")
1163
+ full_filename = os.path.join(submit_path, self.graph["dag_filename"])
1164
+ os.makedirs(os.path.dirname(full_filename), exist_ok=True)
1165
+ with open(full_filename, "w") as fh:
1166
+ for name, nodeval in self.nodes().items():
1167
+ try:
1168
+ job = nodeval["data"]
1169
+ except KeyError:
1170
+ _LOG.error("Job %s doesn't have data (keys: %s).", name, nodeval.keys())
1171
+ raise
1172
+ if job.subdag:
1173
+ dag_subdir = f"subdags/{job.name}"
1174
+ if "dir" in job.dagcmds:
1175
+ subdir = job.dagcmds["dir"]
1176
+ else:
1177
+ subdir = job_subdir
1178
+ job.subdag.write(submit_path, subdir, dag_subdir, "../..")
1179
+ fh.write(
1180
+ f"SUBDAG EXTERNAL {job.name} {Path(job.subdag.graph['dag_filename']).name} "
1181
+ f"DIR {dag_subdir}\n"
1182
+ )
1183
+ if job.dagcmds:
1184
+ _htc_write_job_commands(fh, job.name, job.dagcmds)
1185
+ else:
1186
+ job.write_submit_file(submit_path)
1187
+ job.write_dag_commands(fh, dag_rel_path)
1188
+
993
1189
  for edge in self.edges():
994
1190
  print(f"PARENT {edge[0]} CHILD {edge[1]}", file=fh)
995
1191
  print(f"DOT {self.name}.dot", file=fh)
@@ -1006,12 +1202,8 @@ class HTCDag(networkx.DiGraph):
1006
1202
  }
1007
1203
  for dagcmd, job in special_jobs.items():
1008
1204
  if job is not None:
1009
- job.write_submit_file(submit_path, job_subdir)
1010
- print(f"{dagcmd} {job.name} {job.subfile}", file=fh)
1011
- if "pre" in job.dagcmds:
1012
- print(f"SCRIPT PRE {job.name} {job.dagcmds['pre']}", file=fh)
1013
- if "post" in job.dagcmds:
1014
- print(f"SCRIPT POST {job.name} {job.dagcmds['post']}", file=fh)
1205
+ job.write_submit_file(submit_path)
1206
+ job.write_dag_commands(fh, dag_rel_path, dagcmd)
1015
1207
 
1016
1208
  def dump(self, fh):
1017
1209
  """Dump DAG info to output stream.
@@ -1061,7 +1253,7 @@ def condor_q(constraint=None, schedds=None, **kwargs):
1061
1253
 
1062
1254
  Returns
1063
1255
  -------
1064
- job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1256
+ job_info : `dict` [`str`, `dict` [`str`, `dict` [`str`, `~typing.Any`]]]
1065
1257
  Information about jobs satisfying the search criteria where for each
1066
1258
  Scheduler, local HTCondor job ids are mapped to their respective
1067
1259
  classads.
@@ -1086,7 +1278,7 @@ def condor_history(constraint=None, schedds=None, **kwargs):
1086
1278
 
1087
1279
  Returns
1088
1280
  -------
1089
- job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1281
+ job_info : `dict` [`str`, `dict` [`str`, `dict` [`str`, `~typing.Any`]]]
1090
1282
  Information about jobs satisfying the search criteria where for each
1091
1283
  Scheduler, local HTCondor job ids are mapped to their respective
1092
1284
  classads.
@@ -1117,7 +1309,7 @@ def condor_query(constraint=None, schedds=None, query_func=htc_query_present, **
1117
1309
 
1118
1310
  Returns
1119
1311
  -------
1120
- job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1312
+ job_info : `dict` [`str`, `dict` [`str`, `dict` [`str`, `~typing.Any`]]]
1121
1313
  Information about jobs satisfying the search criteria where for each
1122
1314
  Scheduler, local HTCondor job ids are mapped to their respective
1123
1315
  classads.
@@ -1172,7 +1364,7 @@ def condor_search(constraint=None, hist=None, schedds=None):
1172
1364
 
1173
1365
  Returns
1174
1366
  -------
1175
- job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` Any]]]
1367
+ job_info : `dict` [`str`, `dict` [`str`, `dict` [`str` `~typing.Any`]]]
1176
1368
  Information about jobs satisfying the search criteria where for each
1177
1369
  Scheduler, local HTCondor job ids are mapped to their respective
1178
1370
  classads.
@@ -1203,7 +1395,7 @@ def condor_status(constraint=None, coll=None):
1203
1395
 
1204
1396
  Returns
1205
1397
  -------
1206
- pool_info : `dict` [`str`, `dict` [`str`, Any]]
1398
+ pool_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1207
1399
  Mapping between HTCondor slot names and slot information (classAds).
1208
1400
  """
1209
1401
  if coll is None:
@@ -1225,14 +1417,14 @@ def update_job_info(job_info, other_info):
1225
1417
 
1226
1418
  Parameters
1227
1419
  ----------
1228
- job_info : `dict` [`str`, `dict` [`str`, Any]]
1420
+ job_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1229
1421
  Results of the job query that needs to be updated.
1230
- other_info : `dict` [`str`, `dict` [`str`, Any]]
1422
+ other_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1231
1423
  Results of the other job query.
1232
1424
 
1233
1425
  Returns
1234
1426
  -------
1235
- job_info : `dict` [`str`, `dict` [`str`, Any]]
1427
+ job_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1236
1428
  The updated results.
1237
1429
  """
1238
1430
  for schedd_name, others in other_info.items():
@@ -1246,7 +1438,98 @@ def update_job_info(job_info, other_info):
1246
1438
  return job_info
1247
1439
 
1248
1440
 
1249
- def summarize_dag(dir_name: str) -> tuple[str, dict[str, str], dict[str, str]]:
1441
+ def count_jobs_in_single_dag(
1442
+ filename: str | os.PathLike,
1443
+ ) -> tuple[Counter[str], dict[str, str], dict[str, WmsNodeType]]:
1444
+ """Build bps_run_summary string from dag file.
1445
+
1446
+ Parameters
1447
+ ----------
1448
+ filename : `str`
1449
+ Path that includes dag file for a run.
1450
+
1451
+ Returns
1452
+ -------
1453
+ counts : `Counter` [`str`]
1454
+ Semi-colon separated list of job labels and counts.
1455
+ (Same format as saved in dag classad).
1456
+ job_name_to_label : `dict` [`str`, `str`]
1457
+ Mapping of job names to job labels.
1458
+ job_name_to_type : `dict` [`str`, `lsst.ctrl.bps.htcondor.WmsNodeType`]
1459
+ Mapping of job names to job types
1460
+ (e.g., payload, final, service).
1461
+ """
1462
+ # Later code depends upon insertion order
1463
+ counts: Counter = Counter() # counts of payload jobs per label
1464
+ job_name_to_label: dict[str, str] = {}
1465
+ job_name_to_type: dict[str, WmsNodeType] = {}
1466
+ with open(filename) as fh:
1467
+ for line in fh:
1468
+ job_name = ""
1469
+ m = re.match(
1470
+ r"(?P<command>JOB|FINAL|SERVICE|SUBDAG EXTERNAL)\s+"
1471
+ r'(?P<jobname>(?P<wms>wms_)?\S+)\s+"?(?P<subfile>\S+)"?\s*'
1472
+ r'(DIR "?(?P<dir>[^\s"]+)"?)?\s*(?P<noop>NOOP)?',
1473
+ line,
1474
+ )
1475
+ if m:
1476
+ job_name = m.group("jobname")
1477
+ name_parts = job_name.split("_")
1478
+
1479
+ label = ""
1480
+ if m.group("dir"):
1481
+ dir_match = re.search(r"jobs/([^\s/]+)", m.group("dir"))
1482
+ if dir_match:
1483
+ label = dir_match.group(1)
1484
+ else:
1485
+ _LOG.debug("Parse DAG: unparsed dir = %s", line)
1486
+ elif m.group("subfile"):
1487
+ subfile_match = re.search(r"jobs/([^\s/]+)", m.group("subfile"))
1488
+ if subfile_match:
1489
+ label = m.group("subfile").split("/")[1]
1490
+ else:
1491
+ label = pegasus_name_to_label(job_name)
1492
+
1493
+ match m.group("command"):
1494
+ case "JOB":
1495
+ if m.group("noop"):
1496
+ job_type = WmsNodeType.NOOP
1497
+ # wms_noop_label
1498
+ label = name_parts[2]
1499
+ elif m.group("wms"):
1500
+ if name_parts[1] == "check":
1501
+ job_type = WmsNodeType.SUBDAG_CHECK
1502
+ # wms_check_status_wms_group_label
1503
+ label = name_parts[5]
1504
+ else:
1505
+ _LOG.warning(
1506
+ "Unexpected skipping of dag line due to unknown wms job: %s", line
1507
+ )
1508
+ else:
1509
+ job_type = WmsNodeType.PAYLOAD
1510
+ if label == "init":
1511
+ label = "pipetaskInit"
1512
+ counts[label] += 1
1513
+ case "FINAL":
1514
+ job_type = WmsNodeType.FINAL
1515
+ counts[label] += 1 # final counts a payload job.
1516
+ case "SERVICE":
1517
+ job_type = WmsNodeType.SERVICE
1518
+ case "SUBDAG EXTERNAL":
1519
+ job_type = WmsNodeType.SUBDAG
1520
+ label = name_parts[2]
1521
+
1522
+ job_name_to_label[job_name] = label
1523
+ job_name_to_type[job_name] = job_type
1524
+ elif not line.startswith(("VARS", "PARENT", "DOT", "NODE_STATUS_FILE", "SET_JOB_ATTR", "SCRIPT")):
1525
+ # Only print warning if not a line wanting to skip
1526
+ # Probably means problem with regex in above match pattern.
1527
+ _LOG.warning("Unexpected skipping of dag line: %s", line)
1528
+
1529
+ return counts, job_name_to_label, job_name_to_type
1530
+
1531
+
1532
+ def summarize_dag(dir_name: str) -> tuple[str, dict[str, str], dict[str, WmsNodeType]]:
1250
1533
  """Build bps_run_summary string from dag file.
1251
1534
 
1252
1535
  Parameters
@@ -1261,56 +1544,25 @@ def summarize_dag(dir_name: str) -> tuple[str, dict[str, str], dict[str, str]]:
1261
1544
  (Same format as saved in dag classad).
1262
1545
  job_name_to_label : `dict` [`str`, `str`]
1263
1546
  Mapping of job names to job labels.
1264
- job_name_to_type : `dict` [`str`, `str`]
1547
+ job_name_to_type : `dict` [`str`, `lsst.ctrl.bps.htcondor.WmsNodeType`]
1265
1548
  Mapping of job names to job types
1266
1549
  (e.g., payload, final, service).
1267
1550
  """
1268
1551
  # Later code depends upon insertion order
1269
- counts: defaultdict[str, int] = defaultdict(int) # counts of payload jobs per label
1270
- job_name_to_label = {}
1271
- job_name_to_type = {}
1272
- try:
1273
- dag = next(Path(dir_name).glob("*.dag"))
1274
- with open(dag) as fh:
1275
- for line in fh:
1276
- job_name = ""
1277
- if line.startswith("JOB"):
1278
- m = re.match(r'JOB (\S+) "?jobs/([^/]+)/', line)
1279
- if m:
1280
- job_name = m.group(1)
1281
- label = m.group(2)
1282
- if label == "init":
1283
- label = "pipetaskInit"
1284
- counts[label] += 1
1285
- else: # Check if Pegasus submission
1286
- m = re.match(r"JOB (\S+) (\S+)", line)
1287
- if m:
1288
- job_name = m.group(1)
1289
- label = pegasus_name_to_label(m.group(1))
1290
- counts[label] += 1
1291
- else:
1292
- _LOG.warning("Parse DAG: unmatched job line: %s", line)
1293
- job_type = "payload"
1294
- elif line.startswith("FINAL"):
1295
- m = re.match(r"FINAL (\S+) jobs/([^/]+)/", line)
1296
- if m:
1297
- job_name = m.group(1)
1298
- label = m.group(2)
1299
- counts[label] += 1 # final counts a payload job.
1300
- job_type = "final"
1301
- elif line.startswith("SERVICE"):
1302
- m = re.match(r"SERVICE (\S+) jobs/([^/]+)/", line)
1303
- if m:
1304
- job_name = m.group(1)
1305
- label = m.group(2)
1306
- job_type = "service"
1307
-
1308
- if job_name:
1309
- job_name_to_label[job_name] = label
1310
- job_name_to_type[job_name] = job_type
1311
-
1312
- except (OSError, PermissionError, StopIteration):
1313
- pass
1552
+ counts: Counter[str] = Counter() # counts of payload jobs per label
1553
+ job_name_to_label: dict[str, str] = {}
1554
+ job_name_to_type: dict[str, WmsNodeType] = {}
1555
+ for filename in Path(dir_name).glob("*.dag"):
1556
+ single_counts, single_job_name_to_label, single_job_name_to_type = count_jobs_in_single_dag(filename)
1557
+ counts += single_counts
1558
+ _update_dicts(job_name_to_label, single_job_name_to_label)
1559
+ _update_dicts(job_name_to_type, single_job_name_to_type)
1560
+
1561
+ for filename in Path(dir_name).glob("subdags/*/*.dag"):
1562
+ single_counts, single_job_name_to_label, single_job_name_to_type = count_jobs_in_single_dag(filename)
1563
+ counts += single_counts
1564
+ _update_dicts(job_name_to_label, single_job_name_to_label)
1565
+ _update_dicts(job_name_to_type, single_job_name_to_type)
1314
1566
 
1315
1567
  summary = ";".join([f"{name}:{counts[name]}" for name in counts])
1316
1568
  _LOG.debug("summarize_dag: %s %s %s", summary, job_name_to_label, job_name_to_type)
@@ -1343,69 +1595,100 @@ def pegasus_name_to_label(name):
1343
1595
  return label
1344
1596
 
1345
1597
 
1346
- def read_dag_status(wms_path):
1598
+ def read_single_dag_status(filename: str | os.PathLike) -> dict[str, Any]:
1347
1599
  """Read the node status file for DAG summary information.
1348
1600
 
1349
1601
  Parameters
1350
1602
  ----------
1351
- wms_path : `str`
1352
- Path that includes node status file for a run.
1603
+ filename : `str` or `Path.pathlib`
1604
+ Node status filename.
1353
1605
 
1354
1606
  Returns
1355
1607
  -------
1356
- dag_ad : `dict` [`str`, Any]
1608
+ dag_ad : `dict` [`str`, `~typing.Any`]
1357
1609
  DAG summary information.
1358
1610
  """
1359
- dag_ad = {}
1611
+ dag_ad: dict[str, Any] = {}
1360
1612
 
1361
1613
  # While this is probably more up to date than dag classad, only read from
1362
1614
  # file if need to.
1363
1615
  try:
1364
- try:
1365
- node_stat_file = next(Path(wms_path).glob("*.node_status"))
1366
- _LOG.debug("Reading Node Status File %s", node_stat_file)
1367
- with open(node_stat_file) as infh:
1368
- dag_ad = classad.parseNext(infh) # pylint: disable=E1101
1369
- except StopIteration:
1370
- pass
1616
+ node_stat_file = Path(filename)
1617
+ _LOG.debug("Reading Node Status File %s", node_stat_file)
1618
+ with open(node_stat_file) as infh:
1619
+ dag_ad = dict(classad.parseNext(infh)) # pylint: disable=E1101
1371
1620
 
1372
1621
  if not dag_ad:
1373
1622
  # Pegasus check here
1374
- try:
1375
- metrics_file = next(Path(wms_path).glob("*.dag.metrics"))
1623
+ metrics_file = node_stat_file.with_suffix(".dag.metrics")
1624
+ if metrics_file.exists():
1376
1625
  with open(metrics_file) as infh:
1377
1626
  metrics = json.load(infh)
1378
1627
  dag_ad["NodesTotal"] = metrics.get("jobs", 0)
1379
1628
  dag_ad["NodesFailed"] = metrics.get("jobs_failed", 0)
1380
1629
  dag_ad["NodesDone"] = metrics.get("jobs_succeeded", 0)
1381
- dag_ad["pegasus_version"] = metrics.get("planner_version", "")
1382
- except StopIteration:
1383
- try:
1384
- metrics_file = next(Path(wms_path).glob("*.metrics"))
1385
- with open(metrics_file) as infh:
1386
- metrics = json.load(infh)
1387
- dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"]
1388
- dag_ad["pegasus_version"] = metrics.get("version", "")
1389
- except StopIteration:
1390
- pass
1630
+ metrics_file = node_stat_file.with_suffix(".metrics")
1631
+ with open(metrics_file) as infh:
1632
+ metrics = json.load(infh)
1633
+ dag_ad["NodesTotal"] = metrics["wf_metrics"]["total_jobs"]
1391
1634
  except (OSError, PermissionError):
1392
1635
  pass
1393
1636
 
1394
1637
  _LOG.debug("read_dag_status: %s", dag_ad)
1395
- return dict(dag_ad)
1638
+ return dag_ad
1396
1639
 
1397
1640
 
1398
- def read_node_status(wms_path):
1399
- """Read entire node status file.
1641
+ def read_dag_status(wms_path: str | os.PathLike) -> dict[str, Any]:
1642
+ """Read the node status file for DAG summary information.
1400
1643
 
1401
1644
  Parameters
1402
1645
  ----------
1403
- wms_path : `str`
1646
+ wms_path : `str` or `os.PathLike
1404
1647
  Path that includes node status file for a run.
1405
1648
 
1406
1649
  Returns
1407
1650
  -------
1408
- jobs : `dict` [`str`, Any]
1651
+ dag_ad : `dict` [`str`, `~typing.Any`]
1652
+ DAG summary information, counts summed across any subdags.
1653
+ """
1654
+ dag_ads: dict[str, Any] = {}
1655
+ path = Path(wms_path)
1656
+ try:
1657
+ node_stat_file = next(path.glob("*.node_status"))
1658
+ except StopIteration as exc:
1659
+ raise FileNotFoundError(f"DAGMan node status not found in {wms_path}") from exc
1660
+
1661
+ dag_ads = read_single_dag_status(node_stat_file)
1662
+
1663
+ for node_stat_file in path.glob("subdags/*/*.node_status"):
1664
+ dag_ad = read_single_dag_status(node_stat_file)
1665
+ dag_ads["JobProcsHeld"] += dag_ad.get("JobProcsHeld", 0)
1666
+ dag_ads["NodesPost"] += dag_ad.get("NodesPost", 0)
1667
+ dag_ads["JobProcsIdle"] += dag_ad.get("JobProcsIdle", 0)
1668
+ dag_ads["NodesTotal"] += dag_ad.get("NodesTotal", 0)
1669
+ dag_ads["NodesFailed"] += dag_ad.get("NodesFailed", 0)
1670
+ dag_ads["NodesDone"] += dag_ad.get("NodesDone", 0)
1671
+ dag_ads["NodesQueued"] += dag_ad.get("NodesQueued", 0)
1672
+ dag_ads["NodesPre"] += dag_ad.get("NodesReady", 0)
1673
+ dag_ads["NodesFutile"] += dag_ad.get("NodesFutile", 0)
1674
+ dag_ads["NodesUnready"] += dag_ad.get("NodesUnready", 0)
1675
+
1676
+ return dag_ads
1677
+
1678
+
1679
+ def read_single_node_status(filename: str | os.PathLike, init_fake_id: int) -> dict[str, Any]:
1680
+ """Read entire node status file.
1681
+
1682
+ Parameters
1683
+ ----------
1684
+ filename : `str` or `pathlib.Path`
1685
+ Node status filename.
1686
+ init_fake_id : `int`
1687
+ Initial fake id value.
1688
+
1689
+ Returns
1690
+ -------
1691
+ jobs : `dict` [`str`, `~typing.Any`]
1409
1692
  DAG summary information compiled from the node status file combined
1410
1693
  with the information found in the node event log.
1411
1694
 
@@ -1413,28 +1696,34 @@ def read_node_status(wms_path):
1413
1696
  from the event log takes precedence over the value from the node status
1414
1697
  file.
1415
1698
  """
1699
+ filename = Path(filename)
1700
+
1416
1701
  # Get jobid info from other places to fill in gaps in info from node_status
1417
- _, job_name_to_label, job_name_to_type = summarize_dag(wms_path)
1418
- wms_workflow_id, loginfo = read_dag_log(wms_path)
1419
- loginfo = read_dag_nodes_log(wms_path)
1702
+ _, job_name_to_label, job_name_to_type = count_jobs_in_single_dag(filename.with_suffix(".dag"))
1703
+ loginfo: dict[str, dict[str, Any]] = {}
1704
+ try:
1705
+ wms_workflow_id, loginfo = read_single_dag_log(filename.with_suffix(".dag.dagman.log"))
1706
+ loginfo = read_single_dag_nodes_log(filename.with_suffix(".dag.nodes.log"))
1707
+ except (OSError, PermissionError):
1708
+ pass
1709
+
1710
+ job_name_to_id: dict[str, str] = {}
1420
1711
  _LOG.debug("loginfo = %s", loginfo)
1421
- job_name_to_id = {}
1712
+ log_job_name_to_id: dict[str, str] = {}
1422
1713
  for job_id, job_info in loginfo.items():
1423
1714
  if "LogNotes" in job_info:
1424
1715
  m = re.match(r"DAG Node: (\S+)", job_info["LogNotes"])
1425
1716
  if m:
1426
1717
  job_name = m.group(1)
1427
- job_name_to_id[job_name] = job_id
1718
+ log_job_name_to_id[job_name] = job_id
1428
1719
  job_info["DAGNodeName"] = job_name
1429
- job_info["bps_job_type"] = job_name_to_type[job_name]
1720
+ job_info["wms_node_type"] = job_name_to_type[job_name]
1430
1721
  job_info["bps_job_label"] = job_name_to_label[job_name]
1431
1722
 
1432
- jobs = loginfo
1433
- fake_id = -1.0 # For nodes that do not yet have a job id, give fake one
1723
+ jobs = {}
1724
+ fake_id = init_fake_id # For nodes that do not yet have a job id, give fake one
1434
1725
  try:
1435
- node_status = next(Path(wms_path).glob("*.node_status"))
1436
-
1437
- with open(node_status) as fh:
1726
+ with open(filename) as fh:
1438
1727
  for ad in classad.parseAds(fh):
1439
1728
  match ad["Type"]:
1440
1729
  case "DagStatus":
@@ -1449,21 +1738,23 @@ def read_node_status(wms_path):
1449
1738
  else:
1450
1739
  job_label = job_name
1451
1740
 
1452
- # Make job info as if came from condor_q.
1453
- if job_name in job_name_to_id:
1454
- job_id = str(job_name_to_id[job_name])
1455
- job = jobs[job_id]
1741
+ job = dict(ad)
1742
+ if job_name in log_job_name_to_id:
1743
+ job_id = str(log_job_name_to_id[job_name])
1744
+ _update_dicts(job, loginfo[job_id])
1456
1745
  else:
1457
1746
  job_id = str(fake_id)
1458
- job_name_to_id[job_name] = job_id
1459
1747
  job = dict(ad)
1460
- jobs[job_id] = job
1461
1748
  fake_id -= 1
1749
+ jobs[job_id] = job
1750
+ job_name_to_id[job_name] = job_id
1751
+
1752
+ # Make job info as if came from condor_q.
1462
1753
  job["ClusterId"] = int(float(job_id))
1463
1754
  job["DAGManJobID"] = wms_workflow_id
1464
1755
  job["DAGNodeName"] = job_name
1465
1756
  job["bps_job_label"] = job_label
1466
- job["bps_job_type"] = job_name_to_type[job_name]
1757
+ job["wms_node_type"] = job_name_to_type[job_name]
1467
1758
 
1468
1759
  case "StatusEnd":
1469
1760
  # Skip node status file "epilog".
@@ -1472,41 +1763,104 @@ def read_node_status(wms_path):
1472
1763
  _LOG.debug(
1473
1764
  "Ignoring unknown classad type '%s' in the node status file '%s'",
1474
1765
  ad["Type"],
1475
- wms_path,
1766
+ filename,
1476
1767
  )
1477
- except (StopIteration, OSError, PermissionError):
1768
+ except (OSError, PermissionError):
1478
1769
  pass
1479
1770
 
1480
1771
  # Check for missing jobs (e.g., submission failure or not submitted yet)
1481
1772
  # Use dag info to create job placeholders
1482
1773
  for name in set(job_name_to_label) - set(job_name_to_id):
1483
- job = {}
1484
- job["ClusterId"] = int(float(fake_id))
1774
+ if name in log_job_name_to_id: # job was in nodes.log, but not node_status
1775
+ job_id = str(log_job_name_to_id[name])
1776
+ job = dict(loginfo[job_id])
1777
+ else:
1778
+ job_id = str(fake_id)
1779
+ fake_id -= 1
1780
+ job = {}
1781
+ job["NodeStatus"] = NodeStatus.NOT_READY
1782
+
1783
+ job["ClusterId"] = int(float(job_id))
1485
1784
  job["ProcId"] = 0
1486
1785
  job["DAGManJobID"] = wms_workflow_id
1487
1786
  job["DAGNodeName"] = name
1488
1787
  job["bps_job_label"] = job_name_to_label[name]
1489
- job["bps_job_type"] = job_name_to_type[name]
1490
- job["NodeStatus"] = NodeStatus.NOT_READY
1788
+ job["wms_node_type"] = job_name_to_type[name]
1491
1789
  jobs[f"{job['ClusterId']}.{job['ProcId']}"] = job
1492
- fake_id -= 1
1790
+
1791
+ for job_info in jobs.values():
1792
+ job_info["from_dag_job"] = f"wms_{filename.stem}"
1793
+
1794
+ return jobs
1795
+
1796
+
1797
+ def read_node_status(wms_path: str | os.PathLike) -> dict[str, dict[str, Any]]:
1798
+ """Read entire node status file.
1799
+
1800
+ Parameters
1801
+ ----------
1802
+ wms_path : `str` or `os.PathLike`
1803
+ Path that includes node status file for a run.
1804
+
1805
+ Returns
1806
+ -------
1807
+ jobs : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1808
+ DAG summary information compiled from the node status file combined
1809
+ with the information found in the node event log.
1810
+
1811
+ Currently, if the same job attribute is found in both files, its value
1812
+ from the event log takes precedence over the value from the node status
1813
+ file.
1814
+ """
1815
+ jobs: dict[str, dict[str, Any]] = {}
1816
+ init_fake_id = -1
1817
+
1818
+ # subdags may not have run so wouldn't have node_status file
1819
+ # use dag files and let read_single_node_status handle missing
1820
+ # node_status file.
1821
+ for dag_filename in Path(wms_path).glob("*.dag"):
1822
+ filename = dag_filename.with_suffix(".node_status")
1823
+ info = read_single_node_status(filename, init_fake_id)
1824
+ init_fake_id -= len(info)
1825
+ _update_dicts(jobs, info)
1826
+
1827
+ for dag_filename in Path(wms_path).glob("subdags/*/*.dag"):
1828
+ filename = dag_filename.with_suffix(".node_status")
1829
+ info = read_single_node_status(filename, init_fake_id)
1830
+ init_fake_id -= len(info)
1831
+ _update_dicts(jobs, info)
1832
+
1833
+ # Propagate pruned from subdags to jobs
1834
+ name_to_id: dict[str, str] = {}
1835
+ missing_status: dict[str, list[str]] = {}
1836
+ for id_, job in jobs.items():
1837
+ if job["DAGNodeName"].startswith("wms_"):
1838
+ name_to_id[job["DAGNodeName"]] = id_
1839
+ if "NodeStatus" not in job or job["NodeStatus"] == NodeStatus.NOT_READY:
1840
+ missing_status.setdefault(job["from_dag_job"], []).append(id_)
1841
+
1842
+ for name, dag_id in name_to_id.items():
1843
+ dag_status = jobs[dag_id].get("NodeStatus", NodeStatus.NOT_READY)
1844
+ if dag_status in {NodeStatus.NOT_READY, NodeStatus.FUTILE}:
1845
+ for id_ in missing_status.get(name, []):
1846
+ jobs[id_]["NodeStatus"] = dag_status
1493
1847
 
1494
1848
  return jobs
1495
1849
 
1496
1850
 
1497
- def read_dag_log(wms_path: str) -> tuple[str, dict[str, Any]]:
1851
+ def read_single_dag_log(log_filename: str | os.PathLike) -> tuple[str, dict[str, dict[str, Any]]]:
1498
1852
  """Read job information from the DAGMan log file.
1499
1853
 
1500
1854
  Parameters
1501
1855
  ----------
1502
- wms_path : `str`
1503
- Path containing the DAGMan log file.
1856
+ log_filename : `str` or `os.PathLike`
1857
+ DAGMan log filename.
1504
1858
 
1505
1859
  Returns
1506
1860
  -------
1507
1861
  wms_workflow_id : `str`
1508
1862
  HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job.
1509
- dag_info : `dict` [`str`, `~collections.abc.Any`]
1863
+ dag_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1510
1864
  HTCondor job information read from the log file mapped to HTCondor
1511
1865
  job id.
1512
1866
 
@@ -1515,25 +1869,21 @@ def read_dag_log(wms_path: str) -> tuple[str, dict[str, Any]]:
1515
1869
  FileNotFoundError
1516
1870
  If cannot find DAGMan log in given wms_path.
1517
1871
  """
1518
- wms_workflow_id = 0
1519
- dag_info = {}
1872
+ wms_workflow_id = "0"
1873
+ dag_info: dict[str, dict[str, Any]] = {}
1520
1874
 
1521
- path = Path(wms_path)
1522
- if path.exists():
1523
- try:
1524
- filename = next(path.glob("*.dag.dagman.log"))
1525
- except StopIteration as exc:
1526
- raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc
1875
+ filename = Path(log_filename)
1876
+ if filename.exists():
1527
1877
  _LOG.debug("dag node log filename: %s", filename)
1528
1878
 
1529
- info = {}
1879
+ info: dict[str, Any] = {}
1530
1880
  job_event_log = htcondor.JobEventLog(str(filename))
1531
1881
  for event in job_event_log.events(stop_after=0):
1532
1882
  id_ = f"{event['Cluster']}.{event['Proc']}"
1533
1883
  if id_ not in info:
1534
1884
  info[id_] = {}
1535
1885
  wms_workflow_id = id_ # taking last job id in case of restarts
1536
- info[id_].update(event)
1886
+ _update_dicts(info[id_], event)
1537
1887
  info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1538
1888
 
1539
1889
  # only save latest DAG job
@@ -1544,17 +1894,53 @@ def read_dag_log(wms_path: str) -> tuple[str, dict[str, Any]]:
1544
1894
  return wms_workflow_id, dag_info
1545
1895
 
1546
1896
 
1547
- def read_dag_nodes_log(wms_path):
1897
+ def read_dag_log(wms_path: str | os.PathLike) -> tuple[str, dict[str, Any]]:
1898
+ """Read job information from the DAGMan log file.
1899
+
1900
+ Parameters
1901
+ ----------
1902
+ wms_path : `str` or `os.PathLike`
1903
+ Path containing the DAGMan log file.
1904
+
1905
+ Returns
1906
+ -------
1907
+ wms_workflow_id : `str`
1908
+ HTCondor job id (i.e., <ClusterId>.<ProcId>) of the DAGMan job.
1909
+ dag_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1910
+ HTCondor job information read from the log file mapped to HTCondor
1911
+ job id.
1912
+
1913
+ Raises
1914
+ ------
1915
+ FileNotFoundError
1916
+ If cannot find DAGMan log in given wms_path.
1917
+ """
1918
+ wms_workflow_id = MISSING_ID
1919
+ dag_info: dict[str, dict[str, Any]] = {}
1920
+
1921
+ path = Path(wms_path)
1922
+ if path.exists():
1923
+ try:
1924
+ filename = next(path.glob("*.dag.dagman.log"))
1925
+ except StopIteration as exc:
1926
+ raise FileNotFoundError(f"DAGMan log not found in {wms_path}") from exc
1927
+ _LOG.debug("dag node log filename: %s", filename)
1928
+ wms_workflow_id, dag_info = read_single_dag_log(filename)
1929
+
1930
+ return wms_workflow_id, dag_info
1931
+
1932
+
1933
+ def read_single_dag_nodes_log(filename: str | os.PathLike) -> dict[str, dict[str, Any]]:
1548
1934
  """Read job information from the DAGMan nodes log file.
1549
1935
 
1550
1936
  Parameters
1551
1937
  ----------
1552
- wms_path : `str`
1938
+ filename : `str` or `os.PathLike`
1553
1939
  Path containing the DAGMan nodes log file.
1554
1940
 
1555
1941
  Returns
1556
1942
  -------
1557
- info : `dict` [`str`, Any]
1943
+ info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1558
1944
  HTCondor job information read from the log file mapped to HTCondor
1559
1945
  job id.
1560
1946
 
@@ -1563,20 +1949,46 @@ def read_dag_nodes_log(wms_path):
1563
1949
  FileNotFoundError
1564
1950
  If cannot find DAGMan node log in given wms_path.
1565
1951
  """
1566
- try:
1567
- filename = next(Path(wms_path).glob("*.dag.nodes.log"))
1568
- except StopIteration as exc:
1569
- raise FileNotFoundError(f"DAGMan node log not found in {wms_path}") from exc
1570
1952
  _LOG.debug("dag node log filename: %s", filename)
1953
+ filename = Path(filename)
1954
+
1955
+ info: dict[str, dict[str, Any]] = {}
1956
+ if not filename.exists():
1957
+ raise FileNotFoundError(f"{filename} does not exist")
1958
+
1959
+ try:
1960
+ job_event_log = htcondor.JobEventLog(str(filename))
1961
+ except htcondor.HTCondorIOError as ex:
1962
+ _LOG.error("Problem reading nodes log file (%s): %s", filename, ex)
1963
+ import traceback
1571
1964
 
1572
- info = {}
1573
- job_event_log = htcondor.JobEventLog(str(filename))
1965
+ traceback.print_stack()
1966
+ raise
1574
1967
  for event in job_event_log.events(stop_after=0):
1575
- id_ = f"{event['Cluster']}.{event['Proc']}"
1576
- if id_ not in info:
1577
- info[id_] = {}
1578
- info[id_].update(event)
1579
- info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1968
+ _LOG.debug("log event type = %s, keys = %s", event["EventTypeNumber"], event.keys())
1969
+
1970
+ try:
1971
+ id_ = f"{event['Cluster']}.{event['Proc']}"
1972
+ except KeyError:
1973
+ _LOG.warn(
1974
+ "Log event missing ids (DAGNodeName=%s, EventTime=%s, EventTypeNumber=%s)",
1975
+ event.get("DAGNodeName", "UNK"),
1976
+ event.get("EventTime", "UNK"),
1977
+ event.get("EventTypeNumber", "UNK"),
1978
+ )
1979
+ else:
1980
+ if id_ not in info:
1981
+ info[id_] = {}
1982
+ # Workaround: Please check to see if still problem in
1983
+ # future HTCondor versions. Sometimes get a
1984
+ # JobAbortedEvent for a subdag job after it already
1985
+ # terminated normally. Seems to happen when using job
1986
+ # plus subdags.
1987
+ if event["EventTypeNumber"] == 9 and info[id_].get("EventTypeNumber", -1) == 5:
1988
+ _LOG.debug("Skipping spurious JobAbortedEvent: %s", dict(event))
1989
+ else:
1990
+ _update_dicts(info[id_], event)
1991
+ info[id_][f"{event.type.name.lower()}_time"] = event["EventTime"]
1580
1992
 
1581
1993
  # Add more condor_q-like info to info parsed from log file.
1582
1994
  for job in info.values():
@@ -1585,17 +1997,54 @@ def read_dag_nodes_log(wms_path):
1585
1997
  return info
1586
1998
 
1587
1999
 
1588
- def read_dag_info(wms_path):
2000
+ def read_dag_nodes_log(wms_path: str | os.PathLike) -> dict[str, dict[str, Any]]:
2001
+ """Read job information from the DAGMan nodes log file.
2002
+
2003
+ Parameters
2004
+ ----------
2005
+ wms_path : `str` or `os.PathLike`
2006
+ Path containing the DAGMan nodes log file.
2007
+
2008
+ Returns
2009
+ -------
2010
+ info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
2011
+ HTCondor job information read from the log file mapped to HTCondor
2012
+ job id.
2013
+
2014
+ Raises
2015
+ ------
2016
+ FileNotFoundError
2017
+ If cannot find DAGMan node log in given wms_path.
2018
+ """
2019
+ info: dict[str, dict[str, Any]] = {}
2020
+ for filename in Path(wms_path).glob("*.dag.nodes.log"):
2021
+ _LOG.debug("dag node log filename: %s", filename)
2022
+ _update_dicts(info, read_single_dag_nodes_log(filename))
2023
+
2024
+ # If submitted, the main nodes log file should exist
2025
+ if not info:
2026
+ raise FileNotFoundError(f"DAGMan node log not found in {wms_path}")
2027
+
2028
+ # Subdags will not have dag nodes log files if they haven't
2029
+ # started running yet (so missing is not an error).
2030
+ for filename in Path(wms_path).glob("subdags/*/*.dag.nodes.log"):
2031
+ _LOG.debug("dag node log filename: %s", filename)
2032
+ _update_dicts(info, read_single_dag_nodes_log(filename))
2033
+
2034
+ return info
2035
+
2036
+
2037
+ def read_dag_info(wms_path: str | os.PathLike) -> dict[str, dict[str, Any]]:
1589
2038
  """Read custom DAGMan job information from the file.
1590
2039
 
1591
2040
  Parameters
1592
2041
  ----------
1593
- wms_path : `str`
2042
+ wms_path : `str` or `os.PathLike`
1594
2043
  Path containing the file with the DAGMan job info.
1595
2044
 
1596
2045
  Returns
1597
2046
  -------
1598
- dag_info : `dict` [`str`, `dict` [`str`, Any]]
2047
+ dag_info : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
1599
2048
  HTCondor job information.
1600
2049
 
1601
2050
  Raises
@@ -1603,6 +2052,7 @@ def read_dag_info(wms_path):
1603
2052
  FileNotFoundError
1604
2053
  If cannot find DAGMan job info file in the given location.
1605
2054
  """
2055
+ dag_info: dict[str, dict[str, Any]] = {}
1606
2056
  try:
1607
2057
  filename = next(Path(wms_path).glob("*.info.json"))
1608
2058
  except StopIteration as exc:
@@ -1613,7 +2063,6 @@ def read_dag_info(wms_path):
1613
2063
  dag_info = json.load(fh)
1614
2064
  except (OSError, PermissionError) as exc:
1615
2065
  _LOG.debug("Retrieving DAGMan job information failed: %s", exc)
1616
- dag_info = {}
1617
2066
  return dag_info
1618
2067
 
1619
2068
 
@@ -1624,7 +2073,7 @@ def write_dag_info(filename, dag_info):
1624
2073
  ----------
1625
2074
  filename : `str`
1626
2075
  Name of the file where the information will be stored.
1627
- dag_info : `dict` [`str` `dict` [`str`, Any]]
2076
+ dag_info : `dict` [`str` `dict` [`str`, `~typing.Any`]]
1628
2077
  Information about the DAGMan job.
1629
2078
  """
1630
2079
  schedd_name = next(iter(dag_info))
@@ -1647,7 +2096,7 @@ def _tweak_log_info(filename, job):
1647
2096
  ----------
1648
2097
  filename : `pathlib.Path`
1649
2098
  Name of the DAGMan log.
1650
- job : `dict` [ `str`, Any ]
2099
+ job : `dict` [ `str`, `~typing.Any` ]
1651
2100
  A mapping between HTCondor job id and job information read from
1652
2101
  the log.
1653
2102
  """
@@ -1661,37 +2110,47 @@ def _tweak_log_info(filename, job):
1661
2110
 
1662
2111
  match job["MyType"]:
1663
2112
  case "ExecuteEvent":
1664
- job["JobStatus"] = JobStatus.RUNNING
2113
+ job["JobStatus"] = htcondor.JobStatus.RUNNING
1665
2114
  case "JobTerminatedEvent" | "PostScriptTerminatedEvent":
1666
- job["JobStatus"] = JobStatus.COMPLETED
2115
+ job["JobStatus"] = htcondor.JobStatus.COMPLETED
1667
2116
  case "SubmitEvent":
1668
- job["JobStatus"] = JobStatus.IDLE
2117
+ job["JobStatus"] = htcondor.JobStatus.IDLE
1669
2118
  case "JobAbortedEvent":
1670
- job["JobStatus"] = JobStatus.REMOVED
2119
+ job["JobStatus"] = htcondor.JobStatus.REMOVED
1671
2120
  case "JobHeldEvent":
1672
- job["JobStatus"] = JobStatus.HELD
2121
+ job["JobStatus"] = htcondor.JobStatus.HELD
2122
+ case "JobReleaseEvent":
2123
+ # Shows up as last event if a DAG job was held and released
2124
+ # so assume job is running. If regular job is released, there
2125
+ # will be other events so JobReleaseEvent won't be the last
2126
+ job["JobStatus"] = htcondor.JobStatus.RUNNING
1673
2127
  case _:
1674
2128
  _LOG.debug("Unknown log event type: %s", job["MyType"])
1675
- job["JobStatus"] = JobStatus.UNEXPANDED
2129
+ job["JobStatus"] = None
1676
2130
 
1677
- if job["JobStatus"] in {JobStatus.COMPLETED, JobStatus.HELD}:
2131
+ if job["JobStatus"] in {htcondor.JobStatus.COMPLETED, htcondor.JobStatus.HELD}:
1678
2132
  new_job = HTC_JOB_AD_HANDLERS.handle(job)
1679
2133
  if new_job is not None:
1680
2134
  job = new_job
1681
2135
  else:
1682
2136
  _LOG.error("Could not determine exit status for job '%s.%s'", job["ClusterId"], job["ProcId"])
1683
2137
 
2138
+ if "LogNotes" in job:
2139
+ m = re.match(r"DAG Node: (\S+)", job["LogNotes"])
2140
+ if m:
2141
+ job["DAGNodeName"] = m.group(1)
2142
+
1684
2143
  except KeyError as e:
1685
2144
  _LOG.error("Missing key %s in job: %s", str(e), job)
1686
2145
  raise
1687
2146
 
1688
2147
 
1689
- def htc_check_dagman_output(wms_path):
2148
+ def htc_check_dagman_output(wms_path: str | os.PathLike) -> str:
1690
2149
  """Check the DAGMan output for error messages.
1691
2150
 
1692
2151
  Parameters
1693
2152
  ----------
1694
- wms_path : `str`
2153
+ wms_path : `str` or `os.PathLike`
1695
2154
  Directory containing the DAGman output file.
1696
2155
 
1697
2156
  Returns
@@ -1711,32 +2170,176 @@ def htc_check_dagman_output(wms_path):
1711
2170
  raise FileNotFoundError(f"DAGMan standard output file not found in {wms_path}") from exc
1712
2171
  _LOG.debug("dag output filename: %s", filename)
1713
2172
 
2173
+ p = re.compile(r"^(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) (Job submit try \d+/\d+ failed|Warning:.*$|ERROR:.*$)")
2174
+
1714
2175
  message = ""
1715
2176
  try:
1716
2177
  with open(filename) as fh:
1717
- last_submit_failed = ""
2178
+ last_submit_failed = "" # Since submit retries multiple times only report last one
1718
2179
  for line in fh:
1719
- m = re.match(r"(\d\d/\d\d/\d\d \d\d:\d\d:\d\d) Job submit try \d+/\d+ failed", line)
2180
+ m = p.match(line)
1720
2181
  if m:
1721
- last_submit_failed = m.group(1)
1722
- else:
1723
- m = re.search(r"Warning: (.+)", line)
1724
- if m:
1725
- if ".dag.nodes.log is in /tmp" in m.group(1):
2182
+ if m.group(2).startswith("Job submit try"):
2183
+ last_submit_failed = m.group(1)
2184
+ elif m.group(2).startswith("ERROR: submit attempt failed"):
2185
+ pass # Should be handled by Job submit try
2186
+ elif m.group(2).startswith("Warning"):
2187
+ if ".dag.nodes.log is in /tmp" in m.group(2):
1726
2188
  last_warning = "Cannot submit from /tmp."
1727
2189
  else:
1728
- last_warning = m.group(1)
2190
+ last_warning = m.group(2)
2191
+ elif m.group(2) == "ERROR: Warning is fatal error because of DAGMAN_USE_STRICT setting":
2192
+ message += "ERROR: "
2193
+ message += last_warning
2194
+ message += "\n"
2195
+ elif m.group(2) in [
2196
+ "ERROR: the following job(s) failed:",
2197
+ "ERROR: the following Node(s) failed:",
2198
+ ]:
2199
+ pass
1729
2200
  else:
1730
- m = re.search(r"(ERROR: .+)", line)
1731
- if m:
1732
- if (
1733
- m.group(1)
1734
- == "ERROR: Warning is fatal error because of DAGMAN_USE_STRICT setting"
1735
- ):
1736
- message += f"ERROR: {last_warning}"
2201
+ message += m.group(2)
2202
+ message += "\n"
2203
+
1737
2204
  if last_submit_failed:
1738
2205
  message += f"Warn: Job submission issues (last: {last_submit_failed})"
1739
2206
  except (OSError, PermissionError):
1740
2207
  message = f"Warn: Could not read dagman output file from {wms_path}."
1741
2208
  _LOG.debug("dag output file message: %s", message)
1742
2209
  return message
2210
+
2211
+
2212
+ def _read_rescue_headers(infh: TextIO) -> tuple[list[str], list[str]]:
2213
+ """Read header lines from a rescue file.
2214
+
2215
+ Parameters
2216
+ ----------
2217
+ infh : `TextIO`
2218
+ The rescue file from which to read the header lines.
2219
+
2220
+ Returns
2221
+ -------
2222
+ header_lines : `list` [`str`]
2223
+ Header lines read from the rescue file.
2224
+ failed_subdags : `list` [`str`]
2225
+ Names of failed subdag jobs.
2226
+ """
2227
+ header_lines: list[str] = []
2228
+ failed = False
2229
+ failed_subdags: list[str] = []
2230
+
2231
+ for line in infh:
2232
+ line = line.strip()
2233
+ if line.startswith("#"):
2234
+ if line.startswith("# Nodes that failed:"):
2235
+ failed = True
2236
+ header_lines.append(line)
2237
+ elif failed:
2238
+ orig_failed_nodes = line[1:].strip().split(",")
2239
+ new_failed_nodes = []
2240
+ for node in orig_failed_nodes:
2241
+ if node.startswith("wms_check_status"):
2242
+ group_node = node[17:]
2243
+ failed_subdags.append(group_node)
2244
+ new_failed_nodes.append(group_node)
2245
+ else:
2246
+ new_failed_nodes.append(node)
2247
+ header_lines.append(f"# {','.join(new_failed_nodes)}")
2248
+ if orig_failed_nodes[-1] == "<ENDLIST>":
2249
+ failed = False
2250
+ else:
2251
+ header_lines.append(line)
2252
+ elif line.strip() == "": # end of headers
2253
+ break
2254
+ return header_lines, failed_subdags
2255
+
2256
+
2257
+ def _write_rescue_headers(header_lines: list[str], failed_subdags: list[str], outfh: TextIO) -> None:
2258
+ """Write the header lines to the new rescue file.
2259
+
2260
+ Parameters
2261
+ ----------
2262
+ header_lines : `list` [`str`]
2263
+ Header lines to write to the new rescue file.
2264
+ failed_subdags : `list` [`str`]
2265
+ Job names of the failed subdags.
2266
+ outfh : `TextIO`
2267
+ New rescue file.
2268
+ """
2269
+ done_str = "# Nodes premarked DONE"
2270
+ pattern = f"^{done_str}:\\s+(\\d+)"
2271
+ for header_line in header_lines:
2272
+ m = re.match(pattern, header_line)
2273
+ if m:
2274
+ print(f"{done_str}: {int(m.group(1)) - len(failed_subdags)}", file=outfh)
2275
+ else:
2276
+ print(header_line, file=outfh)
2277
+
2278
+ print("", file=outfh)
2279
+
2280
+
2281
+ def _copy_done_lines(failed_subdags: list[str], infh: TextIO, outfh: TextIO) -> None:
2282
+ """Copy the DONE lines from the original rescue file skipping
2283
+ the failed group jobs.
2284
+
2285
+ Parameters
2286
+ ----------
2287
+ failed_subdags : `list` [`str`]
2288
+ List of job names for the failed subdags
2289
+ infh : `TextIO`
2290
+ Original rescue file to copy from.
2291
+ outfh : `TextIO`
2292
+ New rescue file to copy to.
2293
+ """
2294
+ for line in infh:
2295
+ line = line.strip()
2296
+ try:
2297
+ _, node_name = line.split()
2298
+ except ValueError:
2299
+ _LOG.error(f"Unexpected line in rescue file = '{line}'")
2300
+ raise
2301
+ if node_name not in failed_subdags:
2302
+ print(line, file=outfh)
2303
+
2304
+
2305
+ def _update_rescue_file(rescue_file: Path) -> None:
2306
+ """Update the subdag failures in the main rescue file
2307
+ and backup the failed subdag dirs.
2308
+
2309
+ Parameters
2310
+ ----------
2311
+ rescue_file : `pathlib.Path`
2312
+ The main rescue file that needs to be updated.
2313
+ """
2314
+ # To reduce memory requirements, not reading entire file into memory.
2315
+ rescue_tmp = rescue_file.with_suffix(rescue_file.suffix + ".tmp")
2316
+ with open(rescue_file) as infh:
2317
+ header_lines, failed_subdags = _read_rescue_headers(infh)
2318
+ with open(rescue_tmp, "w") as outfh:
2319
+ _write_rescue_headers(header_lines, failed_subdags, outfh)
2320
+ _copy_done_lines(failed_subdags, infh, outfh)
2321
+ rescue_file.unlink()
2322
+ rescue_tmp.rename(rescue_file)
2323
+ for failed_subdag in failed_subdags:
2324
+ htc_backup_files(
2325
+ rescue_file.parent / "subdags" / failed_subdag, subdir=f"backups/subdags/{failed_subdag}"
2326
+ )
2327
+
2328
+
2329
+ def _update_dicts(dict1, dict2):
2330
+ """Update dict1 with info in dict2.
2331
+
2332
+ (Basically an update for nested dictionaries.)
2333
+
2334
+ Parameters
2335
+ ----------
2336
+ dict1 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
2337
+ HTCondor job information to be updated.
2338
+ dict2 : `dict` [`str`, `dict` [`str`, `~typing.Any`]]
2339
+ Additional HTCondor job information.
2340
+ """
2341
+ for key, value in dict2.items():
2342
+ if key in dict1 and isinstance(dict1[key], dict) and isinstance(value, dict):
2343
+ _update_dicts(dict1[key], value)
2344
+ else:
2345
+ dict1[key] = value