hpcflow-new2 0.2.0a190__py3-none-any.whl → 0.2.0a200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. hpcflow/__pyinstaller/hook-hpcflow.py +1 -0
  2. hpcflow/_version.py +1 -1
  3. hpcflow/data/scripts/bad_script.py +2 -0
  4. hpcflow/data/scripts/do_nothing.py +2 -0
  5. hpcflow/data/scripts/env_specifier_test/input_file_generator_pass_env_spec.py +4 -0
  6. hpcflow/data/scripts/env_specifier_test/main_script_test_pass_env_spec.py +8 -0
  7. hpcflow/data/scripts/env_specifier_test/output_file_parser_pass_env_spec.py +4 -0
  8. hpcflow/data/scripts/env_specifier_test/v1/input_file_generator_basic.py +4 -0
  9. hpcflow/data/scripts/env_specifier_test/v1/main_script_test_direct_in_direct_out.py +7 -0
  10. hpcflow/data/scripts/env_specifier_test/v1/output_file_parser_basic.py +4 -0
  11. hpcflow/data/scripts/env_specifier_test/v2/main_script_test_direct_in_direct_out.py +7 -0
  12. hpcflow/data/scripts/input_file_generator_basic.py +3 -0
  13. hpcflow/data/scripts/input_file_generator_basic_FAIL.py +3 -0
  14. hpcflow/data/scripts/input_file_generator_test_stdout_stderr.py +8 -0
  15. hpcflow/data/scripts/main_script_test_direct_in.py +3 -0
  16. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2.py +6 -0
  17. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed.py +6 -0
  18. hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed_group.py +7 -0
  19. hpcflow/data/scripts/main_script_test_direct_in_direct_out_3.py +6 -0
  20. hpcflow/data/scripts/main_script_test_direct_in_group_direct_out_3.py +6 -0
  21. hpcflow/data/scripts/main_script_test_direct_in_group_one_fail_direct_out_3.py +6 -0
  22. hpcflow/data/scripts/main_script_test_hdf5_in_obj_2.py +12 -0
  23. hpcflow/data/scripts/main_script_test_json_out_FAIL.py +3 -0
  24. hpcflow/data/scripts/main_script_test_shell_env_vars.py +12 -0
  25. hpcflow/data/scripts/main_script_test_std_out_std_err.py +6 -0
  26. hpcflow/data/scripts/output_file_parser_basic.py +3 -0
  27. hpcflow/data/scripts/output_file_parser_basic_FAIL.py +7 -0
  28. hpcflow/data/scripts/output_file_parser_test_stdout_stderr.py +8 -0
  29. hpcflow/data/scripts/script_exit_test.py +5 -0
  30. hpcflow/data/template_components/environments.yaml +1 -1
  31. hpcflow/sdk/__init__.py +5 -0
  32. hpcflow/sdk/app.py +166 -92
  33. hpcflow/sdk/cli.py +263 -84
  34. hpcflow/sdk/cli_common.py +99 -5
  35. hpcflow/sdk/config/callbacks.py +38 -1
  36. hpcflow/sdk/config/config.py +102 -13
  37. hpcflow/sdk/config/errors.py +19 -5
  38. hpcflow/sdk/config/types.py +3 -0
  39. hpcflow/sdk/core/__init__.py +25 -1
  40. hpcflow/sdk/core/actions.py +914 -262
  41. hpcflow/sdk/core/cache.py +76 -34
  42. hpcflow/sdk/core/command_files.py +14 -128
  43. hpcflow/sdk/core/commands.py +35 -6
  44. hpcflow/sdk/core/element.py +122 -50
  45. hpcflow/sdk/core/errors.py +58 -2
  46. hpcflow/sdk/core/execute.py +207 -0
  47. hpcflow/sdk/core/loop.py +408 -50
  48. hpcflow/sdk/core/loop_cache.py +4 -4
  49. hpcflow/sdk/core/parameters.py +382 -37
  50. hpcflow/sdk/core/run_dir_files.py +13 -40
  51. hpcflow/sdk/core/skip_reason.py +7 -0
  52. hpcflow/sdk/core/task.py +119 -30
  53. hpcflow/sdk/core/task_schema.py +68 -0
  54. hpcflow/sdk/core/test_utils.py +66 -27
  55. hpcflow/sdk/core/types.py +54 -1
  56. hpcflow/sdk/core/utils.py +136 -19
  57. hpcflow/sdk/core/workflow.py +1587 -356
  58. hpcflow/sdk/data/workflow_spec_schema.yaml +2 -0
  59. hpcflow/sdk/demo/cli.py +7 -0
  60. hpcflow/sdk/helper/cli.py +1 -0
  61. hpcflow/sdk/log.py +42 -15
  62. hpcflow/sdk/persistence/base.py +405 -53
  63. hpcflow/sdk/persistence/json.py +177 -52
  64. hpcflow/sdk/persistence/pending.py +237 -69
  65. hpcflow/sdk/persistence/store_resource.py +3 -2
  66. hpcflow/sdk/persistence/types.py +15 -4
  67. hpcflow/sdk/persistence/zarr.py +928 -81
  68. hpcflow/sdk/submission/jobscript.py +1408 -489
  69. hpcflow/sdk/submission/schedulers/__init__.py +40 -5
  70. hpcflow/sdk/submission/schedulers/direct.py +33 -19
  71. hpcflow/sdk/submission/schedulers/sge.py +51 -16
  72. hpcflow/sdk/submission/schedulers/slurm.py +44 -16
  73. hpcflow/sdk/submission/schedulers/utils.py +7 -2
  74. hpcflow/sdk/submission/shells/base.py +68 -20
  75. hpcflow/sdk/submission/shells/bash.py +222 -129
  76. hpcflow/sdk/submission/shells/powershell.py +200 -150
  77. hpcflow/sdk/submission/submission.py +852 -119
  78. hpcflow/sdk/submission/types.py +18 -21
  79. hpcflow/sdk/typing.py +24 -5
  80. hpcflow/sdk/utils/arrays.py +71 -0
  81. hpcflow/sdk/utils/deferred_file.py +55 -0
  82. hpcflow/sdk/utils/hashing.py +16 -0
  83. hpcflow/sdk/utils/patches.py +12 -0
  84. hpcflow/sdk/utils/strings.py +33 -0
  85. hpcflow/tests/api/test_api.py +32 -0
  86. hpcflow/tests/conftest.py +19 -0
  87. hpcflow/tests/data/benchmark_script_runner.yaml +26 -0
  88. hpcflow/tests/data/multi_path_sequences.yaml +29 -0
  89. hpcflow/tests/data/workflow_test_run_abort.yaml +34 -35
  90. hpcflow/tests/schedulers/sge/test_sge_submission.py +36 -0
  91. hpcflow/tests/scripts/test_input_file_generators.py +282 -0
  92. hpcflow/tests/scripts/test_main_scripts.py +821 -70
  93. hpcflow/tests/scripts/test_non_snippet_script.py +46 -0
  94. hpcflow/tests/scripts/test_ouput_file_parsers.py +353 -0
  95. hpcflow/tests/shells/wsl/test_wsl_submission.py +6 -0
  96. hpcflow/tests/unit/test_action.py +176 -0
  97. hpcflow/tests/unit/test_app.py +20 -0
  98. hpcflow/tests/unit/test_cache.py +46 -0
  99. hpcflow/tests/unit/test_cli.py +133 -0
  100. hpcflow/tests/unit/test_config.py +122 -1
  101. hpcflow/tests/unit/test_element_iteration.py +47 -0
  102. hpcflow/tests/unit/test_jobscript_unit.py +757 -0
  103. hpcflow/tests/unit/test_loop.py +1332 -27
  104. hpcflow/tests/unit/test_meta_task.py +325 -0
  105. hpcflow/tests/unit/test_multi_path_sequences.py +229 -0
  106. hpcflow/tests/unit/test_parameter.py +13 -0
  107. hpcflow/tests/unit/test_persistence.py +190 -8
  108. hpcflow/tests/unit/test_run.py +109 -3
  109. hpcflow/tests/unit/test_run_directories.py +29 -0
  110. hpcflow/tests/unit/test_shell.py +20 -0
  111. hpcflow/tests/unit/test_submission.py +5 -76
  112. hpcflow/tests/unit/test_workflow_template.py +31 -0
  113. hpcflow/tests/unit/utils/test_arrays.py +40 -0
  114. hpcflow/tests/unit/utils/test_deferred_file_writer.py +34 -0
  115. hpcflow/tests/unit/utils/test_hashing.py +65 -0
  116. hpcflow/tests/unit/utils/test_patches.py +5 -0
  117. hpcflow/tests/unit/utils/test_redirect_std.py +50 -0
  118. hpcflow/tests/workflows/__init__.py +0 -0
  119. hpcflow/tests/workflows/test_directory_structure.py +31 -0
  120. hpcflow/tests/workflows/test_jobscript.py +332 -0
  121. hpcflow/tests/workflows/test_run_status.py +198 -0
  122. hpcflow/tests/workflows/test_skip_downstream.py +696 -0
  123. hpcflow/tests/workflows/test_submission.py +140 -0
  124. hpcflow/tests/workflows/test_workflows.py +142 -2
  125. hpcflow/tests/workflows/test_zip.py +18 -0
  126. hpcflow/viz_demo.ipynb +6587 -3
  127. {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a200.dist-info}/METADATA +7 -4
  128. hpcflow_new2-0.2.0a200.dist-info/RECORD +222 -0
  129. hpcflow_new2-0.2.0a190.dist-info/RECORD +0 -165
  130. {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a200.dist-info}/LICENSE +0 -0
  131. {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a200.dist-info}/WHEEL +0 -0
  132. {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a200.dist-info}/entry_points.txt +0 -0
@@ -4,12 +4,23 @@ Elements are components of tasks.
4
4
 
5
5
  from __future__ import annotations
6
6
  import copy
7
- from dataclasses import dataclass, field
7
+ from dataclasses import dataclass, field, fields
8
+ from operator import attrgetter
8
9
  from itertools import chain
9
10
  import os
10
- from typing import cast, overload, TYPE_CHECKING
11
+ from typing import (
12
+ Any,
13
+ Callable,
14
+ Dict,
15
+ List,
16
+ Optional,
17
+ cast,
18
+ overload,
19
+ TYPE_CHECKING,
20
+ )
11
21
 
12
22
  from hpcflow.sdk.core.enums import ParallelMode
23
+ from hpcflow.sdk.core.skip_reason import SkipReason
13
24
  from hpcflow.sdk.core.errors import UnsupportedOSError, UnsupportedSchedulerError
14
25
  from hpcflow.sdk.core.json_like import ChildObjectSpec, JSONLike
15
26
  from hpcflow.sdk.core.loop_cache import LoopIndex
@@ -23,6 +34,7 @@ from hpcflow.sdk.core.utils import (
23
34
  )
24
35
  from hpcflow.sdk.log import TimeIt
25
36
  from hpcflow.sdk.submission.shells import get_shell
37
+ from hpcflow.sdk.utils.hashing import get_hash
26
38
 
27
39
  if TYPE_CHECKING:
28
40
  from collections.abc import Iterable, Iterator, Mapping, Sequence
@@ -270,6 +282,12 @@ class ElementResources(JSONLike):
270
282
  Whether to use array jobs.
271
283
  max_array_items: int
272
284
  If using array jobs, up to how many items should be in the job array.
285
+ write_app_logs: bool
286
+ Whether an app log file should be written.
287
+ combine_jobscript_std: bool
288
+ Whether jobscript standard output and error streams should be combined.
289
+ combine_scripts: bool
290
+ Whether Python scripts should be combined.
273
291
  time_limit: str
274
292
  How long to run for.
275
293
  scheduler_args: dict[str, Any]
@@ -280,6 +298,13 @@ class ElementResources(JSONLike):
280
298
  Which OS to use.
281
299
  environments: dict
282
300
  Which execution environments to use.
301
+ resources_id: int
302
+ An arbitrary integer that can be used to force multiple jobscripts.
303
+ skip_downstream_on_failure: bool
304
+ Whether to skip downstream dependents on failure.
305
+ allow_failed_dependencies: int | float | bool | None
306
+ The failure tolerance with respect to dependencies, specified as a number or
307
+ proportion.
283
308
  SGE_parallel_env: str
284
309
  Which SGE parallel environment to request.
285
310
  SLURM_partition: str
@@ -317,6 +342,12 @@ class ElementResources(JSONLike):
317
342
  use_job_array: bool | None = None
318
343
  #: If using array jobs, up to how many items should be in the job array.
319
344
  max_array_items: int | None = None
345
+ #: Whether an app log file should be written.
346
+ write_app_logs: bool = False
347
+ #: Whether jobscript standard output and error streams should be combined.
348
+ combine_jobscript_std: bool = field(default_factory=lambda: os.name != "nt")
349
+ #: Whether Python scripts should be combined.
350
+ combine_scripts: bool | None = None
320
351
  #: How long to run for.
321
352
  time_limit: str | None = None
322
353
 
@@ -328,6 +359,13 @@ class ElementResources(JSONLike):
328
359
  os_name: str | None = None
329
360
  #: Which execution environments to use.
330
361
  environments: dict[str, dict[str, Any]] | None = None
362
+ #: An arbitrary integer that can be used to force multiple jobscripts.
363
+ resources_id: int | None = None
364
+ #: Whether to skip downstream dependents on failure.
365
+ skip_downstream_on_failure: bool = True
366
+ #: The failure tolerance with respect to dependencies, specified as a number or
367
+ #: proportion.
368
+ allow_failed_dependencies: int | float | bool | None = False
331
369
 
332
370
  # SGE scheduler specific:
333
371
  #: Which SGE parallel environment to request.
@@ -357,37 +395,34 @@ class ElementResources(JSONLike):
357
395
  if self.parallel_mode:
358
396
  self.parallel_mode = get_enum_by_name_or_val(ParallelMode, self.parallel_mode)
359
397
 
360
- def __eq__(self, other: Any) -> bool:
361
- return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
398
+ self.scheduler_args = self.scheduler_args or {}
399
+ self.shell_args = self.shell_args or {}
400
+
401
+ def __eq__(self, other) -> bool:
402
+ if type(self) != type(other):
403
+ return False
404
+ else:
405
+ return self.__dict__ == other.__dict__
362
406
 
407
+ @TimeIt.decorator
363
408
  def get_jobscript_hash(self) -> int:
364
409
  """Get hash from all arguments that distinguish jobscripts."""
365
410
 
366
- def _hash_dict(d: dict) -> int:
367
- if not d:
368
- return -1
369
- keys, vals = zip(*d.items())
370
- return hash(tuple((keys, vals)))
411
+ exclude = ["time_limit", "skip_downstream_on_failure"]
412
+ if not self.combine_scripts:
413
+ # usually environment selection need not distinguish jobscripts because
414
+ # environments become effective/active within the command files, but if we
415
+ # are combining scripts, then the environments must be the same:
416
+ exclude.append("environments")
371
417
 
372
- exclude = {"time_limit"}
373
418
  dct = {k: copy.deepcopy(v) for k, v in self.__dict__.items() if k not in exclude}
374
419
 
375
- scheduler_args = dct["scheduler_args"]
376
- shell_args = dct["shell_args"]
377
- envs = dct["environments"]
378
-
379
- if "options" in scheduler_args:
380
- dct["scheduler_args"]["options"] = _hash_dict(scheduler_args["options"])
381
- dct["scheduler_args"] = _hash_dict(dct["scheduler_args"])
382
-
383
- dct["shell_args"] = _hash_dict(shell_args)
384
-
385
- if isinstance(envs, dict):
386
- for k, v in envs.items():
387
- dct["environments"][k] = _hash_dict(v)
388
- dct["environments"] = _hash_dict(dct["environments"])
420
+ # `combine_scripts==False` and `combine_scripts==None` should have an equivalent
421
+ # contribution to the hash, so always set it to `False` if unset at this point:
422
+ if self.combine_scripts is None:
423
+ dct["combine_scripts"] = False
389
424
 
390
- return _hash_dict(dct)
425
+ return get_hash(dct)
391
426
 
392
427
  @property
393
428
  def is_parallel(self) -> bool:
@@ -416,6 +451,7 @@ class ElementResources(JSONLike):
416
451
  return ("num_cores",) # TODO: filter on `parallel_mode` later
417
452
 
418
453
  @staticmethod
454
+ @TimeIt.decorator
419
455
  def get_default_os_name() -> str:
420
456
  """
421
457
  Get the default value for OS name.
@@ -423,6 +459,7 @@ class ElementResources(JSONLike):
423
459
  return os.name
424
460
 
425
461
  @classmethod
462
+ @TimeIt.decorator
426
463
  def get_default_shell(cls) -> str:
427
464
  """
428
465
  Get the default value for name.
@@ -430,6 +467,7 @@ class ElementResources(JSONLike):
430
467
  return cls._app.config.default_shell
431
468
 
432
469
  @classmethod
470
+ @TimeIt.decorator
433
471
  def get_default_scheduler(cls, os_name: str, shell_name: str) -> str:
434
472
  """
435
473
  Get the default value for scheduler.
@@ -439,6 +477,7 @@ class ElementResources(JSONLike):
439
477
  return "direct_posix"
440
478
  return cls._app.config.default_scheduler
441
479
 
480
+ @TimeIt.decorator
442
481
  def set_defaults(self):
443
482
  """
444
483
  Set defaults for unspecified values that need defaults.
@@ -464,9 +503,11 @@ class ElementResources(JSONLike):
464
503
  cfg_defs = cfg_sched.get("defaults", {})
465
504
  cfg_opts = cfg_defs.pop("options", {})
466
505
  opts = {**cfg_opts, **self.scheduler_args.get("options", {})}
467
- self.scheduler_args["options"] = opts
506
+ if opts:
507
+ self.scheduler_args["options"] = opts
468
508
  self.scheduler_args = {**cfg_defs, **self.scheduler_args}
469
509
 
510
+ @TimeIt.decorator
470
511
  def validate_against_machine(self):
471
512
  """Validate the values for `os_name`, `shell` and `scheduler` against those
472
513
  supported on this machine (as specified by the app configuration)."""
@@ -477,6 +518,12 @@ class ElementResources(JSONLike):
477
518
  scheduler=self.scheduler,
478
519
  supported=self._app.config.schedulers,
479
520
  )
521
+
522
+ if self.os_name == "nt" and self.combine_jobscript_std:
523
+ raise NotImplementedError(
524
+ "`combine_jobscript_std` is not yet supported on Windows."
525
+ )
526
+
480
527
  # might raise `UnsupportedShellError`:
481
528
  get_shell(shell_name=self.shell, os_name=self.os_name)
482
529
 
@@ -626,10 +673,21 @@ class ElementIteration(AppAware):
626
673
  @property
627
674
  def EAR_IDs(self) -> Mapping[int, Sequence[int]]:
628
675
  """
629
- Mapping from iteration number to EAR ID, where known.
676
+ Mapping from action index to EAR ID, where known.
630
677
  """
631
678
  return self._EAR_IDs
632
679
 
680
+ @property
681
+ def loop_skipped(self) -> bool:
682
+ """True if the the iteration was skipped entirely due to a loop termination."""
683
+ if not self.action_runs:
684
+ # this includes when runs are not initialised
685
+ return False
686
+ else:
687
+ return all(
688
+ i.skip_reason is SkipReason.LOOP_TERMINATION for i in self.action_runs
689
+ )
690
+
633
691
  @property
634
692
  def EAR_IDs_flat(self) -> Iterable[int]:
635
693
  """
@@ -1256,6 +1314,10 @@ class ElementIteration(AppAware):
1256
1314
  resources["os_name"], resources["shell"]
1257
1315
  )
1258
1316
 
1317
+ # unset inapplicable items:
1318
+ if "combine_scripts" in resources and not action.script_is_python_snippet:
1319
+ del resources["combine_scripts"]
1320
+
1259
1321
  return resources
1260
1322
 
1261
1323
  def get_resources_obj(
@@ -1446,47 +1508,60 @@ class Element(AppAware):
1446
1508
  """
1447
1509
  return self.iterations[-1]
1448
1510
 
1511
+ @property
1512
+ def latest_iteration_non_skipped(self):
1513
+ """Get the latest iteration that is not loop-skipped."""
1514
+ for iter_i in self.iterations[::-1]:
1515
+ if not iter_i.loop_skipped:
1516
+ return iter_i
1517
+
1449
1518
  @property
1450
1519
  def inputs(self) -> ElementInputs:
1451
1520
  """
1452
- The inputs to this element (or its most recent iteration).
1521
+ The inputs to this element's most recent iteration (that was not skipped due to
1522
+ loop termination).
1453
1523
  """
1454
- return self.latest_iteration.inputs
1524
+ return self.latest_iteration_non_skipped.inputs
1455
1525
 
1456
1526
  @property
1457
1527
  def outputs(self) -> ElementOutputs:
1458
1528
  """
1459
- The outputs from this element (or its most recent iteration).
1529
+ The outputs from this element's most recent iteration (that was not skipped due to
1530
+ loop termination).
1460
1531
  """
1461
- return self.latest_iteration.outputs
1532
+ return self.latest_iteration_non_skipped.outputs
1462
1533
 
1463
1534
  @property
1464
1535
  def input_files(self) -> ElementInputFiles:
1465
1536
  """
1466
- The input files to this element (or its most recent iteration).
1537
+ The input files to this element's most recent iteration (that was not skipped due
1538
+ to loop termination).
1467
1539
  """
1468
- return self.latest_iteration.input_files
1540
+ return self.latest_iteration_non_skipped.input_files
1469
1541
 
1470
1542
  @property
1471
1543
  def output_files(self) -> ElementOutputFiles:
1472
1544
  """
1473
- The output files from this element (or its most recent iteration).
1545
+ The output files from this element's most recent iteration (that was not skipped
1546
+ due to loop termination).
1474
1547
  """
1475
- return self.latest_iteration.output_files
1548
+ return self.latest_iteration_non_skipped.output_files
1476
1549
 
1477
1550
  @property
1478
1551
  def schema_parameters(self) -> Sequence[str]:
1479
1552
  """
1480
- The schema-defined parameters to this element (or its most recent iteration).
1553
+ The schema-defined parameters to this element's most recent iteration (that was
1554
+ not skipped due to loop termination).
1481
1555
  """
1482
- return self.latest_iteration.schema_parameters
1556
+ return self.latest_iteration_non_skipped.schema_parameters
1483
1557
 
1484
1558
  @property
1485
1559
  def actions(self) -> Mapping[int, ElementAction]:
1486
1560
  """
1487
- The actions of this element (or its most recent iteration).
1561
+ The actions of this element's most recent iteration (that was not skipped due to
1562
+ loop termination).
1488
1563
  """
1489
- return self.latest_iteration.actions
1564
+ return self.latest_iteration_non_skipped.actions
1490
1565
 
1491
1566
  @property
1492
1567
  def action_runs(self) -> Sequence[ElementActionRun]:
@@ -1494,13 +1569,7 @@ class Element(AppAware):
1494
1569
  A list of element action runs from the latest iteration, where only the
1495
1570
  final run is taken for each element action.
1496
1571
  """
1497
- return self.latest_iteration.action_runs
1498
-
1499
- def init_loop_index(self, loop_name: str) -> None:
1500
- """
1501
- Initialise the loop index if necessary.
1502
- """
1503
- pass
1572
+ return self.latest_iteration_non_skipped.action_runs
1504
1573
 
1505
1574
  def to_element_set_data(self) -> tuple[list[InputValue], list[ResourceSpec]]:
1506
1575
  """Generate lists of workflow-bound InputValues and ResourceList."""
@@ -1550,14 +1619,15 @@ class Element(AppAware):
1550
1619
  action_idx: int | None = None,
1551
1620
  run_idx: int = -1,
1552
1621
  ) -> DataIndex:
1553
- """Get the data index of the most recent element iteration.
1622
+ """Get the data index of the most recent element iteration that
1623
+ is not loop-skipped.
1554
1624
 
1555
1625
  Parameters
1556
1626
  ----------
1557
1627
  action_idx
1558
1628
  The index of the action within the schema.
1559
1629
  """
1560
- return self.latest_iteration.get_data_idx(
1630
+ return self.latest_iteration_non_skipped.get_data_idx(
1561
1631
  path=path,
1562
1632
  action_idx=action_idx,
1563
1633
  run_idx=run_idx,
@@ -1633,8 +1703,9 @@ class Element(AppAware):
1633
1703
  raise_on_missing: bool = False,
1634
1704
  raise_on_unset: bool = False,
1635
1705
  ) -> Any:
1636
- """Get element data of the most recent iteration from the persistent store."""
1637
- return self.latest_iteration.get(
1706
+ """Get element data of the most recent iteration that is not
1707
+ loop-skipped."""
1708
+ return self.latest_iteration_non_skipped.get(
1638
1709
  path=path,
1639
1710
  action_idx=action_idx,
1640
1711
  run_idx=run_idx,
@@ -1651,6 +1722,7 @@ class Element(AppAware):
1651
1722
  def get_EAR_dependencies(self, as_objects: Literal[False] = False) -> set[int]:
1652
1723
  ...
1653
1724
 
1725
+ @TimeIt.decorator
1654
1726
  def get_EAR_dependencies(
1655
1727
  self, as_objects: bool = False
1656
1728
  ) -> set[int] | list[ElementActionRun]:
@@ -9,10 +9,12 @@ from textwrap import indent
9
9
  from typing import Any, TYPE_CHECKING
10
10
 
11
11
  if TYPE_CHECKING:
12
+ from logging import Logger
12
13
  from .enums import ParallelMode
13
14
  from .object_list import WorkflowLoopList
14
- from .parameters import InputSource, ValueSequence
15
+ from .parameters import InputSource, ValueSequence, SchemaInput
15
16
  from .types import ScriptData
17
+ from .task import WorkflowTask
16
18
 
17
19
 
18
20
  class InputValueDuplicateSequenceAddress(ValueError):
@@ -410,7 +412,13 @@ class WorkflowLimitsError(ValueError):
410
412
  # FIXME: never used
411
413
 
412
414
 
413
- class UnsetParameterDataError(Exception):
415
+ class UnsetParameterDataErrorBase(Exception):
416
+ """
417
+ Exceptions related to attempts to retrieve unset parameters.
418
+ """
419
+
420
+
421
+ class UnsetParameterDataError(UnsetParameterDataErrorBase):
414
422
  """
415
423
  Tried to read from an unset parameter.
416
424
  """
@@ -422,6 +430,50 @@ class UnsetParameterDataError(Exception):
422
430
  )
423
431
 
424
432
 
433
+ class UnsetParameterFractionLimitExceededError(UnsetParameterDataErrorBase):
434
+ """
435
+ Given the specified `allow_failed_dependencies`, the fraction of failed dependencies
436
+ (unset parameter data) is too high."""
437
+
438
+ def __init__(
439
+ self,
440
+ schema_inp: SchemaInput,
441
+ task: WorkflowTask,
442
+ unset_fraction: float,
443
+ log: Logger | None = None,
444
+ ):
445
+ msg = (
446
+ f"Input {schema_inp.parameter.typ!r} of task {task.name!r}: higher "
447
+ f"proportion of dependencies failed ({unset_fraction!r}) than allowed "
448
+ f"({schema_inp.allow_failed_dependencies!r})."
449
+ )
450
+ if log:
451
+ log.info(msg)
452
+ super().__init__(msg)
453
+
454
+
455
+ class UnsetParameterNumberLimitExceededError(UnsetParameterDataErrorBase):
456
+ """
457
+ Given the specified `allow_failed_dependencies`, the number of failed dependencies
458
+ (unset parameter data) is too high."""
459
+
460
+ def __init__(
461
+ self,
462
+ schema_inp: SchemaInput,
463
+ task: WorkflowTask,
464
+ unset_num: int,
465
+ log: Logger | None = None,
466
+ ):
467
+ msg = (
468
+ f"Input {schema_inp.parameter.typ!r} of task {task.name!r}: higher number of "
469
+ f"dependencies failed ({unset_num!r}) than allowed "
470
+ f"({schema_inp.allow_failed_dependencies!r})."
471
+ )
472
+ if log:
473
+ log.info(msg)
474
+ super().__init__(msg)
475
+
476
+
425
477
  class LoopAlreadyExistsError(Exception):
426
478
  """
427
479
  A particular loop (or its name) already exists.
@@ -730,6 +782,10 @@ class MissingParameterData(_MissingStoreItemError):
730
782
  super().__init__(id_lst, self._item_type)
731
783
 
732
784
 
785
+ class ParametersMetadataReadOnlyError(RuntimeError):
786
+ pass
787
+
788
+
733
789
  class NotSubmitMachineError(RuntimeError):
734
790
  """
735
791
  The requested machine can't be submitted to.
@@ -0,0 +1,207 @@
1
+ import asyncio
2
+ import os
3
+ import queue
4
+ import struct
5
+ import threading
6
+ import time
7
+
8
+ import zmq
9
+
10
+ from hpcflow.sdk.core.app_aware import AppAware
11
+
12
+
13
+ class Executor(AppAware):
14
+ def __init__(self, cmd, env, package_name):
15
+
16
+ # TODO: make zmq_server optional (but required if action is abortable, or if
17
+ # `script_data_in`/`out`` is "zeromq")
18
+
19
+ self.cmd = cmd
20
+ self.env = env
21
+ self.package_name = package_name
22
+
23
+ # initialise a global ZeroMQ context for use in all threads:
24
+ zmq.Context()
25
+
26
+ self._q = None # queue for inter-thread communication
27
+
28
+ # assigned by `start_zmq_server`:
29
+ self.port_number = None
30
+ self.server_thread = None
31
+
32
+ # assigned on (non-aborted) completion of the subprocess via `_subprocess_runner`:
33
+ self.return_code = None
34
+
35
+ @property
36
+ def q(self):
37
+ if not self._q:
38
+ self._q = queue.Queue()
39
+ return self._q
40
+
41
+ @property
42
+ def zmq_context(self):
43
+ return zmq.Context.instance()
44
+
45
+ def _zmq_server(self):
46
+ """Start a ZeroMQ server on a random port.
47
+
48
+ This method is invoked in a separate thread via `start_zmq_server`.
49
+
50
+ """
51
+ socket = self.zmq_context.socket(zmq.REP)
52
+ port_number = socket.bind_to_random_port("tcp://*")
53
+ self._app.logger.info(f"zmq_server: started on port {port_number}")
54
+
55
+ # send port number back to main thread:
56
+ self.q.put(port_number)
57
+
58
+ self._app.logger.info(f"zmq_server: port number sent to main thread.")
59
+
60
+ # TODO: exception handling
61
+
62
+ while True:
63
+ message = socket.recv_string()
64
+ self._app.logger.info(f"zmq_server: received request: {message}")
65
+
66
+ # Check if the received message is a shutdown signal
67
+ if message in ("shutdown", "abort"):
68
+ self.q.put(message)
69
+ socket.send_string("shutting down the server")
70
+ break
71
+
72
+ else:
73
+ socket.send_string(f"received request: {message}")
74
+
75
+ socket.close()
76
+ self._app.logger.info("zmq_server: server stopped")
77
+
78
+ def start_zmq_server(self) -> int:
79
+
80
+ # start the server thread
81
+ server_thread = threading.Thread(target=self._zmq_server)
82
+ server_thread.start()
83
+
84
+ self._app.logger.info(f"server thread started")
85
+
86
+ if os.name == "nt":
87
+ # some sort of race condition seems to exist on Windows, where self.q.get()
88
+ # will occasionally hang on the Github Actions runners. This seems to resolve
89
+ # it.
90
+ time.sleep(0.1)
91
+
92
+ # block until port number received:
93
+ port_number = self.q.get(timeout=5)
94
+ self._app.logger.info(f"received port number from server thread: {port_number}")
95
+
96
+ self.port_number = port_number
97
+ self.server_thread = server_thread
98
+
99
+ return port_number
100
+
101
+ def stop_zmq_server(self):
102
+
103
+ # send a shutdown signal to the server:
104
+ socket = self.zmq_context.socket(zmq.REQ)
105
+ address = f"tcp://localhost:{self.port_number}"
106
+ socket.connect(address)
107
+ self._app.logger.info(
108
+ f"stop_zmq_server: about to send shutdown message to server: {address!r}"
109
+ )
110
+ socket.send_string("shutdown")
111
+ send_shutdown_out = socket.recv()
112
+ self._app.logger.info(f"stop_zmq_server: received reply: {send_shutdown_out!r}")
113
+ socket.close()
114
+
115
+ # wait for the server thread to finish:
116
+ self._app.logger.info(f"stop_zmq_server: joining server thread")
117
+ self.server_thread.join()
118
+
119
+ self._app.logger.info(f"stop_zmq_server: terminating ZMQ context")
120
+ self.zmq_context.term()
121
+ if self.server_thread.is_alive():
122
+ raise RuntimeError("Server thread is still alive!")
123
+
124
+ def run(self):
125
+ """Launch the subprocess to execute the commands, and once complete, stop the
126
+ ZMQ server. Kill the subprocess if a "shutdown" or "abort" message is sent to the
127
+ server."""
128
+ asyncio.run(self._run())
129
+ return self.return_code
130
+
131
+ def _receive_stop(self):
132
+ """Wait until the queue receives a shutdown message from the server"""
133
+ while True:
134
+ if self.q.get() in ("shutdown", "abort"):
135
+ return
136
+
137
+ async def _subprocess_runner(self):
138
+ app_caps = self.package_name.upper()
139
+ env = {**self.env, f"{app_caps}_RUN_PORT": str(self.port_number)}
140
+ try:
141
+ process = await asyncio.create_subprocess_exec(*self.cmd, env=env)
142
+ self._app.logger.info(
143
+ f"_subprocess_runner: started subprocess: {process=!r}."
144
+ )
145
+ ret_code = await process.wait()
146
+ self._app.logger.info(
147
+ f"_subprocess_runner: subprocess finished with return code: {ret_code!r}."
148
+ )
149
+ self.return_code = ret_code
150
+
151
+ except asyncio.CancelledError:
152
+ process.kill()
153
+
154
+ async def _run(self):
155
+
156
+ # create tasks for the subprocess and a synchronous Queue.get retrieval:
157
+ try:
158
+ wait_abort_thread = asyncio.to_thread(self._receive_stop)
159
+ except AttributeError:
160
+ # Python 3.8
161
+ from hpcflow.sdk.core.utils import to_thread
162
+
163
+ wait_abort_thread = to_thread(self._receive_stop)
164
+
165
+ wait_abort_task = asyncio.create_task(wait_abort_thread)
166
+ subprocess_task = asyncio.create_task(self._subprocess_runner())
167
+
168
+ # wait for either: subprocess to finish, or a stop signal from the server:
169
+ _, pending = await asyncio.wait(
170
+ [wait_abort_task, subprocess_task],
171
+ return_when=asyncio.FIRST_COMPLETED,
172
+ )
173
+
174
+ # TODO: test we can SIGTERM and SIGINT the subprocess successfully?
175
+ # - add an API for sending signals to the process via the server?
176
+
177
+ if pending == {wait_abort_task}:
178
+ # subprocess completed; need to shutdown the server
179
+ self._app.logger.info(f"_run: subprocess completed; stopping zmq server")
180
+ self.stop_zmq_server()
181
+
182
+ else:
183
+ # subprocess still running but got a stop request; need to kill subprocess:
184
+ self._app.logger.info(f"_run: stop request; killing subprocess")
185
+ subprocess_task.cancel()
186
+
187
+ if self.return_code and os.name == "nt":
188
+ # Windows return codes are defined as 32-bit unsigned integers, but
189
+ # some programs might still return negative numbers, so convert to a
190
+ # signed 32-bit integer:
191
+ self.return_code = struct.unpack("i", struct.pack("I", self.return_code))[0]
192
+
193
+ @classmethod
194
+ def send_abort(cls, hostname, port_number):
195
+ """Send an abort message to a running server."""
196
+ context = zmq.Context()
197
+ socket = context.socket(zmq.REQ)
198
+ address = f"tcp://{hostname}:{port_number}"
199
+ socket.connect(address)
200
+ cls._app.logger.info(
201
+ f"send_abort: about to send abort message to server: {address!r}"
202
+ )
203
+ socket.send_string("abort")
204
+ abort_rep = socket.recv()
205
+ cls._app.logger.info(f"send_abort: received reply: {abort_rep!r}")
206
+ socket.close()
207
+ context.term()