hpcflow-new2 0.2.0a190__py3-none-any.whl → 0.2.0a200__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpcflow/__pyinstaller/hook-hpcflow.py +1 -0
- hpcflow/_version.py +1 -1
- hpcflow/data/scripts/bad_script.py +2 -0
- hpcflow/data/scripts/do_nothing.py +2 -0
- hpcflow/data/scripts/env_specifier_test/input_file_generator_pass_env_spec.py +4 -0
- hpcflow/data/scripts/env_specifier_test/main_script_test_pass_env_spec.py +8 -0
- hpcflow/data/scripts/env_specifier_test/output_file_parser_pass_env_spec.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v1/input_file_generator_basic.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v1/main_script_test_direct_in_direct_out.py +7 -0
- hpcflow/data/scripts/env_specifier_test/v1/output_file_parser_basic.py +4 -0
- hpcflow/data/scripts/env_specifier_test/v2/main_script_test_direct_in_direct_out.py +7 -0
- hpcflow/data/scripts/input_file_generator_basic.py +3 -0
- hpcflow/data/scripts/input_file_generator_basic_FAIL.py +3 -0
- hpcflow/data/scripts/input_file_generator_test_stdout_stderr.py +8 -0
- hpcflow/data/scripts/main_script_test_direct_in.py +3 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_2_fail_allowed_group.py +7 -0
- hpcflow/data/scripts/main_script_test_direct_in_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_group_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_direct_in_group_one_fail_direct_out_3.py +6 -0
- hpcflow/data/scripts/main_script_test_hdf5_in_obj_2.py +12 -0
- hpcflow/data/scripts/main_script_test_json_out_FAIL.py +3 -0
- hpcflow/data/scripts/main_script_test_shell_env_vars.py +12 -0
- hpcflow/data/scripts/main_script_test_std_out_std_err.py +6 -0
- hpcflow/data/scripts/output_file_parser_basic.py +3 -0
- hpcflow/data/scripts/output_file_parser_basic_FAIL.py +7 -0
- hpcflow/data/scripts/output_file_parser_test_stdout_stderr.py +8 -0
- hpcflow/data/scripts/script_exit_test.py +5 -0
- hpcflow/data/template_components/environments.yaml +1 -1
- hpcflow/sdk/__init__.py +5 -0
- hpcflow/sdk/app.py +166 -92
- hpcflow/sdk/cli.py +263 -84
- hpcflow/sdk/cli_common.py +99 -5
- hpcflow/sdk/config/callbacks.py +38 -1
- hpcflow/sdk/config/config.py +102 -13
- hpcflow/sdk/config/errors.py +19 -5
- hpcflow/sdk/config/types.py +3 -0
- hpcflow/sdk/core/__init__.py +25 -1
- hpcflow/sdk/core/actions.py +914 -262
- hpcflow/sdk/core/cache.py +76 -34
- hpcflow/sdk/core/command_files.py +14 -128
- hpcflow/sdk/core/commands.py +35 -6
- hpcflow/sdk/core/element.py +122 -50
- hpcflow/sdk/core/errors.py +58 -2
- hpcflow/sdk/core/execute.py +207 -0
- hpcflow/sdk/core/loop.py +408 -50
- hpcflow/sdk/core/loop_cache.py +4 -4
- hpcflow/sdk/core/parameters.py +382 -37
- hpcflow/sdk/core/run_dir_files.py +13 -40
- hpcflow/sdk/core/skip_reason.py +7 -0
- hpcflow/sdk/core/task.py +119 -30
- hpcflow/sdk/core/task_schema.py +68 -0
- hpcflow/sdk/core/test_utils.py +66 -27
- hpcflow/sdk/core/types.py +54 -1
- hpcflow/sdk/core/utils.py +136 -19
- hpcflow/sdk/core/workflow.py +1587 -356
- hpcflow/sdk/data/workflow_spec_schema.yaml +2 -0
- hpcflow/sdk/demo/cli.py +7 -0
- hpcflow/sdk/helper/cli.py +1 -0
- hpcflow/sdk/log.py +42 -15
- hpcflow/sdk/persistence/base.py +405 -53
- hpcflow/sdk/persistence/json.py +177 -52
- hpcflow/sdk/persistence/pending.py +237 -69
- hpcflow/sdk/persistence/store_resource.py +3 -2
- hpcflow/sdk/persistence/types.py +15 -4
- hpcflow/sdk/persistence/zarr.py +928 -81
- hpcflow/sdk/submission/jobscript.py +1408 -489
- hpcflow/sdk/submission/schedulers/__init__.py +40 -5
- hpcflow/sdk/submission/schedulers/direct.py +33 -19
- hpcflow/sdk/submission/schedulers/sge.py +51 -16
- hpcflow/sdk/submission/schedulers/slurm.py +44 -16
- hpcflow/sdk/submission/schedulers/utils.py +7 -2
- hpcflow/sdk/submission/shells/base.py +68 -20
- hpcflow/sdk/submission/shells/bash.py +222 -129
- hpcflow/sdk/submission/shells/powershell.py +200 -150
- hpcflow/sdk/submission/submission.py +852 -119
- hpcflow/sdk/submission/types.py +18 -21
- hpcflow/sdk/typing.py +24 -5
- hpcflow/sdk/utils/arrays.py +71 -0
- hpcflow/sdk/utils/deferred_file.py +55 -0
- hpcflow/sdk/utils/hashing.py +16 -0
- hpcflow/sdk/utils/patches.py +12 -0
- hpcflow/sdk/utils/strings.py +33 -0
- hpcflow/tests/api/test_api.py +32 -0
- hpcflow/tests/conftest.py +19 -0
- hpcflow/tests/data/benchmark_script_runner.yaml +26 -0
- hpcflow/tests/data/multi_path_sequences.yaml +29 -0
- hpcflow/tests/data/workflow_test_run_abort.yaml +34 -35
- hpcflow/tests/schedulers/sge/test_sge_submission.py +36 -0
- hpcflow/tests/scripts/test_input_file_generators.py +282 -0
- hpcflow/tests/scripts/test_main_scripts.py +821 -70
- hpcflow/tests/scripts/test_non_snippet_script.py +46 -0
- hpcflow/tests/scripts/test_ouput_file_parsers.py +353 -0
- hpcflow/tests/shells/wsl/test_wsl_submission.py +6 -0
- hpcflow/tests/unit/test_action.py +176 -0
- hpcflow/tests/unit/test_app.py +20 -0
- hpcflow/tests/unit/test_cache.py +46 -0
- hpcflow/tests/unit/test_cli.py +133 -0
- hpcflow/tests/unit/test_config.py +122 -1
- hpcflow/tests/unit/test_element_iteration.py +47 -0
- hpcflow/tests/unit/test_jobscript_unit.py +757 -0
- hpcflow/tests/unit/test_loop.py +1332 -27
- hpcflow/tests/unit/test_meta_task.py +325 -0
- hpcflow/tests/unit/test_multi_path_sequences.py +229 -0
- hpcflow/tests/unit/test_parameter.py +13 -0
- hpcflow/tests/unit/test_persistence.py +190 -8
- hpcflow/tests/unit/test_run.py +109 -3
- hpcflow/tests/unit/test_run_directories.py +29 -0
- hpcflow/tests/unit/test_shell.py +20 -0
- hpcflow/tests/unit/test_submission.py +5 -76
- hpcflow/tests/unit/test_workflow_template.py +31 -0
- hpcflow/tests/unit/utils/test_arrays.py +40 -0
- hpcflow/tests/unit/utils/test_deferred_file_writer.py +34 -0
- hpcflow/tests/unit/utils/test_hashing.py +65 -0
- hpcflow/tests/unit/utils/test_patches.py +5 -0
- hpcflow/tests/unit/utils/test_redirect_std.py +50 -0
- hpcflow/tests/workflows/__init__.py +0 -0
- hpcflow/tests/workflows/test_directory_structure.py +31 -0
- hpcflow/tests/workflows/test_jobscript.py +332 -0
- hpcflow/tests/workflows/test_run_status.py +198 -0
- hpcflow/tests/workflows/test_skip_downstream.py +696 -0
- hpcflow/tests/workflows/test_submission.py +140 -0
- hpcflow/tests/workflows/test_workflows.py +142 -2
- hpcflow/tests/workflows/test_zip.py +18 -0
- hpcflow/viz_demo.ipynb +6587 -3
- {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a200.dist-info}/METADATA +7 -4
- hpcflow_new2-0.2.0a200.dist-info/RECORD +222 -0
- hpcflow_new2-0.2.0a190.dist-info/RECORD +0 -165
- {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a200.dist-info}/LICENSE +0 -0
- {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a200.dist-info}/WHEEL +0 -0
- {hpcflow_new2-0.2.0a190.dist-info → hpcflow_new2-0.2.0a200.dist-info}/entry_points.txt +0 -0
hpcflow/sdk/core/element.py
CHANGED
@@ -4,12 +4,23 @@ Elements are components of tasks.
|
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
import copy
|
7
|
-
from dataclasses import dataclass, field
|
7
|
+
from dataclasses import dataclass, field, fields
|
8
|
+
from operator import attrgetter
|
8
9
|
from itertools import chain
|
9
10
|
import os
|
10
|
-
from typing import
|
11
|
+
from typing import (
|
12
|
+
Any,
|
13
|
+
Callable,
|
14
|
+
Dict,
|
15
|
+
List,
|
16
|
+
Optional,
|
17
|
+
cast,
|
18
|
+
overload,
|
19
|
+
TYPE_CHECKING,
|
20
|
+
)
|
11
21
|
|
12
22
|
from hpcflow.sdk.core.enums import ParallelMode
|
23
|
+
from hpcflow.sdk.core.skip_reason import SkipReason
|
13
24
|
from hpcflow.sdk.core.errors import UnsupportedOSError, UnsupportedSchedulerError
|
14
25
|
from hpcflow.sdk.core.json_like import ChildObjectSpec, JSONLike
|
15
26
|
from hpcflow.sdk.core.loop_cache import LoopIndex
|
@@ -23,6 +34,7 @@ from hpcflow.sdk.core.utils import (
|
|
23
34
|
)
|
24
35
|
from hpcflow.sdk.log import TimeIt
|
25
36
|
from hpcflow.sdk.submission.shells import get_shell
|
37
|
+
from hpcflow.sdk.utils.hashing import get_hash
|
26
38
|
|
27
39
|
if TYPE_CHECKING:
|
28
40
|
from collections.abc import Iterable, Iterator, Mapping, Sequence
|
@@ -270,6 +282,12 @@ class ElementResources(JSONLike):
|
|
270
282
|
Whether to use array jobs.
|
271
283
|
max_array_items: int
|
272
284
|
If using array jobs, up to how many items should be in the job array.
|
285
|
+
write_app_logs: bool
|
286
|
+
Whether an app log file should be written.
|
287
|
+
combine_jobscript_std: bool
|
288
|
+
Whether jobscript standard output and error streams should be combined.
|
289
|
+
combine_scripts: bool
|
290
|
+
Whether Python scripts should be combined.
|
273
291
|
time_limit: str
|
274
292
|
How long to run for.
|
275
293
|
scheduler_args: dict[str, Any]
|
@@ -280,6 +298,13 @@ class ElementResources(JSONLike):
|
|
280
298
|
Which OS to use.
|
281
299
|
environments: dict
|
282
300
|
Which execution environments to use.
|
301
|
+
resources_id: int
|
302
|
+
An arbitrary integer that can be used to force multiple jobscripts.
|
303
|
+
skip_downstream_on_failure: bool
|
304
|
+
Whether to skip downstream dependents on failure.
|
305
|
+
allow_failed_dependencies: int | float | bool | None
|
306
|
+
The failure tolerance with respect to dependencies, specified as a number or
|
307
|
+
proportion.
|
283
308
|
SGE_parallel_env: str
|
284
309
|
Which SGE parallel environment to request.
|
285
310
|
SLURM_partition: str
|
@@ -317,6 +342,12 @@ class ElementResources(JSONLike):
|
|
317
342
|
use_job_array: bool | None = None
|
318
343
|
#: If using array jobs, up to how many items should be in the job array.
|
319
344
|
max_array_items: int | None = None
|
345
|
+
#: Whether an app log file should be written.
|
346
|
+
write_app_logs: bool = False
|
347
|
+
#: Whether jobscript standard output and error streams should be combined.
|
348
|
+
combine_jobscript_std: bool = field(default_factory=lambda: os.name != "nt")
|
349
|
+
#: Whether Python scripts should be combined.
|
350
|
+
combine_scripts: bool | None = None
|
320
351
|
#: How long to run for.
|
321
352
|
time_limit: str | None = None
|
322
353
|
|
@@ -328,6 +359,13 @@ class ElementResources(JSONLike):
|
|
328
359
|
os_name: str | None = None
|
329
360
|
#: Which execution environments to use.
|
330
361
|
environments: dict[str, dict[str, Any]] | None = None
|
362
|
+
#: An arbitrary integer that can be used to force multiple jobscripts.
|
363
|
+
resources_id: int | None = None
|
364
|
+
#: Whether to skip downstream dependents on failure.
|
365
|
+
skip_downstream_on_failure: bool = True
|
366
|
+
#: The failure tolerance with respect to dependencies, specified as a number or
|
367
|
+
#: proportion.
|
368
|
+
allow_failed_dependencies: int | float | bool | None = False
|
331
369
|
|
332
370
|
# SGE scheduler specific:
|
333
371
|
#: Which SGE parallel environment to request.
|
@@ -357,37 +395,34 @@ class ElementResources(JSONLike):
|
|
357
395
|
if self.parallel_mode:
|
358
396
|
self.parallel_mode = get_enum_by_name_or_val(ParallelMode, self.parallel_mode)
|
359
397
|
|
360
|
-
|
361
|
-
|
398
|
+
self.scheduler_args = self.scheduler_args or {}
|
399
|
+
self.shell_args = self.shell_args or {}
|
400
|
+
|
401
|
+
def __eq__(self, other) -> bool:
|
402
|
+
if type(self) != type(other):
|
403
|
+
return False
|
404
|
+
else:
|
405
|
+
return self.__dict__ == other.__dict__
|
362
406
|
|
407
|
+
@TimeIt.decorator
|
363
408
|
def get_jobscript_hash(self) -> int:
|
364
409
|
"""Get hash from all arguments that distinguish jobscripts."""
|
365
410
|
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
411
|
+
exclude = ["time_limit", "skip_downstream_on_failure"]
|
412
|
+
if not self.combine_scripts:
|
413
|
+
# usually environment selection need not distinguish jobscripts because
|
414
|
+
# environments become effective/active within the command files, but if we
|
415
|
+
# are combining scripts, then the environments must be the same:
|
416
|
+
exclude.append("environments")
|
371
417
|
|
372
|
-
exclude = {"time_limit"}
|
373
418
|
dct = {k: copy.deepcopy(v) for k, v in self.__dict__.items() if k not in exclude}
|
374
419
|
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
if "options" in scheduler_args:
|
380
|
-
dct["scheduler_args"]["options"] = _hash_dict(scheduler_args["options"])
|
381
|
-
dct["scheduler_args"] = _hash_dict(dct["scheduler_args"])
|
382
|
-
|
383
|
-
dct["shell_args"] = _hash_dict(shell_args)
|
384
|
-
|
385
|
-
if isinstance(envs, dict):
|
386
|
-
for k, v in envs.items():
|
387
|
-
dct["environments"][k] = _hash_dict(v)
|
388
|
-
dct["environments"] = _hash_dict(dct["environments"])
|
420
|
+
# `combine_scripts==False` and `combine_scripts==None` should have an equivalent
|
421
|
+
# contribution to the hash, so always set it to `False` if unset at this point:
|
422
|
+
if self.combine_scripts is None:
|
423
|
+
dct["combine_scripts"] = False
|
389
424
|
|
390
|
-
return
|
425
|
+
return get_hash(dct)
|
391
426
|
|
392
427
|
@property
|
393
428
|
def is_parallel(self) -> bool:
|
@@ -416,6 +451,7 @@ class ElementResources(JSONLike):
|
|
416
451
|
return ("num_cores",) # TODO: filter on `parallel_mode` later
|
417
452
|
|
418
453
|
@staticmethod
|
454
|
+
@TimeIt.decorator
|
419
455
|
def get_default_os_name() -> str:
|
420
456
|
"""
|
421
457
|
Get the default value for OS name.
|
@@ -423,6 +459,7 @@ class ElementResources(JSONLike):
|
|
423
459
|
return os.name
|
424
460
|
|
425
461
|
@classmethod
|
462
|
+
@TimeIt.decorator
|
426
463
|
def get_default_shell(cls) -> str:
|
427
464
|
"""
|
428
465
|
Get the default value for name.
|
@@ -430,6 +467,7 @@ class ElementResources(JSONLike):
|
|
430
467
|
return cls._app.config.default_shell
|
431
468
|
|
432
469
|
@classmethod
|
470
|
+
@TimeIt.decorator
|
433
471
|
def get_default_scheduler(cls, os_name: str, shell_name: str) -> str:
|
434
472
|
"""
|
435
473
|
Get the default value for scheduler.
|
@@ -439,6 +477,7 @@ class ElementResources(JSONLike):
|
|
439
477
|
return "direct_posix"
|
440
478
|
return cls._app.config.default_scheduler
|
441
479
|
|
480
|
+
@TimeIt.decorator
|
442
481
|
def set_defaults(self):
|
443
482
|
"""
|
444
483
|
Set defaults for unspecified values that need defaults.
|
@@ -464,9 +503,11 @@ class ElementResources(JSONLike):
|
|
464
503
|
cfg_defs = cfg_sched.get("defaults", {})
|
465
504
|
cfg_opts = cfg_defs.pop("options", {})
|
466
505
|
opts = {**cfg_opts, **self.scheduler_args.get("options", {})}
|
467
|
-
|
506
|
+
if opts:
|
507
|
+
self.scheduler_args["options"] = opts
|
468
508
|
self.scheduler_args = {**cfg_defs, **self.scheduler_args}
|
469
509
|
|
510
|
+
@TimeIt.decorator
|
470
511
|
def validate_against_machine(self):
|
471
512
|
"""Validate the values for `os_name`, `shell` and `scheduler` against those
|
472
513
|
supported on this machine (as specified by the app configuration)."""
|
@@ -477,6 +518,12 @@ class ElementResources(JSONLike):
|
|
477
518
|
scheduler=self.scheduler,
|
478
519
|
supported=self._app.config.schedulers,
|
479
520
|
)
|
521
|
+
|
522
|
+
if self.os_name == "nt" and self.combine_jobscript_std:
|
523
|
+
raise NotImplementedError(
|
524
|
+
"`combine_jobscript_std` is not yet supported on Windows."
|
525
|
+
)
|
526
|
+
|
480
527
|
# might raise `UnsupportedShellError`:
|
481
528
|
get_shell(shell_name=self.shell, os_name=self.os_name)
|
482
529
|
|
@@ -626,10 +673,21 @@ class ElementIteration(AppAware):
|
|
626
673
|
@property
|
627
674
|
def EAR_IDs(self) -> Mapping[int, Sequence[int]]:
|
628
675
|
"""
|
629
|
-
Mapping from
|
676
|
+
Mapping from action index to EAR ID, where known.
|
630
677
|
"""
|
631
678
|
return self._EAR_IDs
|
632
679
|
|
680
|
+
@property
|
681
|
+
def loop_skipped(self) -> bool:
|
682
|
+
"""True if the the iteration was skipped entirely due to a loop termination."""
|
683
|
+
if not self.action_runs:
|
684
|
+
# this includes when runs are not initialised
|
685
|
+
return False
|
686
|
+
else:
|
687
|
+
return all(
|
688
|
+
i.skip_reason is SkipReason.LOOP_TERMINATION for i in self.action_runs
|
689
|
+
)
|
690
|
+
|
633
691
|
@property
|
634
692
|
def EAR_IDs_flat(self) -> Iterable[int]:
|
635
693
|
"""
|
@@ -1256,6 +1314,10 @@ class ElementIteration(AppAware):
|
|
1256
1314
|
resources["os_name"], resources["shell"]
|
1257
1315
|
)
|
1258
1316
|
|
1317
|
+
# unset inapplicable items:
|
1318
|
+
if "combine_scripts" in resources and not action.script_is_python_snippet:
|
1319
|
+
del resources["combine_scripts"]
|
1320
|
+
|
1259
1321
|
return resources
|
1260
1322
|
|
1261
1323
|
def get_resources_obj(
|
@@ -1446,47 +1508,60 @@ class Element(AppAware):
|
|
1446
1508
|
"""
|
1447
1509
|
return self.iterations[-1]
|
1448
1510
|
|
1511
|
+
@property
|
1512
|
+
def latest_iteration_non_skipped(self):
|
1513
|
+
"""Get the latest iteration that is not loop-skipped."""
|
1514
|
+
for iter_i in self.iterations[::-1]:
|
1515
|
+
if not iter_i.loop_skipped:
|
1516
|
+
return iter_i
|
1517
|
+
|
1449
1518
|
@property
|
1450
1519
|
def inputs(self) -> ElementInputs:
|
1451
1520
|
"""
|
1452
|
-
The inputs to this element (
|
1521
|
+
The inputs to this element's most recent iteration (that was not skipped due to
|
1522
|
+
loop termination).
|
1453
1523
|
"""
|
1454
|
-
return self.
|
1524
|
+
return self.latest_iteration_non_skipped.inputs
|
1455
1525
|
|
1456
1526
|
@property
|
1457
1527
|
def outputs(self) -> ElementOutputs:
|
1458
1528
|
"""
|
1459
|
-
The outputs from this element (
|
1529
|
+
The outputs from this element's most recent iteration (that was not skipped due to
|
1530
|
+
loop termination).
|
1460
1531
|
"""
|
1461
|
-
return self.
|
1532
|
+
return self.latest_iteration_non_skipped.outputs
|
1462
1533
|
|
1463
1534
|
@property
|
1464
1535
|
def input_files(self) -> ElementInputFiles:
|
1465
1536
|
"""
|
1466
|
-
The input files to this element (
|
1537
|
+
The input files to this element's most recent iteration (that was not skipped due
|
1538
|
+
to loop termination).
|
1467
1539
|
"""
|
1468
|
-
return self.
|
1540
|
+
return self.latest_iteration_non_skipped.input_files
|
1469
1541
|
|
1470
1542
|
@property
|
1471
1543
|
def output_files(self) -> ElementOutputFiles:
|
1472
1544
|
"""
|
1473
|
-
The output files from this element
|
1545
|
+
The output files from this element's most recent iteration (that was not skipped
|
1546
|
+
due to loop termination).
|
1474
1547
|
"""
|
1475
|
-
return self.
|
1548
|
+
return self.latest_iteration_non_skipped.output_files
|
1476
1549
|
|
1477
1550
|
@property
|
1478
1551
|
def schema_parameters(self) -> Sequence[str]:
|
1479
1552
|
"""
|
1480
|
-
The schema-defined parameters to this element
|
1553
|
+
The schema-defined parameters to this element's most recent iteration (that was
|
1554
|
+
not skipped due to loop termination).
|
1481
1555
|
"""
|
1482
|
-
return self.
|
1556
|
+
return self.latest_iteration_non_skipped.schema_parameters
|
1483
1557
|
|
1484
1558
|
@property
|
1485
1559
|
def actions(self) -> Mapping[int, ElementAction]:
|
1486
1560
|
"""
|
1487
|
-
The actions of this element (
|
1561
|
+
The actions of this element's most recent iteration (that was not skipped due to
|
1562
|
+
loop termination).
|
1488
1563
|
"""
|
1489
|
-
return self.
|
1564
|
+
return self.latest_iteration_non_skipped.actions
|
1490
1565
|
|
1491
1566
|
@property
|
1492
1567
|
def action_runs(self) -> Sequence[ElementActionRun]:
|
@@ -1494,13 +1569,7 @@ class Element(AppAware):
|
|
1494
1569
|
A list of element action runs from the latest iteration, where only the
|
1495
1570
|
final run is taken for each element action.
|
1496
1571
|
"""
|
1497
|
-
return self.
|
1498
|
-
|
1499
|
-
def init_loop_index(self, loop_name: str) -> None:
|
1500
|
-
"""
|
1501
|
-
Initialise the loop index if necessary.
|
1502
|
-
"""
|
1503
|
-
pass
|
1572
|
+
return self.latest_iteration_non_skipped.action_runs
|
1504
1573
|
|
1505
1574
|
def to_element_set_data(self) -> tuple[list[InputValue], list[ResourceSpec]]:
|
1506
1575
|
"""Generate lists of workflow-bound InputValues and ResourceList."""
|
@@ -1550,14 +1619,15 @@ class Element(AppAware):
|
|
1550
1619
|
action_idx: int | None = None,
|
1551
1620
|
run_idx: int = -1,
|
1552
1621
|
) -> DataIndex:
|
1553
|
-
"""Get the data index of the most recent element iteration
|
1622
|
+
"""Get the data index of the most recent element iteration that
|
1623
|
+
is not loop-skipped.
|
1554
1624
|
|
1555
1625
|
Parameters
|
1556
1626
|
----------
|
1557
1627
|
action_idx
|
1558
1628
|
The index of the action within the schema.
|
1559
1629
|
"""
|
1560
|
-
return self.
|
1630
|
+
return self.latest_iteration_non_skipped.get_data_idx(
|
1561
1631
|
path=path,
|
1562
1632
|
action_idx=action_idx,
|
1563
1633
|
run_idx=run_idx,
|
@@ -1633,8 +1703,9 @@ class Element(AppAware):
|
|
1633
1703
|
raise_on_missing: bool = False,
|
1634
1704
|
raise_on_unset: bool = False,
|
1635
1705
|
) -> Any:
|
1636
|
-
"""Get element data of the most recent iteration
|
1637
|
-
|
1706
|
+
"""Get element data of the most recent iteration that is not
|
1707
|
+
loop-skipped."""
|
1708
|
+
return self.latest_iteration_non_skipped.get(
|
1638
1709
|
path=path,
|
1639
1710
|
action_idx=action_idx,
|
1640
1711
|
run_idx=run_idx,
|
@@ -1651,6 +1722,7 @@ class Element(AppAware):
|
|
1651
1722
|
def get_EAR_dependencies(self, as_objects: Literal[False] = False) -> set[int]:
|
1652
1723
|
...
|
1653
1724
|
|
1725
|
+
@TimeIt.decorator
|
1654
1726
|
def get_EAR_dependencies(
|
1655
1727
|
self, as_objects: bool = False
|
1656
1728
|
) -> set[int] | list[ElementActionRun]:
|
hpcflow/sdk/core/errors.py
CHANGED
@@ -9,10 +9,12 @@ from textwrap import indent
|
|
9
9
|
from typing import Any, TYPE_CHECKING
|
10
10
|
|
11
11
|
if TYPE_CHECKING:
|
12
|
+
from logging import Logger
|
12
13
|
from .enums import ParallelMode
|
13
14
|
from .object_list import WorkflowLoopList
|
14
|
-
from .parameters import InputSource, ValueSequence
|
15
|
+
from .parameters import InputSource, ValueSequence, SchemaInput
|
15
16
|
from .types import ScriptData
|
17
|
+
from .task import WorkflowTask
|
16
18
|
|
17
19
|
|
18
20
|
class InputValueDuplicateSequenceAddress(ValueError):
|
@@ -410,7 +412,13 @@ class WorkflowLimitsError(ValueError):
|
|
410
412
|
# FIXME: never used
|
411
413
|
|
412
414
|
|
413
|
-
class
|
415
|
+
class UnsetParameterDataErrorBase(Exception):
|
416
|
+
"""
|
417
|
+
Exceptions related to attempts to retrieve unset parameters.
|
418
|
+
"""
|
419
|
+
|
420
|
+
|
421
|
+
class UnsetParameterDataError(UnsetParameterDataErrorBase):
|
414
422
|
"""
|
415
423
|
Tried to read from an unset parameter.
|
416
424
|
"""
|
@@ -422,6 +430,50 @@ class UnsetParameterDataError(Exception):
|
|
422
430
|
)
|
423
431
|
|
424
432
|
|
433
|
+
class UnsetParameterFractionLimitExceededError(UnsetParameterDataErrorBase):
|
434
|
+
"""
|
435
|
+
Given the specified `allow_failed_dependencies`, the fraction of failed dependencies
|
436
|
+
(unset parameter data) is too high."""
|
437
|
+
|
438
|
+
def __init__(
|
439
|
+
self,
|
440
|
+
schema_inp: SchemaInput,
|
441
|
+
task: WorkflowTask,
|
442
|
+
unset_fraction: float,
|
443
|
+
log: Logger | None = None,
|
444
|
+
):
|
445
|
+
msg = (
|
446
|
+
f"Input {schema_inp.parameter.typ!r} of task {task.name!r}: higher "
|
447
|
+
f"proportion of dependencies failed ({unset_fraction!r}) than allowed "
|
448
|
+
f"({schema_inp.allow_failed_dependencies!r})."
|
449
|
+
)
|
450
|
+
if log:
|
451
|
+
log.info(msg)
|
452
|
+
super().__init__(msg)
|
453
|
+
|
454
|
+
|
455
|
+
class UnsetParameterNumberLimitExceededError(UnsetParameterDataErrorBase):
|
456
|
+
"""
|
457
|
+
Given the specified `allow_failed_dependencies`, the number of failed dependencies
|
458
|
+
(unset parameter data) is too high."""
|
459
|
+
|
460
|
+
def __init__(
|
461
|
+
self,
|
462
|
+
schema_inp: SchemaInput,
|
463
|
+
task: WorkflowTask,
|
464
|
+
unset_num: int,
|
465
|
+
log: Logger | None = None,
|
466
|
+
):
|
467
|
+
msg = (
|
468
|
+
f"Input {schema_inp.parameter.typ!r} of task {task.name!r}: higher number of "
|
469
|
+
f"dependencies failed ({unset_num!r}) than allowed "
|
470
|
+
f"({schema_inp.allow_failed_dependencies!r})."
|
471
|
+
)
|
472
|
+
if log:
|
473
|
+
log.info(msg)
|
474
|
+
super().__init__(msg)
|
475
|
+
|
476
|
+
|
425
477
|
class LoopAlreadyExistsError(Exception):
|
426
478
|
"""
|
427
479
|
A particular loop (or its name) already exists.
|
@@ -730,6 +782,10 @@ class MissingParameterData(_MissingStoreItemError):
|
|
730
782
|
super().__init__(id_lst, self._item_type)
|
731
783
|
|
732
784
|
|
785
|
+
class ParametersMetadataReadOnlyError(RuntimeError):
|
786
|
+
pass
|
787
|
+
|
788
|
+
|
733
789
|
class NotSubmitMachineError(RuntimeError):
|
734
790
|
"""
|
735
791
|
The requested machine can't be submitted to.
|
@@ -0,0 +1,207 @@
|
|
1
|
+
import asyncio
|
2
|
+
import os
|
3
|
+
import queue
|
4
|
+
import struct
|
5
|
+
import threading
|
6
|
+
import time
|
7
|
+
|
8
|
+
import zmq
|
9
|
+
|
10
|
+
from hpcflow.sdk.core.app_aware import AppAware
|
11
|
+
|
12
|
+
|
13
|
+
class Executor(AppAware):
|
14
|
+
def __init__(self, cmd, env, package_name):
|
15
|
+
|
16
|
+
# TODO: make zmq_server optional (but required if action is abortable, or if
|
17
|
+
# `script_data_in`/`out`` is "zeromq")
|
18
|
+
|
19
|
+
self.cmd = cmd
|
20
|
+
self.env = env
|
21
|
+
self.package_name = package_name
|
22
|
+
|
23
|
+
# initialise a global ZeroMQ context for use in all threads:
|
24
|
+
zmq.Context()
|
25
|
+
|
26
|
+
self._q = None # queue for inter-thread communication
|
27
|
+
|
28
|
+
# assigned by `start_zmq_server`:
|
29
|
+
self.port_number = None
|
30
|
+
self.server_thread = None
|
31
|
+
|
32
|
+
# assigned on (non-aborted) completion of the subprocess via `_subprocess_runner`:
|
33
|
+
self.return_code = None
|
34
|
+
|
35
|
+
@property
|
36
|
+
def q(self):
|
37
|
+
if not self._q:
|
38
|
+
self._q = queue.Queue()
|
39
|
+
return self._q
|
40
|
+
|
41
|
+
@property
|
42
|
+
def zmq_context(self):
|
43
|
+
return zmq.Context.instance()
|
44
|
+
|
45
|
+
def _zmq_server(self):
|
46
|
+
"""Start a ZeroMQ server on a random port.
|
47
|
+
|
48
|
+
This method is invoked in a separate thread via `start_zmq_server`.
|
49
|
+
|
50
|
+
"""
|
51
|
+
socket = self.zmq_context.socket(zmq.REP)
|
52
|
+
port_number = socket.bind_to_random_port("tcp://*")
|
53
|
+
self._app.logger.info(f"zmq_server: started on port {port_number}")
|
54
|
+
|
55
|
+
# send port number back to main thread:
|
56
|
+
self.q.put(port_number)
|
57
|
+
|
58
|
+
self._app.logger.info(f"zmq_server: port number sent to main thread.")
|
59
|
+
|
60
|
+
# TODO: exception handling
|
61
|
+
|
62
|
+
while True:
|
63
|
+
message = socket.recv_string()
|
64
|
+
self._app.logger.info(f"zmq_server: received request: {message}")
|
65
|
+
|
66
|
+
# Check if the received message is a shutdown signal
|
67
|
+
if message in ("shutdown", "abort"):
|
68
|
+
self.q.put(message)
|
69
|
+
socket.send_string("shutting down the server")
|
70
|
+
break
|
71
|
+
|
72
|
+
else:
|
73
|
+
socket.send_string(f"received request: {message}")
|
74
|
+
|
75
|
+
socket.close()
|
76
|
+
self._app.logger.info("zmq_server: server stopped")
|
77
|
+
|
78
|
+
def start_zmq_server(self) -> int:
|
79
|
+
|
80
|
+
# start the server thread
|
81
|
+
server_thread = threading.Thread(target=self._zmq_server)
|
82
|
+
server_thread.start()
|
83
|
+
|
84
|
+
self._app.logger.info(f"server thread started")
|
85
|
+
|
86
|
+
if os.name == "nt":
|
87
|
+
# some sort of race condition seems to exist on Windows, where self.q.get()
|
88
|
+
# will occasionally hang on the Github Actions runners. This seems to resolve
|
89
|
+
# it.
|
90
|
+
time.sleep(0.1)
|
91
|
+
|
92
|
+
# block until port number received:
|
93
|
+
port_number = self.q.get(timeout=5)
|
94
|
+
self._app.logger.info(f"received port number from server thread: {port_number}")
|
95
|
+
|
96
|
+
self.port_number = port_number
|
97
|
+
self.server_thread = server_thread
|
98
|
+
|
99
|
+
return port_number
|
100
|
+
|
101
|
+
def stop_zmq_server(self):
|
102
|
+
|
103
|
+
# send a shutdown signal to the server:
|
104
|
+
socket = self.zmq_context.socket(zmq.REQ)
|
105
|
+
address = f"tcp://localhost:{self.port_number}"
|
106
|
+
socket.connect(address)
|
107
|
+
self._app.logger.info(
|
108
|
+
f"stop_zmq_server: about to send shutdown message to server: {address!r}"
|
109
|
+
)
|
110
|
+
socket.send_string("shutdown")
|
111
|
+
send_shutdown_out = socket.recv()
|
112
|
+
self._app.logger.info(f"stop_zmq_server: received reply: {send_shutdown_out!r}")
|
113
|
+
socket.close()
|
114
|
+
|
115
|
+
# wait for the server thread to finish:
|
116
|
+
self._app.logger.info(f"stop_zmq_server: joining server thread")
|
117
|
+
self.server_thread.join()
|
118
|
+
|
119
|
+
self._app.logger.info(f"stop_zmq_server: terminating ZMQ context")
|
120
|
+
self.zmq_context.term()
|
121
|
+
if self.server_thread.is_alive():
|
122
|
+
raise RuntimeError("Server thread is still alive!")
|
123
|
+
|
124
|
+
def run(self):
|
125
|
+
"""Launch the subprocess to execute the commands, and once complete, stop the
|
126
|
+
ZMQ server. Kill the subprocess if a "shutdown" or "abort" message is sent to the
|
127
|
+
server."""
|
128
|
+
asyncio.run(self._run())
|
129
|
+
return self.return_code
|
130
|
+
|
131
|
+
def _receive_stop(self):
|
132
|
+
"""Wait until the queue receives a shutdown message from the server"""
|
133
|
+
while True:
|
134
|
+
if self.q.get() in ("shutdown", "abort"):
|
135
|
+
return
|
136
|
+
|
137
|
+
async def _subprocess_runner(self):
|
138
|
+
app_caps = self.package_name.upper()
|
139
|
+
env = {**self.env, f"{app_caps}_RUN_PORT": str(self.port_number)}
|
140
|
+
try:
|
141
|
+
process = await asyncio.create_subprocess_exec(*self.cmd, env=env)
|
142
|
+
self._app.logger.info(
|
143
|
+
f"_subprocess_runner: started subprocess: {process=!r}."
|
144
|
+
)
|
145
|
+
ret_code = await process.wait()
|
146
|
+
self._app.logger.info(
|
147
|
+
f"_subprocess_runner: subprocess finished with return code: {ret_code!r}."
|
148
|
+
)
|
149
|
+
self.return_code = ret_code
|
150
|
+
|
151
|
+
except asyncio.CancelledError:
|
152
|
+
process.kill()
|
153
|
+
|
154
|
+
async def _run(self):
|
155
|
+
|
156
|
+
# create tasks for the subprocess and a synchronous Queue.get retrieval:
|
157
|
+
try:
|
158
|
+
wait_abort_thread = asyncio.to_thread(self._receive_stop)
|
159
|
+
except AttributeError:
|
160
|
+
# Python 3.8
|
161
|
+
from hpcflow.sdk.core.utils import to_thread
|
162
|
+
|
163
|
+
wait_abort_thread = to_thread(self._receive_stop)
|
164
|
+
|
165
|
+
wait_abort_task = asyncio.create_task(wait_abort_thread)
|
166
|
+
subprocess_task = asyncio.create_task(self._subprocess_runner())
|
167
|
+
|
168
|
+
# wait for either: subprocess to finish, or a stop signal from the server:
|
169
|
+
_, pending = await asyncio.wait(
|
170
|
+
[wait_abort_task, subprocess_task],
|
171
|
+
return_when=asyncio.FIRST_COMPLETED,
|
172
|
+
)
|
173
|
+
|
174
|
+
# TODO: test we can SIGTERM and SIGINT the subprocess successfully?
|
175
|
+
# - add an API for sending signals to the process via the server?
|
176
|
+
|
177
|
+
if pending == {wait_abort_task}:
|
178
|
+
# subprocess completed; need to shutdown the server
|
179
|
+
self._app.logger.info(f"_run: subprocess completed; stopping zmq server")
|
180
|
+
self.stop_zmq_server()
|
181
|
+
|
182
|
+
else:
|
183
|
+
# subprocess still running but got a stop request; need to kill subprocess:
|
184
|
+
self._app.logger.info(f"_run: stop request; killing subprocess")
|
185
|
+
subprocess_task.cancel()
|
186
|
+
|
187
|
+
if self.return_code and os.name == "nt":
|
188
|
+
# Windows return codes are defined as 32-bit unsigned integers, but
|
189
|
+
# some programs might still return negative numbers, so convert to a
|
190
|
+
# signed 32-bit integer:
|
191
|
+
self.return_code = struct.unpack("i", struct.pack("I", self.return_code))[0]
|
192
|
+
|
193
|
+
@classmethod
|
194
|
+
def send_abort(cls, hostname, port_number):
|
195
|
+
"""Send an abort message to a running server."""
|
196
|
+
context = zmq.Context()
|
197
|
+
socket = context.socket(zmq.REQ)
|
198
|
+
address = f"tcp://{hostname}:{port_number}"
|
199
|
+
socket.connect(address)
|
200
|
+
cls._app.logger.info(
|
201
|
+
f"send_abort: about to send abort message to server: {address!r}"
|
202
|
+
)
|
203
|
+
socket.send_string("abort")
|
204
|
+
abort_rep = socket.recv()
|
205
|
+
cls._app.logger.info(f"send_abort: received reply: {abort_rep!r}")
|
206
|
+
socket.close()
|
207
|
+
context.term()
|