parsl 2025.9.8__py3-none-any.whl → 2025.11.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/app/bash.py +1 -1
- parsl/benchmark/perf.py +73 -17
- parsl/concurrent/__init__.py +95 -14
- parsl/curvezmq.py +0 -16
- parsl/data_provider/globus.py +3 -1
- parsl/dataflow/dflow.py +106 -204
- parsl/dataflow/memoization.py +146 -19
- parsl/dataflow/states.py +5 -5
- parsl/executors/base.py +2 -2
- parsl/executors/execute_task.py +2 -8
- parsl/executors/flux/executor.py +4 -6
- parsl/executors/globus_compute.py +0 -4
- parsl/executors/high_throughput/executor.py +86 -24
- parsl/executors/high_throughput/interchange.py +39 -20
- parsl/executors/high_throughput/mpi_executor.py +1 -2
- parsl/executors/high_throughput/mpi_resource_management.py +7 -14
- parsl/executors/high_throughput/process_worker_pool.py +32 -7
- parsl/executors/high_throughput/zmq_pipes.py +36 -67
- parsl/executors/radical/executor.py +2 -6
- parsl/executors/radical/rpex_worker.py +2 -2
- parsl/executors/taskvine/executor.py +5 -1
- parsl/executors/threads.py +5 -2
- parsl/jobs/states.py +2 -2
- parsl/jobs/strategy.py +7 -6
- parsl/monitoring/monitoring.py +2 -2
- parsl/monitoring/radios/filesystem.py +2 -1
- parsl/monitoring/radios/htex.py +2 -1
- parsl/monitoring/radios/multiprocessing.py +2 -1
- parsl/monitoring/radios/udp.py +2 -1
- parsl/multiprocessing.py +0 -49
- parsl/providers/base.py +24 -37
- parsl/providers/pbspro/pbspro.py +1 -1
- parsl/serialize/__init__.py +6 -9
- parsl/serialize/facade.py +0 -32
- parsl/tests/configs/local_threads_globus.py +18 -14
- parsl/tests/configs/taskvine_ex.py +1 -1
- parsl/tests/sites/test_concurrent.py +51 -3
- parsl/tests/test_checkpointing/test_periodic.py +15 -9
- parsl/tests/test_checkpointing/test_regression_233.py +0 -1
- parsl/tests/test_curvezmq.py +0 -42
- parsl/tests/test_execute_task.py +2 -11
- parsl/tests/test_htex/test_command_concurrency_regression_1321.py +54 -0
- parsl/tests/test_htex/test_htex.py +36 -1
- parsl/tests/test_htex/test_interchange_exit_bad_registration.py +2 -2
- parsl/tests/test_htex/test_priority_queue.py +26 -3
- parsl/tests/test_htex/test_zmq_binding.py +2 -1
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +18 -43
- parsl/tests/test_python_apps/test_basic.py +0 -14
- parsl/tests/test_python_apps/test_depfail_propagation.py +11 -1
- parsl/tests/test_python_apps/test_exception.py +19 -0
- parsl/tests/test_python_apps/test_garbage_collect.py +1 -6
- parsl/tests/test_python_apps/test_memoize_2.py +11 -1
- parsl/tests/test_regression/test_3874.py +47 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +1 -0
- parsl/tests/test_staging/test_staging_globus.py +2 -2
- parsl/tests/unit/test_globus_compute_executor.py +11 -2
- parsl/utils.py +8 -3
- parsl/version.py +1 -1
- {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/interchange.py +39 -20
- {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/process_worker_pool.py +32 -7
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/METADATA +64 -50
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/RECORD +68 -74
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/WHEEL +1 -1
- parsl/tests/configs/local_threads_checkpoint_periodic.py +0 -11
- parsl/tests/configs/local_threads_no_cache.py +0 -11
- parsl/tests/site_tests/test_provider.py +0 -88
- parsl/tests/site_tests/test_site.py +0 -70
- parsl/tests/test_aalst_patterns.py +0 -474
- parsl/tests/test_docs/test_workflow2.py +0 -42
- parsl/tests/test_error_handling/test_rand_fail.py +0 -171
- parsl/tests/test_regression/test_854.py +0 -62
- parsl/tests/test_serialization/test_pack_resource_spec.py +0 -23
- {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/entry_points.txt +0 -0
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info/licenses}/LICENSE +0 -0
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/top_level.txt +0 -0
|
@@ -14,12 +14,6 @@ def import_square(x):
|
|
|
14
14
|
return math.pow(x, 2)
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
@python_app
|
|
18
|
-
def custom_exception():
|
|
19
|
-
from globus_sdk import GlobusError
|
|
20
|
-
raise GlobusError('foobar')
|
|
21
|
-
|
|
22
|
-
|
|
23
17
|
def test_simple(n=2):
|
|
24
18
|
x = double(n)
|
|
25
19
|
assert x.result() == n * 2
|
|
@@ -38,11 +32,3 @@ def test_parallel_for(n):
|
|
|
38
32
|
|
|
39
33
|
for i in d:
|
|
40
34
|
assert d[i].result() == 2 * i
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def test_custom_exception():
|
|
44
|
-
from globus_sdk import GlobusError
|
|
45
|
-
|
|
46
|
-
x = custom_exception()
|
|
47
|
-
with pytest.raises(GlobusError):
|
|
48
|
-
x.result()
|
|
@@ -1,5 +1,7 @@
|
|
|
1
|
+
import parsl
|
|
1
2
|
from parsl import python_app
|
|
2
3
|
from parsl.dataflow.errors import DependencyError
|
|
4
|
+
from parsl.dataflow.states import States
|
|
3
5
|
|
|
4
6
|
|
|
5
7
|
@python_app
|
|
@@ -14,6 +16,7 @@ def depends(parent):
|
|
|
14
16
|
|
|
15
17
|
def test_depfail_once():
|
|
16
18
|
"""Test the simplest dependency failure case"""
|
|
19
|
+
start_dep_fail_count = parsl.dfk().task_state_counts[States.dep_fail]
|
|
17
20
|
f1 = fails()
|
|
18
21
|
f2 = depends(f1)
|
|
19
22
|
|
|
@@ -25,9 +28,12 @@ def test_depfail_once():
|
|
|
25
28
|
# in the DependencyError message
|
|
26
29
|
assert ("task " + str(f1.task_record['id'])) in str(f2.exception())
|
|
27
30
|
|
|
31
|
+
assert parsl.dfk().task_state_counts[States.dep_fail] == start_dep_fail_count + 1
|
|
32
|
+
|
|
28
33
|
|
|
29
34
|
def test_depfail_chain():
|
|
30
35
|
"""Test that dependency failures chain"""
|
|
36
|
+
start_dep_fail_count = parsl.dfk().task_state_counts[States.dep_fail]
|
|
31
37
|
f1 = fails()
|
|
32
38
|
f2 = depends(f1)
|
|
33
39
|
f3 = depends(f2)
|
|
@@ -39,11 +45,13 @@ def test_depfail_chain():
|
|
|
39
45
|
assert isinstance(f3.exception(), DependencyError)
|
|
40
46
|
assert isinstance(f4.exception(), DependencyError)
|
|
41
47
|
|
|
48
|
+
assert parsl.dfk().task_state_counts[States.dep_fail] == start_dep_fail_count + 3
|
|
49
|
+
|
|
42
50
|
|
|
43
51
|
def test_depfail_branches():
|
|
44
52
|
"""Test that dependency failures propagate in the
|
|
45
53
|
presence of multiple downstream tasks."""
|
|
46
|
-
|
|
54
|
+
start_dep_fail_count = parsl.dfk().task_state_counts[States.dep_fail]
|
|
47
55
|
f1 = fails()
|
|
48
56
|
f2 = depends(f1)
|
|
49
57
|
f3 = depends(f1)
|
|
@@ -52,3 +60,5 @@ def test_depfail_branches():
|
|
|
52
60
|
assert not isinstance(f1.exception(), DependencyError)
|
|
53
61
|
assert isinstance(f2.exception(), DependencyError)
|
|
54
62
|
assert isinstance(f3.exception(), DependencyError)
|
|
63
|
+
|
|
64
|
+
assert parsl.dfk().task_state_counts[States.dep_fail] == start_dep_fail_count + 2
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from parsl.app.app import python_app
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CustomException(Exception):
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@python_app
|
|
11
|
+
def custom_exception():
|
|
12
|
+
from parsl.tests.test_python_apps.test_exception import CustomException
|
|
13
|
+
raise CustomException('foobar')
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_custom_exception():
|
|
17
|
+
x = custom_exception()
|
|
18
|
+
with pytest.raises(CustomException):
|
|
19
|
+
x.result()
|
|
@@ -27,10 +27,5 @@ def test_garbage_collect():
|
|
|
27
27
|
|
|
28
28
|
evt.set()
|
|
29
29
|
assert x.result() == 10 * 4
|
|
30
|
-
|
|
31
|
-
# We explicit call checkpoint if checkpoint_mode is enabled covering
|
|
32
|
-
# cases like manual/periodic where checkpointing may be deferred.
|
|
33
|
-
parsl.dfk().checkpoint()
|
|
34
|
-
|
|
35
|
-
time.sleep(0.01) # Give enough time for task wipes to work
|
|
30
|
+
time.sleep(0.01) # Give enough time for task wipes to work - see issue #1279
|
|
36
31
|
assert x.tid not in parsl.dfk().tasks, "Task record should be wiped after task completion"
|
|
@@ -4,7 +4,17 @@ import pytest
|
|
|
4
4
|
|
|
5
5
|
import parsl
|
|
6
6
|
from parsl.app.app import python_app
|
|
7
|
-
from parsl.
|
|
7
|
+
from parsl.config import Config
|
|
8
|
+
from parsl.executors.threads import ThreadPoolExecutor
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def local_config():
|
|
12
|
+
return Config(
|
|
13
|
+
executors=[
|
|
14
|
+
ThreadPoolExecutor(max_threads=4),
|
|
15
|
+
],
|
|
16
|
+
app_cache=False
|
|
17
|
+
)
|
|
8
18
|
|
|
9
19
|
|
|
10
20
|
@python_app
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
import parsl
|
|
6
|
+
from parsl.app.app import python_app
|
|
7
|
+
from parsl.config import Config
|
|
8
|
+
from parsl.executors import HighThroughputExecutor
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@python_app
|
|
12
|
+
def noop():
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@pytest.mark.local
|
|
17
|
+
def test_regression_3874(tmpd_cwd_session):
|
|
18
|
+
# HTEX run 1
|
|
19
|
+
|
|
20
|
+
rundir_1 = str(tmpd_cwd_session / "1")
|
|
21
|
+
|
|
22
|
+
config = Config(executors=[HighThroughputExecutor()], strategy_period=0.5)
|
|
23
|
+
config.run_dir = rundir_1
|
|
24
|
+
|
|
25
|
+
with parsl.load(config):
|
|
26
|
+
noop().result()
|
|
27
|
+
|
|
28
|
+
# It is necessary to delete this rundir to exercise the bug. Otherwise,
|
|
29
|
+
# the next run will be able to continue looking at this directory - the
|
|
30
|
+
# bug manifests when it cannot.
|
|
31
|
+
|
|
32
|
+
shutil.rmtree(rundir_1)
|
|
33
|
+
|
|
34
|
+
# HTEX run 2
|
|
35
|
+
# In the case of issue 3874, this run hangs (rather than failing) as the
|
|
36
|
+
# JobStatusPoller fails to collect status of all of its managed tasks
|
|
37
|
+
# every iteration, without converging towards failure.
|
|
38
|
+
|
|
39
|
+
rundir_2 = str(tmpd_cwd_session / "2")
|
|
40
|
+
|
|
41
|
+
config = Config(executors=[HighThroughputExecutor()], strategy_period=0.5)
|
|
42
|
+
config.run_dir = rundir_2
|
|
43
|
+
|
|
44
|
+
with parsl.load(config):
|
|
45
|
+
noop().result()
|
|
46
|
+
|
|
47
|
+
shutil.rmtree(rundir_2)
|
|
@@ -51,6 +51,7 @@ def test_htex_strategy_does_not_oscillate(ns):
|
|
|
51
51
|
executor.outstanding = lambda: n_tasks
|
|
52
52
|
executor.status_facade = statuses
|
|
53
53
|
executor.workers_per_node = n_workers
|
|
54
|
+
executor.bad_state_is_set = False
|
|
54
55
|
|
|
55
56
|
provider.parallelism = 1
|
|
56
57
|
provider.init_blocks = 0
|
|
@@ -3,9 +3,9 @@ import pytest
|
|
|
3
3
|
import parsl
|
|
4
4
|
from parsl.app.app import python_app
|
|
5
5
|
from parsl.data_provider.files import File
|
|
6
|
-
from parsl.tests.configs.local_threads_globus import
|
|
6
|
+
from parsl.tests.configs.local_threads_globus import fresh_config, remote_writeable
|
|
7
7
|
|
|
8
|
-
local_config =
|
|
8
|
+
local_config = fresh_config
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
@python_app
|
|
@@ -2,18 +2,21 @@ import random
|
|
|
2
2
|
from unittest import mock
|
|
3
3
|
|
|
4
4
|
import pytest
|
|
5
|
-
from globus_compute_sdk import Executor
|
|
6
5
|
|
|
7
6
|
from parsl.executors import GlobusComputeExecutor
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
@pytest.fixture
|
|
11
10
|
def mock_ex():
|
|
12
|
-
# Not Parsl's job to test GC's Executor
|
|
11
|
+
# Not Parsl's job to test GC's Executor, although it
|
|
12
|
+
# still needs to be importable for these test cases.
|
|
13
|
+
from globus_compute_sdk import Executor
|
|
14
|
+
|
|
13
15
|
yield mock.Mock(spec=Executor)
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
@pytest.mark.local
|
|
19
|
+
@pytest.mark.globus_compute
|
|
17
20
|
def test_gc_executor_mock_spec(mock_ex):
|
|
18
21
|
# a test of tests -- make sure we're using spec= in the mock
|
|
19
22
|
with pytest.raises(AttributeError):
|
|
@@ -21,12 +24,14 @@ def test_gc_executor_mock_spec(mock_ex):
|
|
|
21
24
|
|
|
22
25
|
|
|
23
26
|
@pytest.mark.local
|
|
27
|
+
@pytest.mark.globus_compute
|
|
24
28
|
def test_gc_executor_label_default(mock_ex):
|
|
25
29
|
gce = GlobusComputeExecutor(mock_ex)
|
|
26
30
|
assert gce.label == type(gce).__name__, "Expect reasonable default label"
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
@pytest.mark.local
|
|
34
|
+
@pytest.mark.globus_compute
|
|
30
35
|
def test_gc_executor_label(mock_ex, randomstring):
|
|
31
36
|
exp_label = randomstring()
|
|
32
37
|
gce = GlobusComputeExecutor(mock_ex, label=exp_label)
|
|
@@ -34,6 +39,7 @@ def test_gc_executor_label(mock_ex, randomstring):
|
|
|
34
39
|
|
|
35
40
|
|
|
36
41
|
@pytest.mark.local
|
|
42
|
+
@pytest.mark.globus_compute
|
|
37
43
|
def test_gc_executor_resets_spec_after_submit(mock_ex, randomstring):
|
|
38
44
|
submit_res = {randomstring(): "some submit res"}
|
|
39
45
|
res = {"some": randomstring(), "spec": randomstring()}
|
|
@@ -57,6 +63,7 @@ def test_gc_executor_resets_spec_after_submit(mock_ex, randomstring):
|
|
|
57
63
|
|
|
58
64
|
|
|
59
65
|
@pytest.mark.local
|
|
66
|
+
@pytest.mark.globus_compute
|
|
60
67
|
def test_gc_executor_resets_uep_after_submit(mock_ex, randomstring):
|
|
61
68
|
uep_conf = randomstring()
|
|
62
69
|
res = {"some": randomstring()}
|
|
@@ -79,6 +86,7 @@ def test_gc_executor_resets_uep_after_submit(mock_ex, randomstring):
|
|
|
79
86
|
|
|
80
87
|
|
|
81
88
|
@pytest.mark.local
|
|
89
|
+
@pytest.mark.globus_compute
|
|
82
90
|
def test_gc_executor_happy_path(mock_ex, randomstring):
|
|
83
91
|
mock_fn = mock.Mock()
|
|
84
92
|
args = tuple(randomstring() for _ in range(random.randint(0, 3)))
|
|
@@ -95,6 +103,7 @@ def test_gc_executor_happy_path(mock_ex, randomstring):
|
|
|
95
103
|
|
|
96
104
|
|
|
97
105
|
@pytest.mark.local
|
|
106
|
+
@pytest.mark.globus_compute
|
|
98
107
|
def test_gc_executor_shuts_down_asynchronously(mock_ex):
|
|
99
108
|
gce = GlobusComputeExecutor(mock_ex)
|
|
100
109
|
gce.shutdown()
|
parsl/utils.py
CHANGED
|
@@ -11,7 +11,6 @@ from types import TracebackType
|
|
|
11
11
|
from typing import (
|
|
12
12
|
IO,
|
|
13
13
|
Any,
|
|
14
|
-
AnyStr,
|
|
15
14
|
Callable,
|
|
16
15
|
Dict,
|
|
17
16
|
Generator,
|
|
@@ -132,7 +131,13 @@ def get_std_fname_mode(
|
|
|
132
131
|
mode = 'a+'
|
|
133
132
|
elif isinstance(stdfspec, tuple):
|
|
134
133
|
if len(stdfspec) != 2:
|
|
135
|
-
|
|
134
|
+
# this is annotated as unreachable because the type annotation says
|
|
135
|
+
# it cannot be reached. Earlier versions of typeguard did not enforce
|
|
136
|
+
# that type annotation at runtime, though, and the parameters to this
|
|
137
|
+
# function come from the user.
|
|
138
|
+
# When typeguard lower bound is raised to around version 4, this
|
|
139
|
+
# unreachable can be removed.
|
|
140
|
+
msg = (f"std descriptor {fdname} has incorrect tuple length " # type: ignore[unreachable]
|
|
136
141
|
f"{len(stdfspec)}")
|
|
137
142
|
raise pe.BadStdStreamFile(msg)
|
|
138
143
|
fname, mode = stdfspec
|
|
@@ -157,7 +162,7 @@ def wait_for_file(path: str, seconds: int = 10) -> Generator[None, None, None]:
|
|
|
157
162
|
|
|
158
163
|
|
|
159
164
|
@contextmanager
|
|
160
|
-
def time_limited_open(path: str, mode: str, seconds: int = 1) -> Generator[IO
|
|
165
|
+
def time_limited_open(path: str, mode: str, seconds: int = 1) -> Generator[IO, None, None]:
|
|
161
166
|
with wait_for_file(path, seconds):
|
|
162
167
|
logger.debug("wait_for_file yielded")
|
|
163
168
|
f = open(path, mode)
|
parsl/version.py
CHANGED
|
@@ -23,7 +23,6 @@ from parsl.monitoring.radios.base import MonitoringRadioSender
|
|
|
23
23
|
from parsl.monitoring.radios.zmq import ZMQRadioSender
|
|
24
24
|
from parsl.process_loggers import wrap_with_logs
|
|
25
25
|
from parsl.serialize import serialize as serialize_object
|
|
26
|
-
from parsl.utils import setproctitle
|
|
27
26
|
from parsl.version import VERSION as PARSL_VERSION
|
|
28
27
|
|
|
29
28
|
PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
|
|
@@ -56,6 +55,7 @@ class Interchange:
|
|
|
56
55
|
cert_dir: Optional[str],
|
|
57
56
|
manager_selector: ManagerSelector,
|
|
58
57
|
run_id: str,
|
|
58
|
+
_check_python_mismatch: bool,
|
|
59
59
|
) -> None:
|
|
60
60
|
"""
|
|
61
61
|
Parameters
|
|
@@ -99,6 +99,11 @@ class Interchange:
|
|
|
99
99
|
|
|
100
100
|
cert_dir : str | None
|
|
101
101
|
Path to the certificate directory.
|
|
102
|
+
|
|
103
|
+
_check_python_mismatch : bool
|
|
104
|
+
If True, the interchange and worker managers must run the same version of
|
|
105
|
+
Python. Running different versions can cause inter-process communication
|
|
106
|
+
errors, so proceed with caution.
|
|
102
107
|
"""
|
|
103
108
|
self.cert_dir = cert_dir
|
|
104
109
|
self.logdir = logdir
|
|
@@ -126,15 +131,13 @@ class Interchange:
|
|
|
126
131
|
logger.info("Connected to client")
|
|
127
132
|
|
|
128
133
|
self.run_id = run_id
|
|
134
|
+
self._check_python_mismatch = _check_python_mismatch
|
|
129
135
|
|
|
130
136
|
self.hub_address = hub_address
|
|
131
137
|
self.hub_zmq_port = hub_zmq_port
|
|
132
138
|
|
|
133
139
|
self.pending_task_queue: SortedList[Any] = SortedList(key=lambda tup: (tup[0], tup[1]))
|
|
134
140
|
|
|
135
|
-
# count of tasks that have been received from the submit side
|
|
136
|
-
self.task_counter = 0
|
|
137
|
-
|
|
138
141
|
# count of tasks that have been sent out to worker pools
|
|
139
142
|
self.count = 0
|
|
140
143
|
|
|
@@ -157,6 +160,7 @@ class Interchange:
|
|
|
157
160
|
logger.info(f"Bound to port {worker_port} for incoming worker connections")
|
|
158
161
|
|
|
159
162
|
self._ready_managers: Dict[bytes, ManagerRecord] = {}
|
|
163
|
+
self._logged_manager_count_token: object = None
|
|
160
164
|
self.connected_block_history: List[str] = []
|
|
161
165
|
|
|
162
166
|
self.heartbeat_threshold = heartbeat_threshold
|
|
@@ -213,7 +217,7 @@ class Interchange:
|
|
|
213
217
|
|
|
214
218
|
reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
|
|
215
219
|
|
|
216
|
-
if self.
|
|
220
|
+
if self.socks.get(self.command_channel) == zmq.POLLIN:
|
|
217
221
|
logger.debug("entering command_server section")
|
|
218
222
|
|
|
219
223
|
command_req = self.command_channel.recv_pyobj()
|
|
@@ -310,6 +314,7 @@ class Interchange:
|
|
|
310
314
|
self.process_manager_socket_message(interesting_managers, monitoring_radio, kill_event)
|
|
311
315
|
self.expire_bad_managers(interesting_managers, monitoring_radio)
|
|
312
316
|
self.expire_drained_managers(interesting_managers, monitoring_radio)
|
|
317
|
+
self.log_manager_counts(interesting_managers)
|
|
313
318
|
self.process_tasks_to_send(interesting_managers, monitoring_radio)
|
|
314
319
|
|
|
315
320
|
self.zmq_context.destroy()
|
|
@@ -321,20 +326,20 @@ class Interchange:
|
|
|
321
326
|
"""Process incoming task message(s).
|
|
322
327
|
"""
|
|
323
328
|
|
|
324
|
-
if self.
|
|
329
|
+
if self.socks.get(self.task_incoming) == zmq.POLLIN:
|
|
325
330
|
logger.debug("start task_incoming section")
|
|
326
331
|
msg = self.task_incoming.recv_pyobj()
|
|
327
332
|
|
|
328
333
|
# Process priority, higher number = lower priority
|
|
329
|
-
|
|
334
|
+
task_id = msg['task_id']
|
|
335
|
+
resource_spec = msg['context'].get('resource_spec', {})
|
|
330
336
|
priority = resource_spec.get('priority', float('inf'))
|
|
331
|
-
queue_entry = (-priority, -
|
|
337
|
+
queue_entry = (-priority, -task_id, msg)
|
|
332
338
|
|
|
333
|
-
logger.debug("
|
|
339
|
+
logger.debug("Putting task %s onto pending_task_queue", task_id)
|
|
334
340
|
|
|
335
341
|
self.pending_task_queue.add(queue_entry)
|
|
336
|
-
|
|
337
|
-
logger.debug(f"Fetched {self.task_counter} tasks so far")
|
|
342
|
+
logger.debug("Put task %s onto pending_task_queue", task_id)
|
|
338
343
|
|
|
339
344
|
def process_manager_socket_message(
|
|
340
345
|
self,
|
|
@@ -354,9 +359,10 @@ class Interchange:
|
|
|
354
359
|
mtype = meta['type']
|
|
355
360
|
except Exception as e:
|
|
356
361
|
logger.warning(
|
|
357
|
-
|
|
362
|
+
'Failed to read manager message; ignoring message'
|
|
363
|
+
f' (Exception: [{type(e).__name__}] {e})'
|
|
358
364
|
)
|
|
359
|
-
logger.debug('
|
|
365
|
+
logger.debug('Raw message bytes:\n %r\n', msg_parts, exc_info=e)
|
|
360
366
|
return
|
|
361
367
|
|
|
362
368
|
logger.debug(
|
|
@@ -396,7 +402,9 @@ class Interchange:
|
|
|
396
402
|
logger.info(f'Registration info for manager {manager_id!r}: {meta}')
|
|
397
403
|
self._send_monitoring_info(monitoring_radio, new_rec)
|
|
398
404
|
|
|
399
|
-
|
|
405
|
+
python_mismatch: bool = ix_minor_py != mgr_minor_py
|
|
406
|
+
parsl_mismatch: bool = ix_parsl_v != mgr_parsl_v
|
|
407
|
+
if parsl_mismatch or (self._check_python_mismatch and python_mismatch):
|
|
400
408
|
kill_event.set()
|
|
401
409
|
vm_exc = VersionMismatch(
|
|
402
410
|
f"py.v={ix_minor_py} parsl.v={ix_parsl_v}",
|
|
@@ -517,15 +525,24 @@ class Interchange:
|
|
|
517
525
|
m['active'] = False
|
|
518
526
|
self._send_monitoring_info(monitoring_radio, m)
|
|
519
527
|
|
|
528
|
+
def log_manager_counts(self, interesting_managers: Set[bytes]) -> None:
|
|
529
|
+
count_interesting = len(interesting_managers)
|
|
530
|
+
count_ready = len(self._ready_managers)
|
|
531
|
+
|
|
532
|
+
new_logged_manager_count_token = (count_interesting, count_ready)
|
|
533
|
+
|
|
534
|
+
if self._logged_manager_count_token != new_logged_manager_count_token:
|
|
535
|
+
|
|
536
|
+
logger.debug(
|
|
537
|
+
"Managers count (interesting/total): %d/%d",
|
|
538
|
+
count_interesting,
|
|
539
|
+
count_ready
|
|
540
|
+
)
|
|
541
|
+
self._logged_manager_count_token = new_logged_manager_count_token
|
|
542
|
+
|
|
520
543
|
def process_tasks_to_send(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
|
521
544
|
# Check if there are tasks that could be sent to managers
|
|
522
545
|
|
|
523
|
-
logger.debug(
|
|
524
|
-
"Managers count (interesting/total): %d/%d",
|
|
525
|
-
len(interesting_managers),
|
|
526
|
-
len(self._ready_managers)
|
|
527
|
-
)
|
|
528
|
-
|
|
529
546
|
if interesting_managers and self.pending_task_queue:
|
|
530
547
|
shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
|
|
531
548
|
|
|
@@ -618,6 +635,8 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
|
|
|
618
635
|
|
|
619
636
|
|
|
620
637
|
if __name__ == "__main__":
|
|
638
|
+
from parsl.utils import setproctitle
|
|
639
|
+
|
|
621
640
|
setproctitle("parsl: HTEX interchange")
|
|
622
641
|
|
|
623
642
|
config = pickle.load(sys.stdin.buffer)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!python
|
|
2
2
|
|
|
3
3
|
import argparse
|
|
4
|
+
import importlib
|
|
4
5
|
import logging
|
|
5
6
|
import math
|
|
6
7
|
import multiprocessing
|
|
@@ -17,7 +18,7 @@ from importlib.metadata import distributions
|
|
|
17
18
|
from multiprocessing.context import SpawnProcess
|
|
18
19
|
from multiprocessing.managers import DictProxy
|
|
19
20
|
from multiprocessing.sharedctypes import Synchronized
|
|
20
|
-
from typing import Dict, List, Optional, Sequence
|
|
21
|
+
from typing import Callable, Dict, List, Optional, Sequence
|
|
21
22
|
|
|
22
23
|
import psutil
|
|
23
24
|
import zmq
|
|
@@ -348,7 +349,7 @@ class Manager:
|
|
|
348
349
|
|
|
349
350
|
logger.debug(
|
|
350
351
|
'ready workers: %d, pending tasks: %d',
|
|
351
|
-
self.ready_worker_count.value,
|
|
352
|
+
self.ready_worker_count.value,
|
|
352
353
|
pending_task_count,
|
|
353
354
|
)
|
|
354
355
|
|
|
@@ -373,10 +374,12 @@ class Manager:
|
|
|
373
374
|
if socks.get(ix_sock) == zmq.POLLIN:
|
|
374
375
|
pkl_msg = ix_sock.recv()
|
|
375
376
|
tasks = pickle.loads(pkl_msg)
|
|
377
|
+
del pkl_msg
|
|
378
|
+
|
|
376
379
|
last_interchange_contact = time.time()
|
|
377
380
|
|
|
378
381
|
if tasks == HEARTBEAT_CODE:
|
|
379
|
-
logger.debug("Got heartbeat from interchange")
|
|
382
|
+
logger.debug("Got heartbeat response from interchange")
|
|
380
383
|
elif tasks == DRAINED_CODE:
|
|
381
384
|
logger.info("Got fully drained message from interchange - setting kill flag")
|
|
382
385
|
self._stop_event.set()
|
|
@@ -454,6 +457,7 @@ class Manager:
|
|
|
454
457
|
'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
|
|
455
458
|
pkl_package = pickle.dumps(result_package)
|
|
456
459
|
self.pending_result_queue.put(pkl_package)
|
|
460
|
+
del pkl_package
|
|
457
461
|
except KeyError:
|
|
458
462
|
logger.info("Worker {} was not busy when it died".format(worker_id))
|
|
459
463
|
|
|
@@ -603,6 +607,10 @@ def update_resource_spec_env_vars(mpi_launcher: str, resource_spec: Dict, node_i
|
|
|
603
607
|
|
|
604
608
|
|
|
605
609
|
def _init_mpi_env(mpi_launcher: str, resource_spec: Dict):
|
|
610
|
+
for varname in resource_spec:
|
|
611
|
+
envname = "PARSL_" + str(varname).upper()
|
|
612
|
+
os.environ[envname] = str(resource_spec[varname])
|
|
613
|
+
|
|
606
614
|
node_list = resource_spec.get("MPI_NODELIST")
|
|
607
615
|
if node_list is None:
|
|
608
616
|
return
|
|
@@ -753,8 +761,8 @@ def worker(
|
|
|
753
761
|
worker_enqueued = True
|
|
754
762
|
|
|
755
763
|
try:
|
|
756
|
-
# The worker will receive {'task_id':<tid>, 'buffer':<buf>}
|
|
757
764
|
req = task_queue.get(timeout=task_queue_timeout)
|
|
765
|
+
# req is {'task_id':<tid>, 'buffer':<buf>, 'resource_spec':<dict>}
|
|
758
766
|
except queue.Empty:
|
|
759
767
|
continue
|
|
760
768
|
|
|
@@ -766,17 +774,33 @@ def worker(
|
|
|
766
774
|
ready_worker_count.value -= 1
|
|
767
775
|
worker_enqueued = False
|
|
768
776
|
|
|
769
|
-
|
|
777
|
+
ctxt = req["context"]
|
|
778
|
+
res_spec = ctxt.get("resource_spec", {})
|
|
779
|
+
|
|
780
|
+
_init_mpi_env(mpi_launcher=mpi_launcher, resource_spec=res_spec)
|
|
781
|
+
|
|
782
|
+
exec_func: Callable = execute_task
|
|
783
|
+
exec_args = ()
|
|
784
|
+
exec_kwargs = {}
|
|
770
785
|
|
|
771
786
|
try:
|
|
772
|
-
|
|
787
|
+
if task_executor := ctxt.get("task_executor", None):
|
|
788
|
+
mod_name, _, fn_name = task_executor["f"].rpartition(".")
|
|
789
|
+
exec_mod = importlib.import_module(mod_name)
|
|
790
|
+
exec_func = getattr(exec_mod, fn_name)
|
|
791
|
+
|
|
792
|
+
exec_args = task_executor.get("a", ())
|
|
793
|
+
exec_kwargs = task_executor.get("k", {})
|
|
794
|
+
|
|
795
|
+
result = exec_func(req['buffer'], *exec_args, **exec_kwargs)
|
|
773
796
|
serialized_result = serialize(result, buffer_threshold=1000000)
|
|
774
797
|
except Exception as e:
|
|
775
798
|
logger.info('Caught an exception: {}'.format(e))
|
|
776
799
|
result_package = {'type': 'result', 'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
|
|
777
800
|
else:
|
|
778
801
|
result_package = {'type': 'result', 'task_id': tid, 'result': serialized_result}
|
|
779
|
-
|
|
802
|
+
del serialized_result
|
|
803
|
+
del req
|
|
780
804
|
|
|
781
805
|
logger.info("Completed executor task {}".format(tid))
|
|
782
806
|
try:
|
|
@@ -788,6 +812,7 @@ def worker(
|
|
|
788
812
|
})
|
|
789
813
|
|
|
790
814
|
result_queue.put(pkl_package)
|
|
815
|
+
del pkl_package, result_package
|
|
791
816
|
tasks_in_progress.pop(worker_id)
|
|
792
817
|
logger.info("All processing finished for executor task {}".format(tid))
|
|
793
818
|
|