parsl 2024.6.10__py3-none-any.whl → 2024.6.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/app/app.py +0 -2
- parsl/executors/high_throughput/executor.py +36 -31
- parsl/executors/high_throughput/interchange.py +5 -8
- parsl/providers/kubernetes/kube.py +3 -3
- parsl/tests/test_htex/test_htex.py +24 -7
- parsl/version.py +1 -1
- parsl-2024.6.17.data/scripts/interchange.py +681 -0
- {parsl-2024.6.10.dist-info → parsl-2024.6.17.dist-info}/METADATA +2 -2
- {parsl-2024.6.10.dist-info → parsl-2024.6.17.dist-info}/RECORD +16 -15
- {parsl-2024.6.10.data → parsl-2024.6.17.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.6.10.data → parsl-2024.6.17.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.6.10.data → parsl-2024.6.17.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.6.10.dist-info → parsl-2024.6.17.dist-info}/LICENSE +0 -0
- {parsl-2024.6.10.dist-info → parsl-2024.6.17.dist-info}/WHEEL +0 -0
- {parsl-2024.6.10.dist-info → parsl-2024.6.17.dist-info}/entry_points.txt +0 -0
- {parsl-2024.6.10.dist-info → parsl-2024.6.17.dist-info}/top_level.txt +0 -0
parsl/app/app.py
CHANGED
@@ -66,8 +66,6 @@ class AppBase(metaclass=ABCMeta):
|
|
66
66
|
self.kwargs['walltime'] = params['walltime'].default
|
67
67
|
if 'parsl_resource_specification' in params:
|
68
68
|
self.kwargs['parsl_resource_specification'] = params['parsl_resource_specification'].default
|
69
|
-
self.outputs = params['outputs'].default if 'outputs' in params else []
|
70
|
-
self.inputs = params['inputs'].default if 'inputs' in params else []
|
71
69
|
|
72
70
|
@abstractmethod
|
73
71
|
def __call__(self, *args: Any, **kwargs: Any) -> AppFuture:
|
@@ -1,13 +1,13 @@
|
|
1
1
|
import logging
|
2
2
|
import math
|
3
3
|
import pickle
|
4
|
+
import subprocess
|
4
5
|
import threading
|
5
6
|
import typing
|
6
7
|
import warnings
|
7
8
|
from collections import defaultdict
|
8
9
|
from concurrent.futures import Future
|
9
10
|
from dataclasses import dataclass
|
10
|
-
from multiprocessing import Process
|
11
11
|
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
|
12
12
|
|
13
13
|
import typeguard
|
@@ -18,7 +18,7 @@ from parsl.addresses import get_all_addresses
|
|
18
18
|
from parsl.app.errors import RemoteExceptionWrapper
|
19
19
|
from parsl.data_provider.staging import Staging
|
20
20
|
from parsl.executors.errors import BadMessage, ScalingFailed
|
21
|
-
from parsl.executors.high_throughput import
|
21
|
+
from parsl.executors.high_throughput import zmq_pipes
|
22
22
|
from parsl.executors.high_throughput.errors import CommandClientTimeoutError
|
23
23
|
from parsl.executors.high_throughput.mpi_prefix_composer import (
|
24
24
|
VALID_LAUNCHERS,
|
@@ -26,7 +26,6 @@ from parsl.executors.high_throughput.mpi_prefix_composer import (
|
|
26
26
|
)
|
27
27
|
from parsl.executors.status_handling import BlockProviderExecutor
|
28
28
|
from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
|
29
|
-
from parsl.multiprocessing import ForkProcess
|
30
29
|
from parsl.process_loggers import wrap_with_logs
|
31
30
|
from parsl.providers import LocalProvider
|
32
31
|
from parsl.providers.base import ExecutionProvider
|
@@ -305,7 +304,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
305
304
|
self._task_counter = 0
|
306
305
|
self.worker_ports = worker_ports
|
307
306
|
self.worker_port_range = worker_port_range
|
308
|
-
self.interchange_proc: Optional[
|
307
|
+
self.interchange_proc: Optional[subprocess.Popen] = None
|
309
308
|
self.interchange_port_range = interchange_port_range
|
310
309
|
self.heartbeat_threshold = heartbeat_threshold
|
311
310
|
self.heartbeat_period = heartbeat_period
|
@@ -520,38 +519,45 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
520
519
|
|
521
520
|
logger.info("Queue management worker finished")
|
522
521
|
|
523
|
-
def _start_local_interchange_process(self):
|
522
|
+
def _start_local_interchange_process(self) -> None:
|
524
523
|
""" Starts the interchange process locally
|
525
524
|
|
526
|
-
Starts the interchange process locally and uses
|
525
|
+
Starts the interchange process locally and uses the command queue to
|
527
526
|
get the worker task and result ports that the interchange has bound to.
|
528
527
|
"""
|
529
|
-
self.interchange_proc = ForkProcess(target=interchange.starter,
|
530
|
-
kwargs={"client_address": "127.0.0.1",
|
531
|
-
"client_ports": (self.outgoing_q.port,
|
532
|
-
self.incoming_q.port,
|
533
|
-
self.command_client.port),
|
534
|
-
"interchange_address": self.address,
|
535
|
-
"worker_ports": self.worker_ports,
|
536
|
-
"worker_port_range": self.worker_port_range,
|
537
|
-
"hub_address": self.hub_address,
|
538
|
-
"hub_zmq_port": self.hub_zmq_port,
|
539
|
-
"logdir": self.logdir,
|
540
|
-
"heartbeat_threshold": self.heartbeat_threshold,
|
541
|
-
"poll_period": self.poll_period,
|
542
|
-
"logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
|
543
|
-
"cert_dir": self.cert_dir,
|
544
|
-
},
|
545
|
-
daemon=True,
|
546
|
-
name="HTEX-Interchange"
|
547
|
-
)
|
548
|
-
self.interchange_proc.start()
|
549
528
|
|
529
|
+
interchange_config = {"client_address": "127.0.0.1",
|
530
|
+
"client_ports": (self.outgoing_q.port,
|
531
|
+
self.incoming_q.port,
|
532
|
+
self.command_client.port),
|
533
|
+
"interchange_address": self.address,
|
534
|
+
"worker_ports": self.worker_ports,
|
535
|
+
"worker_port_range": self.worker_port_range,
|
536
|
+
"hub_address": self.hub_address,
|
537
|
+
"hub_zmq_port": self.hub_zmq_port,
|
538
|
+
"logdir": self.logdir,
|
539
|
+
"heartbeat_threshold": self.heartbeat_threshold,
|
540
|
+
"poll_period": self.poll_period,
|
541
|
+
"logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
|
542
|
+
"cert_dir": self.cert_dir,
|
543
|
+
}
|
544
|
+
|
545
|
+
config_pickle = pickle.dumps(interchange_config)
|
546
|
+
|
547
|
+
self.interchange_proc = subprocess.Popen(b"interchange.py", stdin=subprocess.PIPE)
|
548
|
+
stdin = self.interchange_proc.stdin
|
549
|
+
assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
|
550
|
+
|
551
|
+
logger.debug("Popened interchange process. Writing config object")
|
552
|
+
stdin.write(config_pickle)
|
553
|
+
stdin.flush()
|
554
|
+
logger.debug("Sent config object. Requesting worker ports")
|
550
555
|
try:
|
551
556
|
(self.worker_task_port, self.worker_result_port) = self.command_client.run("WORKER_PORTS", timeout_s=120)
|
552
557
|
except CommandClientTimeoutError:
|
553
|
-
logger.error("Interchange has not completed initialization
|
558
|
+
logger.error("Interchange has not completed initialization. Aborting")
|
554
559
|
raise Exception("Interchange failed to start")
|
560
|
+
logger.debug("Got worker ports")
|
555
561
|
|
556
562
|
def _start_queue_management_thread(self):
|
557
563
|
"""Method to start the management thread as a daemon.
|
@@ -810,13 +816,12 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
810
816
|
logger.info("Attempting HighThroughputExecutor shutdown")
|
811
817
|
|
812
818
|
self.interchange_proc.terminate()
|
813
|
-
|
814
|
-
|
819
|
+
try:
|
820
|
+
self.interchange_proc.wait(timeout=timeout)
|
821
|
+
except subprocess.TimeoutExpired:
|
815
822
|
logger.info("Unable to terminate Interchange process; sending SIGKILL")
|
816
823
|
self.interchange_proc.kill()
|
817
824
|
|
818
|
-
self.interchange_proc.close()
|
819
|
-
|
820
825
|
logger.info("Finished HighThroughputExecutor shutdown attempt")
|
821
826
|
|
822
827
|
def get_usage_information(self):
|
@@ -672,13 +672,10 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
|
|
672
672
|
logger.addHandler(handler)
|
673
673
|
|
674
674
|
|
675
|
-
|
676
|
-
def starter(*args: Any, **kwargs: Any) -> None:
|
677
|
-
"""Start the interchange process
|
678
|
-
|
679
|
-
The executor is expected to call this function. The args, kwargs match that of the Interchange.__init__
|
680
|
-
"""
|
675
|
+
if __name__ == "__main__":
|
681
676
|
setproctitle("parsl: HTEX interchange")
|
682
|
-
|
683
|
-
|
677
|
+
|
678
|
+
config = pickle.load(sys.stdin.buffer)
|
679
|
+
|
680
|
+
ic = Interchange(**config)
|
684
681
|
ic.start()
|
@@ -243,13 +243,13 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
243
243
|
for jid in to_poll_job_ids:
|
244
244
|
phase = None
|
245
245
|
try:
|
246
|
-
|
246
|
+
pod = self.kube_client.read_namespaced_pod(name=jid, namespace=self.namespace)
|
247
247
|
except Exception:
|
248
248
|
logger.exception("Failed to poll pod {} status, most likely because pod was terminated".format(jid))
|
249
249
|
if self.resources[jid]['status'] is JobStatus(JobState.RUNNING):
|
250
250
|
phase = 'Unknown'
|
251
251
|
else:
|
252
|
-
phase =
|
252
|
+
phase = pod.status.phase
|
253
253
|
if phase:
|
254
254
|
status = translate_table.get(phase, JobState.UNKNOWN)
|
255
255
|
logger.debug("Updating pod {} with status {} to parsl status {}".format(jid,
|
@@ -286,7 +286,7 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
286
286
|
# Create the environment variables and command to initiate IPP
|
287
287
|
environment_vars = client.V1EnvVar(name="TEST", value="SOME DATA")
|
288
288
|
|
289
|
-
launch_args = ["-c", "{0}
|
289
|
+
launch_args = ["-c", "{0}".format(cmd_string)]
|
290
290
|
|
291
291
|
volume_mounts = []
|
292
292
|
# Create mount paths for the volumes
|
@@ -1,11 +1,11 @@
|
|
1
1
|
import pathlib
|
2
2
|
import warnings
|
3
|
+
from subprocess import Popen, TimeoutExpired
|
3
4
|
from unittest import mock
|
4
5
|
|
5
6
|
import pytest
|
6
7
|
|
7
8
|
from parsl import HighThroughputExecutor, curvezmq
|
8
|
-
from parsl.multiprocessing import ForkProcess
|
9
9
|
|
10
10
|
_MOCK_BASE = "parsl.executors.high_throughput.executor"
|
11
11
|
|
@@ -78,16 +78,33 @@ def test_htex_shutdown(
|
|
78
78
|
timeout_expires: bool,
|
79
79
|
htex: HighThroughputExecutor,
|
80
80
|
):
|
81
|
-
mock_ix_proc = mock.Mock(spec=
|
81
|
+
mock_ix_proc = mock.Mock(spec=Popen)
|
82
82
|
|
83
83
|
if started:
|
84
84
|
htex.interchange_proc = mock_ix_proc
|
85
|
-
|
85
|
+
|
86
|
+
# This will, in the absence of any exit trigger, block forever if
|
87
|
+
# no timeout is given and if the interchange does not terminate.
|
88
|
+
# Raise an exception to report that, rather than actually block,
|
89
|
+
# and hope that nothing is catching that exception.
|
90
|
+
|
91
|
+
# this function implements the behaviour if the interchange has
|
92
|
+
# not received a termination call
|
93
|
+
def proc_wait_alive(timeout):
|
94
|
+
if timeout:
|
95
|
+
raise TimeoutExpired(cmd="mock-interchange", timeout=timeout)
|
96
|
+
else:
|
97
|
+
raise RuntimeError("This wait call would hang forever")
|
98
|
+
|
99
|
+
def proc_wait_terminated(timeout):
|
100
|
+
return 0
|
101
|
+
|
102
|
+
mock_ix_proc.wait.side_effect = proc_wait_alive
|
86
103
|
|
87
104
|
if not timeout_expires:
|
88
105
|
# Simulate termination of the Interchange process
|
89
106
|
def kill_interchange(*args, **kwargs):
|
90
|
-
mock_ix_proc.
|
107
|
+
mock_ix_proc.wait.side_effect = proc_wait_terminated
|
91
108
|
|
92
109
|
mock_ix_proc.terminate.side_effect = kill_interchange
|
93
110
|
|
@@ -96,8 +113,8 @@ def test_htex_shutdown(
|
|
96
113
|
mock_logs = mock_logger.info.call_args_list
|
97
114
|
if started:
|
98
115
|
assert mock_ix_proc.terminate.called
|
99
|
-
assert mock_ix_proc.
|
100
|
-
assert {"timeout": 10} == mock_ix_proc.
|
116
|
+
assert mock_ix_proc.wait.called
|
117
|
+
assert {"timeout": 10} == mock_ix_proc.wait.call_args[1]
|
101
118
|
if timeout_expires:
|
102
119
|
assert "Unable to terminate Interchange" in mock_logs[1][0][0]
|
103
120
|
assert mock_ix_proc.kill.called
|
@@ -105,7 +122,7 @@ def test_htex_shutdown(
|
|
105
122
|
assert "Finished" in mock_logs[-1][0][0]
|
106
123
|
else:
|
107
124
|
assert not mock_ix_proc.terminate.called
|
108
|
-
assert not mock_ix_proc.
|
125
|
+
assert not mock_ix_proc.wait.called
|
109
126
|
assert "has not started" in mock_logs[0][0][0]
|
110
127
|
|
111
128
|
|
parsl/version.py
CHANGED
@@ -0,0 +1,681 @@
|
|
1
|
+
#!python
|
2
|
+
import datetime
|
3
|
+
import json
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
import pickle
|
7
|
+
import platform
|
8
|
+
import queue
|
9
|
+
import random
|
10
|
+
import signal
|
11
|
+
import sys
|
12
|
+
import threading
|
13
|
+
import time
|
14
|
+
from typing import Any, Dict, List, NoReturn, Optional, Sequence, Set, Tuple, cast
|
15
|
+
|
16
|
+
import zmq
|
17
|
+
|
18
|
+
from parsl import curvezmq
|
19
|
+
from parsl.app.errors import RemoteExceptionWrapper
|
20
|
+
from parsl.executors.high_throughput.manager_record import ManagerRecord
|
21
|
+
from parsl.monitoring.message_type import MessageType
|
22
|
+
from parsl.process_loggers import wrap_with_logs
|
23
|
+
from parsl.serialize import serialize as serialize_object
|
24
|
+
from parsl.utils import setproctitle
|
25
|
+
from parsl.version import VERSION as PARSL_VERSION
|
26
|
+
|
27
|
+
PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
|
28
|
+
PKL_DRAINED_CODE = pickle.dumps((2 ** 32) - 2)
|
29
|
+
|
30
|
+
LOGGER_NAME = "interchange"
|
31
|
+
logger = logging.getLogger(LOGGER_NAME)
|
32
|
+
|
33
|
+
|
34
|
+
class ManagerLost(Exception):
|
35
|
+
''' Task lost due to manager loss. Manager is considered lost when multiple heartbeats
|
36
|
+
have been missed.
|
37
|
+
'''
|
38
|
+
def __init__(self, manager_id: bytes, hostname: str) -> None:
|
39
|
+
self.manager_id = manager_id
|
40
|
+
self.tstamp = time.time()
|
41
|
+
self.hostname = hostname
|
42
|
+
|
43
|
+
def __str__(self) -> str:
|
44
|
+
return "Task failure due to loss of manager {} on host {}".format(self.manager_id.decode(), self.hostname)
|
45
|
+
|
46
|
+
|
47
|
+
class VersionMismatch(Exception):
|
48
|
+
''' Manager and Interchange versions do not match
|
49
|
+
'''
|
50
|
+
def __init__(self, interchange_version: str, manager_version: str):
|
51
|
+
self.interchange_version = interchange_version
|
52
|
+
self.manager_version = manager_version
|
53
|
+
|
54
|
+
def __str__(self) -> str:
|
55
|
+
return "Manager version info {} does not match interchange version info {}, causing a critical failure".format(
|
56
|
+
self.manager_version,
|
57
|
+
self.interchange_version)
|
58
|
+
|
59
|
+
|
60
|
+
class Interchange:
|
61
|
+
""" Interchange is a task orchestrator for distributed systems.
|
62
|
+
|
63
|
+
1. Asynchronously queue large volume of tasks (>100K)
|
64
|
+
2. Allow for workers to join and leave the union
|
65
|
+
3. Detect workers that have failed using heartbeats
|
66
|
+
"""
|
67
|
+
def __init__(self,
|
68
|
+
*,
|
69
|
+
client_address: str,
|
70
|
+
interchange_address: Optional[str],
|
71
|
+
client_ports: Tuple[int, int, int],
|
72
|
+
worker_ports: Optional[Tuple[int, int]],
|
73
|
+
worker_port_range: Tuple[int, int],
|
74
|
+
hub_address: Optional[str],
|
75
|
+
hub_zmq_port: Optional[int],
|
76
|
+
heartbeat_threshold: int,
|
77
|
+
logdir: str,
|
78
|
+
logging_level: int,
|
79
|
+
poll_period: int,
|
80
|
+
cert_dir: Optional[str],
|
81
|
+
) -> None:
|
82
|
+
"""
|
83
|
+
Parameters
|
84
|
+
----------
|
85
|
+
client_address : str
|
86
|
+
The ip address at which the parsl client can be reached. Default: "127.0.0.1"
|
87
|
+
|
88
|
+
interchange_address : Optional str
|
89
|
+
If specified the interchange will only listen on this address for connections from workers
|
90
|
+
else, it binds to all addresses.
|
91
|
+
|
92
|
+
client_ports : triple(int, int, int)
|
93
|
+
The ports at which the client can be reached
|
94
|
+
|
95
|
+
worker_ports : tuple(int, int)
|
96
|
+
The specific two ports at which workers will connect to the Interchange.
|
97
|
+
|
98
|
+
worker_port_range : tuple(int, int)
|
99
|
+
The interchange picks ports at random from the range which will be used by workers.
|
100
|
+
This is overridden when the worker_ports option is set.
|
101
|
+
|
102
|
+
hub_address : str
|
103
|
+
The IP address at which the interchange can send info about managers to when monitoring is enabled.
|
104
|
+
When None, monitoring is disabled.
|
105
|
+
|
106
|
+
hub_zmq_port : str
|
107
|
+
The port at which the interchange can send info about managers to when monitoring is enabled.
|
108
|
+
When None, monitoring is disabled.
|
109
|
+
|
110
|
+
heartbeat_threshold : int
|
111
|
+
Number of seconds since the last heartbeat after which worker is considered lost.
|
112
|
+
|
113
|
+
logdir : str
|
114
|
+
Parsl log directory paths. Logs and temp files go here.
|
115
|
+
|
116
|
+
logging_level : int
|
117
|
+
Logging level as defined in the logging module.
|
118
|
+
|
119
|
+
poll_period : int
|
120
|
+
The main thread polling period, in milliseconds.
|
121
|
+
|
122
|
+
cert_dir : str | None
|
123
|
+
Path to the certificate directory.
|
124
|
+
"""
|
125
|
+
self.cert_dir = cert_dir
|
126
|
+
self.logdir = logdir
|
127
|
+
os.makedirs(self.logdir, exist_ok=True)
|
128
|
+
|
129
|
+
start_file_logger("{}/interchange.log".format(self.logdir), level=logging_level)
|
130
|
+
logger.propagate = False
|
131
|
+
logger.debug("Initializing Interchange process")
|
132
|
+
|
133
|
+
self.client_address = client_address
|
134
|
+
self.interchange_address: str = interchange_address or "*"
|
135
|
+
self.poll_period = poll_period
|
136
|
+
|
137
|
+
logger.info("Attempting connection to client at {} on ports: {},{},{}".format(
|
138
|
+
client_address, client_ports[0], client_ports[1], client_ports[2]))
|
139
|
+
self.zmq_context = curvezmq.ServerContext(self.cert_dir)
|
140
|
+
self.task_incoming = self.zmq_context.socket(zmq.DEALER)
|
141
|
+
self.task_incoming.set_hwm(0)
|
142
|
+
self.task_incoming.connect("tcp://{}:{}".format(client_address, client_ports[0]))
|
143
|
+
self.results_outgoing = self.zmq_context.socket(zmq.DEALER)
|
144
|
+
self.results_outgoing.set_hwm(0)
|
145
|
+
self.results_outgoing.connect("tcp://{}:{}".format(client_address, client_ports[1]))
|
146
|
+
|
147
|
+
self.command_channel = self.zmq_context.socket(zmq.REP)
|
148
|
+
self.command_channel.connect("tcp://{}:{}".format(client_address, client_ports[2]))
|
149
|
+
logger.info("Connected to client")
|
150
|
+
|
151
|
+
self.hub_address = hub_address
|
152
|
+
self.hub_zmq_port = hub_zmq_port
|
153
|
+
|
154
|
+
self.pending_task_queue: queue.Queue[Any] = queue.Queue(maxsize=10 ** 6)
|
155
|
+
self.count = 0
|
156
|
+
|
157
|
+
self.worker_ports = worker_ports
|
158
|
+
self.worker_port_range = worker_port_range
|
159
|
+
|
160
|
+
self.task_outgoing = self.zmq_context.socket(zmq.ROUTER)
|
161
|
+
self.task_outgoing.set_hwm(0)
|
162
|
+
self.results_incoming = self.zmq_context.socket(zmq.ROUTER)
|
163
|
+
self.results_incoming.set_hwm(0)
|
164
|
+
|
165
|
+
if self.worker_ports:
|
166
|
+
self.worker_task_port = self.worker_ports[0]
|
167
|
+
self.worker_result_port = self.worker_ports[1]
|
168
|
+
|
169
|
+
self.task_outgoing.bind(f"tcp://{self.interchange_address}:{self.worker_task_port}")
|
170
|
+
self.results_incoming.bind(f"tcp://{self.interchange_address}:{self.worker_result_port}")
|
171
|
+
|
172
|
+
else:
|
173
|
+
self.worker_task_port = self.task_outgoing.bind_to_random_port(f"tcp://{self.interchange_address}",
|
174
|
+
min_port=worker_port_range[0],
|
175
|
+
max_port=worker_port_range[1], max_tries=100)
|
176
|
+
self.worker_result_port = self.results_incoming.bind_to_random_port(f"tcp://{self.interchange_address}",
|
177
|
+
min_port=worker_port_range[0],
|
178
|
+
max_port=worker_port_range[1], max_tries=100)
|
179
|
+
|
180
|
+
logger.info("Bound to ports {},{} for incoming worker connections".format(
|
181
|
+
self.worker_task_port, self.worker_result_port))
|
182
|
+
|
183
|
+
self._ready_managers: Dict[bytes, ManagerRecord] = {}
|
184
|
+
self.connected_block_history: List[str] = []
|
185
|
+
|
186
|
+
self.heartbeat_threshold = heartbeat_threshold
|
187
|
+
|
188
|
+
self.current_platform = {'parsl_v': PARSL_VERSION,
|
189
|
+
'python_v': "{}.{}.{}".format(sys.version_info.major,
|
190
|
+
sys.version_info.minor,
|
191
|
+
sys.version_info.micro),
|
192
|
+
'os': platform.system(),
|
193
|
+
'hostname': platform.node(),
|
194
|
+
'dir': os.getcwd()}
|
195
|
+
|
196
|
+
logger.info("Platform info: {}".format(self.current_platform))
|
197
|
+
|
198
|
+
def get_tasks(self, count: int) -> Sequence[dict]:
|
199
|
+
""" Obtains a batch of tasks from the internal pending_task_queue
|
200
|
+
|
201
|
+
Parameters
|
202
|
+
----------
|
203
|
+
count: int
|
204
|
+
Count of tasks to get from the queue
|
205
|
+
|
206
|
+
Returns
|
207
|
+
-------
|
208
|
+
List of upto count tasks. May return fewer than count down to an empty list
|
209
|
+
eg. [{'task_id':<x>, 'buffer':<buf>} ... ]
|
210
|
+
"""
|
211
|
+
tasks = []
|
212
|
+
for _ in range(0, count):
|
213
|
+
try:
|
214
|
+
x = self.pending_task_queue.get(block=False)
|
215
|
+
except queue.Empty:
|
216
|
+
break
|
217
|
+
else:
|
218
|
+
tasks.append(x)
|
219
|
+
|
220
|
+
return tasks
|
221
|
+
|
222
|
+
@wrap_with_logs(target="interchange")
|
223
|
+
def task_puller(self) -> NoReturn:
|
224
|
+
"""Pull tasks from the incoming tasks zmq pipe onto the internal
|
225
|
+
pending task queue
|
226
|
+
"""
|
227
|
+
logger.info("Starting")
|
228
|
+
task_counter = 0
|
229
|
+
|
230
|
+
while True:
|
231
|
+
logger.debug("launching recv_pyobj")
|
232
|
+
try:
|
233
|
+
msg = self.task_incoming.recv_pyobj()
|
234
|
+
except zmq.Again:
|
235
|
+
# We just timed out while attempting to receive
|
236
|
+
logger.debug("zmq.Again with {} tasks in internal queue".format(self.pending_task_queue.qsize()))
|
237
|
+
continue
|
238
|
+
|
239
|
+
logger.debug("putting message onto pending_task_queue")
|
240
|
+
self.pending_task_queue.put(msg)
|
241
|
+
task_counter += 1
|
242
|
+
logger.debug(f"Fetched {task_counter} tasks so far")
|
243
|
+
|
244
|
+
def _create_monitoring_channel(self) -> Optional[zmq.Socket]:
|
245
|
+
if self.hub_address and self.hub_zmq_port:
|
246
|
+
logger.info("Connecting to MonitoringHub")
|
247
|
+
# This is a one-off because monitoring is unencrypted
|
248
|
+
hub_channel = zmq.Context().socket(zmq.DEALER)
|
249
|
+
hub_channel.set_hwm(0)
|
250
|
+
hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_zmq_port))
|
251
|
+
logger.info("Connected to MonitoringHub")
|
252
|
+
return hub_channel
|
253
|
+
else:
|
254
|
+
return None
|
255
|
+
|
256
|
+
def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
|
257
|
+
if hub_channel:
|
258
|
+
logger.info("Sending message {} to MonitoringHub".format(manager))
|
259
|
+
|
260
|
+
d: Dict = cast(Dict, manager.copy())
|
261
|
+
d['timestamp'] = datetime.datetime.now()
|
262
|
+
d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat'])
|
263
|
+
|
264
|
+
hub_channel.send_pyobj((MessageType.NODE_INFO, d))
|
265
|
+
|
266
|
+
@wrap_with_logs(target="interchange")
|
267
|
+
def _command_server(self) -> NoReturn:
|
268
|
+
""" Command server to run async command to the interchange
|
269
|
+
"""
|
270
|
+
logger.debug("Command Server Starting")
|
271
|
+
|
272
|
+
# Need to create a new ZMQ socket for command server thread
|
273
|
+
hub_channel = self._create_monitoring_channel()
|
274
|
+
|
275
|
+
reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
|
276
|
+
|
277
|
+
while True:
|
278
|
+
try:
|
279
|
+
command_req = self.command_channel.recv_pyobj()
|
280
|
+
logger.debug("Received command request: {}".format(command_req))
|
281
|
+
if command_req == "OUTSTANDING_C":
|
282
|
+
outstanding = self.pending_task_queue.qsize()
|
283
|
+
for manager in self._ready_managers.values():
|
284
|
+
outstanding += len(manager['tasks'])
|
285
|
+
reply = outstanding
|
286
|
+
|
287
|
+
elif command_req == "CONNECTED_BLOCKS":
|
288
|
+
reply = self.connected_block_history
|
289
|
+
|
290
|
+
elif command_req == "WORKERS":
|
291
|
+
num_workers = 0
|
292
|
+
for manager in self._ready_managers.values():
|
293
|
+
num_workers += manager['worker_count']
|
294
|
+
reply = num_workers
|
295
|
+
|
296
|
+
elif command_req == "MANAGERS":
|
297
|
+
reply = []
|
298
|
+
for manager_id in self._ready_managers:
|
299
|
+
m = self._ready_managers[manager_id]
|
300
|
+
idle_since = m['idle_since']
|
301
|
+
if idle_since is not None:
|
302
|
+
idle_duration = time.time() - idle_since
|
303
|
+
else:
|
304
|
+
idle_duration = 0.0
|
305
|
+
resp = {'manager': manager_id.decode('utf-8'),
|
306
|
+
'block_id': m['block_id'],
|
307
|
+
'worker_count': m['worker_count'],
|
308
|
+
'tasks': len(m['tasks']),
|
309
|
+
'idle_duration': idle_duration,
|
310
|
+
'active': m['active'],
|
311
|
+
'parsl_version': m['parsl_version'],
|
312
|
+
'python_version': m['python_version'],
|
313
|
+
'draining': m['draining']}
|
314
|
+
reply.append(resp)
|
315
|
+
|
316
|
+
elif command_req.startswith("HOLD_WORKER"):
|
317
|
+
cmd, s_manager = command_req.split(';')
|
318
|
+
manager_id = s_manager.encode('utf-8')
|
319
|
+
logger.info("Received HOLD_WORKER for {!r}".format(manager_id))
|
320
|
+
if manager_id in self._ready_managers:
|
321
|
+
m = self._ready_managers[manager_id]
|
322
|
+
m['active'] = False
|
323
|
+
self._send_monitoring_info(hub_channel, m)
|
324
|
+
else:
|
325
|
+
logger.warning("Worker to hold was not in ready managers list")
|
326
|
+
|
327
|
+
reply = None
|
328
|
+
|
329
|
+
elif command_req == "WORKER_PORTS":
|
330
|
+
reply = (self.worker_task_port, self.worker_result_port)
|
331
|
+
|
332
|
+
else:
|
333
|
+
logger.error(f"Received unknown command: {command_req}")
|
334
|
+
reply = None
|
335
|
+
|
336
|
+
logger.debug("Reply: {}".format(reply))
|
337
|
+
self.command_channel.send_pyobj(reply)
|
338
|
+
|
339
|
+
except zmq.Again:
|
340
|
+
logger.debug("Command thread is alive")
|
341
|
+
continue
|
342
|
+
|
343
|
+
@wrap_with_logs
|
344
|
+
def start(self) -> None:
|
345
|
+
""" Start the interchange
|
346
|
+
"""
|
347
|
+
|
348
|
+
# If a user workflow has set its own signal handler for sigterm, that
|
349
|
+
# handler will be inherited by the interchange process because it is
|
350
|
+
# launched as a multiprocessing fork process.
|
351
|
+
# That can interfere with the interchange shutdown mechanism, which is
|
352
|
+
# to receive a SIGTERM and exit immediately.
|
353
|
+
# See Parsl issue #2343 (Threads and multiprocessing cannot be
|
354
|
+
# intermingled without deadlocks) which talks about other fork-related
|
355
|
+
# parent-process-inheritance problems.
|
356
|
+
signal.signal(signal.SIGTERM, signal.SIG_DFL)
|
357
|
+
|
358
|
+
logger.info("Incoming ports bound")
|
359
|
+
|
360
|
+
hub_channel = self._create_monitoring_channel()
|
361
|
+
|
362
|
+
poll_period = self.poll_period
|
363
|
+
|
364
|
+
start = time.time()
|
365
|
+
|
366
|
+
self._task_puller_thread = threading.Thread(target=self.task_puller,
|
367
|
+
name="Interchange-Task-Puller",
|
368
|
+
daemon=True)
|
369
|
+
self._task_puller_thread.start()
|
370
|
+
|
371
|
+
self._command_thread = threading.Thread(target=self._command_server,
|
372
|
+
name="Interchange-Command",
|
373
|
+
daemon=True)
|
374
|
+
self._command_thread.start()
|
375
|
+
|
376
|
+
kill_event = threading.Event()
|
377
|
+
|
378
|
+
poller = zmq.Poller()
|
379
|
+
poller.register(self.task_outgoing, zmq.POLLIN)
|
380
|
+
poller.register(self.results_incoming, zmq.POLLIN)
|
381
|
+
|
382
|
+
# These are managers which we should examine in an iteration
|
383
|
+
# for scheduling a job (or maybe any other attention?).
|
384
|
+
# Anything altering the state of the manager should add it
|
385
|
+
# onto this list.
|
386
|
+
interesting_managers: Set[bytes] = set()
|
387
|
+
|
388
|
+
while not kill_event.is_set():
|
389
|
+
self.socks = dict(poller.poll(timeout=poll_period))
|
390
|
+
|
391
|
+
self.process_task_outgoing_incoming(interesting_managers, hub_channel, kill_event)
|
392
|
+
self.process_results_incoming(interesting_managers, hub_channel)
|
393
|
+
self.expire_bad_managers(interesting_managers, hub_channel)
|
394
|
+
self.expire_drained_managers(interesting_managers, hub_channel)
|
395
|
+
self.process_tasks_to_send(interesting_managers)
|
396
|
+
|
397
|
+
self.zmq_context.destroy()
|
398
|
+
delta = time.time() - start
|
399
|
+
logger.info("Processed {} tasks in {} seconds".format(self.count, delta))
|
400
|
+
logger.warning("Exiting")
|
401
|
+
|
402
|
+
def process_task_outgoing_incoming(
|
403
|
+
self,
|
404
|
+
interesting_managers: Set[bytes],
|
405
|
+
hub_channel: Optional[zmq.Socket],
|
406
|
+
kill_event: threading.Event
|
407
|
+
) -> None:
|
408
|
+
"""Process one message from manager on the task_outgoing channel.
|
409
|
+
Note that this message flow is in contradiction to the name of the
|
410
|
+
channel - it is not an outgoing message and it is not a task.
|
411
|
+
"""
|
412
|
+
if self.task_outgoing in self.socks and self.socks[self.task_outgoing] == zmq.POLLIN:
|
413
|
+
logger.debug("starting task_outgoing section")
|
414
|
+
message = self.task_outgoing.recv_multipart()
|
415
|
+
manager_id = message[0]
|
416
|
+
|
417
|
+
try:
|
418
|
+
msg = json.loads(message[1].decode('utf-8'))
|
419
|
+
except Exception:
|
420
|
+
logger.warning("Got Exception reading message from manager: {!r}".format(
|
421
|
+
manager_id), exc_info=True)
|
422
|
+
logger.debug("Message: \n{!r}\n".format(message[1]))
|
423
|
+
return
|
424
|
+
|
425
|
+
# perform a bit of validation on the structure of the deserialized
|
426
|
+
# object, at least enough to behave like a deserialization error
|
427
|
+
# in obviously malformed cases
|
428
|
+
if not isinstance(msg, dict) or 'type' not in msg:
|
429
|
+
logger.error(f"JSON message was not correctly formatted from manager: {manager_id!r}")
|
430
|
+
logger.debug("Message: \n{!r}\n".format(message[1]))
|
431
|
+
return
|
432
|
+
|
433
|
+
if msg['type'] == 'registration':
|
434
|
+
# We set up an entry only if registration works correctly
|
435
|
+
self._ready_managers[manager_id] = {'last_heartbeat': time.time(),
|
436
|
+
'idle_since': time.time(),
|
437
|
+
'block_id': None,
|
438
|
+
'max_capacity': 0,
|
439
|
+
'worker_count': 0,
|
440
|
+
'active': True,
|
441
|
+
'draining': False,
|
442
|
+
'parsl_version': msg['parsl_v'],
|
443
|
+
'python_version': msg['python_v'],
|
444
|
+
'tasks': []}
|
445
|
+
self.connected_block_history.append(msg['block_id'])
|
446
|
+
|
447
|
+
interesting_managers.add(manager_id)
|
448
|
+
logger.info("Adding manager: {!r} to ready queue".format(manager_id))
|
449
|
+
m = self._ready_managers[manager_id]
|
450
|
+
|
451
|
+
# m is a ManagerRecord, but msg is a dict[Any,Any] and so can
|
452
|
+
# contain arbitrary fields beyond those in ManagerRecord (and
|
453
|
+
# indeed does - for example, python_v) which are then ignored
|
454
|
+
# later.
|
455
|
+
m.update(msg) # type: ignore[typeddict-item]
|
456
|
+
|
457
|
+
logger.info("Registration info for manager {!r}: {}".format(manager_id, msg))
|
458
|
+
self._send_monitoring_info(hub_channel, m)
|
459
|
+
|
460
|
+
if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
|
461
|
+
msg['parsl_v'] != self.current_platform['parsl_v']):
|
462
|
+
logger.error("Manager {!r} has incompatible version info with the interchange".format(manager_id))
|
463
|
+
logger.debug("Setting kill event")
|
464
|
+
kill_event.set()
|
465
|
+
e = VersionMismatch("py.v={} parsl.v={}".format(self.current_platform['python_v'].rsplit(".", 1)[0],
|
466
|
+
self.current_platform['parsl_v']),
|
467
|
+
"py.v={} parsl.v={}".format(msg['python_v'].rsplit(".", 1)[0],
|
468
|
+
msg['parsl_v'])
|
469
|
+
)
|
470
|
+
result_package = {'type': 'result', 'task_id': -1, 'exception': serialize_object(e)}
|
471
|
+
pkl_package = pickle.dumps(result_package)
|
472
|
+
self.results_outgoing.send(pkl_package)
|
473
|
+
logger.error("Sent failure reports, shutting down interchange")
|
474
|
+
else:
|
475
|
+
logger.info("Manager {!r} has compatible Parsl version {}".format(manager_id, msg['parsl_v']))
|
476
|
+
logger.info("Manager {!r} has compatible Python version {}".format(manager_id,
|
477
|
+
msg['python_v'].rsplit(".", 1)[0]))
|
478
|
+
elif msg['type'] == 'heartbeat':
|
479
|
+
self._ready_managers[manager_id]['last_heartbeat'] = time.time()
|
480
|
+
logger.debug("Manager {!r} sent heartbeat via tasks connection".format(manager_id))
|
481
|
+
self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
|
482
|
+
elif msg['type'] == 'drain':
|
483
|
+
self._ready_managers[manager_id]['draining'] = True
|
484
|
+
logger.debug(f"Manager {manager_id!r} requested drain")
|
485
|
+
else:
|
486
|
+
logger.error(f"Unexpected message type received from manager: {msg['type']}")
|
487
|
+
logger.debug("leaving task_outgoing section")
|
488
|
+
|
489
|
+
def expire_drained_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
|
490
|
+
|
491
|
+
for manager_id in list(interesting_managers):
|
492
|
+
# is it always true that a draining manager will be in interesting managers?
|
493
|
+
# i think so because it will have outstanding capacity?
|
494
|
+
m = self._ready_managers[manager_id]
|
495
|
+
if m['draining'] and len(m['tasks']) == 0:
|
496
|
+
logger.info(f"Manager {manager_id!r} is drained - sending drained message to manager")
|
497
|
+
self.task_outgoing.send_multipart([manager_id, b'', PKL_DRAINED_CODE])
|
498
|
+
interesting_managers.remove(manager_id)
|
499
|
+
self._ready_managers.pop(manager_id)
|
500
|
+
|
501
|
+
m['active'] = False
|
502
|
+
self._send_monitoring_info(hub_channel, m)
|
503
|
+
|
504
|
+
def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
|
505
|
+
# Check if there are tasks that could be sent to managers
|
506
|
+
|
507
|
+
logger.debug("Managers count (interesting/total): {interesting}/{total}".format(
|
508
|
+
total=len(self._ready_managers),
|
509
|
+
interesting=len(interesting_managers)))
|
510
|
+
|
511
|
+
if interesting_managers and not self.pending_task_queue.empty():
|
512
|
+
shuffled_managers = list(interesting_managers)
|
513
|
+
random.shuffle(shuffled_managers)
|
514
|
+
|
515
|
+
while shuffled_managers and not self.pending_task_queue.empty(): # cf. the if statement above...
|
516
|
+
manager_id = shuffled_managers.pop()
|
517
|
+
m = self._ready_managers[manager_id]
|
518
|
+
tasks_inflight = len(m['tasks'])
|
519
|
+
real_capacity = m['max_capacity'] - tasks_inflight
|
520
|
+
|
521
|
+
if (real_capacity and m['active'] and not m['draining']):
|
522
|
+
tasks = self.get_tasks(real_capacity)
|
523
|
+
if tasks:
|
524
|
+
self.task_outgoing.send_multipart([manager_id, b'', pickle.dumps(tasks)])
|
525
|
+
task_count = len(tasks)
|
526
|
+
self.count += task_count
|
527
|
+
tids = [t['task_id'] for t in tasks]
|
528
|
+
m['tasks'].extend(tids)
|
529
|
+
m['idle_since'] = None
|
530
|
+
logger.debug("Sent tasks: {} to manager {!r}".format(tids, manager_id))
|
531
|
+
# recompute real_capacity after sending tasks
|
532
|
+
real_capacity = m['max_capacity'] - tasks_inflight
|
533
|
+
if real_capacity > 0:
|
534
|
+
logger.debug("Manager {!r} has free capacity {}".format(manager_id, real_capacity))
|
535
|
+
# ... so keep it in the interesting_managers list
|
536
|
+
else:
|
537
|
+
logger.debug("Manager {!r} is now saturated".format(manager_id))
|
538
|
+
interesting_managers.remove(manager_id)
|
539
|
+
else:
|
540
|
+
interesting_managers.remove(manager_id)
|
541
|
+
# logger.debug("Nothing to send to manager {}".format(manager_id))
|
542
|
+
logger.debug("leaving _ready_managers section, with {} managers still interesting".format(len(interesting_managers)))
|
543
|
+
else:
|
544
|
+
logger.debug("either no interesting managers or no tasks, so skipping manager pass")
|
545
|
+
|
546
|
+
def process_results_incoming(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
|
547
|
+
# Receive any results and forward to client
|
548
|
+
if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN:
|
549
|
+
logger.debug("entering results_incoming section")
|
550
|
+
manager_id, *all_messages = self.results_incoming.recv_multipart()
|
551
|
+
if manager_id not in self._ready_managers:
|
552
|
+
logger.warning("Received a result from a un-registered manager: {!r}".format(manager_id))
|
553
|
+
else:
|
554
|
+
logger.debug(f"Got {len(all_messages)} result items in batch from manager {manager_id!r}")
|
555
|
+
|
556
|
+
b_messages = []
|
557
|
+
|
558
|
+
for p_message in all_messages:
|
559
|
+
r = pickle.loads(p_message)
|
560
|
+
if r['type'] == 'result':
|
561
|
+
# process this for task ID and forward to executor
|
562
|
+
b_messages.append((p_message, r))
|
563
|
+
elif r['type'] == 'monitoring':
|
564
|
+
# the monitoring code makes the assumption that no
|
565
|
+
# monitoring messages will be received if monitoring
|
566
|
+
# is not configured, and that hub_channel will only
|
567
|
+
# be None when monitoring is not configurated.
|
568
|
+
assert hub_channel is not None
|
569
|
+
|
570
|
+
hub_channel.send_pyobj(r['payload'])
|
571
|
+
elif r['type'] == 'heartbeat':
|
572
|
+
logger.debug(f"Manager {manager_id!r} sent heartbeat via results connection")
|
573
|
+
b_messages.append((p_message, r))
|
574
|
+
else:
|
575
|
+
logger.error("Interchange discarding result_queue message of unknown type: {}".format(r['type']))
|
576
|
+
|
577
|
+
got_result = False
|
578
|
+
m = self._ready_managers[manager_id]
|
579
|
+
for (_, r) in b_messages:
|
580
|
+
assert 'type' in r, f"Message is missing type entry: {r}"
|
581
|
+
if r['type'] == 'result':
|
582
|
+
got_result = True
|
583
|
+
try:
|
584
|
+
logger.debug(f"Removing task {r['task_id']} from manager record {manager_id!r}")
|
585
|
+
m['tasks'].remove(r['task_id'])
|
586
|
+
except Exception:
|
587
|
+
# If we reach here, there's something very wrong.
|
588
|
+
logger.exception("Ignoring exception removing task_id {} for manager {!r} with task list {}".format(
|
589
|
+
r['task_id'],
|
590
|
+
manager_id,
|
591
|
+
m['tasks']))
|
592
|
+
|
593
|
+
b_messages_to_send = []
|
594
|
+
for (b_message, _) in b_messages:
|
595
|
+
b_messages_to_send.append(b_message)
|
596
|
+
|
597
|
+
if b_messages_to_send:
|
598
|
+
logger.debug("Sending messages on results_outgoing")
|
599
|
+
self.results_outgoing.send_multipart(b_messages_to_send)
|
600
|
+
logger.debug("Sent messages on results_outgoing")
|
601
|
+
|
602
|
+
logger.debug(f"Current tasks on manager {manager_id!r}: {m['tasks']}")
|
603
|
+
if len(m['tasks']) == 0 and m['idle_since'] is None:
|
604
|
+
m['idle_since'] = time.time()
|
605
|
+
|
606
|
+
# A manager is only made interesting here if a result was
|
607
|
+
# received, which means there should be capacity for a new
|
608
|
+
# task now. Heartbeats and monitoring messages do not make a
|
609
|
+
# manager become interesting.
|
610
|
+
if got_result:
|
611
|
+
interesting_managers.add(manager_id)
|
612
|
+
logger.debug("leaving results_incoming section")
|
613
|
+
|
614
|
+
def expire_bad_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
|
615
|
+
bad_managers = [(manager_id, m) for (manager_id, m) in self._ready_managers.items() if
|
616
|
+
time.time() - m['last_heartbeat'] > self.heartbeat_threshold]
|
617
|
+
for (manager_id, m) in bad_managers:
|
618
|
+
logger.debug("Last: {} Current: {}".format(m['last_heartbeat'], time.time()))
|
619
|
+
logger.warning(f"Too many heartbeats missed for manager {manager_id!r} - removing manager")
|
620
|
+
if m['active']:
|
621
|
+
m['active'] = False
|
622
|
+
self._send_monitoring_info(hub_channel, m)
|
623
|
+
|
624
|
+
logger.warning(f"Cancelling htex tasks {m['tasks']} on removed manager")
|
625
|
+
for tid in m['tasks']:
|
626
|
+
try:
|
627
|
+
raise ManagerLost(manager_id, m['hostname'])
|
628
|
+
except Exception:
|
629
|
+
result_package = {'type': 'result', 'task_id': tid, 'exception': serialize_object(RemoteExceptionWrapper(*sys.exc_info()))}
|
630
|
+
pkl_package = pickle.dumps(result_package)
|
631
|
+
self.results_outgoing.send(pkl_package)
|
632
|
+
logger.warning("Sent failure reports, unregistering manager")
|
633
|
+
self._ready_managers.pop(manager_id, 'None')
|
634
|
+
if manager_id in interesting_managers:
|
635
|
+
interesting_managers.remove(manager_id)
|
636
|
+
|
637
|
+
|
638
|
+
def start_file_logger(filename: str, level: int = logging.DEBUG, format_string: Optional[str] = None) -> None:
|
639
|
+
"""Add a stream log handler.
|
640
|
+
|
641
|
+
Parameters
|
642
|
+
---------
|
643
|
+
|
644
|
+
filename: string
|
645
|
+
Name of the file to write logs to. Required.
|
646
|
+
level: logging.LEVEL
|
647
|
+
Set the logging level. Default=logging.DEBUG
|
648
|
+
- format_string (string): Set the format string
|
649
|
+
format_string: string
|
650
|
+
Format string to use.
|
651
|
+
|
652
|
+
Returns
|
653
|
+
-------
|
654
|
+
None.
|
655
|
+
"""
|
656
|
+
if format_string is None:
|
657
|
+
format_string = (
|
658
|
+
|
659
|
+
"%(asctime)s.%(msecs)03d %(name)s:%(lineno)d "
|
660
|
+
"%(processName)s(%(process)d) %(threadName)s "
|
661
|
+
"%(funcName)s [%(levelname)s] %(message)s"
|
662
|
+
|
663
|
+
)
|
664
|
+
|
665
|
+
global logger
|
666
|
+
logger = logging.getLogger(LOGGER_NAME)
|
667
|
+
logger.setLevel(level)
|
668
|
+
handler = logging.FileHandler(filename)
|
669
|
+
handler.setLevel(level)
|
670
|
+
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%d %H:%M:%S')
|
671
|
+
handler.setFormatter(formatter)
|
672
|
+
logger.addHandler(handler)
|
673
|
+
|
674
|
+
|
675
|
+
if __name__ == "__main__":
|
676
|
+
setproctitle("parsl: HTEX interchange")
|
677
|
+
|
678
|
+
config = pickle.load(sys.stdin.buffer)
|
679
|
+
|
680
|
+
ic = Interchange(**config)
|
681
|
+
ic.start()
|
@@ -1,9 +1,9 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: parsl
|
3
|
-
Version: 2024.6.
|
3
|
+
Version: 2024.6.17
|
4
4
|
Summary: Simple data dependent workflows in Python
|
5
5
|
Home-page: https://github.com/Parsl/parsl
|
6
|
-
Download-URL: https://github.com/Parsl/parsl/archive/2024.06.
|
6
|
+
Download-URL: https://github.com/Parsl/parsl/archive/2024.06.17.tar.gz
|
7
7
|
Author: The Parsl Team
|
8
8
|
Author-email: parsl@googlegroups.com
|
9
9
|
License: Apache 2.0
|
@@ -8,9 +8,9 @@ parsl/multiprocessing.py,sha256=MyaEcEq-Qf860u7V98u-PZrPNdtzOZL_NW6EhIJnmfQ,1937
|
|
8
8
|
parsl/process_loggers.py,sha256=uQ7Gd0W72Jz7rrcYlOMfLsAEhkRltxXJL2MgdduJjEw,1136
|
9
9
|
parsl/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
parsl/utils.py,sha256=91FjQiTUY383ueAjkBAgE21My9nba6SP2a2SrbB1r1Q,11250
|
11
|
-
parsl/version.py,sha256=
|
11
|
+
parsl/version.py,sha256=bW2OSs-fGwn8vVUB6s290nY4Qe21tYD0iIBDkqvY198,131
|
12
12
|
parsl/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
parsl/app/app.py,sha256=
|
13
|
+
parsl/app/app.py,sha256=D5Ok_gt99mlclM_QfZbquHUBkibyG4tYdUN9ijRwUnQ,8345
|
14
14
|
parsl/app/bash.py,sha256=iTpWH1K5E0e60nH23bwl97zNgg5BssFIqfp-182wkjA,5656
|
15
15
|
parsl/app/errors.py,sha256=nJmOEPglAISfD3R1UsTZH-avqiSOJgx_DkpdL9B591w,3917
|
16
16
|
parsl/app/futures.py,sha256=XU1NwkoNVsxy3KF5y0Ihsla5hPbhhuSikZInfS7h7Uo,2910
|
@@ -80,8 +80,8 @@ parsl/executors/flux/executor.py,sha256=gPq49CQwtSZYZggLZ0dCXdpUlllKHJbvR8WRKeGh
|
|
80
80
|
parsl/executors/flux/flux_instance_manager.py,sha256=2KVcphlybF-ALYD_3_YjMUi0f5LkjdoJOT_783CW4H0,2036
|
81
81
|
parsl/executors/high_throughput/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
82
82
|
parsl/executors/high_throughput/errors.py,sha256=77ZGrw9suLh9tSWjyhCaIvnC9nRAOmrXsZmvHM6nT68,626
|
83
|
-
parsl/executors/high_throughput/executor.py,sha256=
|
84
|
-
parsl/executors/high_throughput/interchange.py,sha256=
|
83
|
+
parsl/executors/high_throughput/executor.py,sha256=iRmAdQpHpmC0UDC5jDZ0O-BlZe_RhfItlqL5RIiD7os,37039
|
84
|
+
parsl/executors/high_throughput/interchange.py,sha256=6avQQ8Ljtmuzpa5yjClswqdVEBPDnNBeKb_yn0XbVW4,31462
|
85
85
|
parsl/executors/high_throughput/manager_record.py,sha256=9XppKjDW0DJ7SMkPNxsiDs-HvXGPLrTg6Ceyh4b6gNs,433
|
86
86
|
parsl/executors/high_throughput/monitoring_info.py,sha256=HC0drp6nlXQpAop5PTUKNjdXMgtZVvrBL0JzZJebPP4,298
|
87
87
|
parsl/executors/high_throughput/mpi_executor.py,sha256=B2CR1pHaGQzIwTrQ-_i08NZG-NwS6yr8y7nxPaa_rkA,3760
|
@@ -175,7 +175,7 @@ parsl/providers/grid_engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
|
|
175
175
|
parsl/providers/grid_engine/grid_engine.py,sha256=jTQjKaJh4eEXGbhrrCcXFV4AVFo2t39iVpslDR8gF6o,8565
|
176
176
|
parsl/providers/grid_engine/template.py,sha256=a7iViKr8LXcFTPmsf_qQeVK5o_RekOAIlUOF0X1q-2M,273
|
177
177
|
parsl/providers/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
178
|
-
parsl/providers/kubernetes/kube.py,sha256=
|
178
|
+
parsl/providers/kubernetes/kube.py,sha256=K9-HsdjOJ_kKqUbqnSu4CjuZj9CjtJO8LDLIWIETiz0,14485
|
179
179
|
parsl/providers/kubernetes/template.py,sha256=VsRz6cmNaII-y4OdMT6sCwzQy95SJX6NMB0hmmFBhX4,50
|
180
180
|
parsl/providers/local/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
181
181
|
parsl/providers/local/local.py,sha256=pTEcl9NnjRcL8FHcMeMEtJj1IXiAOxZ2Cih97Q5jDPY,11388
|
@@ -343,7 +343,7 @@ parsl/tests/test_htex/test_connected_blocks.py,sha256=gaXZSr__pIaLvKY6rF-4r1p_4d
|
|
343
343
|
parsl/tests/test_htex/test_cpu_affinity_explicit.py,sha256=DVHrRCskDbJIrfB5YSi3ZSbfR4WzijA46aZfZzjNcrU,1382
|
344
344
|
parsl/tests/test_htex/test_disconnected_blocks.py,sha256=3V1Ol9gMS6knjLTgIjB5GrunRSp4ANsJ_2vAvpyMR6c,1858
|
345
345
|
parsl/tests/test_htex/test_drain.py,sha256=Z2Z5-3NfLL9tMgJh4JkVKLZZDl3Z2gDAbEFHDSGdItw,2288
|
346
|
-
parsl/tests/test_htex/test_htex.py,sha256
|
346
|
+
parsl/tests/test_htex/test_htex.py,sha256=-gAD-c2h9EpgYG52IN4AUVBbsWnVD31-bIdaNffoGUY,4524
|
347
347
|
parsl/tests/test_htex/test_manager_failure.py,sha256=N-obuSZ8f7XA_XcddoN2LWKSVtpKUZvTHb7BFelS3iQ,1143
|
348
348
|
parsl/tests/test_htex/test_managers_command.py,sha256=Y-eUjtBzwW9erCYdph9bOesbkUvX8QUPqXt27DCgVS8,951
|
349
349
|
parsl/tests/test_htex/test_missing_worker.py,sha256=gyp5i7_t-JHyJGtz_eXZKKBY5w8oqLOIxO6cJgGJMtQ,745
|
@@ -464,12 +464,13 @@ parsl/usage_tracking/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
464
464
|
parsl/usage_tracking/api.py,sha256=iaCY58Dc5J4UM7_dJzEEs871P1p1HdxBMtNGyVdzc9g,1821
|
465
465
|
parsl/usage_tracking/levels.py,sha256=xbfzYEsd55KiZJ-mzNgPebvOH4rRHum04hROzEf41tU,291
|
466
466
|
parsl/usage_tracking/usage.py,sha256=qNEJ7nPimqd3Y7OWFLdYmNwJ6XDKlyfV_fTzasxsQw8,8690
|
467
|
-
parsl-2024.6.
|
468
|
-
parsl-2024.6.
|
469
|
-
parsl-2024.6.
|
470
|
-
parsl-2024.6.
|
471
|
-
parsl-2024.6.
|
472
|
-
parsl-2024.6.
|
473
|
-
parsl-2024.6.
|
474
|
-
parsl-2024.6.
|
475
|
-
parsl-2024.6.
|
467
|
+
parsl-2024.6.17.data/scripts/exec_parsl_function.py,sha256=RUkJ4JSJAjr7YyRZ58zhMdg8cR5dVV9odUl3AuzNf3k,7802
|
468
|
+
parsl-2024.6.17.data/scripts/interchange.py,sha256=kI1fSF30txw_eEicPSsxbycuz6Sdxiiyy2xrrk7jlZU,31449
|
469
|
+
parsl-2024.6.17.data/scripts/parsl_coprocess.py,sha256=zrVjEqQvFOHxsLufPi00xzMONagjVwLZbavPM7bbjK4,5722
|
470
|
+
parsl-2024.6.17.data/scripts/process_worker_pool.py,sha256=weug6_LAMbqEKQhiI6ZMg8r3e-XBDw1-L5_COEt7caM,41879
|
471
|
+
parsl-2024.6.17.dist-info/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
|
472
|
+
parsl-2024.6.17.dist-info/METADATA,sha256=IwP3sPUnicwrXXDVoYqbCuTzXAA1am762Xpw2RuKfso,4124
|
473
|
+
parsl-2024.6.17.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
474
|
+
parsl-2024.6.17.dist-info/entry_points.txt,sha256=XqnsWDYoEcLbsMcpnYGKLEnSBmaIe1YoM5YsBdJG2tI,176
|
475
|
+
parsl-2024.6.17.dist-info/top_level.txt,sha256=PIheYoUFQtF2icLsgOykgU-Cjuwr2Oi6On2jo5RYgRM,6
|
476
|
+
parsl-2024.6.17.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|