parsl 2024.11.18__py3-none-any.whl → 2024.12.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/addresses.py +19 -1
- parsl/channels/base.py +0 -28
- parsl/channels/local/local.py +0 -36
- parsl/configs/ASPIRE1.py +0 -1
- parsl/curvezmq.py +4 -0
- parsl/executors/execute_task.py +37 -0
- parsl/executors/flux/execute_parsl_task.py +1 -1
- parsl/executors/high_throughput/executor.py +29 -7
- parsl/executors/high_throughput/interchange.py +8 -7
- parsl/executors/high_throughput/mpi_executor.py +2 -0
- parsl/executors/high_throughput/mpi_resource_management.py +2 -3
- parsl/executors/high_throughput/probe.py +4 -4
- parsl/executors/high_throughput/process_worker_pool.py +15 -43
- parsl/executors/high_throughput/zmq_pipes.py +18 -8
- parsl/executors/radical/rpex_worker.py +2 -2
- parsl/executors/workqueue/exec_parsl_function.py +1 -1
- parsl/providers/condor/condor.py +3 -5
- parsl/providers/grid_engine/grid_engine.py +2 -3
- parsl/providers/local/local.py +1 -15
- parsl/providers/lsf/lsf.py +2 -12
- parsl/providers/pbspro/pbspro.py +1 -3
- parsl/providers/slurm/slurm.py +1 -11
- parsl/providers/torque/torque.py +1 -3
- parsl/serialize/facade.py +3 -3
- parsl/tests/configs/htex_local.py +1 -0
- parsl/tests/configs/htex_local_alternate.py +0 -1
- parsl/tests/configs/local_threads_monitoring.py +0 -1
- parsl/tests/manual_tests/test_udp_simple.py +0 -1
- parsl/tests/test_execute_task.py +29 -0
- parsl/tests/test_htex/test_zmq_binding.py +3 -2
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -1
- parsl/tests/test_monitoring/test_stdouterr.py +0 -1
- parsl/tests/unit/test_address.py +20 -0
- parsl/version.py +1 -1
- {parsl-2024.11.18.data → parsl-2024.12.2.data}/scripts/exec_parsl_function.py +1 -1
- {parsl-2024.11.18.data → parsl-2024.12.2.data}/scripts/interchange.py +8 -7
- {parsl-2024.11.18.data → parsl-2024.12.2.data}/scripts/process_worker_pool.py +15 -43
- {parsl-2024.11.18.dist-info → parsl-2024.12.2.dist-info}/METADATA +2 -2
- {parsl-2024.11.18.dist-info → parsl-2024.12.2.dist-info}/RECORD +44 -41
- {parsl-2024.11.18.dist-info → parsl-2024.12.2.dist-info}/WHEEL +1 -1
- {parsl-2024.11.18.data → parsl-2024.12.2.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.11.18.dist-info → parsl-2024.12.2.dist-info}/LICENSE +0 -0
- {parsl-2024.11.18.dist-info → parsl-2024.12.2.dist-info}/entry_points.txt +0 -0
- {parsl-2024.11.18.dist-info → parsl-2024.12.2.dist-info}/top_level.txt +0 -0
parsl/addresses.py
CHANGED
@@ -6,6 +6,7 @@ The helper to use depends on the network environment around the submitter,
|
|
6
6
|
so some experimentation will probably be needed to choose the correct one.
|
7
7
|
"""
|
8
8
|
|
9
|
+
import ipaddress
|
9
10
|
import logging
|
10
11
|
import platform
|
11
12
|
import socket
|
@@ -17,7 +18,7 @@ try:
|
|
17
18
|
except ImportError:
|
18
19
|
fcntl = None # type: ignore[assignment]
|
19
20
|
import struct
|
20
|
-
from typing import Callable, List, Set
|
21
|
+
from typing import Callable, List, Set, Union
|
21
22
|
|
22
23
|
import psutil
|
23
24
|
import typeguard
|
@@ -156,3 +157,20 @@ def get_any_address() -> str:
|
|
156
157
|
if addr == '':
|
157
158
|
raise Exception('Cannot find address of the local machine.')
|
158
159
|
return addr
|
160
|
+
|
161
|
+
|
162
|
+
def tcp_url(address: str, port: Union[str, int, None] = None) -> str:
|
163
|
+
"""Construct a tcp url safe for IPv4 and IPv6"""
|
164
|
+
if address == "*":
|
165
|
+
return "tcp://*"
|
166
|
+
|
167
|
+
ip_addr = ipaddress.ip_address(address)
|
168
|
+
|
169
|
+
port_suffix = f":{port}" if port else ""
|
170
|
+
|
171
|
+
if ip_addr.version == 6 and port_suffix:
|
172
|
+
url = f"tcp://[{address}]{port_suffix}"
|
173
|
+
else:
|
174
|
+
url = f"tcp://{address}{port_suffix}"
|
175
|
+
|
176
|
+
return url
|
parsl/channels/base.py
CHANGED
@@ -52,31 +52,3 @@ class Channel(metaclass=ABCMeta):
|
|
52
52
|
@script_dir.setter
|
53
53
|
def script_dir(self, value: str) -> None:
|
54
54
|
pass
|
55
|
-
|
56
|
-
@abstractmethod
|
57
|
-
def push_file(self, source: str, dest_dir: str) -> str:
|
58
|
-
''' Channel will take care of moving the file from source to the destination
|
59
|
-
directory
|
60
|
-
|
61
|
-
Args:
|
62
|
-
source (string) : Full filepath of the file to be moved
|
63
|
-
dest_dir (string) : Absolute path of the directory to move to
|
64
|
-
|
65
|
-
Returns:
|
66
|
-
destination_path (string)
|
67
|
-
'''
|
68
|
-
pass
|
69
|
-
|
70
|
-
@abstractmethod
|
71
|
-
def pull_file(self, remote_source: str, local_dir: str) -> str:
|
72
|
-
''' Transport file on the remote side to a local directory
|
73
|
-
|
74
|
-
Args:
|
75
|
-
remote_source (string): remote_source
|
76
|
-
local_dir (string): Local directory to copy to
|
77
|
-
|
78
|
-
|
79
|
-
Returns:
|
80
|
-
destination_path (string)
|
81
|
-
'''
|
82
|
-
pass
|
parsl/channels/local/local.py
CHANGED
@@ -1,10 +1,8 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
|
-
import shutil
|
4
3
|
import subprocess
|
5
4
|
|
6
5
|
from parsl.channels.base import Channel
|
7
|
-
from parsl.channels.errors import FileCopyException
|
8
6
|
from parsl.utils import RepresentationMixin
|
9
7
|
|
10
8
|
logger = logging.getLogger(__name__)
|
@@ -57,40 +55,6 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
57
55
|
|
58
56
|
return (retcode, stdout.decode("utf-8"), stderr.decode("utf-8"))
|
59
57
|
|
60
|
-
def push_file(self, source, dest_dir):
|
61
|
-
''' If the source files dirpath is the same as dest_dir, a copy
|
62
|
-
is not necessary, and nothing is done. Else a copy is made.
|
63
|
-
|
64
|
-
Args:
|
65
|
-
- source (string) : Path to the source file
|
66
|
-
- dest_dir (string) : Path to the directory to which the files is to be copied
|
67
|
-
|
68
|
-
Returns:
|
69
|
-
- destination_path (String) : Absolute path of the destination file
|
70
|
-
|
71
|
-
Raises:
|
72
|
-
- FileCopyException : If file copy failed.
|
73
|
-
'''
|
74
|
-
|
75
|
-
local_dest = os.path.join(dest_dir, os.path.basename(source))
|
76
|
-
|
77
|
-
# Only attempt to copy if the target dir and source dir are different
|
78
|
-
if os.path.dirname(source) != dest_dir:
|
79
|
-
try:
|
80
|
-
shutil.copyfile(source, local_dest)
|
81
|
-
os.chmod(local_dest, 0o700)
|
82
|
-
|
83
|
-
except OSError as e:
|
84
|
-
raise FileCopyException(e, "localhost")
|
85
|
-
|
86
|
-
else:
|
87
|
-
os.chmod(local_dest, 0o700)
|
88
|
-
|
89
|
-
return local_dest
|
90
|
-
|
91
|
-
def pull_file(self, remote_source, local_dir):
|
92
|
-
return self.push_file(remote_source, local_dir)
|
93
|
-
|
94
58
|
@property
|
95
59
|
def script_dir(self):
|
96
60
|
return self._script_dir
|
parsl/configs/ASPIRE1.py
CHANGED
parsl/curvezmq.py
CHANGED
@@ -160,6 +160,9 @@ class ServerContext(BaseContext):
|
|
160
160
|
except zmq.ZMQError as e:
|
161
161
|
raise ValueError("Invalid CurveZMQ key format") from e
|
162
162
|
sock.setsockopt(zmq.CURVE_SERVER, True) # Must come before bind
|
163
|
+
|
164
|
+
# This flag enables IPV6 in addition to IPV4
|
165
|
+
sock.setsockopt(zmq.IPV6, True)
|
163
166
|
return sock
|
164
167
|
|
165
168
|
def term(self):
|
@@ -202,4 +205,5 @@ class ClientContext(BaseContext):
|
|
202
205
|
sock.setsockopt(zmq.CURVE_SERVERKEY, server_public_key)
|
203
206
|
except zmq.ZMQError as e:
|
204
207
|
raise ValueError("Invalid CurveZMQ key format") from e
|
208
|
+
sock.setsockopt(zmq.IPV6, True)
|
205
209
|
return sock
|
@@ -0,0 +1,37 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
from parsl.serialize import unpack_res_spec_apply_message
|
4
|
+
|
5
|
+
|
6
|
+
def execute_task(bufs: bytes):
|
7
|
+
"""Deserialize the buffer and execute the task.
|
8
|
+
Returns the result or throws exception.
|
9
|
+
"""
|
10
|
+
f, args, kwargs, resource_spec = unpack_res_spec_apply_message(bufs)
|
11
|
+
|
12
|
+
for varname in resource_spec:
|
13
|
+
envname = "PARSL_" + str(varname).upper()
|
14
|
+
os.environ[envname] = str(resource_spec[varname])
|
15
|
+
|
16
|
+
# We might need to look into callability of the function from itself
|
17
|
+
# since we change it's name in the new namespace
|
18
|
+
prefix = "parsl_"
|
19
|
+
fname = prefix + "f"
|
20
|
+
argname = prefix + "args"
|
21
|
+
kwargname = prefix + "kwargs"
|
22
|
+
resultname = prefix + "result"
|
23
|
+
|
24
|
+
code = "{0} = {1}(*{2}, **{3})".format(resultname, fname,
|
25
|
+
argname, kwargname)
|
26
|
+
|
27
|
+
user_ns = locals()
|
28
|
+
user_ns.update({
|
29
|
+
'__builtins__': __builtins__,
|
30
|
+
fname: f,
|
31
|
+
argname: args,
|
32
|
+
kwargname: kwargs,
|
33
|
+
resultname: resultname
|
34
|
+
})
|
35
|
+
|
36
|
+
exec(code, user_ns, user_ns)
|
37
|
+
return user_ns.get(resultname)
|
@@ -4,8 +4,8 @@ import argparse
|
|
4
4
|
import logging
|
5
5
|
import os
|
6
6
|
|
7
|
+
from parsl.executors.execute_task import execute_task
|
7
8
|
from parsl.executors.flux import TaskResult
|
8
|
-
from parsl.executors.high_throughput.process_worker_pool import execute_task
|
9
9
|
from parsl.serialize import serialize
|
10
10
|
|
11
11
|
|
@@ -86,7 +86,7 @@ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionP
|
|
86
86
|
|
87
87
|
address : string
|
88
88
|
An address to connect to the main Parsl process which is reachable from the network in which
|
89
|
-
workers will be running. This field expects an IPv4 address
|
89
|
+
workers will be running. This field expects an IPv4 or IPv6 address.
|
90
90
|
Most login nodes on clusters have several network interfaces available, only some of which
|
91
91
|
can be reached from the compute nodes. This field can be used to limit the executor to listen
|
92
92
|
only on a specific interface, and limiting connections to the internal network.
|
@@ -94,6 +94,11 @@ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionP
|
|
94
94
|
Setting an address here overrides the default behavior.
|
95
95
|
default=None
|
96
96
|
|
97
|
+
loopback_address: string
|
98
|
+
Specify address used for internal communication between executor and interchange.
|
99
|
+
Supports IPv4 and IPv6 addresses
|
100
|
+
default=127.0.0.1
|
101
|
+
|
97
102
|
worker_ports : (int, int)
|
98
103
|
Specify the ports to be used by workers to connect to Parsl. If this option is specified,
|
99
104
|
worker_port_range will not be honored.
|
@@ -224,6 +229,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
224
229
|
Parsl will create names as integers starting with 0.
|
225
230
|
|
226
231
|
default: empty list
|
232
|
+
|
227
233
|
"""
|
228
234
|
|
229
235
|
@typeguard.typechecked
|
@@ -233,6 +239,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
233
239
|
launch_cmd: Optional[str] = None,
|
234
240
|
interchange_launch_cmd: Optional[Sequence[str]] = None,
|
235
241
|
address: Optional[str] = None,
|
242
|
+
loopback_address: str = "127.0.0.1",
|
236
243
|
worker_ports: Optional[Tuple[int, int]] = None,
|
237
244
|
worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
|
238
245
|
interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000),
|
@@ -268,6 +275,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
268
275
|
self.address = address
|
269
276
|
self.address_probe_timeout = address_probe_timeout
|
270
277
|
self.manager_selector = manager_selector
|
278
|
+
self.loopback_address = loopback_address
|
279
|
+
|
271
280
|
if self.address:
|
272
281
|
self.all_addresses = address
|
273
282
|
else:
|
@@ -322,6 +331,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
322
331
|
interchange_launch_cmd = DEFAULT_INTERCHANGE_LAUNCH_CMD
|
323
332
|
self.interchange_launch_cmd = interchange_launch_cmd
|
324
333
|
|
334
|
+
self._result_queue_thread_exit = threading.Event()
|
335
|
+
self._result_queue_thread: Optional[threading.Thread] = None
|
336
|
+
|
325
337
|
radio_mode = "htex"
|
326
338
|
enable_mpi_mode: bool = False
|
327
339
|
mpi_launcher: str = "mpiexec"
|
@@ -408,13 +420,13 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
408
420
|
)
|
409
421
|
|
410
422
|
self.outgoing_q = zmq_pipes.TasksOutgoing(
|
411
|
-
|
423
|
+
self.loopback_address, self.interchange_port_range, self.cert_dir
|
412
424
|
)
|
413
425
|
self.incoming_q = zmq_pipes.ResultsIncoming(
|
414
|
-
|
426
|
+
self.loopback_address, self.interchange_port_range, self.cert_dir
|
415
427
|
)
|
416
428
|
self.command_client = zmq_pipes.CommandClient(
|
417
|
-
|
429
|
+
self.loopback_address, self.interchange_port_range, self.cert_dir
|
418
430
|
)
|
419
431
|
|
420
432
|
self._result_queue_thread = None
|
@@ -446,9 +458,11 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
446
458
|
"""
|
447
459
|
logger.debug("Result queue worker starting")
|
448
460
|
|
449
|
-
while not self.bad_state_is_set:
|
461
|
+
while not self.bad_state_is_set and not self._result_queue_thread_exit.is_set():
|
450
462
|
try:
|
451
|
-
msgs = self.incoming_q.get()
|
463
|
+
msgs = self.incoming_q.get(timeout_ms=self.poll_period)
|
464
|
+
if msgs is None: # timeout
|
465
|
+
continue
|
452
466
|
|
453
467
|
except IOError as e:
|
454
468
|
logger.exception("Caught broken queue with exception code {}: {}".format(e.errno, e))
|
@@ -506,6 +520,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
506
520
|
else:
|
507
521
|
raise BadMessage("Message received with unknown type {}".format(msg['type']))
|
508
522
|
|
523
|
+
logger.info("Closing result ZMQ pipe")
|
524
|
+
self.incoming_q.close()
|
509
525
|
logger.info("Result queue worker finished")
|
510
526
|
|
511
527
|
def _start_local_interchange_process(self) -> None:
|
@@ -515,7 +531,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
515
531
|
get the worker task and result ports that the interchange has bound to.
|
516
532
|
"""
|
517
533
|
|
518
|
-
interchange_config = {"client_address":
|
534
|
+
interchange_config = {"client_address": self.loopback_address,
|
519
535
|
"client_ports": (self.outgoing_q.port,
|
520
536
|
self.incoming_q.port,
|
521
537
|
self.command_client.port),
|
@@ -808,6 +824,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
808
824
|
|
809
825
|
logger.info("Attempting HighThroughputExecutor shutdown")
|
810
826
|
|
827
|
+
logger.info("Terminating interchange and result queue thread")
|
828
|
+
self._result_queue_thread_exit.set()
|
811
829
|
self.interchange_proc.terminate()
|
812
830
|
try:
|
813
831
|
self.interchange_proc.wait(timeout=timeout)
|
@@ -832,6 +850,10 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
832
850
|
logger.info("Closing command client")
|
833
851
|
self.command_client.close()
|
834
852
|
|
853
|
+
logger.info("Waiting for result queue thread exit")
|
854
|
+
if self._result_queue_thread:
|
855
|
+
self._result_queue_thread.join()
|
856
|
+
|
835
857
|
logger.info("Finished HighThroughputExecutor shutdown attempt")
|
836
858
|
|
837
859
|
def get_usage_information(self):
|
@@ -14,6 +14,7 @@ from typing import Any, Dict, List, NoReturn, Optional, Sequence, Set, Tuple, ca
|
|
14
14
|
import zmq
|
15
15
|
|
16
16
|
from parsl import curvezmq
|
17
|
+
from parsl.addresses import tcp_url
|
17
18
|
from parsl.app.errors import RemoteExceptionWrapper
|
18
19
|
from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch
|
19
20
|
from parsl.executors.high_throughput.manager_record import ManagerRecord
|
@@ -115,13 +116,13 @@ class Interchange:
|
|
115
116
|
self.zmq_context = curvezmq.ServerContext(self.cert_dir)
|
116
117
|
self.task_incoming = self.zmq_context.socket(zmq.DEALER)
|
117
118
|
self.task_incoming.set_hwm(0)
|
118
|
-
self.task_incoming.connect(
|
119
|
+
self.task_incoming.connect(tcp_url(client_address, client_ports[0]))
|
119
120
|
self.results_outgoing = self.zmq_context.socket(zmq.DEALER)
|
120
121
|
self.results_outgoing.set_hwm(0)
|
121
|
-
self.results_outgoing.connect(
|
122
|
+
self.results_outgoing.connect(tcp_url(client_address, client_ports[1]))
|
122
123
|
|
123
124
|
self.command_channel = self.zmq_context.socket(zmq.REP)
|
124
|
-
self.command_channel.connect(
|
125
|
+
self.command_channel.connect(tcp_url(client_address, client_ports[2]))
|
125
126
|
logger.info("Connected to client")
|
126
127
|
|
127
128
|
self.run_id = run_id
|
@@ -144,14 +145,14 @@ class Interchange:
|
|
144
145
|
self.worker_task_port = self.worker_ports[0]
|
145
146
|
self.worker_result_port = self.worker_ports[1]
|
146
147
|
|
147
|
-
self.task_outgoing.bind(
|
148
|
-
self.results_incoming.bind(
|
148
|
+
self.task_outgoing.bind(tcp_url(self.interchange_address, self.worker_task_port))
|
149
|
+
self.results_incoming.bind(tcp_url(self.interchange_address, self.worker_result_port))
|
149
150
|
|
150
151
|
else:
|
151
|
-
self.worker_task_port = self.task_outgoing.bind_to_random_port(
|
152
|
+
self.worker_task_port = self.task_outgoing.bind_to_random_port(tcp_url(self.interchange_address),
|
152
153
|
min_port=worker_port_range[0],
|
153
154
|
max_port=worker_port_range[1], max_tries=100)
|
154
|
-
self.worker_result_port = self.results_incoming.bind_to_random_port(
|
155
|
+
self.worker_result_port = self.results_incoming.bind_to_random_port(tcp_url(self.interchange_address),
|
155
156
|
min_port=worker_port_range[0],
|
156
157
|
max_port=worker_port_range[1], max_tries=100)
|
157
158
|
|
@@ -50,6 +50,7 @@ class MPIExecutor(HighThroughputExecutor):
|
|
50
50
|
launch_cmd: Optional[str] = None,
|
51
51
|
interchange_launch_cmd: Optional[str] = None,
|
52
52
|
address: Optional[str] = None,
|
53
|
+
loopback_address: str = "127.0.0.1",
|
53
54
|
worker_ports: Optional[Tuple[int, int]] = None,
|
54
55
|
worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
|
55
56
|
interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000),
|
@@ -78,6 +79,7 @@ class MPIExecutor(HighThroughputExecutor):
|
|
78
79
|
launch_cmd=launch_cmd,
|
79
80
|
interchange_launch_cmd=interchange_launch_cmd,
|
80
81
|
address=address,
|
82
|
+
loopback_address=loopback_address,
|
81
83
|
worker_ports=worker_ports,
|
82
84
|
worker_port_range=worker_port_range,
|
83
85
|
interchange_port_range=interchange_port_range,
|
@@ -160,9 +160,7 @@ class MPITaskScheduler(TaskScheduler):
|
|
160
160
|
"""Schedule task if resources are available otherwise backlog the task"""
|
161
161
|
user_ns = locals()
|
162
162
|
user_ns.update({"__builtins__": __builtins__})
|
163
|
-
_f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(
|
164
|
-
task_package["buffer"], user_ns, copy=False
|
165
|
-
)
|
163
|
+
_f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(task_package["buffer"])
|
166
164
|
|
167
165
|
nodes_needed = resource_spec.get("num_nodes")
|
168
166
|
if nodes_needed:
|
@@ -177,6 +175,7 @@ class MPITaskScheduler(TaskScheduler):
|
|
177
175
|
self._map_tasks_to_nodes[task_package["task_id"]] = allocated_nodes
|
178
176
|
buffer = pack_res_spec_apply_message(_f, _args, _kwargs, resource_spec)
|
179
177
|
task_package["buffer"] = buffer
|
178
|
+
task_package["resource_spec"] = resource_spec
|
180
179
|
|
181
180
|
self.pending_task_q.put(task_package)
|
182
181
|
|
@@ -6,7 +6,7 @@ import uuid
|
|
6
6
|
import zmq
|
7
7
|
from zmq.utils.monitor import recv_monitor_message
|
8
8
|
|
9
|
-
from parsl.addresses import get_all_addresses
|
9
|
+
from parsl.addresses import get_all_addresses, tcp_url
|
10
10
|
|
11
11
|
logger = logging.getLogger(__name__)
|
12
12
|
|
@@ -32,7 +32,8 @@ def probe_addresses(addresses, task_port, timeout=120):
|
|
32
32
|
for addr in addresses:
|
33
33
|
socket = context.socket(zmq.DEALER)
|
34
34
|
socket.setsockopt(zmq.LINGER, 0)
|
35
|
-
|
35
|
+
socket.setsockopt(zmq.IPV6, True)
|
36
|
+
url = tcp_url(addr, task_port)
|
36
37
|
logger.debug("Trying to connect back on {}".format(url))
|
37
38
|
socket.connect(url)
|
38
39
|
addr_map[addr] = {'sock': socket,
|
@@ -71,8 +72,7 @@ class TestWorker:
|
|
71
72
|
|
72
73
|
address = probe_addresses(addresses, port)
|
73
74
|
print("Viable address :", address)
|
74
|
-
self.task_incoming.connect(
|
75
|
-
print("Here")
|
75
|
+
self.task_incoming.connect(tcp_url(address, port))
|
76
76
|
|
77
77
|
def heartbeat(self):
|
78
78
|
""" Send heartbeat to the incoming task queue
|
@@ -22,7 +22,9 @@ import psutil
|
|
22
22
|
import zmq
|
23
23
|
|
24
24
|
from parsl import curvezmq
|
25
|
+
from parsl.addresses import tcp_url
|
25
26
|
from parsl.app.errors import RemoteExceptionWrapper
|
27
|
+
from parsl.executors.execute_task import execute_task
|
26
28
|
from parsl.executors.high_throughput.errors import WorkerLost
|
27
29
|
from parsl.executors.high_throughput.mpi_prefix_composer import (
|
28
30
|
VALID_LAUNCHERS,
|
@@ -35,7 +37,7 @@ from parsl.executors.high_throughput.mpi_resource_management import (
|
|
35
37
|
from parsl.executors.high_throughput.probe import probe_addresses
|
36
38
|
from parsl.multiprocessing import SpawnContext
|
37
39
|
from parsl.process_loggers import wrap_with_logs
|
38
|
-
from parsl.serialize import serialize
|
40
|
+
from parsl.serialize import serialize
|
39
41
|
from parsl.version import VERSION as PARSL_VERSION
|
40
42
|
|
41
43
|
HEARTBEAT_CODE = (2 ** 32) - 1
|
@@ -158,8 +160,8 @@ class Manager:
|
|
158
160
|
raise Exception("No viable address found")
|
159
161
|
else:
|
160
162
|
logger.info("Connection to Interchange successful on {}".format(ix_address))
|
161
|
-
task_q_url =
|
162
|
-
result_q_url =
|
163
|
+
task_q_url = tcp_url(ix_address, task_port)
|
164
|
+
result_q_url = tcp_url(ix_address, result_port)
|
163
165
|
logger.info("Task url : {}".format(task_q_url))
|
164
166
|
logger.info("Result url : {}".format(result_q_url))
|
165
167
|
except Exception:
|
@@ -590,45 +592,13 @@ def update_resource_spec_env_vars(mpi_launcher: str, resource_spec: Dict, node_i
|
|
590
592
|
os.environ[key] = prefix_table[key]
|
591
593
|
|
592
594
|
|
593
|
-
def
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
f, args, kwargs, resource_spec = unpack_res_spec_apply_message(bufs, user_ns, copy=False)
|
602
|
-
|
603
|
-
for varname in resource_spec:
|
604
|
-
envname = "PARSL_" + str(varname).upper()
|
605
|
-
os.environ[envname] = str(resource_spec[varname])
|
606
|
-
|
607
|
-
if resource_spec.get("MPI_NODELIST"):
|
608
|
-
worker_id = os.environ['PARSL_WORKER_RANK']
|
609
|
-
nodes_for_task = resource_spec["MPI_NODELIST"].split(',')
|
610
|
-
logger.info(f"Launching task on provisioned nodes: {nodes_for_task}")
|
611
|
-
assert mpi_launcher
|
612
|
-
update_resource_spec_env_vars(mpi_launcher,
|
613
|
-
resource_spec=resource_spec,
|
614
|
-
node_info=nodes_for_task)
|
615
|
-
# We might need to look into callability of the function from itself
|
616
|
-
# since we change it's name in the new namespace
|
617
|
-
prefix = "parsl_"
|
618
|
-
fname = prefix + "f"
|
619
|
-
argname = prefix + "args"
|
620
|
-
kwargname = prefix + "kwargs"
|
621
|
-
resultname = prefix + "result"
|
622
|
-
|
623
|
-
user_ns.update({fname: f,
|
624
|
-
argname: args,
|
625
|
-
kwargname: kwargs,
|
626
|
-
resultname: resultname})
|
627
|
-
|
628
|
-
code = "{0} = {1}(*{2}, **{3})".format(resultname, fname,
|
629
|
-
argname, kwargname)
|
630
|
-
exec(code, user_ns, user_ns)
|
631
|
-
return user_ns.get(resultname)
|
595
|
+
def _init_mpi_env(mpi_launcher: str, resource_spec: Dict):
|
596
|
+
node_list = resource_spec.get("MPI_NODELIST")
|
597
|
+
if node_list is None:
|
598
|
+
return
|
599
|
+
nodes_for_task = node_list.split(',')
|
600
|
+
logger.info(f"Launching task on provisioned nodes: {nodes_for_task}")
|
601
|
+
update_resource_spec_env_vars(mpi_launcher=mpi_launcher, resource_spec=resource_spec, node_info=nodes_for_task)
|
632
602
|
|
633
603
|
|
634
604
|
@wrap_with_logs(target="worker_log")
|
@@ -786,8 +756,10 @@ def worker(
|
|
786
756
|
ready_worker_count.value -= 1
|
787
757
|
worker_enqueued = False
|
788
758
|
|
759
|
+
_init_mpi_env(mpi_launcher=mpi_launcher, resource_spec=req["resource_spec"])
|
760
|
+
|
789
761
|
try:
|
790
|
-
result = execute_task(req['buffer']
|
762
|
+
result = execute_task(req['buffer'])
|
791
763
|
serialized_result = serialize(result, buffer_threshold=1000000)
|
792
764
|
except Exception as e:
|
793
765
|
logger.info('Caught an exception: {}'.format(e))
|
@@ -8,6 +8,7 @@ from typing import Optional
|
|
8
8
|
import zmq
|
9
9
|
|
10
10
|
from parsl import curvezmq
|
11
|
+
from parsl.addresses import tcp_url
|
11
12
|
from parsl.errors import InternalConsistencyError
|
12
13
|
from parsl.executors.high_throughput.errors import (
|
13
14
|
CommandClientBadError,
|
@@ -52,11 +53,11 @@ class CommandClient:
|
|
52
53
|
self.zmq_socket = self.zmq_context.socket(zmq.REQ)
|
53
54
|
self.zmq_socket.setsockopt(zmq.LINGER, 0)
|
54
55
|
if self.port is None:
|
55
|
-
self.port = self.zmq_socket.bind_to_random_port(
|
56
|
+
self.port = self.zmq_socket.bind_to_random_port(tcp_url(self.ip_address),
|
56
57
|
min_port=self.port_range[0],
|
57
58
|
max_port=self.port_range[1])
|
58
59
|
else:
|
59
|
-
self.zmq_socket.bind(
|
60
|
+
self.zmq_socket.bind(tcp_url(self.ip_address, self.port))
|
60
61
|
|
61
62
|
def run(self, message, max_retries=3, timeout_s=None):
|
62
63
|
""" This function needs to be fast at the same time aware of the possibility of
|
@@ -146,7 +147,7 @@ class TasksOutgoing:
|
|
146
147
|
self.zmq_context = curvezmq.ClientContext(cert_dir)
|
147
148
|
self.zmq_socket = self.zmq_context.socket(zmq.DEALER)
|
148
149
|
self.zmq_socket.set_hwm(0)
|
149
|
-
self.port = self.zmq_socket.bind_to_random_port(
|
150
|
+
self.port = self.zmq_socket.bind_to_random_port(tcp_url(ip_address),
|
150
151
|
min_port=port_range[0],
|
151
152
|
max_port=port_range[1])
|
152
153
|
self.poller = zmq.Poller()
|
@@ -202,15 +203,24 @@ class ResultsIncoming:
|
|
202
203
|
self.zmq_context = curvezmq.ClientContext(cert_dir)
|
203
204
|
self.results_receiver = self.zmq_context.socket(zmq.DEALER)
|
204
205
|
self.results_receiver.set_hwm(0)
|
205
|
-
self.port = self.results_receiver.bind_to_random_port(
|
206
|
+
self.port = self.results_receiver.bind_to_random_port(tcp_url(ip_address),
|
206
207
|
min_port=port_range[0],
|
207
208
|
max_port=port_range[1])
|
209
|
+
self.poller = zmq.Poller()
|
210
|
+
self.poller.register(self.results_receiver, zmq.POLLIN)
|
208
211
|
|
209
|
-
def get(self):
|
212
|
+
def get(self, timeout_ms=None):
|
213
|
+
"""Get a message from the queue, returning None if timeout expires
|
214
|
+
without a message. timeout is measured in milliseconds.
|
215
|
+
"""
|
210
216
|
logger.debug("Waiting for ResultsIncoming message")
|
211
|
-
|
212
|
-
|
213
|
-
|
217
|
+
socks = dict(self.poller.poll(timeout=timeout_ms))
|
218
|
+
if self.results_receiver in socks and socks[self.results_receiver] == zmq.POLLIN:
|
219
|
+
m = self.results_receiver.recv_multipart()
|
220
|
+
logger.debug("Received ResultsIncoming message")
|
221
|
+
return m
|
222
|
+
else:
|
223
|
+
return None
|
214
224
|
|
215
225
|
def close(self):
|
216
226
|
self.results_receiver.close()
|
@@ -4,7 +4,7 @@ import radical.pilot as rp
|
|
4
4
|
|
5
5
|
import parsl.app.errors as pe
|
6
6
|
from parsl.app.bash import remote_side_bash_executor
|
7
|
-
from parsl.executors.
|
7
|
+
from parsl.executors.execute_task import execute_task
|
8
8
|
from parsl.serialize import serialize, unpack_res_spec_apply_message
|
9
9
|
|
10
10
|
|
@@ -33,7 +33,7 @@ class ParslWorker:
|
|
33
33
|
|
34
34
|
try:
|
35
35
|
buffer = rp.utils.deserialize_bson(task['description']['executable'])
|
36
|
-
func, args, kwargs, _resource_spec = unpack_res_spec_apply_message(buffer
|
36
|
+
func, args, kwargs, _resource_spec = unpack_res_spec_apply_message(buffer)
|
37
37
|
ret = remote_side_bash_executor(func, *args, **kwargs)
|
38
38
|
exc = (None, None)
|
39
39
|
val = None
|
@@ -94,7 +94,7 @@ def unpack_source_code_function(function_info, user_namespace):
|
|
94
94
|
|
95
95
|
def unpack_byte_code_function(function_info, user_namespace):
|
96
96
|
from parsl.serialize import unpack_apply_message
|
97
|
-
func, args, kwargs = unpack_apply_message(function_info["byte code"]
|
97
|
+
func, args, kwargs = unpack_apply_message(function_info["byte code"])
|
98
98
|
return (func, 'parsl_function_name', args, kwargs)
|
99
99
|
|
100
100
|
|
parsl/providers/condor/condor.py
CHANGED
@@ -245,16 +245,14 @@ class CondorProvider(RepresentationMixin, ClusterProvider):
|
|
245
245
|
with open(userscript_path, 'w') as f:
|
246
246
|
f.write(job_config["worker_init"] + '\n' + wrapped_command)
|
247
247
|
|
248
|
-
|
249
|
-
the_input_files = [user_script_path] + self.transfer_input_files
|
248
|
+
the_input_files = [userscript_path] + self.transfer_input_files
|
250
249
|
job_config["input_files"] = ','.join(the_input_files)
|
251
|
-
job_config["job_script"] = os.path.basename(
|
250
|
+
job_config["job_script"] = os.path.basename(userscript_path)
|
252
251
|
|
253
252
|
# Construct and move the submit script
|
254
253
|
self._write_submit_script(template_string, script_path, job_name, job_config)
|
255
|
-
channel_script_path = self.channel.push_file(script_path, self.channel.script_dir)
|
256
254
|
|
257
|
-
cmd = "condor_submit {0}".format(
|
255
|
+
cmd = "condor_submit {0}".format(script_path)
|
258
256
|
try:
|
259
257
|
retcode, stdout, stderr = self.execute_wait(cmd)
|
260
258
|
except Exception as e:
|
@@ -142,11 +142,10 @@ class GridEngineProvider(ClusterProvider, RepresentationMixin):
|
|
142
142
|
logger.debug("Writing submit script")
|
143
143
|
self._write_submit_script(template_string, script_path, job_name, job_config)
|
144
144
|
|
145
|
-
channel_script_path = self.channel.push_file(script_path, self.channel.script_dir)
|
146
145
|
if self.queue is not None:
|
147
|
-
cmd = "qsub -q {0} -terse {1}".format(self.queue,
|
146
|
+
cmd = "qsub -q {0} -terse {1}".format(self.queue, script_path)
|
148
147
|
else:
|
149
|
-
cmd = "qsub -terse {0}".format(
|
148
|
+
cmd = "qsub -terse {0}".format(script_path)
|
150
149
|
retcode, stdout, stderr = self.execute_wait(cmd)
|
151
150
|
|
152
151
|
if retcode == 0:
|