parsl 2024.1.29__py3-none-any.whl → 2024.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/curvezmq.py +205 -0
- parsl/dataflow/dflow.py +1 -1
- parsl/executors/high_throughput/executor.py +78 -49
- parsl/executors/high_throughput/interchange.py +14 -7
- parsl/executors/high_throughput/process_worker_pool.py +15 -7
- parsl/executors/high_throughput/zmq_pipes.py +21 -15
- parsl/executors/taskvine/manager.py +44 -43
- parsl/monitoring/monitoring.py +18 -3
- parsl/providers/errors.py +4 -6
- parsl/providers/slurm/slurm.py +7 -6
- parsl/tests/configs/ad_hoc_cluster_htex.py +1 -0
- parsl/tests/configs/azure_single_node.py +1 -0
- parsl/tests/configs/bluewaters.py +1 -0
- parsl/tests/configs/bridges.py +1 -0
- parsl/tests/configs/cc_in2p3.py +1 -0
- parsl/tests/configs/comet.py +1 -0
- parsl/tests/configs/cooley_htex.py +1 -0
- parsl/tests/configs/ec2_single_node.py +1 -0
- parsl/tests/configs/ec2_spot.py +1 -0
- parsl/tests/configs/frontera.py +1 -0
- parsl/tests/configs/htex_ad_hoc_cluster.py +1 -0
- parsl/tests/configs/htex_local.py +1 -0
- parsl/tests/configs/htex_local_alternate.py +1 -0
- parsl/tests/configs/htex_local_intask_staging.py +1 -0
- parsl/tests/configs/htex_local_rsync_staging.py +1 -0
- parsl/tests/configs/local_adhoc.py +1 -0
- parsl/tests/configs/midway.py +1 -0
- parsl/tests/configs/nscc_singapore.py +1 -0
- parsl/tests/configs/osg_htex.py +1 -0
- parsl/tests/configs/petrelkube.py +1 -0
- parsl/tests/configs/summit.py +1 -0
- parsl/tests/configs/swan_htex.py +1 -0
- parsl/tests/configs/theta.py +1 -0
- parsl/tests/manual_tests/htex_local.py +1 -0
- parsl/tests/manual_tests/test_ad_hoc_htex.py +1 -0
- parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +1 -0
- parsl/tests/manual_tests/test_memory_limits.py +1 -0
- parsl/tests/scaling_tests/htex_local.py +1 -0
- parsl/tests/sites/test_affinity.py +1 -0
- parsl/tests/sites/test_concurrent.py +2 -1
- parsl/tests/sites/test_dynamic_executor.py +1 -0
- parsl/tests/sites/test_worker_info.py +1 -0
- parsl/tests/test_bash_apps/test_stdout.py +6 -1
- parsl/tests/test_curvezmq.py +455 -0
- parsl/tests/test_data/test_file_apps.py +5 -5
- parsl/tests/test_data/test_file_staging.py +3 -3
- parsl/tests/test_docs/test_kwargs.py +3 -3
- parsl/tests/test_htex/test_htex.py +46 -0
- parsl/tests/test_htex/test_htex_zmq_binding.py +53 -13
- parsl/tests/test_python_apps/test_futures.py +5 -5
- parsl/tests/test_regression/test_97_parallelism_0.py +1 -0
- parsl/tests/test_scaling/test_block_error_handler.py +6 -5
- parsl/tests/test_scaling/test_regression_1621.py +1 -0
- parsl/tests/test_scaling/test_scale_down.py +1 -0
- parsl/version.py +1 -1
- {parsl-2024.1.29.data → parsl-2024.2.5.data}/scripts/process_worker_pool.py +15 -7
- {parsl-2024.1.29.dist-info → parsl-2024.2.5.dist-info}/METADATA +3 -3
- {parsl-2024.1.29.dist-info → parsl-2024.2.5.dist-info}/RECORD +64 -61
- {parsl-2024.1.29.data → parsl-2024.2.5.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.1.29.data → parsl-2024.2.5.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.1.29.dist-info → parsl-2024.2.5.dist-info}/LICENSE +0 -0
- {parsl-2024.1.29.dist-info → parsl-2024.2.5.dist-info}/WHEEL +0 -0
- {parsl-2024.1.29.dist-info → parsl-2024.2.5.dist-info}/entry_points.txt +0 -0
- {parsl-2024.1.29.dist-info → parsl-2024.2.5.dist-info}/top_level.txt +0 -0
@@ -4,24 +4,28 @@ import zmq
|
|
4
4
|
import logging
|
5
5
|
import threading
|
6
6
|
|
7
|
+
from parsl import curvezmq
|
8
|
+
|
7
9
|
logger = logging.getLogger(__name__)
|
8
10
|
|
9
11
|
|
10
12
|
class CommandClient:
|
11
13
|
""" CommandClient
|
12
14
|
"""
|
13
|
-
def __init__(self, ip_address, port_range):
|
15
|
+
def __init__(self, zmq_context: curvezmq.ClientContext, ip_address, port_range):
|
14
16
|
"""
|
15
17
|
Parameters
|
16
18
|
----------
|
17
19
|
|
20
|
+
zmq_context: curvezmq.ClientContext
|
21
|
+
CurveZMQ client context used to create secure sockets
|
18
22
|
ip_address: str
|
19
23
|
IP address of the client (where Parsl runs)
|
20
24
|
port_range: tuple(int, int)
|
21
25
|
Port range for the comms between client and interchange
|
22
26
|
|
23
27
|
"""
|
24
|
-
self.
|
28
|
+
self.zmq_context = zmq_context
|
25
29
|
self.ip_address = ip_address
|
26
30
|
self.port_range = port_range
|
27
31
|
self.port = None
|
@@ -33,7 +37,7 @@ class CommandClient:
|
|
33
37
|
|
34
38
|
Upon recreating the socket, we bind to the same port.
|
35
39
|
"""
|
36
|
-
self.zmq_socket = self.
|
40
|
+
self.zmq_socket = self.zmq_context.socket(zmq.REQ)
|
37
41
|
self.zmq_socket.setsockopt(zmq.LINGER, 0)
|
38
42
|
if self.port is None:
|
39
43
|
self.port = self.zmq_socket.bind_to_random_port("tcp://{}".format(self.ip_address),
|
@@ -62,9 +66,7 @@ class CommandClient:
|
|
62
66
|
except zmq.ZMQError:
|
63
67
|
logger.exception("Potential ZMQ REQ-REP deadlock caught")
|
64
68
|
logger.info("Trying to reestablish context")
|
65
|
-
self.
|
66
|
-
self.context.destroy()
|
67
|
-
self.context = zmq.Context()
|
69
|
+
self.zmq_context.recreate()
|
68
70
|
self.create_socket_and_bind()
|
69
71
|
else:
|
70
72
|
break
|
@@ -77,25 +79,27 @@ class CommandClient:
|
|
77
79
|
|
78
80
|
def close(self):
|
79
81
|
self.zmq_socket.close()
|
80
|
-
self.
|
82
|
+
self.zmq_context.term()
|
81
83
|
|
82
84
|
|
83
85
|
class TasksOutgoing:
|
84
86
|
""" Outgoing task queue from the executor to the Interchange
|
85
87
|
"""
|
86
|
-
def __init__(self, ip_address, port_range):
|
88
|
+
def __init__(self, zmq_context: curvezmq.ClientContext, ip_address, port_range):
|
87
89
|
"""
|
88
90
|
Parameters
|
89
91
|
----------
|
90
92
|
|
93
|
+
zmq_context: curvezmq.ClientContext
|
94
|
+
CurveZMQ client context used to create secure sockets
|
91
95
|
ip_address: str
|
92
96
|
IP address of the client (where Parsl runs)
|
93
97
|
port_range: tuple(int, int)
|
94
98
|
Port range for the comms between client and interchange
|
95
99
|
|
96
100
|
"""
|
97
|
-
self.
|
98
|
-
self.zmq_socket = self.
|
101
|
+
self.zmq_context = zmq_context
|
102
|
+
self.zmq_socket = self.zmq_context.socket(zmq.DEALER)
|
99
103
|
self.zmq_socket.set_hwm(0)
|
100
104
|
self.port = self.zmq_socket.bind_to_random_port("tcp://{}".format(ip_address),
|
101
105
|
min_port=port_range[0],
|
@@ -127,26 +131,28 @@ class TasksOutgoing:
|
|
127
131
|
|
128
132
|
def close(self):
|
129
133
|
self.zmq_socket.close()
|
130
|
-
self.
|
134
|
+
self.zmq_context.term()
|
131
135
|
|
132
136
|
|
133
137
|
class ResultsIncoming:
|
134
138
|
""" Incoming results queue from the Interchange to the executor
|
135
139
|
"""
|
136
140
|
|
137
|
-
def __init__(self, ip_address, port_range):
|
141
|
+
def __init__(self, zmq_context: curvezmq.ClientContext, ip_address, port_range):
|
138
142
|
"""
|
139
143
|
Parameters
|
140
144
|
----------
|
141
145
|
|
146
|
+
zmq_context: curvezmq.ClientContext
|
147
|
+
CurveZMQ client context used to create secure sockets
|
142
148
|
ip_address: str
|
143
149
|
IP address of the client (where Parsl runs)
|
144
150
|
port_range: tuple(int, int)
|
145
151
|
Port range for the comms between client and interchange
|
146
152
|
|
147
153
|
"""
|
148
|
-
self.
|
149
|
-
self.results_receiver = self.
|
154
|
+
self.zmq_context = zmq_context
|
155
|
+
self.results_receiver = self.zmq_context.socket(zmq.DEALER)
|
150
156
|
self.results_receiver.set_hwm(0)
|
151
157
|
self.port = self.results_receiver.bind_to_random_port("tcp://{}".format(ip_address),
|
152
158
|
min_port=port_range[0],
|
@@ -160,4 +166,4 @@ class ResultsIncoming:
|
|
160
166
|
|
161
167
|
def close(self):
|
162
168
|
self.results_receiver.close()
|
163
|
-
self.
|
169
|
+
self.zmq_context.term()
|
@@ -47,6 +47,10 @@ def _set_manager_attributes(m, config):
|
|
47
47
|
if config.enable_peer_transfers:
|
48
48
|
m.enable_peer_transfers()
|
49
49
|
|
50
|
+
# Set catalog report to parsl if project name exists
|
51
|
+
if m.name:
|
52
|
+
m.set_property("framework", "parsl")
|
53
|
+
|
50
54
|
|
51
55
|
def _prepare_environment_serverless(manager_config, env_cache_dir, poncho_create_script):
|
52
56
|
# Return path to a packaged poncho environment
|
@@ -203,7 +207,7 @@ def _taskvine_submit_wait(ready_task_queue=None,
|
|
203
207
|
break
|
204
208
|
|
205
209
|
# Submit tasks
|
206
|
-
while ready_task_queue.qsize() > 0 and not should_stop.is_set():
|
210
|
+
while ready_task_queue.qsize() > 0 or m.empty() and not should_stop.is_set():
|
207
211
|
# Obtain task from ready_task_queue
|
208
212
|
try:
|
209
213
|
task = ready_task_queue.get(timeout=1)
|
@@ -393,48 +397,45 @@ def _taskvine_submit_wait(ready_task_queue=None,
|
|
393
397
|
|
394
398
|
# If the queue is not empty wait on the TaskVine queue for a task
|
395
399
|
task_found = True
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
result_file=None,
|
436
|
-
reason=reason,
|
437
|
-
status=t.exit_code))
|
400
|
+
while not m.empty() and task_found and not should_stop.is_set():
|
401
|
+
# Obtain the task from the queue
|
402
|
+
t = m.wait(1)
|
403
|
+
if t is None:
|
404
|
+
task_found = False
|
405
|
+
continue
|
406
|
+
logger.debug('Found a task')
|
407
|
+
executor_task_id = vine_id_to_executor_task_id[str(t.id)][0]
|
408
|
+
vine_id_to_executor_task_id.pop(str(t.id))
|
409
|
+
|
410
|
+
# When a task is found
|
411
|
+
result_file = result_file_of_task_id.pop(executor_task_id)
|
412
|
+
|
413
|
+
logger.debug(f"completed executor task info: {executor_task_id}, {t.category}, {t.command}, {t.std_output}")
|
414
|
+
|
415
|
+
# A tasks completes 'succesfully' if it has result file.
|
416
|
+
# A check whether the Python object represented using this file can be
|
417
|
+
# deserialized happens later in the collector thread of the executor
|
418
|
+
# process.
|
419
|
+
logger.debug("Looking for result in {}".format(result_file))
|
420
|
+
if os.path.exists(result_file):
|
421
|
+
logger.debug("Found result in {}".format(result_file))
|
422
|
+
finished_task_queue.put_nowait(VineTaskToParsl(executor_id=executor_task_id,
|
423
|
+
result_received=True,
|
424
|
+
result_file=result_file,
|
425
|
+
reason=None,
|
426
|
+
status=t.exit_code))
|
427
|
+
# If a result file could not be generated, explain the
|
428
|
+
# failure according to taskvine error codes.
|
429
|
+
else:
|
430
|
+
reason = _explain_taskvine_result(t)
|
431
|
+
logger.debug("Did not find result in {}".format(result_file))
|
432
|
+
logger.debug("Wrapper Script status: {}\nTaskVine Status: {}".format(t.exit_code, t.result))
|
433
|
+
logger.debug("Task with executor id {} / vine id {} failed because:\n{}".format(executor_task_id, t.id, reason))
|
434
|
+
finished_task_queue.put_nowait(VineTaskToParsl(executor_id=executor_task_id,
|
435
|
+
result_received=False,
|
436
|
+
result_file=None,
|
437
|
+
reason=reason,
|
438
|
+
status=t.exit_code))
|
438
439
|
|
439
440
|
logger.debug("Exiting TaskVine Monitoring Process")
|
440
441
|
return 0
|
parsl/monitoring/monitoring.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import os
|
2
4
|
import socket
|
3
5
|
import time
|
@@ -11,7 +13,8 @@ import queue
|
|
11
13
|
import parsl.monitoring.remote
|
12
14
|
|
13
15
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
14
|
-
from multiprocessing import Process
|
16
|
+
from multiprocessing import Process
|
17
|
+
from multiprocessing.queues import Queue
|
15
18
|
from parsl.utils import RepresentationMixin
|
16
19
|
from parsl.process_loggers import wrap_with_logs
|
17
20
|
from parsl.utils import setproctitle
|
@@ -20,7 +23,7 @@ from parsl.serialize import deserialize
|
|
20
23
|
|
21
24
|
from parsl.monitoring.message_type import MessageType
|
22
25
|
from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
|
23
|
-
from typing import cast, Any, Callable, Dict, Optional, Sequence, Tuple, Union
|
26
|
+
from typing import cast, Any, Callable, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING
|
24
27
|
|
25
28
|
_db_manager_excepts: Optional[Exception]
|
26
29
|
|
@@ -171,7 +174,19 @@ class MonitoringHub(RepresentationMixin):
|
|
171
174
|
self.logger.debug("Initializing ZMQ Pipes to client")
|
172
175
|
self.monitoring_hub_active = True
|
173
176
|
|
174
|
-
|
177
|
+
# This annotation is incompatible with typeguard 4.x instrumentation
|
178
|
+
# of local variables: Queue is not subscriptable at runtime, as far
|
179
|
+
# as typeguard is concerned. The more general Queue annotation works,
|
180
|
+
# but does not restrict the contents of the Queue. Using TYPE_CHECKING
|
181
|
+
# here allows the stricter definition to be seen by mypy, and the
|
182
|
+
# simpler definition to be seen by typeguard. Hopefully at some point
|
183
|
+
# in the future, Queue will allow runtime subscripts.
|
184
|
+
|
185
|
+
if TYPE_CHECKING:
|
186
|
+
comm_q: Queue[Union[Tuple[int, int], str]]
|
187
|
+
else:
|
188
|
+
comm_q: Queue
|
189
|
+
|
175
190
|
comm_q = SizedQueue(maxsize=10)
|
176
191
|
|
177
192
|
self.exception_q: Queue[Tuple[str, str]]
|
parsl/providers/errors.py
CHANGED
@@ -51,20 +51,18 @@ class SubmitException(ExecutionProviderException):
|
|
51
51
|
'''Raised by the submit() method of a provider if there is an error in launching a job.
|
52
52
|
'''
|
53
53
|
|
54
|
-
def __init__(self, job_name, message, stdout=None, stderr=None):
|
54
|
+
def __init__(self, job_name, message, stdout=None, stderr=None, retcode=None):
|
55
55
|
self.job_name = job_name
|
56
56
|
self.message = message
|
57
57
|
self.stdout = stdout
|
58
58
|
self.stderr = stderr
|
59
|
+
self.retcode = retcode
|
59
60
|
|
60
61
|
@property
|
61
62
|
def task_name(self) -> str:
|
62
63
|
warnings.warn("task_name is deprecated; use .job_name instead. This will be removed after 2024-06.", DeprecationWarning)
|
63
64
|
return self.job_name
|
64
65
|
|
65
|
-
def __str__(self):
|
66
|
+
def __str__(self) -> str:
|
66
67
|
# TODO: make this more user-friendly
|
67
|
-
return "Cannot launch job {
|
68
|
-
self.message,
|
69
|
-
self.stdout,
|
70
|
-
self.stderr)
|
68
|
+
return f"Cannot launch job {self.job_name}: {self.message}; recode={self.retcode}, stdout={self.stdout}, stderr={self.stderr}"
|
parsl/providers/slurm/slurm.py
CHANGED
@@ -13,6 +13,7 @@ from parsl.jobs.states import JobState, JobStatus
|
|
13
13
|
from parsl.launchers import SingleNodeLauncher
|
14
14
|
from parsl.launchers.base import Launcher
|
15
15
|
from parsl.providers.cluster_provider import ClusterProvider
|
16
|
+
from parsl.providers.errors import SubmitException
|
16
17
|
from parsl.providers.slurm.template import template_string
|
17
18
|
from parsl.utils import RepresentationMixin, wtime_to_minutes
|
18
19
|
|
@@ -194,7 +195,7 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
194
195
|
logger.debug("Updating missing job {} to completed status".format(missing_job))
|
195
196
|
self.resources[missing_job]['status'] = JobStatus(JobState.COMPLETED)
|
196
197
|
|
197
|
-
def submit(self, command, tasks_per_node, job_name="parsl.slurm"):
|
198
|
+
def submit(self, command: str, tasks_per_node: int, job_name="parsl.slurm") -> str:
|
198
199
|
"""Submit the command as a slurm job.
|
199
200
|
|
200
201
|
Parameters
|
@@ -207,8 +208,8 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
207
208
|
Name for the job
|
208
209
|
Returns
|
209
210
|
-------
|
210
|
-
|
211
|
-
|
211
|
+
job id : str
|
212
|
+
A string identifier for the job
|
212
213
|
"""
|
213
214
|
|
214
215
|
scheduler_options = self.scheduler_options
|
@@ -254,21 +255,21 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
254
255
|
|
255
256
|
retcode, stdout, stderr = self.execute_wait("sbatch {0}".format(channel_script_path))
|
256
257
|
|
257
|
-
job_id = None
|
258
258
|
if retcode == 0:
|
259
259
|
for line in stdout.split('\n'):
|
260
260
|
match = re.match(self.regex_job_id, line)
|
261
261
|
if match:
|
262
262
|
job_id = match.group("id")
|
263
263
|
self.resources[job_id] = {'job_id': job_id, 'status': JobStatus(JobState.PENDING)}
|
264
|
-
|
264
|
+
return job_id
|
265
265
|
else:
|
266
266
|
logger.error("Could not read job ID from submit command standard output.")
|
267
267
|
logger.error("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip())
|
268
|
+
raise SubmitException(job_name, "Could not read job ID from submit command standard output", stdout=stdout, stderr=stderr, retcode=retcode)
|
268
269
|
else:
|
269
270
|
logger.error("Submit command failed")
|
270
271
|
logger.error("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip())
|
271
|
-
|
272
|
+
raise SubmitException(job_name, "Could not read job ID from submit command standard output", stdout=stdout, stderr=stderr, retcode=retcode)
|
272
273
|
|
273
274
|
def cancel(self, job_ids):
|
274
275
|
''' Cancels the jobs specified by a list of job ids
|
@@ -18,6 +18,7 @@ config = Config(
|
|
18
18
|
label='remote_htex',
|
19
19
|
max_workers=2,
|
20
20
|
worker_logdir_root=user_opts['adhoc']['script_dir'],
|
21
|
+
encrypted=True,
|
21
22
|
provider=AdHocProvider(
|
22
23
|
# Command to be run before starting a worker, such as:
|
23
24
|
# 'module load Anaconda; source activate parsl_env'.
|
parsl/tests/configs/bridges.py
CHANGED
parsl/tests/configs/cc_in2p3.py
CHANGED
parsl/tests/configs/comet.py
CHANGED
parsl/tests/configs/ec2_spot.py
CHANGED
parsl/tests/configs/frontera.py
CHANGED
parsl/tests/configs/midway.py
CHANGED
parsl/tests/configs/osg_htex.py
CHANGED
parsl/tests/configs/summit.py
CHANGED
parsl/tests/configs/swan_htex.py
CHANGED
parsl/tests/configs/theta.py
CHANGED
@@ -15,6 +15,7 @@ config = Config(
|
|
15
15
|
label='AdHoc',
|
16
16
|
max_workers=2,
|
17
17
|
worker_logdir_root="/scratch/midway2/yadunand/parsl_scripts",
|
18
|
+
encrypted=True,
|
18
19
|
provider=AdHocProvider(
|
19
20
|
worker_init="source /scratch/midway2/yadunand/parsl_env_setup.sh",
|
20
21
|
channels=[SSHChannel(hostname=m,
|
@@ -17,6 +17,7 @@ def make_config():
|
|
17
17
|
max_workers=2,
|
18
18
|
heartbeat_period=2,
|
19
19
|
heartbeat_threshold=4,
|
20
|
+
encrypted=True,
|
20
21
|
)
|
21
22
|
],
|
22
23
|
strategy='none',
|
@@ -24,7 +25,7 @@ def make_config():
|
|
24
25
|
|
25
26
|
|
26
27
|
@mark.local
|
27
|
-
def test_executor(
|
28
|
+
def test_executor():
|
28
29
|
my_config = make_config()
|
29
30
|
|
30
31
|
with ParslPoolExecutor(my_config) as exc:
|