parsl 2024.10.21__py3-none-any.whl → 2024.11.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/channels/base.py +0 -11
- parsl/channels/errors.py +0 -17
- parsl/channels/local/local.py +3 -16
- parsl/channels/ssh/ssh.py +0 -11
- parsl/dataflow/dflow.py +6 -6
- parsl/executors/high_throughput/executor.py +0 -1
- parsl/executors/high_throughput/interchange.py +8 -5
- parsl/executors/high_throughput/mpi_resource_management.py +0 -12
- parsl/executors/high_throughput/process_worker_pool.py +0 -8
- parsl/monitoring/db_manager.py +1 -1
- parsl/monitoring/monitoring.py +9 -11
- parsl/monitoring/radios.py +5 -16
- parsl/monitoring/remote.py +3 -5
- parsl/monitoring/router.py +4 -7
- parsl/monitoring/types.py +3 -6
- parsl/providers/__init__.py +0 -2
- parsl/providers/base.py +1 -17
- parsl/tests/conftest.py +4 -0
- parsl/tests/site_tests/site_config_selector.py +1 -6
- parsl/tests/test_bash_apps/test_basic.py +3 -0
- parsl/tests/test_bash_apps/test_error_codes.py +4 -0
- parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -0
- parsl/tests/test_bash_apps/test_memoize.py +2 -6
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +3 -0
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +1 -0
- parsl/tests/test_bash_apps/test_multiline.py +1 -0
- parsl/tests/test_bash_apps/test_stdout.py +2 -0
- parsl/tests/{integration/test_channels → test_channels}/test_local_channel.py +4 -8
- parsl/tests/test_docs/test_from_slides.py +3 -0
- parsl/tests/test_docs/test_kwargs.py +3 -0
- parsl/tests/test_monitoring/test_basic.py +13 -1
- parsl/tests/test_python_apps/test_outputs.py +1 -0
- parsl/tests/test_regression/test_226.py +1 -0
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +92 -0
- parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +1 -1
- parsl/tests/test_staging/test_docs_1.py +1 -0
- parsl/tests/test_staging/test_output_chain_filenames.py +3 -0
- parsl/tests/test_staging/test_staging_ftp.py +1 -0
- parsl/tests/test_staging/test_staging_https.py +3 -0
- parsl/tests/test_staging/test_staging_stdout.py +2 -0
- parsl/version.py +1 -1
- {parsl-2024.10.21.data → parsl-2024.11.4.data}/scripts/interchange.py +8 -5
- {parsl-2024.10.21.data → parsl-2024.11.4.data}/scripts/process_worker_pool.py +0 -8
- {parsl-2024.10.21.dist-info → parsl-2024.11.4.dist-info}/METADATA +2 -2
- {parsl-2024.10.21.dist-info → parsl-2024.11.4.dist-info}/RECORD +51 -58
- parsl/providers/cobalt/__init__.py +0 -0
- parsl/providers/cobalt/cobalt.py +0 -236
- parsl/providers/cobalt/template.py +0 -17
- parsl/tests/configs/cooley_htex.py +0 -37
- parsl/tests/configs/theta.py +0 -37
- parsl/tests/integration/test_channels/test_channels.py +0 -17
- parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
- parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -18
- {parsl-2024.10.21.data → parsl-2024.11.4.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.10.21.data → parsl-2024.11.4.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.10.21.dist-info → parsl-2024.11.4.dist-info}/LICENSE +0 -0
- {parsl-2024.10.21.dist-info → parsl-2024.11.4.dist-info}/WHEEL +0 -0
- {parsl-2024.10.21.dist-info → parsl-2024.11.4.dist-info}/entry_points.txt +0 -0
- {parsl-2024.10.21.dist-info → parsl-2024.11.4.dist-info}/top_level.txt +0 -0
parsl/channels/base.py
CHANGED
@@ -120,14 +120,3 @@ class Channel(metaclass=ABCMeta):
|
|
120
120
|
Path of directory to check.
|
121
121
|
"""
|
122
122
|
pass
|
123
|
-
|
124
|
-
@abstractmethod
|
125
|
-
def abspath(self, path: str) -> str:
|
126
|
-
"""Return the absolute path.
|
127
|
-
|
128
|
-
Parameters
|
129
|
-
----------
|
130
|
-
path : str
|
131
|
-
Path for which the absolute path will be returned.
|
132
|
-
"""
|
133
|
-
pass
|
parsl/channels/errors.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
''' Exceptions raise by Apps.
|
2
2
|
'''
|
3
|
-
from typing import Optional
|
4
|
-
|
5
3
|
from parsl.errors import ParslError
|
6
4
|
|
7
5
|
|
@@ -60,21 +58,6 @@ class BadPermsScriptPath(ChannelError):
|
|
60
58
|
super().__init__("User does not have permissions to access the script_dir", e, hostname)
|
61
59
|
|
62
60
|
|
63
|
-
class FileExists(ChannelError):
|
64
|
-
''' Push or pull of file over channel fails since a file of the name already
|
65
|
-
exists on the destination.
|
66
|
-
|
67
|
-
Contains:
|
68
|
-
reason(string)
|
69
|
-
e (paramiko exception object)
|
70
|
-
hostname (string)
|
71
|
-
'''
|
72
|
-
|
73
|
-
def __init__(self, e: Exception, hostname: str, filename: Optional[str] = None) -> None:
|
74
|
-
super().__init__("File name collision in channel transport phase: {}".format(filename),
|
75
|
-
e, hostname)
|
76
|
-
|
77
|
-
|
78
61
|
class AuthException(ChannelError):
|
79
62
|
''' An error raised during execution of an app.
|
80
63
|
What this exception contains depends entirely on context
|
parsl/channels/local/local.py
CHANGED
@@ -37,19 +37,16 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
37
37
|
|
38
38
|
Args:
|
39
39
|
- cmd (string) : Commandline string to execute
|
40
|
-
- walltime (int) : walltime in seconds
|
40
|
+
- walltime (int) : walltime in seconds
|
41
41
|
|
42
42
|
Kwargs:
|
43
43
|
- envs (dict) : Dictionary of env variables. This will be used
|
44
44
|
to override the envs set at channel initialization.
|
45
45
|
|
46
46
|
Returns:
|
47
|
-
- retcode : Return code from the execution
|
47
|
+
- retcode : Return code from the execution
|
48
48
|
- stdout : stdout string
|
49
49
|
- stderr : stderr string
|
50
|
-
|
51
|
-
Raises:
|
52
|
-
None.
|
53
50
|
'''
|
54
51
|
current_env = copy.deepcopy(self._envs)
|
55
52
|
current_env.update(envs)
|
@@ -145,16 +142,6 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
145
142
|
|
146
143
|
return os.makedirs(path, mode, exist_ok)
|
147
144
|
|
148
|
-
def abspath(self, path):
|
149
|
-
"""Return the absolute path.
|
150
|
-
|
151
|
-
Parameters
|
152
|
-
----------
|
153
|
-
path : str
|
154
|
-
Path for which the absolute path will be returned.
|
155
|
-
"""
|
156
|
-
return os.path.abspath(path)
|
157
|
-
|
158
145
|
@property
|
159
146
|
def script_dir(self):
|
160
147
|
return self._script_dir
|
@@ -162,5 +149,5 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
162
149
|
@script_dir.setter
|
163
150
|
def script_dir(self, value):
|
164
151
|
if value is not None:
|
165
|
-
value =
|
152
|
+
value = os.path.abspath(value)
|
166
153
|
self._script_dir = value
|
parsl/channels/ssh/ssh.py
CHANGED
@@ -214,7 +214,6 @@ class DeprecatedSSHChannel(Channel, RepresentationMixin):
|
|
214
214
|
- str: Local path to file
|
215
215
|
|
216
216
|
Raises:
|
217
|
-
- FileExists : Name collision at local directory.
|
218
217
|
- FileCopyException : FileCopy failed.
|
219
218
|
'''
|
220
219
|
|
@@ -287,16 +286,6 @@ class DeprecatedSSHChannel(Channel, RepresentationMixin):
|
|
287
286
|
self.execute_wait('mkdir -p {}'.format(path))
|
288
287
|
self._valid_sftp_client().chmod(path, mode)
|
289
288
|
|
290
|
-
def abspath(self, path):
|
291
|
-
"""Return the absolute path on the remote side.
|
292
|
-
|
293
|
-
Parameters
|
294
|
-
----------
|
295
|
-
path : str
|
296
|
-
Path for which the absolute path will be returned.
|
297
|
-
"""
|
298
|
-
return self._valid_sftp_client().normalize(path)
|
299
|
-
|
300
289
|
@property
|
301
290
|
def script_dir(self):
|
302
291
|
return self._script_dir
|
parsl/dataflow/dflow.py
CHANGED
@@ -162,8 +162,8 @@ class DataFlowKernel:
|
|
162
162
|
}
|
163
163
|
|
164
164
|
if self.monitoring:
|
165
|
-
self.monitoring.send(MessageType.WORKFLOW_INFO,
|
166
|
-
workflow_info)
|
165
|
+
self.monitoring.send((MessageType.WORKFLOW_INFO,
|
166
|
+
workflow_info))
|
167
167
|
|
168
168
|
if config.checkpoint_files is not None:
|
169
169
|
checkpoints = self.load_checkpoints(config.checkpoint_files)
|
@@ -238,7 +238,7 @@ class DataFlowKernel:
|
|
238
238
|
def _send_task_log_info(self, task_record: TaskRecord) -> None:
|
239
239
|
if self.monitoring:
|
240
240
|
task_log_info = self._create_task_log_info(task_record)
|
241
|
-
self.monitoring.send(MessageType.TASK_INFO, task_log_info)
|
241
|
+
self.monitoring.send((MessageType.TASK_INFO, task_log_info))
|
242
242
|
|
243
243
|
def _create_task_log_info(self, task_record: TaskRecord) -> Dict[str, Any]:
|
244
244
|
"""
|
@@ -987,7 +987,7 @@ class DataFlowKernel:
|
|
987
987
|
- app_kwargs (dict) : Rest of the kwargs to the fn passed as dict.
|
988
988
|
|
989
989
|
Returns:
|
990
|
-
|
990
|
+
AppFuture
|
991
991
|
|
992
992
|
"""
|
993
993
|
|
@@ -1295,12 +1295,12 @@ class DataFlowKernel:
|
|
1295
1295
|
|
1296
1296
|
if self.monitoring:
|
1297
1297
|
logger.info("Sending final monitoring message")
|
1298
|
-
self.monitoring.send(MessageType.WORKFLOW_INFO,
|
1298
|
+
self.monitoring.send((MessageType.WORKFLOW_INFO,
|
1299
1299
|
{'tasks_failed_count': self.task_state_counts[States.failed],
|
1300
1300
|
'tasks_completed_count': self.task_state_counts[States.exec_done],
|
1301
1301
|
"time_began": self.time_began,
|
1302
1302
|
'time_completed': self.time_completed,
|
1303
|
-
'run_id': self.run_id, 'rundir': self.run_dir})
|
1303
|
+
'run_id': self.run_id, 'rundir': self.run_dir}))
|
1304
1304
|
|
1305
1305
|
logger.info("Terminating monitoring")
|
1306
1306
|
self.monitoring.close()
|
@@ -63,7 +63,6 @@ DEFAULT_INTERCHANGE_LAUNCH_CMD = ["interchange.py"]
|
|
63
63
|
|
64
64
|
GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
|
65
65
|
Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
|
66
|
-
:class:`~parsl.providers.cobalt.cobalt.Cobalt`,
|
67
66
|
:class:`~parsl.providers.condor.condor.Condor`,
|
68
67
|
:class:`~parsl.providers.googlecloud.googlecloud.GoogleCloud`,
|
69
68
|
:class:`~parsl.providers.gridEngine.gridEngine.GridEngine`,
|
@@ -66,7 +66,7 @@ class Interchange:
|
|
66
66
|
If specified the interchange will only listen on this address for connections from workers
|
67
67
|
else, it binds to all addresses.
|
68
68
|
|
69
|
-
client_ports :
|
69
|
+
client_ports : tuple(int, int, int)
|
70
70
|
The ports at which the client can be reached
|
71
71
|
|
72
72
|
worker_ports : tuple(int, int)
|
@@ -104,7 +104,6 @@ class Interchange:
|
|
104
104
|
os.makedirs(self.logdir, exist_ok=True)
|
105
105
|
|
106
106
|
start_file_logger("{}/interchange.log".format(self.logdir), level=logging_level)
|
107
|
-
logger.propagate = False
|
108
107
|
logger.debug("Initializing Interchange process")
|
109
108
|
|
110
109
|
self.client_address = client_address
|
@@ -437,9 +436,13 @@ class Interchange:
|
|
437
436
|
logger.info(f"Manager {manager_id!r} has compatible Parsl version {msg['parsl_v']}")
|
438
437
|
logger.info(f"Manager {manager_id!r} has compatible Python version {msg['python_v'].rsplit('.', 1)[0]}")
|
439
438
|
elif msg['type'] == 'heartbeat':
|
440
|
-
|
441
|
-
|
442
|
-
|
439
|
+
manager = self._ready_managers.get(manager_id)
|
440
|
+
if manager:
|
441
|
+
manager['last_heartbeat'] = time.time()
|
442
|
+
logger.debug("Manager %r sent heartbeat via tasks connection", manager_id)
|
443
|
+
self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
|
444
|
+
else:
|
445
|
+
logger.warning("Received heartbeat via tasks connection for not-registered manager %r", manager_id)
|
443
446
|
elif msg['type'] == 'drain':
|
444
447
|
self._ready_managers[manager_id]['draining'] = True
|
445
448
|
logger.debug("Manager %r requested drain", manager_id)
|
@@ -17,7 +17,6 @@ class Scheduler(Enum):
|
|
17
17
|
Unknown = 0
|
18
18
|
Slurm = 1
|
19
19
|
PBS = 2
|
20
|
-
Cobalt = 3
|
21
20
|
|
22
21
|
|
23
22
|
def get_slurm_hosts_list() -> List[str]:
|
@@ -37,13 +36,6 @@ def get_pbs_hosts_list() -> List[str]:
|
|
37
36
|
return [line.strip() for line in f.readlines()]
|
38
37
|
|
39
38
|
|
40
|
-
def get_cobalt_hosts_list() -> List[str]:
|
41
|
-
"""Get list of COBALT hosts from envvar: COBALT_NODEFILE"""
|
42
|
-
nodefile_name = os.environ["COBALT_NODEFILE"]
|
43
|
-
with open(nodefile_name) as f:
|
44
|
-
return [line.strip() for line in f.readlines()]
|
45
|
-
|
46
|
-
|
47
39
|
def get_nodes_in_batchjob(scheduler: Scheduler) -> List[str]:
|
48
40
|
"""Get nodelist from all supported schedulers"""
|
49
41
|
nodelist = []
|
@@ -51,8 +43,6 @@ def get_nodes_in_batchjob(scheduler: Scheduler) -> List[str]:
|
|
51
43
|
nodelist = get_slurm_hosts_list()
|
52
44
|
elif scheduler == Scheduler.PBS:
|
53
45
|
nodelist = get_pbs_hosts_list()
|
54
|
-
elif scheduler == Scheduler.Cobalt:
|
55
|
-
nodelist = get_cobalt_hosts_list()
|
56
46
|
else:
|
57
47
|
raise RuntimeError(f"mpi_mode does not support scheduler:{scheduler}")
|
58
48
|
return nodelist
|
@@ -64,8 +54,6 @@ def identify_scheduler() -> Scheduler:
|
|
64
54
|
return Scheduler.Slurm
|
65
55
|
elif os.environ.get("PBS_NODEFILE"):
|
66
56
|
return Scheduler.PBS
|
67
|
-
elif os.environ.get("COBALT_NODEFILE"):
|
68
|
-
return Scheduler.Cobalt
|
69
57
|
else:
|
70
58
|
return Scheduler.Unknown
|
71
59
|
|
@@ -650,14 +650,6 @@ def worker(
|
|
650
650
|
debug: bool,
|
651
651
|
mpi_launcher: str,
|
652
652
|
):
|
653
|
-
"""
|
654
|
-
|
655
|
-
Put request token into queue
|
656
|
-
Get task from task_queue
|
657
|
-
Pop request from queue
|
658
|
-
Put result into result_queue
|
659
|
-
"""
|
660
|
-
|
661
653
|
# override the global logger inherited from the __main__ process (which
|
662
654
|
# usually logs to manager.log) with one specific to this worker.
|
663
655
|
global logger
|
parsl/monitoring/db_manager.py
CHANGED
@@ -556,7 +556,7 @@ class DatabaseManager:
|
|
556
556
|
logger.debug("Checking STOP conditions: kill event: %s, queue has entries: %s",
|
557
557
|
kill_event.is_set(), logs_queue.qsize() != 0)
|
558
558
|
try:
|
559
|
-
x
|
559
|
+
x = logs_queue.get(timeout=0.1)
|
560
560
|
except queue.Empty:
|
561
561
|
continue
|
562
562
|
else:
|
parsl/monitoring/monitoring.py
CHANGED
@@ -7,16 +7,15 @@ import queue
|
|
7
7
|
import time
|
8
8
|
from multiprocessing import Event, Process
|
9
9
|
from multiprocessing.queues import Queue
|
10
|
-
from typing import TYPE_CHECKING,
|
10
|
+
from typing import TYPE_CHECKING, Literal, Optional, Tuple, Union, cast
|
11
11
|
|
12
12
|
import typeguard
|
13
13
|
|
14
14
|
from parsl.log_utils import set_file_logger
|
15
15
|
from parsl.monitoring.errors import MonitoringHubStartError
|
16
|
-
from parsl.monitoring.message_type import MessageType
|
17
16
|
from parsl.monitoring.radios import MultiprocessingQueueRadioSender
|
18
17
|
from parsl.monitoring.router import router_starter
|
19
|
-
from parsl.monitoring.types import
|
18
|
+
from parsl.monitoring.types import TaggedMonitoringMessage
|
20
19
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
21
20
|
from parsl.process_loggers import wrap_with_logs
|
22
21
|
from parsl.serialize import deserialize
|
@@ -138,7 +137,7 @@ class MonitoringHub(RepresentationMixin):
|
|
138
137
|
self.exception_q: Queue[Tuple[str, str]]
|
139
138
|
self.exception_q = SizedQueue(maxsize=10)
|
140
139
|
|
141
|
-
self.resource_msgs: Queue[Union[
|
140
|
+
self.resource_msgs: Queue[Union[TaggedMonitoringMessage, Literal["STOP"]]]
|
142
141
|
self.resource_msgs = SizedQueue()
|
143
142
|
|
144
143
|
self.router_exit_event: ms.Event
|
@@ -202,10 +201,9 @@ class MonitoringHub(RepresentationMixin):
|
|
202
201
|
|
203
202
|
self.hub_zmq_port = zmq_port
|
204
203
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
self.radio.send((mtype, message))
|
204
|
+
def send(self, message: TaggedMonitoringMessage) -> None:
|
205
|
+
logger.debug("Sending message type {}".format(message[0]))
|
206
|
+
self.radio.send(message)
|
209
207
|
|
210
208
|
def close(self) -> None:
|
211
209
|
logger.info("Terminating Monitoring Hub")
|
@@ -237,7 +235,7 @@ class MonitoringHub(RepresentationMixin):
|
|
237
235
|
logger.debug("Finished waiting for router termination")
|
238
236
|
if len(exception_msgs) == 0:
|
239
237
|
logger.debug("Sending STOP to DBM")
|
240
|
-
self.resource_msgs.put(
|
238
|
+
self.resource_msgs.put("STOP")
|
241
239
|
else:
|
242
240
|
logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
|
243
241
|
logger.debug("Waiting for DB termination")
|
@@ -261,7 +259,7 @@ class MonitoringHub(RepresentationMixin):
|
|
261
259
|
|
262
260
|
|
263
261
|
@wrap_with_logs
|
264
|
-
def filesystem_receiver(logdir: str, q: "queue.Queue[
|
262
|
+
def filesystem_receiver(logdir: str, q: "queue.Queue[TaggedMonitoringMessage]", run_dir: str) -> None:
|
265
263
|
logger = set_file_logger("{}/monitoring_filesystem_radio.log".format(logdir),
|
266
264
|
name="monitoring_filesystem_radio",
|
267
265
|
level=logging.INFO)
|
@@ -288,7 +286,7 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
|
|
288
286
|
message = deserialize(f.read())
|
289
287
|
logger.debug(f"Message received is: {message}")
|
290
288
|
assert isinstance(message, tuple)
|
291
|
-
q.put(cast(
|
289
|
+
q.put(cast(TaggedMonitoringMessage, message))
|
292
290
|
os.remove(full_path_filename)
|
293
291
|
except Exception:
|
294
292
|
logger.exception(f"Exception processing {filename} - probably will be retried next iteration")
|
parsl/monitoring/radios.py
CHANGED
@@ -5,15 +5,11 @@ import socket
|
|
5
5
|
import uuid
|
6
6
|
from abc import ABCMeta, abstractmethod
|
7
7
|
from multiprocessing.queues import Queue
|
8
|
-
from typing import Optional
|
9
8
|
|
10
9
|
import zmq
|
11
10
|
|
12
11
|
from parsl.serialize import serialize
|
13
12
|
|
14
|
-
_db_manager_excepts: Optional[Exception]
|
15
|
-
|
16
|
-
|
17
13
|
logger = logging.getLogger(__name__)
|
18
14
|
|
19
15
|
|
@@ -41,9 +37,8 @@ class FilesystemRadioSender(MonitoringRadioSender):
|
|
41
37
|
the UDP radio, but should be much more reliable.
|
42
38
|
"""
|
43
39
|
|
44
|
-
def __init__(self, *, monitoring_url: str,
|
40
|
+
def __init__(self, *, monitoring_url: str, timeout: int = 10, run_dir: str):
|
45
41
|
logger.info("filesystem based monitoring channel initializing")
|
46
|
-
self.source_id = source_id
|
47
42
|
self.base_path = f"{run_dir}/monitor-fs-radio/"
|
48
43
|
self.tmp_path = f"{self.base_path}/tmp"
|
49
44
|
self.new_path = f"{self.base_path}/new"
|
@@ -58,7 +53,7 @@ class FilesystemRadioSender(MonitoringRadioSender):
|
|
58
53
|
|
59
54
|
tmp_filename = f"{self.tmp_path}/{unique_id}"
|
60
55
|
new_filename = f"{self.new_path}/{unique_id}"
|
61
|
-
buffer =
|
56
|
+
buffer = message
|
62
57
|
|
63
58
|
# this will write the message out then atomically
|
64
59
|
# move it into new/, so that a partially written
|
@@ -70,19 +65,16 @@ class FilesystemRadioSender(MonitoringRadioSender):
|
|
70
65
|
|
71
66
|
class HTEXRadioSender(MonitoringRadioSender):
|
72
67
|
|
73
|
-
def __init__(self, monitoring_url: str,
|
68
|
+
def __init__(self, monitoring_url: str, timeout: int = 10):
|
74
69
|
"""
|
75
70
|
Parameters
|
76
71
|
----------
|
77
72
|
|
78
73
|
monitoring_url : str
|
79
74
|
URL of the form <scheme>://<IP>:<PORT>
|
80
|
-
source_id : str
|
81
|
-
String identifier of the source
|
82
75
|
timeout : int
|
83
76
|
timeout, default=10s
|
84
77
|
"""
|
85
|
-
self.source_id = source_id
|
86
78
|
logger.info("htex-based monitoring channel initialising")
|
87
79
|
|
88
80
|
def send(self, message: object) -> None:
|
@@ -124,21 +116,18 @@ class HTEXRadioSender(MonitoringRadioSender):
|
|
124
116
|
|
125
117
|
class UDPRadioSender(MonitoringRadioSender):
|
126
118
|
|
127
|
-
def __init__(self, monitoring_url: str,
|
119
|
+
def __init__(self, monitoring_url: str, timeout: int = 10):
|
128
120
|
"""
|
129
121
|
Parameters
|
130
122
|
----------
|
131
123
|
|
132
124
|
monitoring_url : str
|
133
125
|
URL of the form <scheme>://<IP>:<PORT>
|
134
|
-
source_id : str
|
135
|
-
String identifier of the source
|
136
126
|
timeout : int
|
137
127
|
timeout, default=10s
|
138
128
|
"""
|
139
129
|
self.monitoring_url = monitoring_url
|
140
130
|
self.sock_timeout = timeout
|
141
|
-
self.source_id = source_id
|
142
131
|
try:
|
143
132
|
self.scheme, self.ip, port = (x.strip('/') for x in monitoring_url.split(':'))
|
144
133
|
self.port = int(port)
|
@@ -187,7 +176,7 @@ class MultiprocessingQueueRadioSender(MonitoringRadioSender):
|
|
187
176
|
self.queue = queue
|
188
177
|
|
189
178
|
def send(self, message: object) -> None:
|
190
|
-
self.queue.put(
|
179
|
+
self.queue.put(message)
|
191
180
|
|
192
181
|
|
193
182
|
class ZMQRadioSender(MonitoringRadioSender):
|
parsl/monitoring/remote.py
CHANGED
@@ -103,14 +103,12 @@ def monitor_wrapper(*,
|
|
103
103
|
def get_radio(radio_mode: str, monitoring_hub_url: str, task_id: int, run_dir: str) -> MonitoringRadioSender:
|
104
104
|
radio: MonitoringRadioSender
|
105
105
|
if radio_mode == "udp":
|
106
|
-
radio = UDPRadioSender(monitoring_hub_url
|
107
|
-
source_id=task_id)
|
106
|
+
radio = UDPRadioSender(monitoring_hub_url)
|
108
107
|
elif radio_mode == "htex":
|
109
|
-
radio = HTEXRadioSender(monitoring_hub_url
|
110
|
-
source_id=task_id)
|
108
|
+
radio = HTEXRadioSender(monitoring_hub_url)
|
111
109
|
elif radio_mode == "filesystem":
|
112
110
|
radio = FilesystemRadioSender(monitoring_url=monitoring_hub_url,
|
113
|
-
|
111
|
+
run_dir=run_dir)
|
114
112
|
else:
|
115
113
|
raise RuntimeError(f"Unknown radio mode: {radio_mode}")
|
116
114
|
return radio
|
parsl/monitoring/router.py
CHANGED
@@ -14,7 +14,7 @@ import typeguard
|
|
14
14
|
import zmq
|
15
15
|
|
16
16
|
from parsl.log_utils import set_file_logger
|
17
|
-
from parsl.monitoring.types import
|
17
|
+
from parsl.monitoring.types import TaggedMonitoringMessage
|
18
18
|
from parsl.process_loggers import wrap_with_logs
|
19
19
|
from parsl.utils import setproctitle
|
20
20
|
|
@@ -125,7 +125,7 @@ class MonitoringRouter:
|
|
125
125
|
data, addr = self.udp_sock.recvfrom(2048)
|
126
126
|
resource_msg = pickle.loads(data)
|
127
127
|
self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
128
|
-
self.resource_msgs.put(
|
128
|
+
self.resource_msgs.put(resource_msg)
|
129
129
|
except socket.timeout:
|
130
130
|
pass
|
131
131
|
|
@@ -136,7 +136,7 @@ class MonitoringRouter:
|
|
136
136
|
data, addr = self.udp_sock.recvfrom(2048)
|
137
137
|
msg = pickle.loads(data)
|
138
138
|
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
139
|
-
self.resource_msgs.put(
|
139
|
+
self.resource_msgs.put(msg)
|
140
140
|
last_msg_received_time = time.time()
|
141
141
|
except socket.timeout:
|
142
142
|
pass
|
@@ -160,10 +160,7 @@ class MonitoringRouter:
|
|
160
160
|
assert len(msg) >= 1, "ZMQ Receiver expects tuples of length at least 1, got {}".format(msg)
|
161
161
|
assert len(msg) == 2, "ZMQ Receiver expects message tuples of exactly length 2, got {}".format(msg)
|
162
162
|
|
163
|
-
|
164
|
-
msg_0 = (msg, 0)
|
165
|
-
|
166
|
-
self.resource_msgs.put(msg_0)
|
163
|
+
self.resource_msgs.put(msg)
|
167
164
|
except zmq.Again:
|
168
165
|
pass
|
169
166
|
except Exception:
|
parsl/monitoring/types.py
CHANGED
@@ -1,14 +1,11 @@
|
|
1
|
-
from typing import Any, Dict, Tuple
|
1
|
+
from typing import Any, Dict, Tuple
|
2
2
|
|
3
3
|
from typing_extensions import TypeAlias
|
4
4
|
|
5
5
|
from parsl.monitoring.message_type import MessageType
|
6
6
|
|
7
|
-
# A
|
8
|
-
#
|
9
|
-
# a TaggedMonitoringMessage, and then that can be further tagged with
|
10
|
-
# an often unused sender address, giving an AddressedMonitoringMessage.
|
7
|
+
# A MonitoringMessage dictionary can be tagged, giving a
|
8
|
+
# TaggedMonitoringMessage.
|
11
9
|
|
12
10
|
MonitoringMessage: TypeAlias = Dict[str, Any]
|
13
11
|
TaggedMonitoringMessage: TypeAlias = Tuple[MessageType, MonitoringMessage]
|
14
|
-
AddressedMonitoringMessage: TypeAlias = Tuple[TaggedMonitoringMessage, Union[str, int]]
|
parsl/providers/__init__.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
# Cloud Providers
|
2
2
|
from parsl.providers.aws.aws import AWSProvider
|
3
3
|
from parsl.providers.azure.azure import AzureProvider
|
4
|
-
from parsl.providers.cobalt.cobalt import CobaltProvider
|
5
4
|
from parsl.providers.condor.condor import CondorProvider
|
6
5
|
from parsl.providers.googlecloud.googlecloud import GoogleCloudProvider
|
7
6
|
from parsl.providers.grid_engine.grid_engine import GridEngineProvider
|
@@ -15,7 +14,6 @@ from parsl.providers.slurm.slurm import SlurmProvider
|
|
15
14
|
from parsl.providers.torque.torque import TorqueProvider
|
16
15
|
|
17
16
|
__all__ = ['LocalProvider',
|
18
|
-
'CobaltProvider',
|
19
17
|
'CondorProvider',
|
20
18
|
'GridEngineProvider',
|
21
19
|
'SlurmProvider',
|
parsl/providers/base.py
CHANGED
@@ -2,7 +2,6 @@ import logging
|
|
2
2
|
from abc import ABCMeta, abstractmethod, abstractproperty
|
3
3
|
from typing import Any, Dict, List, Optional
|
4
4
|
|
5
|
-
from parsl.channels.base import Channel
|
6
5
|
from parsl.jobs.states import JobStatus
|
7
6
|
|
8
7
|
logger = logging.getLogger(__name__)
|
@@ -12,7 +11,7 @@ class ExecutionProvider(metaclass=ABCMeta):
|
|
12
11
|
"""Execution providers are responsible for managing execution resources
|
13
12
|
that have a Local Resource Manager (LRM). For instance, campus clusters
|
14
13
|
and supercomputers generally have LRMs (schedulers) such as Slurm,
|
15
|
-
Torque/PBS,
|
14
|
+
Torque/PBS, and Condor. Clouds, on the other hand, have API
|
16
15
|
interfaces that allow much more fine-grained composition of an execution
|
17
16
|
environment. An execution provider abstracts these types of resources and
|
18
17
|
provides a single uniform interface to them.
|
@@ -154,18 +153,3 @@ class ExecutionProvider(metaclass=ABCMeta):
|
|
154
153
|
:return: the number of seconds to wait between calls to status()
|
155
154
|
"""
|
156
155
|
pass
|
157
|
-
|
158
|
-
|
159
|
-
class Channeled():
|
160
|
-
"""A marker type to indicate that parsl should manage a Channel for this provider"""
|
161
|
-
def __init__(self) -> None:
|
162
|
-
self.channel: Channel
|
163
|
-
pass
|
164
|
-
|
165
|
-
|
166
|
-
class MultiChanneled():
|
167
|
-
"""A marker type to indicate that parsl should manage multiple Channels for this provider"""
|
168
|
-
|
169
|
-
def __init__(self) -> None:
|
170
|
-
self.channels: List[Channel]
|
171
|
-
pass
|
parsl/tests/conftest.py
CHANGED
@@ -163,6 +163,10 @@ def pytest_configure(config):
|
|
163
163
|
'markers',
|
164
164
|
'executor_supports_std_stream_tuples: Marks tests that require tuple support for stdout/stderr'
|
165
165
|
)
|
166
|
+
config.addinivalue_line(
|
167
|
+
'markers',
|
168
|
+
'shared_fs: Marks tests that require a shared_fs between the workers are the test client'
|
169
|
+
)
|
166
170
|
|
167
171
|
|
168
172
|
@pytest.fixture(autouse=True, scope='session')
|
@@ -7,12 +7,7 @@ def fresh_config():
|
|
7
7
|
hostname = os.getenv('PARSL_HOSTNAME', platform.uname().node)
|
8
8
|
print("Loading config for {}".format(hostname))
|
9
9
|
|
10
|
-
if '
|
11
|
-
from parsl.tests.configs.theta import fresh_config
|
12
|
-
config = fresh_config()
|
13
|
-
print("Loading Theta config")
|
14
|
-
|
15
|
-
elif 'frontera' in hostname:
|
10
|
+
if 'frontera' in hostname:
|
16
11
|
print("Loading Frontera config")
|
17
12
|
from parsl.tests.configs.frontera import fresh_config
|
18
13
|
config = fresh_config()
|
@@ -24,6 +24,7 @@ def foo(x, y, z=10, stdout=None, label=None):
|
|
24
24
|
return f"echo {x} {y} {z}"
|
25
25
|
|
26
26
|
|
27
|
+
@pytest.mark.shared_fs
|
27
28
|
def test_command_format_1(tmpd_cwd):
|
28
29
|
"""Testing command format for BashApps"""
|
29
30
|
|
@@ -38,6 +39,7 @@ def test_command_format_1(tmpd_cwd):
|
|
38
39
|
assert so_content == "1 4 10"
|
39
40
|
|
40
41
|
|
42
|
+
@pytest.mark.shared_fs
|
41
43
|
def test_auto_log_filename_format(caplog):
|
42
44
|
"""Testing auto log filename format for BashApps
|
43
45
|
"""
|
@@ -66,6 +68,7 @@ def test_auto_log_filename_format(caplog):
|
|
66
68
|
assert record.levelno < logging.ERROR
|
67
69
|
|
68
70
|
|
71
|
+
@pytest.mark.shared_fs
|
69
72
|
def test_parallel_for(tmpd_cwd, n=3):
|
70
73
|
"""Testing a simple parallel for loop"""
|
71
74
|
outdir = tmpd_cwd / "outputs/test_parallel"
|
@@ -58,6 +58,7 @@ test_matrix = {
|
|
58
58
|
whitelist = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'configs', '*threads*')
|
59
59
|
|
60
60
|
|
61
|
+
@pytest.mark.shared_fs
|
61
62
|
def test_div_0(test_fn=div_0):
|
62
63
|
err_code = test_matrix[test_fn]['exit_code']
|
63
64
|
f = test_fn()
|
@@ -73,6 +74,7 @@ def test_div_0(test_fn=div_0):
|
|
73
74
|
os.remove('std.out')
|
74
75
|
|
75
76
|
|
77
|
+
@pytest.mark.shared_fs
|
76
78
|
def test_bash_misuse(test_fn=bash_misuse):
|
77
79
|
err_code = test_matrix[test_fn]['exit_code']
|
78
80
|
f = test_fn()
|
@@ -87,6 +89,7 @@ def test_bash_misuse(test_fn=bash_misuse):
|
|
87
89
|
os.remove('std.out')
|
88
90
|
|
89
91
|
|
92
|
+
@pytest.mark.shared_fs
|
90
93
|
def test_command_not_found(test_fn=command_not_found):
|
91
94
|
err_code = test_matrix[test_fn]['exit_code']
|
92
95
|
f = test_fn()
|
@@ -103,6 +106,7 @@ def test_command_not_found(test_fn=command_not_found):
|
|
103
106
|
return True
|
104
107
|
|
105
108
|
|
109
|
+
@pytest.mark.shared_fs
|
106
110
|
def test_not_executable(test_fn=not_executable):
|
107
111
|
err_code = test_matrix[test_fn]['exit_code']
|
108
112
|
f = test_fn()
|