parsl 2024.10.28__py3-none-any.whl → 2024.11.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/channels/base.py +6 -46
- parsl/channels/errors.py +0 -67
- parsl/channels/local/local.py +5 -56
- parsl/dataflow/dflow.py +6 -61
- parsl/executors/high_throughput/executor.py +0 -1
- parsl/executors/high_throughput/mpi_resource_management.py +0 -12
- parsl/executors/taskvine/manager.py +6 -0
- parsl/executors/taskvine/manager_config.py +5 -0
- parsl/monitoring/monitoring.py +23 -26
- parsl/monitoring/radios.py +4 -17
- parsl/monitoring/remote.py +3 -5
- parsl/providers/__init__.py +0 -2
- parsl/providers/base.py +1 -1
- parsl/providers/cluster_provider.py +1 -4
- parsl/providers/condor/condor.py +1 -4
- parsl/providers/grid_engine/grid_engine.py +1 -4
- parsl/providers/lsf/lsf.py +1 -4
- parsl/providers/pbspro/pbspro.py +1 -4
- parsl/providers/slurm/slurm.py +1 -4
- parsl/providers/torque/torque.py +1 -4
- parsl/tests/configs/user_opts.py +0 -7
- parsl/tests/conftest.py +4 -4
- parsl/tests/site_tests/site_config_selector.py +1 -6
- parsl/tests/test_bash_apps/test_basic.py +3 -0
- parsl/tests/test_bash_apps/test_error_codes.py +4 -0
- parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -0
- parsl/tests/test_bash_apps/test_memoize.py +2 -6
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +3 -0
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +1 -0
- parsl/tests/test_bash_apps/test_multiline.py +1 -0
- parsl/tests/test_bash_apps/test_stdout.py +2 -0
- parsl/tests/test_channels/test_local_channel.py +0 -19
- parsl/tests/test_docs/test_from_slides.py +3 -0
- parsl/tests/test_docs/test_kwargs.py +3 -0
- parsl/tests/test_monitoring/test_basic.py +13 -1
- parsl/tests/test_providers/test_local_provider.py +0 -135
- parsl/tests/test_providers/test_pbspro_template.py +2 -1
- parsl/tests/test_providers/test_slurm_template.py +2 -1
- parsl/tests/test_python_apps/test_outputs.py +1 -0
- parsl/tests/test_regression/test_226.py +1 -0
- parsl/tests/test_staging/test_docs_1.py +1 -0
- parsl/tests/test_staging/test_output_chain_filenames.py +3 -0
- parsl/tests/test_staging/test_staging_ftp.py +1 -0
- parsl/tests/test_staging/test_staging_https.py +3 -0
- parsl/tests/test_staging/test_staging_stdout.py +2 -0
- parsl/version.py +1 -1
- {parsl-2024.10.28.dist-info → parsl-2024.11.11.dist-info}/METADATA +2 -8
- {parsl-2024.10.28.dist-info → parsl-2024.11.11.dist-info}/RECORD +56 -74
- {parsl-2024.10.28.dist-info → parsl-2024.11.11.dist-info}/WHEEL +1 -1
- parsl/channels/oauth_ssh/__init__.py +0 -0
- parsl/channels/oauth_ssh/oauth_ssh.py +0 -119
- parsl/channels/ssh/__init__.py +0 -0
- parsl/channels/ssh/ssh.py +0 -295
- parsl/channels/ssh_il/__init__.py +0 -0
- parsl/channels/ssh_il/ssh_il.py +0 -85
- parsl/providers/ad_hoc/__init__.py +0 -0
- parsl/providers/ad_hoc/ad_hoc.py +0 -252
- parsl/providers/cobalt/__init__.py +0 -0
- parsl/providers/cobalt/cobalt.py +0 -236
- parsl/providers/cobalt/template.py +0 -17
- parsl/tests/configs/cooley_htex.py +0 -37
- parsl/tests/configs/local_adhoc.py +0 -18
- parsl/tests/configs/theta.py +0 -37
- parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
- parsl/tests/sites/test_local_adhoc.py +0 -62
- parsl/tests/test_channels/test_dfk_close.py +0 -26
- parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -18
- {parsl-2024.10.28.data → parsl-2024.11.11.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.10.28.data → parsl-2024.11.11.data}/scripts/interchange.py +0 -0
- {parsl-2024.10.28.data → parsl-2024.11.11.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.10.28.data → parsl-2024.11.11.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.10.28.dist-info → parsl-2024.11.11.dist-info}/LICENSE +0 -0
- {parsl-2024.10.28.dist-info → parsl-2024.11.11.dist-info}/entry_points.txt +0 -0
- {parsl-2024.10.28.dist-info → parsl-2024.11.11.dist-info}/top_level.txt +0 -0
parsl/channels/base.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from abc import ABCMeta, abstractmethod, abstractproperty
|
2
|
-
from typing import
|
2
|
+
from typing import Tuple
|
3
3
|
|
4
4
|
|
5
5
|
class Channel(metaclass=ABCMeta):
|
@@ -8,33 +8,27 @@ class Channel(metaclass=ABCMeta):
|
|
8
8
|
|
9
9
|
For certain resources such as campus clusters or supercomputers at
|
10
10
|
research laboratories, resource requirements may require authentication.
|
11
|
-
For instance some resources may allow access to their job schedulers from
|
12
|
-
only their login-nodes which require you to authenticate through SSH, or
|
13
|
-
require two factor authentication.
|
14
11
|
|
15
|
-
The
|
16
|
-
shell
|
12
|
+
The only remaining Channel, *LocalChannel*, executes commands locally in a
|
13
|
+
shell.
|
17
14
|
|
18
15
|
Channels provide the ability to execute commands remotely, using the
|
19
16
|
execute_wait method, and manipulate the remote file system using methods
|
20
17
|
such as push_file, pull_file and makedirs.
|
21
18
|
|
22
19
|
Channels should ensure that each launched command runs in a new process
|
23
|
-
group, so that providers (such as
|
24
|
-
|
20
|
+
group, so that providers (such as LocalProvider) which terminate long
|
21
|
+
running commands using process groups can do so.
|
25
22
|
"""
|
26
23
|
|
27
24
|
@abstractmethod
|
28
|
-
def execute_wait(self, cmd: str, walltime: int = 0
|
25
|
+
def execute_wait(self, cmd: str, walltime: int = 0) -> Tuple[int, str, str]:
|
29
26
|
''' Executes the cmd, with a defined walltime.
|
30
27
|
|
31
28
|
Args:
|
32
29
|
- cmd (string): Command string to execute over the channel
|
33
30
|
- walltime (int) : Timeout in seconds
|
34
31
|
|
35
|
-
KWargs:
|
36
|
-
- envs (Dict[str, str]) : Environment variables to push to the remote side
|
37
|
-
|
38
32
|
Returns:
|
39
33
|
- (exit_code, stdout, stderr) (int, string, string)
|
40
34
|
'''
|
@@ -86,37 +80,3 @@ class Channel(metaclass=ABCMeta):
|
|
86
80
|
destination_path (string)
|
87
81
|
'''
|
88
82
|
pass
|
89
|
-
|
90
|
-
@abstractmethod
|
91
|
-
def close(self) -> None:
|
92
|
-
''' Closes the channel.
|
93
|
-
'''
|
94
|
-
pass
|
95
|
-
|
96
|
-
@abstractmethod
|
97
|
-
def makedirs(self, path: str, mode: int = 0o511, exist_ok: bool = False) -> None:
|
98
|
-
"""Create a directory.
|
99
|
-
|
100
|
-
If intermediate directories do not exist, they will be created.
|
101
|
-
|
102
|
-
Parameters
|
103
|
-
----------
|
104
|
-
path : str
|
105
|
-
Path of directory to create.
|
106
|
-
mode : int
|
107
|
-
Permissions (posix-style) for the newly-created directory.
|
108
|
-
exist_ok : bool
|
109
|
-
If False, raise an OSError if the target directory already exists.
|
110
|
-
"""
|
111
|
-
pass
|
112
|
-
|
113
|
-
@abstractmethod
|
114
|
-
def isdir(self, path: str) -> bool:
|
115
|
-
"""Return true if the path refers to an existing directory.
|
116
|
-
|
117
|
-
Parameters
|
118
|
-
----------
|
119
|
-
path : str
|
120
|
-
Path of directory to check.
|
121
|
-
"""
|
122
|
-
pass
|
parsl/channels/errors.py
CHANGED
@@ -17,73 +17,6 @@ class ChannelError(ParslError):
|
|
17
17
|
return "Hostname:{0}, Reason:{1}".format(self.hostname, self.reason)
|
18
18
|
|
19
19
|
|
20
|
-
class BadHostKeyException(ChannelError):
|
21
|
-
''' SSH channel could not be created since server's host keys could not
|
22
|
-
be verified
|
23
|
-
|
24
|
-
Contains:
|
25
|
-
reason(string)
|
26
|
-
e (paramiko exception object)
|
27
|
-
hostname (string)
|
28
|
-
'''
|
29
|
-
|
30
|
-
def __init__(self, e: Exception, hostname: str) -> None:
|
31
|
-
super().__init__("SSH channel could not be created since server's host keys could not be "
|
32
|
-
"verified", e, hostname)
|
33
|
-
|
34
|
-
|
35
|
-
class BadScriptPath(ChannelError):
|
36
|
-
''' An error raised during execution of an app.
|
37
|
-
What this exception contains depends entirely on context
|
38
|
-
Contains:
|
39
|
-
reason(string)
|
40
|
-
e (paramiko exception object)
|
41
|
-
hostname (string)
|
42
|
-
'''
|
43
|
-
|
44
|
-
def __init__(self, e: Exception, hostname: str) -> None:
|
45
|
-
super().__init__("Inaccessible remote script dir. Specify script_dir", e, hostname)
|
46
|
-
|
47
|
-
|
48
|
-
class BadPermsScriptPath(ChannelError):
|
49
|
-
''' User does not have permissions to access the script_dir on the remote site
|
50
|
-
|
51
|
-
Contains:
|
52
|
-
reason(string)
|
53
|
-
e (paramiko exception object)
|
54
|
-
hostname (string)
|
55
|
-
'''
|
56
|
-
|
57
|
-
def __init__(self, e: Exception, hostname: str) -> None:
|
58
|
-
super().__init__("User does not have permissions to access the script_dir", e, hostname)
|
59
|
-
|
60
|
-
|
61
|
-
class AuthException(ChannelError):
|
62
|
-
''' An error raised during execution of an app.
|
63
|
-
What this exception contains depends entirely on context
|
64
|
-
Contains:
|
65
|
-
reason(string)
|
66
|
-
e (paramiko exception object)
|
67
|
-
hostname (string)
|
68
|
-
'''
|
69
|
-
|
70
|
-
def __init__(self, e: Exception, hostname: str) -> None:
|
71
|
-
super().__init__("Authentication to remote server failed", e, hostname)
|
72
|
-
|
73
|
-
|
74
|
-
class SSHException(ChannelError):
|
75
|
-
''' if there was any other error connecting or establishing an SSH session
|
76
|
-
|
77
|
-
Contains:
|
78
|
-
reason(string)
|
79
|
-
e (paramiko exception object)
|
80
|
-
hostname (string)
|
81
|
-
'''
|
82
|
-
|
83
|
-
def __init__(self, e: Exception, hostname: str) -> None:
|
84
|
-
super().__init__("Error connecting or establishing an SSH session", e, hostname)
|
85
|
-
|
86
|
-
|
87
20
|
class FileCopyException(ChannelError):
|
88
21
|
''' File copy operation failed
|
89
22
|
|
parsl/channels/local/local.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
import copy
|
2
1
|
import logging
|
3
2
|
import os
|
4
3
|
import shutil
|
@@ -16,49 +15,32 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
16
15
|
and done so infrequently that they do not need a persistent channel
|
17
16
|
'''
|
18
17
|
|
19
|
-
def __init__(self
|
18
|
+
def __init__(self):
|
20
19
|
''' Initialize the local channel. script_dir is required by set to a default.
|
21
20
|
|
22
21
|
KwArgs:
|
23
|
-
- userhome (string): (default='.') This is provided as a way to override and set a specific userhome
|
24
|
-
- envs (dict) : A dictionary of env variables to be set when launching the shell
|
25
22
|
- script_dir (string): Directory to place scripts
|
26
23
|
'''
|
27
|
-
self.
|
28
|
-
|
29
|
-
|
30
|
-
local_env = os.environ.copy()
|
31
|
-
self._envs = copy.deepcopy(local_env)
|
32
|
-
self._envs.update(envs)
|
33
|
-
self.script_dir = script_dir
|
34
|
-
|
35
|
-
def execute_wait(self, cmd, walltime=None, envs={}):
|
24
|
+
self.script_dir = None
|
25
|
+
|
26
|
+
def execute_wait(self, cmd, walltime=None):
|
36
27
|
''' Synchronously execute a commandline string on the shell.
|
37
28
|
|
38
29
|
Args:
|
39
30
|
- cmd (string) : Commandline string to execute
|
40
31
|
- walltime (int) : walltime in seconds
|
41
32
|
|
42
|
-
Kwargs:
|
43
|
-
- envs (dict) : Dictionary of env variables. This will be used
|
44
|
-
to override the envs set at channel initialization.
|
45
|
-
|
46
33
|
Returns:
|
47
34
|
- retcode : Return code from the execution
|
48
35
|
- stdout : stdout string
|
49
36
|
- stderr : stderr string
|
50
37
|
'''
|
51
|
-
current_env = copy.deepcopy(self._envs)
|
52
|
-
current_env.update(envs)
|
53
|
-
|
54
38
|
try:
|
55
39
|
logger.debug("Creating process with command '%s'", cmd)
|
56
40
|
proc = subprocess.Popen(
|
57
41
|
cmd,
|
58
42
|
stdout=subprocess.PIPE,
|
59
43
|
stderr=subprocess.PIPE,
|
60
|
-
cwd=self.userhome,
|
61
|
-
env=current_env,
|
62
44
|
shell=True,
|
63
45
|
preexec_fn=os.setpgrp
|
64
46
|
)
|
@@ -99,7 +81,7 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
99
81
|
os.chmod(local_dest, 0o700)
|
100
82
|
|
101
83
|
except OSError as e:
|
102
|
-
raise FileCopyException(e,
|
84
|
+
raise FileCopyException(e, "localhost")
|
103
85
|
|
104
86
|
else:
|
105
87
|
os.chmod(local_dest, 0o700)
|
@@ -109,39 +91,6 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
109
91
|
def pull_file(self, remote_source, local_dir):
|
110
92
|
return self.push_file(remote_source, local_dir)
|
111
93
|
|
112
|
-
def close(self) -> None:
|
113
|
-
''' There's nothing to close here, and so this doesn't do anything
|
114
|
-
'''
|
115
|
-
pass
|
116
|
-
|
117
|
-
def isdir(self, path):
|
118
|
-
"""Return true if the path refers to an existing directory.
|
119
|
-
|
120
|
-
Parameters
|
121
|
-
----------
|
122
|
-
path : str
|
123
|
-
Path of directory to check.
|
124
|
-
"""
|
125
|
-
|
126
|
-
return os.path.isdir(path)
|
127
|
-
|
128
|
-
def makedirs(self, path, mode=0o700, exist_ok=False):
|
129
|
-
"""Create a directory.
|
130
|
-
|
131
|
-
If intermediate directories do not exist, they will be created.
|
132
|
-
|
133
|
-
Parameters
|
134
|
-
----------
|
135
|
-
path : str
|
136
|
-
Path of directory to create.
|
137
|
-
mode : int
|
138
|
-
Permissions (posix-style) for the newly-created directory.
|
139
|
-
exist_ok : bool
|
140
|
-
If False, raise an OSError if the target directory already exists.
|
141
|
-
"""
|
142
|
-
|
143
|
-
return os.makedirs(path, mode, exist_ok)
|
144
|
-
|
145
94
|
@property
|
146
95
|
def script_dir(self):
|
147
96
|
return self._script_dir
|
parsl/dataflow/dflow.py
CHANGED
@@ -6,7 +6,6 @@ import datetime
|
|
6
6
|
import inspect
|
7
7
|
import logging
|
8
8
|
import os
|
9
|
-
import pathlib
|
10
9
|
import pickle
|
11
10
|
import random
|
12
11
|
import sys
|
@@ -25,7 +24,6 @@ from typeguard import typechecked
|
|
25
24
|
import parsl
|
26
25
|
from parsl.app.errors import RemoteExceptionWrapper
|
27
26
|
from parsl.app.futures import DataFuture
|
28
|
-
from parsl.channels import Channel
|
29
27
|
from parsl.config import Config
|
30
28
|
from parsl.data_provider.data_manager import DataManager
|
31
29
|
from parsl.data_provider.files import File
|
@@ -49,7 +47,6 @@ from parsl.monitoring import MonitoringHub
|
|
49
47
|
from parsl.monitoring.message_type import MessageType
|
50
48
|
from parsl.monitoring.remote import monitor_wrapper
|
51
49
|
from parsl.process_loggers import wrap_with_logs
|
52
|
-
from parsl.providers.base import ExecutionProvider
|
53
50
|
from parsl.usage_tracking.usage import UsageTracker
|
54
51
|
from parsl.utils import Timer, get_all_checkpoints, get_std_fname_mode, get_version
|
55
52
|
|
@@ -162,8 +159,8 @@ class DataFlowKernel:
|
|
162
159
|
}
|
163
160
|
|
164
161
|
if self.monitoring:
|
165
|
-
self.monitoring.send(MessageType.WORKFLOW_INFO,
|
166
|
-
workflow_info)
|
162
|
+
self.monitoring.send((MessageType.WORKFLOW_INFO,
|
163
|
+
workflow_info))
|
167
164
|
|
168
165
|
if config.checkpoint_files is not None:
|
169
166
|
checkpoints = self.load_checkpoints(config.checkpoint_files)
|
@@ -238,7 +235,7 @@ class DataFlowKernel:
|
|
238
235
|
def _send_task_log_info(self, task_record: TaskRecord) -> None:
|
239
236
|
if self.monitoring:
|
240
237
|
task_log_info = self._create_task_log_info(task_record)
|
241
|
-
self.monitoring.send(MessageType.TASK_INFO, task_log_info)
|
238
|
+
self.monitoring.send((MessageType.TASK_INFO, task_log_info))
|
242
239
|
|
243
240
|
def _create_task_log_info(self, task_record: TaskRecord) -> Dict[str, Any]:
|
244
241
|
"""
|
@@ -1143,36 +1140,6 @@ class DataFlowKernel:
|
|
1143
1140
|
|
1144
1141
|
logger.info("End of summary")
|
1145
1142
|
|
1146
|
-
def _create_remote_dirs_over_channel(self, provider: ExecutionProvider, channel: Channel) -> None:
|
1147
|
-
"""Create script directories across a channel
|
1148
|
-
|
1149
|
-
Parameters
|
1150
|
-
----------
|
1151
|
-
provider: Provider obj
|
1152
|
-
Provider for which scripts dirs are being created
|
1153
|
-
channel: Channel obj
|
1154
|
-
Channel over which the remote dirs are to be created
|
1155
|
-
"""
|
1156
|
-
run_dir = self.run_dir
|
1157
|
-
if channel.script_dir is None:
|
1158
|
-
|
1159
|
-
# This case will be detected as unreachable by mypy, because of
|
1160
|
-
# the type of script_dir, which is str, not Optional[str].
|
1161
|
-
# The type system doesn't represent the initialized/uninitialized
|
1162
|
-
# state of a channel so cannot represent that a channel needs
|
1163
|
-
# its script directory set or not.
|
1164
|
-
|
1165
|
-
channel.script_dir = os.path.join(run_dir, 'submit_scripts') # type: ignore[unreachable]
|
1166
|
-
|
1167
|
-
# Only create dirs if we aren't on a shared-fs
|
1168
|
-
if not channel.isdir(run_dir):
|
1169
|
-
parent, child = pathlib.Path(run_dir).parts[-2:]
|
1170
|
-
remote_run_dir = os.path.join(parent, child)
|
1171
|
-
channel.script_dir = os.path.join(remote_run_dir, 'remote_submit_scripts')
|
1172
|
-
provider.script_dir = os.path.join(run_dir, 'local_submit_scripts')
|
1173
|
-
|
1174
|
-
channel.makedirs(channel.script_dir, exist_ok=True)
|
1175
|
-
|
1176
1143
|
def add_executors(self, executors: Sequence[ParslExecutor]) -> None:
|
1177
1144
|
for executor in executors:
|
1178
1145
|
executor.run_id = self.run_id
|
@@ -1186,12 +1153,7 @@ class DataFlowKernel:
|
|
1186
1153
|
executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
|
1187
1154
|
os.makedirs(executor.provider.script_dir, exist_ok=True)
|
1188
1155
|
|
1189
|
-
|
1190
|
-
logger.debug("Creating script_dir across multiple channels")
|
1191
|
-
for channel in executor.provider.channels:
|
1192
|
-
self._create_remote_dirs_over_channel(executor.provider, channel)
|
1193
|
-
else:
|
1194
|
-
self._create_remote_dirs_over_channel(executor.provider, executor.provider.channel)
|
1156
|
+
executor.provider.channel.script_dir = executor.provider.script_dir
|
1195
1157
|
|
1196
1158
|
self.executors[executor.label] = executor
|
1197
1159
|
executor.start()
|
@@ -1273,34 +1235,17 @@ class DataFlowKernel:
|
|
1273
1235
|
executor.shutdown()
|
1274
1236
|
logger.info(f"Shut down executor {executor.label}")
|
1275
1237
|
|
1276
|
-
if hasattr(executor, 'provider'):
|
1277
|
-
if hasattr(executor.provider, 'script_dir'):
|
1278
|
-
logger.info(f"Closing channel(s) for {executor.label}")
|
1279
|
-
|
1280
|
-
if hasattr(executor.provider, 'channels'):
|
1281
|
-
for channel in executor.provider.channels:
|
1282
|
-
logger.info(f"Closing channel {channel}")
|
1283
|
-
channel.close()
|
1284
|
-
logger.info(f"Closed channel {channel}")
|
1285
|
-
else:
|
1286
|
-
assert hasattr(executor.provider, 'channel'), "If provider has no .channels, it must have .channel"
|
1287
|
-
logger.info(f"Closing channel {executor.provider.channel}")
|
1288
|
-
executor.provider.channel.close()
|
1289
|
-
logger.info(f"Closed channel {executor.provider.channel}")
|
1290
|
-
|
1291
|
-
logger.info(f"Closed executor channel(s) for {executor.label}")
|
1292
|
-
|
1293
1238
|
logger.info("Terminated executors")
|
1294
1239
|
self.time_completed = datetime.datetime.now()
|
1295
1240
|
|
1296
1241
|
if self.monitoring:
|
1297
1242
|
logger.info("Sending final monitoring message")
|
1298
|
-
self.monitoring.send(MessageType.WORKFLOW_INFO,
|
1243
|
+
self.monitoring.send((MessageType.WORKFLOW_INFO,
|
1299
1244
|
{'tasks_failed_count': self.task_state_counts[States.failed],
|
1300
1245
|
'tasks_completed_count': self.task_state_counts[States.exec_done],
|
1301
1246
|
"time_began": self.time_began,
|
1302
1247
|
'time_completed': self.time_completed,
|
1303
|
-
'run_id': self.run_id, 'rundir': self.run_dir})
|
1248
|
+
'run_id': self.run_id, 'rundir': self.run_dir}))
|
1304
1249
|
|
1305
1250
|
logger.info("Terminating monitoring")
|
1306
1251
|
self.monitoring.close()
|
@@ -63,7 +63,6 @@ DEFAULT_INTERCHANGE_LAUNCH_CMD = ["interchange.py"]
|
|
63
63
|
|
64
64
|
GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
|
65
65
|
Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
|
66
|
-
:class:`~parsl.providers.cobalt.cobalt.Cobalt`,
|
67
66
|
:class:`~parsl.providers.condor.condor.Condor`,
|
68
67
|
:class:`~parsl.providers.googlecloud.googlecloud.GoogleCloud`,
|
69
68
|
:class:`~parsl.providers.gridEngine.gridEngine.GridEngine`,
|
@@ -17,7 +17,6 @@ class Scheduler(Enum):
|
|
17
17
|
Unknown = 0
|
18
18
|
Slurm = 1
|
19
19
|
PBS = 2
|
20
|
-
Cobalt = 3
|
21
20
|
|
22
21
|
|
23
22
|
def get_slurm_hosts_list() -> List[str]:
|
@@ -37,13 +36,6 @@ def get_pbs_hosts_list() -> List[str]:
|
|
37
36
|
return [line.strip() for line in f.readlines()]
|
38
37
|
|
39
38
|
|
40
|
-
def get_cobalt_hosts_list() -> List[str]:
|
41
|
-
"""Get list of COBALT hosts from envvar: COBALT_NODEFILE"""
|
42
|
-
nodefile_name = os.environ["COBALT_NODEFILE"]
|
43
|
-
with open(nodefile_name) as f:
|
44
|
-
return [line.strip() for line in f.readlines()]
|
45
|
-
|
46
|
-
|
47
39
|
def get_nodes_in_batchjob(scheduler: Scheduler) -> List[str]:
|
48
40
|
"""Get nodelist from all supported schedulers"""
|
49
41
|
nodelist = []
|
@@ -51,8 +43,6 @@ def get_nodes_in_batchjob(scheduler: Scheduler) -> List[str]:
|
|
51
43
|
nodelist = get_slurm_hosts_list()
|
52
44
|
elif scheduler == Scheduler.PBS:
|
53
45
|
nodelist = get_pbs_hosts_list()
|
54
|
-
elif scheduler == Scheduler.Cobalt:
|
55
|
-
nodelist = get_cobalt_hosts_list()
|
56
46
|
else:
|
57
47
|
raise RuntimeError(f"mpi_mode does not support scheduler:{scheduler}")
|
58
48
|
return nodelist
|
@@ -64,8 +54,6 @@ def identify_scheduler() -> Scheduler:
|
|
64
54
|
return Scheduler.Slurm
|
65
55
|
elif os.environ.get("PBS_NODEFILE"):
|
66
56
|
return Scheduler.PBS
|
67
|
-
elif os.environ.get("COBALT_NODEFILE"):
|
68
|
-
return Scheduler.Cobalt
|
69
57
|
else:
|
70
58
|
return Scheduler.Unknown
|
71
59
|
|
@@ -44,11 +44,17 @@ def _set_manager_attributes(m, config):
|
|
44
44
|
# Enable peer transfer feature between workers if specified
|
45
45
|
if config.enable_peer_transfers:
|
46
46
|
m.enable_peer_transfers()
|
47
|
+
else:
|
48
|
+
m.disable_peer_transfers()
|
47
49
|
|
48
50
|
# Set catalog report to parsl if project name exists
|
49
51
|
if m.name:
|
50
52
|
m.set_property("framework", "parsl")
|
51
53
|
|
54
|
+
if config.tune_parameters is not None:
|
55
|
+
for k, v in config.tune_parameters.items():
|
56
|
+
m.tune(k, v)
|
57
|
+
|
52
58
|
|
53
59
|
def _prepare_environment_serverless(manager_config, env_cache_dir, poncho_create_script):
|
54
60
|
# Return path to a packaged poncho environment
|
@@ -156,6 +156,10 @@ class TaskVineManagerConfig:
|
|
156
156
|
Directory to store TaskVine logging facilities.
|
157
157
|
Default is None, in which all TaskVine logs will be contained
|
158
158
|
in the Parsl logging directory.
|
159
|
+
|
160
|
+
tune_parameters: Optional[dict]
|
161
|
+
Extended vine_tune parameters, expressed in a dictionary
|
162
|
+
by { 'tune-parameter' : value }.
|
159
163
|
"""
|
160
164
|
|
161
165
|
# Connection and communication settings
|
@@ -181,6 +185,7 @@ class TaskVineManagerConfig:
|
|
181
185
|
autocategory: bool = True
|
182
186
|
enable_peer_transfers: bool = True
|
183
187
|
wait_for_workers: Optional[int] = None
|
188
|
+
tune_parameters: Optional[dict] = None
|
184
189
|
|
185
190
|
# Logging settings
|
186
191
|
vine_log_dir: Optional[str] = None
|
parsl/monitoring/monitoring.py
CHANGED
@@ -3,23 +3,22 @@ from __future__ import annotations
|
|
3
3
|
import logging
|
4
4
|
import multiprocessing.synchronize as ms
|
5
5
|
import os
|
6
|
+
import pickle
|
6
7
|
import queue
|
7
8
|
import time
|
8
|
-
from multiprocessing import Event
|
9
|
+
from multiprocessing import Event
|
9
10
|
from multiprocessing.queues import Queue
|
10
|
-
from typing import TYPE_CHECKING,
|
11
|
+
from typing import TYPE_CHECKING, Literal, Optional, Tuple, Union, cast
|
11
12
|
|
12
13
|
import typeguard
|
13
14
|
|
14
15
|
from parsl.log_utils import set_file_logger
|
15
16
|
from parsl.monitoring.errors import MonitoringHubStartError
|
16
|
-
from parsl.monitoring.message_type import MessageType
|
17
17
|
from parsl.monitoring.radios import MultiprocessingQueueRadioSender
|
18
18
|
from parsl.monitoring.router import router_starter
|
19
19
|
from parsl.monitoring.types import TaggedMonitoringMessage
|
20
20
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
21
21
|
from parsl.process_loggers import wrap_with_logs
|
22
|
-
from parsl.serialize import deserialize
|
23
22
|
from parsl.utils import RepresentationMixin, setproctitle
|
24
23
|
|
25
24
|
_db_manager_excepts: Optional[Exception]
|
@@ -170,15 +169,15 @@ class MonitoringHub(RepresentationMixin):
|
|
170
169
|
daemon=True,
|
171
170
|
)
|
172
171
|
self.dbm_proc.start()
|
173
|
-
logger.info("Started the router process
|
172
|
+
logger.info("Started the router process %s and DBM process %s", self.router_proc.pid, self.dbm_proc.pid)
|
174
173
|
|
175
|
-
self.filesystem_proc =
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
174
|
+
self.filesystem_proc = ForkProcess(target=filesystem_receiver,
|
175
|
+
args=(self.logdir, self.resource_msgs, dfk_run_dir),
|
176
|
+
name="Monitoring-Filesystem-Process",
|
177
|
+
daemon=True
|
178
|
+
)
|
180
179
|
self.filesystem_proc.start()
|
181
|
-
logger.info(
|
180
|
+
logger.info("Started filesystem radio receiver process %s", self.filesystem_proc.pid)
|
182
181
|
|
183
182
|
self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
|
184
183
|
|
@@ -191,7 +190,7 @@ class MonitoringHub(RepresentationMixin):
|
|
191
190
|
raise MonitoringHubStartError()
|
192
191
|
|
193
192
|
if isinstance(comm_q_result, str):
|
194
|
-
logger.error(
|
193
|
+
logger.error("MonitoringRouter sent an error message: %s", comm_q_result)
|
195
194
|
raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
|
196
195
|
|
197
196
|
udp_port, zmq_port = comm_q_result
|
@@ -202,10 +201,9 @@ class MonitoringHub(RepresentationMixin):
|
|
202
201
|
|
203
202
|
self.hub_zmq_port = zmq_port
|
204
203
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
self.radio.send((mtype, message))
|
204
|
+
def send(self, message: TaggedMonitoringMessage) -> None:
|
205
|
+
logger.debug("Sending message type %s", message[0])
|
206
|
+
self.radio.send(message)
|
209
207
|
|
210
208
|
def close(self) -> None:
|
211
209
|
logger.info("Terminating Monitoring Hub")
|
@@ -221,10 +219,9 @@ class MonitoringHub(RepresentationMixin):
|
|
221
219
|
if exception_msgs:
|
222
220
|
for exception_msg in exception_msgs:
|
223
221
|
logger.error(
|
224
|
-
"
|
225
|
-
|
226
|
-
|
227
|
-
)
|
222
|
+
"%s process delivered an exception: %s. Terminating all monitoring processes immediately.",
|
223
|
+
exception_msg[0],
|
224
|
+
exception_msg[1]
|
228
225
|
)
|
229
226
|
self.router_proc.terminate()
|
230
227
|
self.dbm_proc.terminate()
|
@@ -261,7 +258,7 @@ class MonitoringHub(RepresentationMixin):
|
|
261
258
|
|
262
259
|
|
263
260
|
@wrap_with_logs
|
264
|
-
def filesystem_receiver(logdir: str, q:
|
261
|
+
def filesystem_receiver(logdir: str, q: Queue[TaggedMonitoringMessage], run_dir: str) -> None:
|
265
262
|
logger = set_file_logger("{}/monitoring_filesystem_radio.log".format(logdir),
|
266
263
|
name="monitoring_filesystem_radio",
|
267
264
|
level=logging.INFO)
|
@@ -271,7 +268,7 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[TaggedMonitoringMessage]",
|
|
271
268
|
base_path = f"{run_dir}/monitor-fs-radio/"
|
272
269
|
tmp_dir = f"{base_path}/tmp/"
|
273
270
|
new_dir = f"{base_path}/new/"
|
274
|
-
logger.debug(
|
271
|
+
logger.debug("Creating new and tmp paths under %s", base_path)
|
275
272
|
|
276
273
|
os.makedirs(tmp_dir, exist_ok=True)
|
277
274
|
os.makedirs(new_dir, exist_ok=True)
|
@@ -282,15 +279,15 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[TaggedMonitoringMessage]",
|
|
282
279
|
# iterate over files in new_dir
|
283
280
|
for filename in os.listdir(new_dir):
|
284
281
|
try:
|
285
|
-
logger.info(
|
282
|
+
logger.info("Processing filesystem radio file %s", filename)
|
286
283
|
full_path_filename = f"{new_dir}/{filename}"
|
287
284
|
with open(full_path_filename, "rb") as f:
|
288
|
-
message =
|
289
|
-
logger.debug(
|
285
|
+
message = pickle.load(f)
|
286
|
+
logger.debug("Message received is: %s", message)
|
290
287
|
assert isinstance(message, tuple)
|
291
288
|
q.put(cast(TaggedMonitoringMessage, message))
|
292
289
|
os.remove(full_path_filename)
|
293
290
|
except Exception:
|
294
|
-
logger.exception(
|
291
|
+
logger.exception("Exception processing %s - probably will be retried next iteration", filename)
|
295
292
|
|
296
293
|
time.sleep(1) # whats a good time for this poll?
|