parsl 2024.11.4__py3-none-any.whl → 2024.11.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/channels/base.py +6 -46
- parsl/channels/errors.py +0 -67
- parsl/channels/local/local.py +5 -56
- parsl/dataflow/dflow.py +1 -58
- parsl/executors/taskvine/manager.py +6 -0
- parsl/executors/taskvine/manager_config.py +5 -0
- parsl/monitoring/db_manager.py +6 -6
- parsl/monitoring/monitoring.py +27 -33
- parsl/monitoring/radios.py +1 -3
- parsl/monitoring/router.py +11 -11
- parsl/providers/cluster_provider.py +1 -4
- parsl/providers/condor/condor.py +1 -4
- parsl/providers/grid_engine/grid_engine.py +1 -4
- parsl/providers/lsf/lsf.py +1 -4
- parsl/providers/pbspro/pbspro.py +1 -4
- parsl/providers/slurm/slurm.py +26 -7
- parsl/providers/torque/torque.py +1 -4
- parsl/tests/configs/user_opts.py +0 -7
- parsl/tests/conftest.py +0 -4
- parsl/tests/test_channels/test_local_channel.py +0 -19
- parsl/tests/test_providers/test_local_provider.py +0 -135
- parsl/tests/test_providers/test_pbspro_template.py +2 -1
- parsl/tests/test_providers/test_slurm_template.py +2 -1
- parsl/version.py +1 -1
- {parsl-2024.11.4.dist-info → parsl-2024.11.18.dist-info}/METADATA +2 -8
- {parsl-2024.11.4.dist-info → parsl-2024.11.18.dist-info}/RECORD +34 -45
- {parsl-2024.11.4.dist-info → parsl-2024.11.18.dist-info}/WHEEL +1 -1
- parsl/channels/oauth_ssh/__init__.py +0 -0
- parsl/channels/oauth_ssh/oauth_ssh.py +0 -119
- parsl/channels/ssh/__init__.py +0 -0
- parsl/channels/ssh/ssh.py +0 -295
- parsl/channels/ssh_il/__init__.py +0 -0
- parsl/channels/ssh_il/ssh_il.py +0 -85
- parsl/providers/ad_hoc/__init__.py +0 -0
- parsl/providers/ad_hoc/ad_hoc.py +0 -252
- parsl/tests/configs/local_adhoc.py +0 -18
- parsl/tests/sites/test_local_adhoc.py +0 -62
- parsl/tests/test_channels/test_dfk_close.py +0 -26
- {parsl-2024.11.4.data → parsl-2024.11.18.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.11.4.data → parsl-2024.11.18.data}/scripts/interchange.py +0 -0
- {parsl-2024.11.4.data → parsl-2024.11.18.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.11.4.data → parsl-2024.11.18.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.11.4.dist-info → parsl-2024.11.18.dist-info}/LICENSE +0 -0
- {parsl-2024.11.4.dist-info → parsl-2024.11.18.dist-info}/entry_points.txt +0 -0
- {parsl-2024.11.4.dist-info → parsl-2024.11.18.dist-info}/top_level.txt +0 -0
parsl/channels/base.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from abc import ABCMeta, abstractmethod, abstractproperty
|
2
|
-
from typing import
|
2
|
+
from typing import Tuple
|
3
3
|
|
4
4
|
|
5
5
|
class Channel(metaclass=ABCMeta):
|
@@ -8,33 +8,27 @@ class Channel(metaclass=ABCMeta):
|
|
8
8
|
|
9
9
|
For certain resources such as campus clusters or supercomputers at
|
10
10
|
research laboratories, resource requirements may require authentication.
|
11
|
-
For instance some resources may allow access to their job schedulers from
|
12
|
-
only their login-nodes which require you to authenticate through SSH, or
|
13
|
-
require two factor authentication.
|
14
11
|
|
15
|
-
The
|
16
|
-
shell
|
12
|
+
The only remaining Channel, *LocalChannel*, executes commands locally in a
|
13
|
+
shell.
|
17
14
|
|
18
15
|
Channels provide the ability to execute commands remotely, using the
|
19
16
|
execute_wait method, and manipulate the remote file system using methods
|
20
17
|
such as push_file, pull_file and makedirs.
|
21
18
|
|
22
19
|
Channels should ensure that each launched command runs in a new process
|
23
|
-
group, so that providers (such as
|
24
|
-
|
20
|
+
group, so that providers (such as LocalProvider) which terminate long
|
21
|
+
running commands using process groups can do so.
|
25
22
|
"""
|
26
23
|
|
27
24
|
@abstractmethod
|
28
|
-
def execute_wait(self, cmd: str, walltime: int = 0
|
25
|
+
def execute_wait(self, cmd: str, walltime: int = 0) -> Tuple[int, str, str]:
|
29
26
|
''' Executes the cmd, with a defined walltime.
|
30
27
|
|
31
28
|
Args:
|
32
29
|
- cmd (string): Command string to execute over the channel
|
33
30
|
- walltime (int) : Timeout in seconds
|
34
31
|
|
35
|
-
KWargs:
|
36
|
-
- envs (Dict[str, str]) : Environment variables to push to the remote side
|
37
|
-
|
38
32
|
Returns:
|
39
33
|
- (exit_code, stdout, stderr) (int, string, string)
|
40
34
|
'''
|
@@ -86,37 +80,3 @@ class Channel(metaclass=ABCMeta):
|
|
86
80
|
destination_path (string)
|
87
81
|
'''
|
88
82
|
pass
|
89
|
-
|
90
|
-
@abstractmethod
|
91
|
-
def close(self) -> None:
|
92
|
-
''' Closes the channel.
|
93
|
-
'''
|
94
|
-
pass
|
95
|
-
|
96
|
-
@abstractmethod
|
97
|
-
def makedirs(self, path: str, mode: int = 0o511, exist_ok: bool = False) -> None:
|
98
|
-
"""Create a directory.
|
99
|
-
|
100
|
-
If intermediate directories do not exist, they will be created.
|
101
|
-
|
102
|
-
Parameters
|
103
|
-
----------
|
104
|
-
path : str
|
105
|
-
Path of directory to create.
|
106
|
-
mode : int
|
107
|
-
Permissions (posix-style) for the newly-created directory.
|
108
|
-
exist_ok : bool
|
109
|
-
If False, raise an OSError if the target directory already exists.
|
110
|
-
"""
|
111
|
-
pass
|
112
|
-
|
113
|
-
@abstractmethod
|
114
|
-
def isdir(self, path: str) -> bool:
|
115
|
-
"""Return true if the path refers to an existing directory.
|
116
|
-
|
117
|
-
Parameters
|
118
|
-
----------
|
119
|
-
path : str
|
120
|
-
Path of directory to check.
|
121
|
-
"""
|
122
|
-
pass
|
parsl/channels/errors.py
CHANGED
@@ -17,73 +17,6 @@ class ChannelError(ParslError):
|
|
17
17
|
return "Hostname:{0}, Reason:{1}".format(self.hostname, self.reason)
|
18
18
|
|
19
19
|
|
20
|
-
class BadHostKeyException(ChannelError):
|
21
|
-
''' SSH channel could not be created since server's host keys could not
|
22
|
-
be verified
|
23
|
-
|
24
|
-
Contains:
|
25
|
-
reason(string)
|
26
|
-
e (paramiko exception object)
|
27
|
-
hostname (string)
|
28
|
-
'''
|
29
|
-
|
30
|
-
def __init__(self, e: Exception, hostname: str) -> None:
|
31
|
-
super().__init__("SSH channel could not be created since server's host keys could not be "
|
32
|
-
"verified", e, hostname)
|
33
|
-
|
34
|
-
|
35
|
-
class BadScriptPath(ChannelError):
|
36
|
-
''' An error raised during execution of an app.
|
37
|
-
What this exception contains depends entirely on context
|
38
|
-
Contains:
|
39
|
-
reason(string)
|
40
|
-
e (paramiko exception object)
|
41
|
-
hostname (string)
|
42
|
-
'''
|
43
|
-
|
44
|
-
def __init__(self, e: Exception, hostname: str) -> None:
|
45
|
-
super().__init__("Inaccessible remote script dir. Specify script_dir", e, hostname)
|
46
|
-
|
47
|
-
|
48
|
-
class BadPermsScriptPath(ChannelError):
|
49
|
-
''' User does not have permissions to access the script_dir on the remote site
|
50
|
-
|
51
|
-
Contains:
|
52
|
-
reason(string)
|
53
|
-
e (paramiko exception object)
|
54
|
-
hostname (string)
|
55
|
-
'''
|
56
|
-
|
57
|
-
def __init__(self, e: Exception, hostname: str) -> None:
|
58
|
-
super().__init__("User does not have permissions to access the script_dir", e, hostname)
|
59
|
-
|
60
|
-
|
61
|
-
class AuthException(ChannelError):
|
62
|
-
''' An error raised during execution of an app.
|
63
|
-
What this exception contains depends entirely on context
|
64
|
-
Contains:
|
65
|
-
reason(string)
|
66
|
-
e (paramiko exception object)
|
67
|
-
hostname (string)
|
68
|
-
'''
|
69
|
-
|
70
|
-
def __init__(self, e: Exception, hostname: str) -> None:
|
71
|
-
super().__init__("Authentication to remote server failed", e, hostname)
|
72
|
-
|
73
|
-
|
74
|
-
class SSHException(ChannelError):
|
75
|
-
''' if there was any other error connecting or establishing an SSH session
|
76
|
-
|
77
|
-
Contains:
|
78
|
-
reason(string)
|
79
|
-
e (paramiko exception object)
|
80
|
-
hostname (string)
|
81
|
-
'''
|
82
|
-
|
83
|
-
def __init__(self, e: Exception, hostname: str) -> None:
|
84
|
-
super().__init__("Error connecting or establishing an SSH session", e, hostname)
|
85
|
-
|
86
|
-
|
87
20
|
class FileCopyException(ChannelError):
|
88
21
|
''' File copy operation failed
|
89
22
|
|
parsl/channels/local/local.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
import copy
|
2
1
|
import logging
|
3
2
|
import os
|
4
3
|
import shutil
|
@@ -16,49 +15,32 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
16
15
|
and done so infrequently that they do not need a persistent channel
|
17
16
|
'''
|
18
17
|
|
19
|
-
def __init__(self
|
18
|
+
def __init__(self):
|
20
19
|
''' Initialize the local channel. script_dir is required by set to a default.
|
21
20
|
|
22
21
|
KwArgs:
|
23
|
-
- userhome (string): (default='.') This is provided as a way to override and set a specific userhome
|
24
|
-
- envs (dict) : A dictionary of env variables to be set when launching the shell
|
25
22
|
- script_dir (string): Directory to place scripts
|
26
23
|
'''
|
27
|
-
self.
|
28
|
-
|
29
|
-
|
30
|
-
local_env = os.environ.copy()
|
31
|
-
self._envs = copy.deepcopy(local_env)
|
32
|
-
self._envs.update(envs)
|
33
|
-
self.script_dir = script_dir
|
34
|
-
|
35
|
-
def execute_wait(self, cmd, walltime=None, envs={}):
|
24
|
+
self.script_dir = None
|
25
|
+
|
26
|
+
def execute_wait(self, cmd, walltime=None):
|
36
27
|
''' Synchronously execute a commandline string on the shell.
|
37
28
|
|
38
29
|
Args:
|
39
30
|
- cmd (string) : Commandline string to execute
|
40
31
|
- walltime (int) : walltime in seconds
|
41
32
|
|
42
|
-
Kwargs:
|
43
|
-
- envs (dict) : Dictionary of env variables. This will be used
|
44
|
-
to override the envs set at channel initialization.
|
45
|
-
|
46
33
|
Returns:
|
47
34
|
- retcode : Return code from the execution
|
48
35
|
- stdout : stdout string
|
49
36
|
- stderr : stderr string
|
50
37
|
'''
|
51
|
-
current_env = copy.deepcopy(self._envs)
|
52
|
-
current_env.update(envs)
|
53
|
-
|
54
38
|
try:
|
55
39
|
logger.debug("Creating process with command '%s'", cmd)
|
56
40
|
proc = subprocess.Popen(
|
57
41
|
cmd,
|
58
42
|
stdout=subprocess.PIPE,
|
59
43
|
stderr=subprocess.PIPE,
|
60
|
-
cwd=self.userhome,
|
61
|
-
env=current_env,
|
62
44
|
shell=True,
|
63
45
|
preexec_fn=os.setpgrp
|
64
46
|
)
|
@@ -99,7 +81,7 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
99
81
|
os.chmod(local_dest, 0o700)
|
100
82
|
|
101
83
|
except OSError as e:
|
102
|
-
raise FileCopyException(e,
|
84
|
+
raise FileCopyException(e, "localhost")
|
103
85
|
|
104
86
|
else:
|
105
87
|
os.chmod(local_dest, 0o700)
|
@@ -109,39 +91,6 @@ class LocalChannel(Channel, RepresentationMixin):
|
|
109
91
|
def pull_file(self, remote_source, local_dir):
|
110
92
|
return self.push_file(remote_source, local_dir)
|
111
93
|
|
112
|
-
def close(self) -> None:
|
113
|
-
''' There's nothing to close here, and so this doesn't do anything
|
114
|
-
'''
|
115
|
-
pass
|
116
|
-
|
117
|
-
def isdir(self, path):
|
118
|
-
"""Return true if the path refers to an existing directory.
|
119
|
-
|
120
|
-
Parameters
|
121
|
-
----------
|
122
|
-
path : str
|
123
|
-
Path of directory to check.
|
124
|
-
"""
|
125
|
-
|
126
|
-
return os.path.isdir(path)
|
127
|
-
|
128
|
-
def makedirs(self, path, mode=0o700, exist_ok=False):
|
129
|
-
"""Create a directory.
|
130
|
-
|
131
|
-
If intermediate directories do not exist, they will be created.
|
132
|
-
|
133
|
-
Parameters
|
134
|
-
----------
|
135
|
-
path : str
|
136
|
-
Path of directory to create.
|
137
|
-
mode : int
|
138
|
-
Permissions (posix-style) for the newly-created directory.
|
139
|
-
exist_ok : bool
|
140
|
-
If False, raise an OSError if the target directory already exists.
|
141
|
-
"""
|
142
|
-
|
143
|
-
return os.makedirs(path, mode, exist_ok)
|
144
|
-
|
145
94
|
@property
|
146
95
|
def script_dir(self):
|
147
96
|
return self._script_dir
|
parsl/dataflow/dflow.py
CHANGED
@@ -6,7 +6,6 @@ import datetime
|
|
6
6
|
import inspect
|
7
7
|
import logging
|
8
8
|
import os
|
9
|
-
import pathlib
|
10
9
|
import pickle
|
11
10
|
import random
|
12
11
|
import sys
|
@@ -25,7 +24,6 @@ from typeguard import typechecked
|
|
25
24
|
import parsl
|
26
25
|
from parsl.app.errors import RemoteExceptionWrapper
|
27
26
|
from parsl.app.futures import DataFuture
|
28
|
-
from parsl.channels import Channel
|
29
27
|
from parsl.config import Config
|
30
28
|
from parsl.data_provider.data_manager import DataManager
|
31
29
|
from parsl.data_provider.files import File
|
@@ -49,7 +47,6 @@ from parsl.monitoring import MonitoringHub
|
|
49
47
|
from parsl.monitoring.message_type import MessageType
|
50
48
|
from parsl.monitoring.remote import monitor_wrapper
|
51
49
|
from parsl.process_loggers import wrap_with_logs
|
52
|
-
from parsl.providers.base import ExecutionProvider
|
53
50
|
from parsl.usage_tracking.usage import UsageTracker
|
54
51
|
from parsl.utils import Timer, get_all_checkpoints, get_std_fname_mode, get_version
|
55
52
|
|
@@ -114,8 +111,6 @@ class DataFlowKernel:
|
|
114
111
|
self.monitoring = config.monitoring
|
115
112
|
|
116
113
|
if self.monitoring:
|
117
|
-
if self.monitoring.logdir is None:
|
118
|
-
self.monitoring.logdir = self.run_dir
|
119
114
|
self.monitoring.start(self.run_dir, self.config.run_dir)
|
120
115
|
|
121
116
|
self.time_began = datetime.datetime.now()
|
@@ -1143,36 +1138,6 @@ class DataFlowKernel:
|
|
1143
1138
|
|
1144
1139
|
logger.info("End of summary")
|
1145
1140
|
|
1146
|
-
def _create_remote_dirs_over_channel(self, provider: ExecutionProvider, channel: Channel) -> None:
|
1147
|
-
"""Create script directories across a channel
|
1148
|
-
|
1149
|
-
Parameters
|
1150
|
-
----------
|
1151
|
-
provider: Provider obj
|
1152
|
-
Provider for which scripts dirs are being created
|
1153
|
-
channel: Channel obj
|
1154
|
-
Channel over which the remote dirs are to be created
|
1155
|
-
"""
|
1156
|
-
run_dir = self.run_dir
|
1157
|
-
if channel.script_dir is None:
|
1158
|
-
|
1159
|
-
# This case will be detected as unreachable by mypy, because of
|
1160
|
-
# the type of script_dir, which is str, not Optional[str].
|
1161
|
-
# The type system doesn't represent the initialized/uninitialized
|
1162
|
-
# state of a channel so cannot represent that a channel needs
|
1163
|
-
# its script directory set or not.
|
1164
|
-
|
1165
|
-
channel.script_dir = os.path.join(run_dir, 'submit_scripts') # type: ignore[unreachable]
|
1166
|
-
|
1167
|
-
# Only create dirs if we aren't on a shared-fs
|
1168
|
-
if not channel.isdir(run_dir):
|
1169
|
-
parent, child = pathlib.Path(run_dir).parts[-2:]
|
1170
|
-
remote_run_dir = os.path.join(parent, child)
|
1171
|
-
channel.script_dir = os.path.join(remote_run_dir, 'remote_submit_scripts')
|
1172
|
-
provider.script_dir = os.path.join(run_dir, 'local_submit_scripts')
|
1173
|
-
|
1174
|
-
channel.makedirs(channel.script_dir, exist_ok=True)
|
1175
|
-
|
1176
1141
|
def add_executors(self, executors: Sequence[ParslExecutor]) -> None:
|
1177
1142
|
for executor in executors:
|
1178
1143
|
executor.run_id = self.run_id
|
@@ -1186,12 +1151,7 @@ class DataFlowKernel:
|
|
1186
1151
|
executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
|
1187
1152
|
os.makedirs(executor.provider.script_dir, exist_ok=True)
|
1188
1153
|
|
1189
|
-
|
1190
|
-
logger.debug("Creating script_dir across multiple channels")
|
1191
|
-
for channel in executor.provider.channels:
|
1192
|
-
self._create_remote_dirs_over_channel(executor.provider, channel)
|
1193
|
-
else:
|
1194
|
-
self._create_remote_dirs_over_channel(executor.provider, executor.provider.channel)
|
1154
|
+
executor.provider.channel.script_dir = executor.provider.script_dir
|
1195
1155
|
|
1196
1156
|
self.executors[executor.label] = executor
|
1197
1157
|
executor.start()
|
@@ -1273,23 +1233,6 @@ class DataFlowKernel:
|
|
1273
1233
|
executor.shutdown()
|
1274
1234
|
logger.info(f"Shut down executor {executor.label}")
|
1275
1235
|
|
1276
|
-
if hasattr(executor, 'provider'):
|
1277
|
-
if hasattr(executor.provider, 'script_dir'):
|
1278
|
-
logger.info(f"Closing channel(s) for {executor.label}")
|
1279
|
-
|
1280
|
-
if hasattr(executor.provider, 'channels'):
|
1281
|
-
for channel in executor.provider.channels:
|
1282
|
-
logger.info(f"Closing channel {channel}")
|
1283
|
-
channel.close()
|
1284
|
-
logger.info(f"Closed channel {channel}")
|
1285
|
-
else:
|
1286
|
-
assert hasattr(executor.provider, 'channel'), "If provider has no .channels, it must have .channel"
|
1287
|
-
logger.info(f"Closing channel {executor.provider.channel}")
|
1288
|
-
executor.provider.channel.close()
|
1289
|
-
logger.info(f"Closed channel {executor.provider.channel}")
|
1290
|
-
|
1291
|
-
logger.info(f"Closed executor channel(s) for {executor.label}")
|
1292
|
-
|
1293
1236
|
logger.info("Terminated executors")
|
1294
1237
|
self.time_completed = datetime.datetime.now()
|
1295
1238
|
|
@@ -44,11 +44,17 @@ def _set_manager_attributes(m, config):
|
|
44
44
|
# Enable peer transfer feature between workers if specified
|
45
45
|
if config.enable_peer_transfers:
|
46
46
|
m.enable_peer_transfers()
|
47
|
+
else:
|
48
|
+
m.disable_peer_transfers()
|
47
49
|
|
48
50
|
# Set catalog report to parsl if project name exists
|
49
51
|
if m.name:
|
50
52
|
m.set_property("framework", "parsl")
|
51
53
|
|
54
|
+
if config.tune_parameters is not None:
|
55
|
+
for k, v in config.tune_parameters.items():
|
56
|
+
m.tune(k, v)
|
57
|
+
|
52
58
|
|
53
59
|
def _prepare_environment_serverless(manager_config, env_cache_dir, poncho_create_script):
|
54
60
|
# Return path to a packaged poncho environment
|
@@ -156,6 +156,10 @@ class TaskVineManagerConfig:
|
|
156
156
|
Directory to store TaskVine logging facilities.
|
157
157
|
Default is None, in which all TaskVine logs will be contained
|
158
158
|
in the Parsl logging directory.
|
159
|
+
|
160
|
+
tune_parameters: Optional[dict]
|
161
|
+
Extended vine_tune parameters, expressed in a dictionary
|
162
|
+
by { 'tune-parameter' : value }.
|
159
163
|
"""
|
160
164
|
|
161
165
|
# Connection and communication settings
|
@@ -181,6 +185,7 @@ class TaskVineManagerConfig:
|
|
181
185
|
autocategory: bool = True
|
182
186
|
enable_peer_transfers: bool = True
|
183
187
|
wait_for_workers: Optional[int] = None
|
188
|
+
tune_parameters: Optional[dict] = None
|
184
189
|
|
185
190
|
# Logging settings
|
186
191
|
vine_log_dir: Optional[str] = None
|
parsl/monitoring/db_manager.py
CHANGED
@@ -279,7 +279,7 @@ class Database:
|
|
279
279
|
class DatabaseManager:
|
280
280
|
def __init__(self,
|
281
281
|
db_url: str = 'sqlite:///runinfo/monitoring.db',
|
282
|
-
|
282
|
+
run_dir: str = '.',
|
283
283
|
logging_level: int = logging.INFO,
|
284
284
|
batching_interval: float = 1,
|
285
285
|
batching_threshold: float = 99999,
|
@@ -287,12 +287,12 @@ class DatabaseManager:
|
|
287
287
|
|
288
288
|
self.workflow_end = False
|
289
289
|
self.workflow_start_message: Optional[MonitoringMessage] = None
|
290
|
-
self.
|
291
|
-
os.makedirs(self.
|
290
|
+
self.run_dir = run_dir
|
291
|
+
os.makedirs(self.run_dir, exist_ok=True)
|
292
292
|
|
293
293
|
logger.propagate = False
|
294
294
|
|
295
|
-
set_file_logger("{}/database_manager.log"
|
295
|
+
set_file_logger(f"{self.run_dir}/database_manager.log", level=logging_level,
|
296
296
|
format_string="%(asctime)s.%(msecs)03d %(name)s:%(lineno)d [%(levelname)s] [%(threadName)s %(thread)d] %(message)s",
|
297
297
|
name="database_manager")
|
298
298
|
|
@@ -681,7 +681,7 @@ class DatabaseManager:
|
|
681
681
|
def dbm_starter(exception_q: mpq.Queue,
|
682
682
|
resource_msgs: mpq.Queue,
|
683
683
|
db_url: str,
|
684
|
-
|
684
|
+
run_dir: str,
|
685
685
|
logging_level: int) -> None:
|
686
686
|
"""Start the database manager process
|
687
687
|
|
@@ -692,7 +692,7 @@ def dbm_starter(exception_q: mpq.Queue,
|
|
692
692
|
|
693
693
|
try:
|
694
694
|
dbm = DatabaseManager(db_url=db_url,
|
695
|
-
|
695
|
+
run_dir=run_dir,
|
696
696
|
logging_level=logging_level)
|
697
697
|
logger.info("Starting dbm in dbm starter")
|
698
698
|
dbm.start(resource_msgs)
|
parsl/monitoring/monitoring.py
CHANGED
@@ -3,9 +3,10 @@ from __future__ import annotations
|
|
3
3
|
import logging
|
4
4
|
import multiprocessing.synchronize as ms
|
5
5
|
import os
|
6
|
+
import pickle
|
6
7
|
import queue
|
7
8
|
import time
|
8
|
-
from multiprocessing import Event
|
9
|
+
from multiprocessing import Event
|
9
10
|
from multiprocessing.queues import Queue
|
10
11
|
from typing import TYPE_CHECKING, Literal, Optional, Tuple, Union, cast
|
11
12
|
|
@@ -18,7 +19,6 @@ from parsl.monitoring.router import router_starter
|
|
18
19
|
from parsl.monitoring.types import TaggedMonitoringMessage
|
19
20
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
20
21
|
from parsl.process_loggers import wrap_with_logs
|
21
|
-
from parsl.serialize import deserialize
|
22
22
|
from parsl.utils import RepresentationMixin, setproctitle
|
23
23
|
|
24
24
|
_db_manager_excepts: Optional[Exception]
|
@@ -44,7 +44,6 @@ class MonitoringHub(RepresentationMixin):
|
|
44
44
|
workflow_name: Optional[str] = None,
|
45
45
|
workflow_version: Optional[str] = None,
|
46
46
|
logging_endpoint: Optional[str] = None,
|
47
|
-
logdir: Optional[str] = None,
|
48
47
|
monitoring_debug: bool = False,
|
49
48
|
resource_monitoring_enabled: bool = True,
|
50
49
|
resource_monitoring_interval: float = 30): # in seconds
|
@@ -73,8 +72,6 @@ class MonitoringHub(RepresentationMixin):
|
|
73
72
|
The database connection url for monitoring to log the information.
|
74
73
|
These URLs follow RFC-1738, and can include username, password, hostname, database name.
|
75
74
|
Default: sqlite, in the configured run_dir.
|
76
|
-
logdir : str
|
77
|
-
Parsl log directory paths. Logs and temp files go here. Default: '.'
|
78
75
|
monitoring_debug : Bool
|
79
76
|
Enable monitoring debug logging. Default: False
|
80
77
|
resource_monitoring_enabled : boolean
|
@@ -96,7 +93,6 @@ class MonitoringHub(RepresentationMixin):
|
|
96
93
|
self.hub_port_range = hub_port_range
|
97
94
|
|
98
95
|
self.logging_endpoint = logging_endpoint
|
99
|
-
self.logdir = logdir
|
100
96
|
self.monitoring_debug = monitoring_debug
|
101
97
|
|
102
98
|
self.workflow_name = workflow_name
|
@@ -109,13 +105,10 @@ class MonitoringHub(RepresentationMixin):
|
|
109
105
|
|
110
106
|
logger.debug("Starting MonitoringHub")
|
111
107
|
|
112
|
-
if self.logdir is None:
|
113
|
-
self.logdir = "."
|
114
|
-
|
115
108
|
if self.logging_endpoint is None:
|
116
109
|
self.logging_endpoint = f"sqlite:///{os.fspath(config_run_dir)}/monitoring.db"
|
117
110
|
|
118
|
-
os.makedirs(
|
111
|
+
os.makedirs(dfk_run_dir, exist_ok=True)
|
119
112
|
|
120
113
|
self.monitoring_hub_active = True
|
121
114
|
|
@@ -151,7 +144,7 @@ class MonitoringHub(RepresentationMixin):
|
|
151
144
|
"hub_address": self.hub_address,
|
152
145
|
"udp_port": self.hub_port,
|
153
146
|
"zmq_port_range": self.hub_port_range,
|
154
|
-
"
|
147
|
+
"run_dir": dfk_run_dir,
|
155
148
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
156
149
|
},
|
157
150
|
name="Monitoring-Router-Process",
|
@@ -161,7 +154,7 @@ class MonitoringHub(RepresentationMixin):
|
|
161
154
|
|
162
155
|
self.dbm_proc = ForkProcess(target=dbm_starter,
|
163
156
|
args=(self.exception_q, self.resource_msgs,),
|
164
|
-
kwargs={"
|
157
|
+
kwargs={"run_dir": dfk_run_dir,
|
165
158
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
166
159
|
"db_url": self.logging_endpoint,
|
167
160
|
},
|
@@ -169,15 +162,15 @@ class MonitoringHub(RepresentationMixin):
|
|
169
162
|
daemon=True,
|
170
163
|
)
|
171
164
|
self.dbm_proc.start()
|
172
|
-
logger.info("Started the router process
|
165
|
+
logger.info("Started the router process %s and DBM process %s", self.router_proc.pid, self.dbm_proc.pid)
|
173
166
|
|
174
|
-
self.filesystem_proc =
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
167
|
+
self.filesystem_proc = ForkProcess(target=filesystem_receiver,
|
168
|
+
args=(self.resource_msgs, dfk_run_dir),
|
169
|
+
name="Monitoring-Filesystem-Process",
|
170
|
+
daemon=True
|
171
|
+
)
|
179
172
|
self.filesystem_proc.start()
|
180
|
-
logger.info(
|
173
|
+
logger.info("Started filesystem radio receiver process %s", self.filesystem_proc.pid)
|
181
174
|
|
182
175
|
self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
|
183
176
|
|
@@ -190,7 +183,7 @@ class MonitoringHub(RepresentationMixin):
|
|
190
183
|
raise MonitoringHubStartError()
|
191
184
|
|
192
185
|
if isinstance(comm_q_result, str):
|
193
|
-
logger.error(
|
186
|
+
logger.error("MonitoringRouter sent an error message: %s", comm_q_result)
|
194
187
|
raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
|
195
188
|
|
196
189
|
udp_port, zmq_port = comm_q_result
|
@@ -202,7 +195,7 @@ class MonitoringHub(RepresentationMixin):
|
|
202
195
|
self.hub_zmq_port = zmq_port
|
203
196
|
|
204
197
|
def send(self, message: TaggedMonitoringMessage) -> None:
|
205
|
-
logger.debug("Sending message type
|
198
|
+
logger.debug("Sending message type %s", message[0])
|
206
199
|
self.radio.send(message)
|
207
200
|
|
208
201
|
def close(self) -> None:
|
@@ -219,10 +212,9 @@ class MonitoringHub(RepresentationMixin):
|
|
219
212
|
if exception_msgs:
|
220
213
|
for exception_msg in exception_msgs:
|
221
214
|
logger.error(
|
222
|
-
"
|
223
|
-
|
224
|
-
|
225
|
-
)
|
215
|
+
"%s process delivered an exception: %s. Terminating all monitoring processes immediately.",
|
216
|
+
exception_msg[0],
|
217
|
+
exception_msg[1]
|
226
218
|
)
|
227
219
|
self.router_proc.terminate()
|
228
220
|
self.dbm_proc.terminate()
|
@@ -259,8 +251,8 @@ class MonitoringHub(RepresentationMixin):
|
|
259
251
|
|
260
252
|
|
261
253
|
@wrap_with_logs
|
262
|
-
def filesystem_receiver(
|
263
|
-
logger = set_file_logger("{}/monitoring_filesystem_radio.log"
|
254
|
+
def filesystem_receiver(q: Queue[TaggedMonitoringMessage], run_dir: str) -> None:
|
255
|
+
logger = set_file_logger(f"{run_dir}/monitoring_filesystem_radio.log",
|
264
256
|
name="monitoring_filesystem_radio",
|
265
257
|
level=logging.INFO)
|
266
258
|
|
@@ -269,7 +261,9 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[TaggedMonitoringMessage]",
|
|
269
261
|
base_path = f"{run_dir}/monitor-fs-radio/"
|
270
262
|
tmp_dir = f"{base_path}/tmp/"
|
271
263
|
new_dir = f"{base_path}/new/"
|
272
|
-
logger.debug(
|
264
|
+
logger.debug("Creating new and tmp paths under %s", base_path)
|
265
|
+
|
266
|
+
target_radio = MultiprocessingQueueRadioSender(q)
|
273
267
|
|
274
268
|
os.makedirs(tmp_dir, exist_ok=True)
|
275
269
|
os.makedirs(new_dir, exist_ok=True)
|
@@ -280,15 +274,15 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[TaggedMonitoringMessage]",
|
|
280
274
|
# iterate over files in new_dir
|
281
275
|
for filename in os.listdir(new_dir):
|
282
276
|
try:
|
283
|
-
logger.info(
|
277
|
+
logger.info("Processing filesystem radio file %s", filename)
|
284
278
|
full_path_filename = f"{new_dir}/{filename}"
|
285
279
|
with open(full_path_filename, "rb") as f:
|
286
|
-
message =
|
287
|
-
logger.debug(
|
280
|
+
message = pickle.load(f)
|
281
|
+
logger.debug("Message received is: %s", message)
|
288
282
|
assert isinstance(message, tuple)
|
289
|
-
|
283
|
+
target_radio.send(cast(TaggedMonitoringMessage, message))
|
290
284
|
os.remove(full_path_filename)
|
291
285
|
except Exception:
|
292
|
-
logger.exception(
|
286
|
+
logger.exception("Exception processing %s - probably will be retried next iteration", filename)
|
293
287
|
|
294
288
|
time.sleep(1) # whats a good time for this poll?
|
parsl/monitoring/radios.py
CHANGED
@@ -8,8 +8,6 @@ from multiprocessing.queues import Queue
|
|
8
8
|
|
9
9
|
import zmq
|
10
10
|
|
11
|
-
from parsl.serialize import serialize
|
12
|
-
|
13
11
|
logger = logging.getLogger(__name__)
|
14
12
|
|
15
13
|
|
@@ -59,7 +57,7 @@ class FilesystemRadioSender(MonitoringRadioSender):
|
|
59
57
|
# move it into new/, so that a partially written
|
60
58
|
# file will never be observed in new/
|
61
59
|
with open(tmp_filename, "wb") as f:
|
62
|
-
|
60
|
+
pickle.dump(buffer, f)
|
63
61
|
os.rename(tmp_filename, new_filename)
|
64
62
|
|
65
63
|
|