parsl 2024.8.5__py3-none-any.whl → 2024.8.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/channels/__init__.py +1 -4
- parsl/channels/oauth_ssh/oauth_ssh.py +12 -4
- parsl/channels/ssh/ssh.py +17 -7
- parsl/channels/ssh_il/ssh_il.py +13 -3
- parsl/dataflow/dflow.py +1 -1
- parsl/executors/high_throughput/executor.py +18 -27
- parsl/executors/high_throughput/interchange.py +4 -0
- parsl/executors/high_throughput/mpi_executor.py +23 -2
- parsl/executors/high_throughput/mpi_prefix_composer.py +5 -4
- parsl/executors/taskvine/executor.py +2 -0
- parsl/executors/workqueue/executor.py +2 -0
- parsl/monitoring/db_manager.py +36 -49
- parsl/monitoring/monitoring.py +9 -5
- parsl/monitoring/remote.py +4 -4
- parsl/monitoring/router.py +16 -18
- parsl/providers/__init__.py +0 -4
- parsl/providers/ad_hoc/ad_hoc.py +6 -2
- parsl/tests/configs/local_adhoc.py +2 -2
- parsl/tests/test_htex/test_resource_spec_validation.py +40 -0
- parsl/tests/test_htex/test_zmq_binding.py +2 -1
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +29 -14
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +16 -8
- parsl/tests/test_mpi_apps/test_mpiex.py +2 -3
- parsl/tests/test_mpi_apps/test_resource_spec.py +39 -41
- parsl/tests/test_providers/test_local_provider.py +6 -5
- parsl/version.py +1 -1
- {parsl-2024.8.5.data → parsl-2024.8.19.data}/scripts/interchange.py +4 -0
- {parsl-2024.8.5.dist-info → parsl-2024.8.19.dist-info}/METADATA +5 -3
- {parsl-2024.8.5.dist-info → parsl-2024.8.19.dist-info}/RECORD +36 -47
- parsl/configs/ad_hoc.py +0 -38
- parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
- parsl/tests/configs/htex_ad_hoc_cluster.py +0 -26
- parsl/tests/configs/swan_htex.py +0 -43
- parsl/tests/integration/test_channels/test_scp_1.py +0 -45
- parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
- parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
- parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
- parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
- parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -49
- parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -47
- {parsl-2024.8.5.data → parsl-2024.8.19.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.8.5.data → parsl-2024.8.19.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.8.5.data → parsl-2024.8.19.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.8.5.dist-info → parsl-2024.8.19.dist-info}/LICENSE +0 -0
- {parsl-2024.8.5.dist-info → parsl-2024.8.19.dist-info}/WHEEL +0 -0
- {parsl-2024.8.5.dist-info → parsl-2024.8.19.dist-info}/entry_points.txt +0 -0
- {parsl-2024.8.5.dist-info → parsl-2024.8.19.dist-info}/top_level.txt +0 -0
parsl/channels/__init__.py
CHANGED
@@ -1,7 +1,4 @@
|
|
1
1
|
from parsl.channels.base import Channel
|
2
2
|
from parsl.channels.local.local import LocalChannel
|
3
|
-
from parsl.channels.oauth_ssh.oauth_ssh import OAuthSSHChannel
|
4
|
-
from parsl.channels.ssh.ssh import SSHChannel
|
5
|
-
from parsl.channels.ssh_il.ssh_il import SSHInteractiveLoginChannel
|
6
3
|
|
7
|
-
__all__ = ['Channel', '
|
4
|
+
__all__ = ['Channel', 'LocalChannel']
|
@@ -1,11 +1,15 @@
|
|
1
1
|
import logging
|
2
2
|
import socket
|
3
3
|
|
4
|
-
import
|
5
|
-
|
6
|
-
from parsl.channels.ssh.ssh import SSHChannel
|
4
|
+
from parsl.channels.ssh.ssh import DeprecatedSSHChannel
|
7
5
|
from parsl.errors import OptionalModuleMissing
|
8
6
|
|
7
|
+
try:
|
8
|
+
import paramiko
|
9
|
+
_ssh_enabled = True
|
10
|
+
except (ImportError, NameError, FileNotFoundError):
|
11
|
+
_ssh_enabled = False
|
12
|
+
|
9
13
|
try:
|
10
14
|
from oauth_ssh.oauth_ssh_token import find_access_token
|
11
15
|
from oauth_ssh.ssh_service import SSHService
|
@@ -17,7 +21,7 @@ except (ImportError, NameError):
|
|
17
21
|
logger = logging.getLogger(__name__)
|
18
22
|
|
19
23
|
|
20
|
-
class
|
24
|
+
class DeprecatedOAuthSSHChannel(DeprecatedSSHChannel):
|
21
25
|
"""SSH persistent channel. This enables remote execution on sites
|
22
26
|
accessible via ssh. This channel uses Globus based OAuth tokens for authentication.
|
23
27
|
"""
|
@@ -38,6 +42,10 @@ class OAuthSSHChannel(SSHChannel):
|
|
38
42
|
|
39
43
|
Raises:
|
40
44
|
'''
|
45
|
+
if not _ssh_enabled:
|
46
|
+
raise OptionalModuleMissing(['ssh'],
|
47
|
+
"OauthSSHChannel requires the ssh module and config.")
|
48
|
+
|
41
49
|
if not _oauth_ssh_enabled:
|
42
50
|
raise OptionalModuleMissing(['oauth_ssh'],
|
43
51
|
"OauthSSHChannel requires oauth_ssh module and config.")
|
parsl/channels/ssh/ssh.py
CHANGED
@@ -2,8 +2,6 @@ import errno
|
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
|
5
|
-
import paramiko
|
6
|
-
|
7
5
|
from parsl.channels.base import Channel
|
8
6
|
from parsl.channels.errors import (
|
9
7
|
AuthException,
|
@@ -13,18 +11,27 @@ from parsl.channels.errors import (
|
|
13
11
|
FileCopyException,
|
14
12
|
SSHException,
|
15
13
|
)
|
14
|
+
from parsl.errors import OptionalModuleMissing
|
16
15
|
from parsl.utils import RepresentationMixin
|
17
16
|
|
17
|
+
try:
|
18
|
+
import paramiko
|
19
|
+
_ssh_enabled = True
|
20
|
+
except (ImportError, NameError, FileNotFoundError):
|
21
|
+
_ssh_enabled = False
|
22
|
+
|
23
|
+
|
18
24
|
logger = logging.getLogger(__name__)
|
19
25
|
|
20
26
|
|
21
|
-
|
22
|
-
|
23
|
-
self
|
24
|
-
|
27
|
+
if _ssh_enabled:
|
28
|
+
class NoAuthSSHClient(paramiko.SSHClient):
|
29
|
+
def _auth(self, username, *args):
|
30
|
+
self._transport.auth_none(username)
|
31
|
+
return
|
25
32
|
|
26
33
|
|
27
|
-
class
|
34
|
+
class DeprecatedSSHChannel(Channel, RepresentationMixin):
|
28
35
|
''' SSH persistent channel. This enables remote execution on sites
|
29
36
|
accessible via ssh. It is assumed that the user has setup host keys
|
30
37
|
so as to ssh to the remote host. Which goes to say that the following
|
@@ -53,6 +60,9 @@ class SSHChannel(Channel, RepresentationMixin):
|
|
53
60
|
|
54
61
|
Raises:
|
55
62
|
'''
|
63
|
+
if not _ssh_enabled:
|
64
|
+
raise OptionalModuleMissing(['ssh'],
|
65
|
+
"SSHChannel requires the ssh module and config.")
|
56
66
|
|
57
67
|
self.hostname = hostname
|
58
68
|
self.username = username
|
parsl/channels/ssh_il/ssh_il.py
CHANGED
@@ -1,14 +1,20 @@
|
|
1
1
|
import getpass
|
2
2
|
import logging
|
3
3
|
|
4
|
-
import
|
4
|
+
from parsl.channels.ssh.ssh import DeprecatedSSHChannel
|
5
|
+
from parsl.errors import OptionalModuleMissing
|
6
|
+
|
7
|
+
try:
|
8
|
+
import paramiko
|
9
|
+
_ssh_enabled = True
|
10
|
+
except (ImportError, NameError, FileNotFoundError):
|
11
|
+
_ssh_enabled = False
|
5
12
|
|
6
|
-
from parsl.channels.ssh.ssh import SSHChannel
|
7
13
|
|
8
14
|
logger = logging.getLogger(__name__)
|
9
15
|
|
10
16
|
|
11
|
-
class
|
17
|
+
class DeprecatedSSHInteractiveLoginChannel(DeprecatedSSHChannel):
|
12
18
|
"""SSH persistent channel. This enables remote execution on sites
|
13
19
|
accessible via ssh. This channel supports interactive login and is appropriate when
|
14
20
|
keys are not set up.
|
@@ -30,6 +36,10 @@ class SSHInteractiveLoginChannel(SSHChannel):
|
|
30
36
|
|
31
37
|
Raises:
|
32
38
|
'''
|
39
|
+
if not _ssh_enabled:
|
40
|
+
raise OptionalModuleMissing(['ssh'],
|
41
|
+
"SSHInteractiveLoginChannel requires the ssh module and config.")
|
42
|
+
|
33
43
|
self.hostname = hostname
|
34
44
|
self.username = username
|
35
45
|
self.password = password
|
parsl/dataflow/dflow.py
CHANGED
@@ -116,7 +116,7 @@ class DataFlowKernel:
|
|
116
116
|
if self.monitoring:
|
117
117
|
if self.monitoring.logdir is None:
|
118
118
|
self.monitoring.logdir = self.run_dir
|
119
|
-
self.monitoring.start(self.
|
119
|
+
self.monitoring.start(self.run_dir, self.config.run_dir)
|
120
120
|
|
121
121
|
self.time_began = datetime.datetime.now()
|
122
122
|
self.time_completed: Optional[datetime.datetime] = None
|
@@ -12,7 +12,6 @@ from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
|
|
12
12
|
|
13
13
|
import typeguard
|
14
14
|
|
15
|
-
import parsl.launchers
|
16
15
|
from parsl import curvezmq
|
17
16
|
from parsl.addresses import get_all_addresses
|
18
17
|
from parsl.app.errors import RemoteExceptionWrapper
|
@@ -25,8 +24,7 @@ from parsl.executors.high_throughput.manager_selector import (
|
|
25
24
|
RandomManagerSelector,
|
26
25
|
)
|
27
26
|
from parsl.executors.high_throughput.mpi_prefix_composer import (
|
28
|
-
|
29
|
-
validate_resource_spec,
|
27
|
+
InvalidResourceSpecification,
|
30
28
|
)
|
31
29
|
from parsl.executors.status_handling import BlockProviderExecutor
|
32
30
|
from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
|
@@ -224,17 +222,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
224
222
|
Parsl will create names as integers starting with 0.
|
225
223
|
|
226
224
|
default: empty list
|
227
|
-
|
228
|
-
enable_mpi_mode: bool
|
229
|
-
If enabled, MPI launch prefixes will be composed for the batch scheduler based on
|
230
|
-
the nodes available in each batch job and the resource_specification dict passed
|
231
|
-
from the app. This is an experimental feature, please refer to the following doc section
|
232
|
-
before use: https://parsl.readthedocs.io/en/stable/userguide/mpi_apps.html
|
233
|
-
|
234
|
-
mpi_launcher: str
|
235
|
-
This field is only used if enable_mpi_mode is set. Select one from the
|
236
|
-
list of supported MPI launchers = ("srun", "aprun", "mpiexec").
|
237
|
-
default: "mpiexec"
|
238
225
|
"""
|
239
226
|
|
240
227
|
@typeguard.typechecked
|
@@ -263,8 +250,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
263
250
|
poll_period: int = 10,
|
264
251
|
address_probe_timeout: Optional[int] = None,
|
265
252
|
worker_logdir_root: Optional[str] = None,
|
266
|
-
enable_mpi_mode: bool = False,
|
267
|
-
mpi_launcher: str = "mpiexec",
|
268
253
|
manager_selector: ManagerSelector = RandomManagerSelector(),
|
269
254
|
block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
|
270
255
|
encrypted: bool = False):
|
@@ -330,15 +315,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
330
315
|
self.encrypted = encrypted
|
331
316
|
self.cert_dir = None
|
332
317
|
|
333
|
-
self.enable_mpi_mode = enable_mpi_mode
|
334
|
-
assert mpi_launcher in VALID_LAUNCHERS, \
|
335
|
-
f"mpi_launcher must be set to one of {VALID_LAUNCHERS}"
|
336
|
-
if self.enable_mpi_mode:
|
337
|
-
assert isinstance(self.provider.launcher, parsl.launchers.SimpleLauncher), \
|
338
|
-
"mpi_mode requires the provider to be configured to use a SimpleLauncher"
|
339
|
-
|
340
|
-
self.mpi_launcher = mpi_launcher
|
341
|
-
|
342
318
|
if not launch_cmd:
|
343
319
|
launch_cmd = DEFAULT_LAUNCH_CMD
|
344
320
|
self.launch_cmd = launch_cmd
|
@@ -348,6 +324,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
348
324
|
self.interchange_launch_cmd = interchange_launch_cmd
|
349
325
|
|
350
326
|
radio_mode = "htex"
|
327
|
+
enable_mpi_mode: bool = False
|
328
|
+
mpi_launcher: str = "mpiexec"
|
351
329
|
|
352
330
|
def _warn_deprecated(self, old: str, new: str):
|
353
331
|
warnings.warn(
|
@@ -377,6 +355,18 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
377
355
|
return "{}/{}".format(self.worker_logdir_root, self.label)
|
378
356
|
return self.logdir
|
379
357
|
|
358
|
+
def validate_resource_spec(self, resource_specification: dict):
|
359
|
+
"""HTEX does not support *any* resource_specification options and
|
360
|
+
will raise InvalidResourceSpecification is any are passed to it"""
|
361
|
+
if resource_specification:
|
362
|
+
raise InvalidResourceSpecification(
|
363
|
+
set(resource_specification.keys()),
|
364
|
+
("HTEX does not support the supplied resource_specifications."
|
365
|
+
"For MPI applications consider using the MPIExecutor. "
|
366
|
+
"For specifications for core count/memory/walltime, consider using WorkQueueExecutor. ")
|
367
|
+
)
|
368
|
+
return
|
369
|
+
|
380
370
|
def initialize_scaling(self):
|
381
371
|
"""Compose the launch command and scale out the initial blocks.
|
382
372
|
"""
|
@@ -551,6 +541,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
551
541
|
"logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
|
552
542
|
"cert_dir": self.cert_dir,
|
553
543
|
"manager_selector": self.manager_selector,
|
544
|
+
"run_id": self.run_id,
|
554
545
|
}
|
555
546
|
|
556
547
|
config_pickle = pickle.dumps(interchange_config)
|
@@ -659,7 +650,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
659
650
|
Future
|
660
651
|
"""
|
661
652
|
|
662
|
-
validate_resource_spec(resource_specification
|
653
|
+
self.validate_resource_spec(resource_specification)
|
663
654
|
|
664
655
|
if self.bad_state_is_set:
|
665
656
|
raise self.executor_exception
|
@@ -831,7 +822,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
831
822
|
try:
|
832
823
|
self.interchange_proc.wait(timeout=timeout)
|
833
824
|
except subprocess.TimeoutExpired:
|
834
|
-
logger.
|
825
|
+
logger.warning("Unable to terminate Interchange process; sending SIGKILL")
|
835
826
|
self.interchange_proc.kill()
|
836
827
|
|
837
828
|
logger.info("Closing ZMQ pipes")
|
@@ -55,6 +55,7 @@ class Interchange:
|
|
55
55
|
poll_period: int,
|
56
56
|
cert_dir: Optional[str],
|
57
57
|
manager_selector: ManagerSelector,
|
58
|
+
run_id: str,
|
58
59
|
) -> None:
|
59
60
|
"""
|
60
61
|
Parameters
|
@@ -125,6 +126,8 @@ class Interchange:
|
|
125
126
|
self.command_channel.connect("tcp://{}:{}".format(client_address, client_ports[2]))
|
126
127
|
logger.info("Connected to client")
|
127
128
|
|
129
|
+
self.run_id = run_id
|
130
|
+
|
128
131
|
self.hub_address = hub_address
|
129
132
|
self.hub_zmq_port = hub_zmq_port
|
130
133
|
|
@@ -227,6 +230,7 @@ class Interchange:
|
|
227
230
|
d: Dict = cast(Dict, manager.copy())
|
228
231
|
d['timestamp'] = datetime.datetime.now()
|
229
232
|
d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat'])
|
233
|
+
d['run_id'] = self.run_id
|
230
234
|
|
231
235
|
monitoring_radio.send((MessageType.NODE_INFO, d))
|
232
236
|
|
@@ -8,8 +8,13 @@ from parsl.executors.high_throughput.executor import (
|
|
8
8
|
GENERAL_HTEX_PARAM_DOCS,
|
9
9
|
HighThroughputExecutor,
|
10
10
|
)
|
11
|
+
from parsl.executors.high_throughput.mpi_prefix_composer import (
|
12
|
+
VALID_LAUNCHERS,
|
13
|
+
validate_resource_spec,
|
14
|
+
)
|
11
15
|
from parsl.executors.status_handling import BlockProviderExecutor
|
12
16
|
from parsl.jobs.states import JobStatus
|
17
|
+
from parsl.launchers import SimpleLauncher
|
13
18
|
from parsl.providers import LocalProvider
|
14
19
|
from parsl.providers.base import ExecutionProvider
|
15
20
|
|
@@ -30,6 +35,11 @@ class MPIExecutor(HighThroughputExecutor):
|
|
30
35
|
max_workers_per_block: int
|
31
36
|
Maximum number of MPI applications to run at once per block
|
32
37
|
|
38
|
+
mpi_launcher: str
|
39
|
+
Select one from the list of supported MPI launchers:
|
40
|
+
("srun", "aprun", "mpiexec").
|
41
|
+
default: "mpiexec"
|
42
|
+
|
33
43
|
{GENERAL_HTEX_PARAM_DOCS}
|
34
44
|
"""
|
35
45
|
|
@@ -60,7 +70,6 @@ class MPIExecutor(HighThroughputExecutor):
|
|
60
70
|
super().__init__(
|
61
71
|
# Hard-coded settings
|
62
72
|
cores_per_worker=1e-9, # Ensures there will be at least an absurd number of workers
|
63
|
-
enable_mpi_mode=True,
|
64
73
|
max_workers_per_node=max_workers_per_block,
|
65
74
|
|
66
75
|
# Everything else
|
@@ -82,9 +91,21 @@ class MPIExecutor(HighThroughputExecutor):
|
|
82
91
|
poll_period=poll_period,
|
83
92
|
address_probe_timeout=address_probe_timeout,
|
84
93
|
worker_logdir_root=worker_logdir_root,
|
85
|
-
mpi_launcher=mpi_launcher,
|
86
94
|
block_error_handler=block_error_handler,
|
87
95
|
encrypted=encrypted
|
88
96
|
)
|
97
|
+
self.enable_mpi_mode = True
|
98
|
+
self.mpi_launcher = mpi_launcher
|
89
99
|
|
90
100
|
self.max_workers_per_block = max_workers_per_block
|
101
|
+
|
102
|
+
if not isinstance(self.provider.launcher, SimpleLauncher):
|
103
|
+
raise TypeError("mpi_mode requires the provider to be configured to use a SimpleLauncher")
|
104
|
+
|
105
|
+
if mpi_launcher not in VALID_LAUNCHERS:
|
106
|
+
raise ValueError(f"mpi_launcher set to:{mpi_launcher} must be set to one of {VALID_LAUNCHERS}")
|
107
|
+
|
108
|
+
self.mpi_launcher = mpi_launcher
|
109
|
+
|
110
|
+
def validate_resource_spec(self, resource_specification: dict):
|
111
|
+
return validate_resource_spec(resource_specification)
|
@@ -21,14 +21,15 @@ class MissingResourceSpecification(Exception):
|
|
21
21
|
class InvalidResourceSpecification(Exception):
|
22
22
|
"""Exception raised when Invalid input is supplied via resource specification"""
|
23
23
|
|
24
|
-
def __init__(self, invalid_keys: Set[str]):
|
24
|
+
def __init__(self, invalid_keys: Set[str], message: str = ''):
|
25
25
|
self.invalid_keys = invalid_keys
|
26
|
+
self.message = message
|
26
27
|
|
27
28
|
def __str__(self):
|
28
|
-
return f"Invalid resource specification options supplied: {self.invalid_keys}"
|
29
|
+
return f"Invalid resource specification options supplied: {self.invalid_keys} {self.message}"
|
29
30
|
|
30
31
|
|
31
|
-
def validate_resource_spec(resource_spec: Dict[str, str]
|
32
|
+
def validate_resource_spec(resource_spec: Dict[str, str]):
|
32
33
|
"""Basic validation of keys in the resource_spec
|
33
34
|
|
34
35
|
Raises: InvalidResourceSpecification if the resource_spec
|
@@ -38,7 +39,7 @@ def validate_resource_spec(resource_spec: Dict[str, str], is_mpi_enabled: bool):
|
|
38
39
|
|
39
40
|
# empty resource_spec when mpi_mode is set causes parsl to hang
|
40
41
|
# ref issue #3427
|
41
|
-
if
|
42
|
+
if len(user_keys) == 0:
|
42
43
|
raise MissingResourceSpecification('MPI mode requires optional parsl_resource_specification keyword argument to be configured')
|
43
44
|
|
44
45
|
legal_keys = set(("ranks_per_node",
|
@@ -589,11 +589,13 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
589
589
|
# Join all processes before exiting
|
590
590
|
logger.debug("Joining on submit process")
|
591
591
|
self._submit_process.join()
|
592
|
+
self._submit_process.close()
|
592
593
|
logger.debug("Joining on collector thread")
|
593
594
|
self._collector_thread.join()
|
594
595
|
if self.worker_launch_method == 'factory':
|
595
596
|
logger.debug("Joining on factory process")
|
596
597
|
self._factory_process.join()
|
598
|
+
self._factory_process.close()
|
597
599
|
|
598
600
|
# Shutdown multiprocessing queues
|
599
601
|
self._ready_task_queue.close()
|
@@ -704,6 +704,8 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
704
704
|
|
705
705
|
logger.debug("Joining on submit process")
|
706
706
|
self.submit_process.join()
|
707
|
+
self.submit_process.close()
|
708
|
+
|
707
709
|
logger.debug("Joining on collector thread")
|
708
710
|
self.collector_thread.join()
|
709
711
|
|
parsl/monitoring/db_manager.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1
1
|
import datetime
|
2
2
|
import logging
|
3
|
+
import multiprocessing.queues as mpq
|
3
4
|
import os
|
4
5
|
import queue
|
5
6
|
import threading
|
6
7
|
import time
|
7
8
|
from typing import Any, Dict, List, Optional, Set, Tuple, TypeVar, cast
|
8
9
|
|
10
|
+
import typeguard
|
11
|
+
|
9
12
|
from parsl.dataflow.states import States
|
10
13
|
from parsl.errors import OptionalModuleMissing
|
11
14
|
from parsl.log_utils import set_file_logger
|
@@ -283,7 +286,7 @@ class DatabaseManager:
|
|
283
286
|
):
|
284
287
|
|
285
288
|
self.workflow_end = False
|
286
|
-
self.workflow_start_message
|
289
|
+
self.workflow_start_message: Optional[MonitoringMessage] = None
|
287
290
|
self.logdir = logdir
|
288
291
|
os.makedirs(self.logdir, exist_ok=True)
|
289
292
|
|
@@ -299,21 +302,21 @@ class DatabaseManager:
|
|
299
302
|
self.batching_interval = batching_interval
|
300
303
|
self.batching_threshold = batching_threshold
|
301
304
|
|
302
|
-
self.pending_priority_queue
|
303
|
-
self.pending_node_queue
|
304
|
-
self.pending_block_queue
|
305
|
-
self.pending_resource_queue
|
305
|
+
self.pending_priority_queue: queue.Queue[TaggedMonitoringMessage] = queue.Queue()
|
306
|
+
self.pending_node_queue: queue.Queue[MonitoringMessage] = queue.Queue()
|
307
|
+
self.pending_block_queue: queue.Queue[MonitoringMessage] = queue.Queue()
|
308
|
+
self.pending_resource_queue: queue.Queue[MonitoringMessage] = queue.Queue()
|
306
309
|
|
307
310
|
def start(self,
|
308
|
-
priority_queue:
|
309
|
-
node_queue:
|
310
|
-
block_queue:
|
311
|
-
resource_queue:
|
311
|
+
priority_queue: mpq.Queue,
|
312
|
+
node_queue: mpq.Queue,
|
313
|
+
block_queue: mpq.Queue,
|
314
|
+
resource_queue: mpq.Queue) -> None:
|
312
315
|
|
313
316
|
self._kill_event = threading.Event()
|
314
317
|
self._priority_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
|
315
318
|
args=(
|
316
|
-
priority_queue,
|
319
|
+
priority_queue, self._kill_event,),
|
317
320
|
name="Monitoring-migrate-priority",
|
318
321
|
daemon=True,
|
319
322
|
)
|
@@ -321,7 +324,7 @@ class DatabaseManager:
|
|
321
324
|
|
322
325
|
self._node_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
|
323
326
|
args=(
|
324
|
-
node_queue,
|
327
|
+
node_queue, self._kill_event,),
|
325
328
|
name="Monitoring-migrate-node",
|
326
329
|
daemon=True,
|
327
330
|
)
|
@@ -329,7 +332,7 @@ class DatabaseManager:
|
|
329
332
|
|
330
333
|
self._block_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
|
331
334
|
args=(
|
332
|
-
block_queue,
|
335
|
+
block_queue, self._kill_event,),
|
333
336
|
name="Monitoring-migrate-block",
|
334
337
|
daemon=True,
|
335
338
|
)
|
@@ -337,7 +340,7 @@ class DatabaseManager:
|
|
337
340
|
|
338
341
|
self._resource_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
|
339
342
|
args=(
|
340
|
-
resource_queue,
|
343
|
+
resource_queue, self._kill_event,),
|
341
344
|
name="Monitoring-migrate-resource",
|
342
345
|
daemon=True,
|
343
346
|
)
|
@@ -351,18 +354,18 @@ class DatabaseManager:
|
|
351
354
|
If that happens, the message will be added to deferred_resource_messages and processed later.
|
352
355
|
|
353
356
|
"""
|
354
|
-
inserted_tasks = set()
|
357
|
+
inserted_tasks: Set[object] = set()
|
355
358
|
|
356
359
|
"""
|
357
360
|
like inserted_tasks but for task,try tuples
|
358
361
|
"""
|
359
|
-
inserted_tries = set()
|
362
|
+
inserted_tries: Set[Any] = set()
|
360
363
|
|
361
364
|
# for any task ID, we can defer exactly one message, which is the
|
362
365
|
# assumed-to-be-unique first message (with first message flag set).
|
363
366
|
# The code prior to this patch will discard previous message in
|
364
367
|
# the case of multiple messages to defer.
|
365
|
-
deferred_resource_messages = {}
|
368
|
+
deferred_resource_messages: MonitoringMessage = {}
|
366
369
|
|
367
370
|
exception_happened = False
|
368
371
|
|
@@ -505,7 +508,7 @@ class DatabaseManager:
|
|
505
508
|
"Got {} messages from block queue".format(len(block_info_messages)))
|
506
509
|
# block_info_messages is possibly a nested list of dict (at different polling times)
|
507
510
|
# Each dict refers to the info of a job/block at one polling time
|
508
|
-
block_messages_to_insert
|
511
|
+
block_messages_to_insert: List[Any] = []
|
509
512
|
for block_msg in block_info_messages:
|
510
513
|
block_messages_to_insert.extend(block_msg)
|
511
514
|
self._insert(table=BLOCK, messages=block_messages_to_insert)
|
@@ -574,43 +577,26 @@ class DatabaseManager:
|
|
574
577
|
raise RuntimeError("An exception happened sometime during database processing and should have been logged in database_manager.log")
|
575
578
|
|
576
579
|
@wrap_with_logs(target="database_manager")
|
577
|
-
def _migrate_logs_to_internal(self, logs_queue: queue.Queue,
|
578
|
-
logger.info("Starting
|
580
|
+
def _migrate_logs_to_internal(self, logs_queue: queue.Queue, kill_event: threading.Event) -> None:
|
581
|
+
logger.info("Starting _migrate_logs_to_internal")
|
579
582
|
|
580
583
|
while not kill_event.is_set() or logs_queue.qsize() != 0:
|
581
|
-
logger.debug("
|
582
|
-
|
584
|
+
logger.debug("Checking STOP conditions: kill event: %s, queue has entries: %s",
|
585
|
+
kill_event.is_set(), logs_queue.qsize() != 0)
|
583
586
|
try:
|
584
587
|
x, addr = logs_queue.get(timeout=0.1)
|
585
588
|
except queue.Empty:
|
586
589
|
continue
|
587
590
|
else:
|
588
|
-
if
|
591
|
+
if x == 'STOP':
|
589
592
|
self.close()
|
590
|
-
elif queue_tag == 'priority': # implicitly not 'STOP'
|
591
|
-
assert isinstance(x, tuple)
|
592
|
-
assert len(x) == 2
|
593
|
-
assert x[0] in [MessageType.WORKFLOW_INFO, MessageType.TASK_INFO], \
|
594
|
-
"_migrate_logs_to_internal can only migrate WORKFLOW_,TASK_INFO message from priority queue, got x[0] == {}".format(x[0])
|
595
|
-
self._dispatch_to_internal(x)
|
596
|
-
elif queue_tag == 'resource':
|
597
|
-
assert isinstance(x, tuple), "_migrate_logs_to_internal was expecting a tuple, got {}".format(x)
|
598
|
-
assert x[0] == MessageType.RESOURCE_INFO, (
|
599
|
-
"_migrate_logs_to_internal can only migrate RESOURCE_INFO message from resource queue, "
|
600
|
-
"got tag {}, message {}".format(x[0], x)
|
601
|
-
)
|
602
|
-
self._dispatch_to_internal(x)
|
603
|
-
elif queue_tag == 'node':
|
604
|
-
assert len(x) == 2, "expected message tuple to have exactly two elements"
|
605
|
-
assert x[0] == MessageType.NODE_INFO, "_migrate_logs_to_internal can only migrate NODE_INFO messages from node queue"
|
606
|
-
|
607
|
-
self._dispatch_to_internal(x)
|
608
|
-
elif queue_tag == "block":
|
609
|
-
self._dispatch_to_internal(x)
|
610
593
|
else:
|
611
|
-
|
594
|
+
self._dispatch_to_internal(x)
|
612
595
|
|
613
596
|
def _dispatch_to_internal(self, x: Tuple) -> None:
|
597
|
+
assert isinstance(x, tuple)
|
598
|
+
assert len(x) == 2, "expected message tuple to have exactly two elements"
|
599
|
+
|
614
600
|
if x[0] in [MessageType.WORKFLOW_INFO, MessageType.TASK_INFO]:
|
615
601
|
self.pending_priority_queue.put(cast(Any, x))
|
616
602
|
elif x[0] == MessageType.RESOURCE_INFO:
|
@@ -686,7 +672,7 @@ class DatabaseManager:
|
|
686
672
|
logger.exception("Rollback failed")
|
687
673
|
|
688
674
|
def _get_messages_in_batch(self, msg_queue: "queue.Queue[X]") -> List[X]:
|
689
|
-
messages
|
675
|
+
messages: List[X] = []
|
690
676
|
start = time.time()
|
691
677
|
while True:
|
692
678
|
if time.time() - start >= self.batching_interval or len(messages) >= self.batching_threshold:
|
@@ -719,11 +705,12 @@ class DatabaseManager:
|
|
719
705
|
|
720
706
|
|
721
707
|
@wrap_with_logs(target="database_manager")
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
708
|
+
@typeguard.typechecked
|
709
|
+
def dbm_starter(exception_q: mpq.Queue,
|
710
|
+
priority_msgs: mpq.Queue,
|
711
|
+
node_msgs: mpq.Queue,
|
712
|
+
block_msgs: mpq.Queue,
|
713
|
+
resource_msgs: mpq.Queue,
|
727
714
|
db_url: str,
|
728
715
|
logdir: str,
|
729
716
|
logging_level: int) -> None:
|
parsl/monitoring/monitoring.py
CHANGED
@@ -106,7 +106,7 @@ class MonitoringHub(RepresentationMixin):
|
|
106
106
|
self.resource_monitoring_enabled = resource_monitoring_enabled
|
107
107
|
self.resource_monitoring_interval = resource_monitoring_interval
|
108
108
|
|
109
|
-
def start(self,
|
109
|
+
def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None:
|
110
110
|
|
111
111
|
logger.debug("Starting MonitoringHub")
|
112
112
|
|
@@ -154,14 +154,18 @@ class MonitoringHub(RepresentationMixin):
|
|
154
154
|
self.router_exit_event = Event()
|
155
155
|
|
156
156
|
self.router_proc = ForkProcess(target=router_starter,
|
157
|
-
|
158
|
-
|
159
|
-
|
157
|
+
kwargs={"comm_q": comm_q,
|
158
|
+
"exception_q": self.exception_q,
|
159
|
+
"priority_msgs": self.priority_msgs,
|
160
|
+
"node_msgs": self.node_msgs,
|
161
|
+
"block_msgs": self.block_msgs,
|
162
|
+
"resource_msgs": self.resource_msgs,
|
163
|
+
"exit_event": self.router_exit_event,
|
164
|
+
"hub_address": self.hub_address,
|
160
165
|
"udp_port": self.hub_port,
|
161
166
|
"zmq_port_range": self.hub_port_range,
|
162
167
|
"logdir": self.logdir,
|
163
168
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
164
|
-
"run_id": run_id
|
165
169
|
},
|
166
170
|
name="Monitoring-Router-Process",
|
167
171
|
daemon=True,
|
parsl/monitoring/remote.py
CHANGED
@@ -199,10 +199,10 @@ def monitor(pid: int,
|
|
199
199
|
|
200
200
|
pm = psutil.Process(pid)
|
201
201
|
|
202
|
-
children_user_time
|
203
|
-
children_system_time
|
204
|
-
children_num_ctx_switches_voluntary
|
205
|
-
children_num_ctx_switches_involuntary
|
202
|
+
children_user_time: Dict[int, float] = {}
|
203
|
+
children_system_time: Dict[int, float] = {}
|
204
|
+
children_num_ctx_switches_voluntary: Dict[int, float] = {}
|
205
|
+
children_num_ctx_switches_involuntary: Dict[int, float] = {}
|
206
206
|
|
207
207
|
def accumulate_and_prepare() -> Dict[str, Any]:
|
208
208
|
d = {"psutil_process_" + str(k): v for k, v in pm.as_dict().items() if k in simple}
|