parsl 2024.8.5__py3-none-any.whl → 2024.8.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. parsl/channels/__init__.py +1 -4
  2. parsl/channels/oauth_ssh/oauth_ssh.py +12 -4
  3. parsl/channels/ssh/ssh.py +17 -7
  4. parsl/channels/ssh_il/ssh_il.py +13 -3
  5. parsl/dataflow/dflow.py +1 -1
  6. parsl/executors/high_throughput/executor.py +18 -27
  7. parsl/executors/high_throughput/interchange.py +4 -0
  8. parsl/executors/high_throughput/mpi_executor.py +23 -2
  9. parsl/executors/high_throughput/mpi_prefix_composer.py +5 -4
  10. parsl/executors/taskvine/executor.py +2 -0
  11. parsl/executors/workqueue/executor.py +2 -0
  12. parsl/monitoring/db_manager.py +36 -49
  13. parsl/monitoring/monitoring.py +9 -5
  14. parsl/monitoring/remote.py +4 -4
  15. parsl/monitoring/router.py +16 -18
  16. parsl/providers/__init__.py +0 -4
  17. parsl/providers/ad_hoc/ad_hoc.py +6 -2
  18. parsl/tests/configs/local_adhoc.py +2 -2
  19. parsl/tests/test_htex/test_resource_spec_validation.py +40 -0
  20. parsl/tests/test_htex/test_zmq_binding.py +2 -1
  21. parsl/tests/test_mpi_apps/test_bad_mpi_config.py +29 -14
  22. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +16 -8
  23. parsl/tests/test_mpi_apps/test_mpiex.py +2 -3
  24. parsl/tests/test_mpi_apps/test_resource_spec.py +39 -41
  25. parsl/tests/test_providers/test_local_provider.py +6 -5
  26. parsl/version.py +1 -1
  27. {parsl-2024.8.5.data → parsl-2024.8.19.data}/scripts/interchange.py +4 -0
  28. {parsl-2024.8.5.dist-info → parsl-2024.8.19.dist-info}/METADATA +5 -3
  29. {parsl-2024.8.5.dist-info → parsl-2024.8.19.dist-info}/RECORD +36 -47
  30. parsl/configs/ad_hoc.py +0 -38
  31. parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
  32. parsl/tests/configs/htex_ad_hoc_cluster.py +0 -26
  33. parsl/tests/configs/swan_htex.py +0 -43
  34. parsl/tests/integration/test_channels/test_scp_1.py +0 -45
  35. parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
  36. parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
  37. parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
  38. parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
  39. parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -49
  40. parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
  41. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -47
  42. {parsl-2024.8.5.data → parsl-2024.8.19.data}/scripts/exec_parsl_function.py +0 -0
  43. {parsl-2024.8.5.data → parsl-2024.8.19.data}/scripts/parsl_coprocess.py +0 -0
  44. {parsl-2024.8.5.data → parsl-2024.8.19.data}/scripts/process_worker_pool.py +0 -0
  45. {parsl-2024.8.5.dist-info → parsl-2024.8.19.dist-info}/LICENSE +0 -0
  46. {parsl-2024.8.5.dist-info → parsl-2024.8.19.dist-info}/WHEEL +0 -0
  47. {parsl-2024.8.5.dist-info → parsl-2024.8.19.dist-info}/entry_points.txt +0 -0
  48. {parsl-2024.8.5.dist-info → parsl-2024.8.19.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,4 @@
1
1
  from parsl.channels.base import Channel
2
2
  from parsl.channels.local.local import LocalChannel
3
- from parsl.channels.oauth_ssh.oauth_ssh import OAuthSSHChannel
4
- from parsl.channels.ssh.ssh import SSHChannel
5
- from parsl.channels.ssh_il.ssh_il import SSHInteractiveLoginChannel
6
3
 
7
- __all__ = ['Channel', 'SSHChannel', 'LocalChannel', 'SSHInteractiveLoginChannel', 'OAuthSSHChannel']
4
+ __all__ = ['Channel', 'LocalChannel']
@@ -1,11 +1,15 @@
1
1
  import logging
2
2
  import socket
3
3
 
4
- import paramiko
5
-
6
- from parsl.channels.ssh.ssh import SSHChannel
4
+ from parsl.channels.ssh.ssh import DeprecatedSSHChannel
7
5
  from parsl.errors import OptionalModuleMissing
8
6
 
7
+ try:
8
+ import paramiko
9
+ _ssh_enabled = True
10
+ except (ImportError, NameError, FileNotFoundError):
11
+ _ssh_enabled = False
12
+
9
13
  try:
10
14
  from oauth_ssh.oauth_ssh_token import find_access_token
11
15
  from oauth_ssh.ssh_service import SSHService
@@ -17,7 +21,7 @@ except (ImportError, NameError):
17
21
  logger = logging.getLogger(__name__)
18
22
 
19
23
 
20
- class OAuthSSHChannel(SSHChannel):
24
+ class DeprecatedOAuthSSHChannel(DeprecatedSSHChannel):
21
25
  """SSH persistent channel. This enables remote execution on sites
22
26
  accessible via ssh. This channel uses Globus based OAuth tokens for authentication.
23
27
  """
@@ -38,6 +42,10 @@ class OAuthSSHChannel(SSHChannel):
38
42
 
39
43
  Raises:
40
44
  '''
45
+ if not _ssh_enabled:
46
+ raise OptionalModuleMissing(['ssh'],
47
+ "OauthSSHChannel requires the ssh module and config.")
48
+
41
49
  if not _oauth_ssh_enabled:
42
50
  raise OptionalModuleMissing(['oauth_ssh'],
43
51
  "OauthSSHChannel requires oauth_ssh module and config.")
parsl/channels/ssh/ssh.py CHANGED
@@ -2,8 +2,6 @@ import errno
2
2
  import logging
3
3
  import os
4
4
 
5
- import paramiko
6
-
7
5
  from parsl.channels.base import Channel
8
6
  from parsl.channels.errors import (
9
7
  AuthException,
@@ -13,18 +11,27 @@ from parsl.channels.errors import (
13
11
  FileCopyException,
14
12
  SSHException,
15
13
  )
14
+ from parsl.errors import OptionalModuleMissing
16
15
  from parsl.utils import RepresentationMixin
17
16
 
17
+ try:
18
+ import paramiko
19
+ _ssh_enabled = True
20
+ except (ImportError, NameError, FileNotFoundError):
21
+ _ssh_enabled = False
22
+
23
+
18
24
  logger = logging.getLogger(__name__)
19
25
 
20
26
 
21
- class NoAuthSSHClient(paramiko.SSHClient):
22
- def _auth(self, username, *args):
23
- self._transport.auth_none(username)
24
- return
27
+ if _ssh_enabled:
28
+ class NoAuthSSHClient(paramiko.SSHClient):
29
+ def _auth(self, username, *args):
30
+ self._transport.auth_none(username)
31
+ return
25
32
 
26
33
 
27
- class SSHChannel(Channel, RepresentationMixin):
34
+ class DeprecatedSSHChannel(Channel, RepresentationMixin):
28
35
  ''' SSH persistent channel. This enables remote execution on sites
29
36
  accessible via ssh. It is assumed that the user has setup host keys
30
37
  so as to ssh to the remote host. Which goes to say that the following
@@ -53,6 +60,9 @@ class SSHChannel(Channel, RepresentationMixin):
53
60
 
54
61
  Raises:
55
62
  '''
63
+ if not _ssh_enabled:
64
+ raise OptionalModuleMissing(['ssh'],
65
+ "SSHChannel requires the ssh module and config.")
56
66
 
57
67
  self.hostname = hostname
58
68
  self.username = username
@@ -1,14 +1,20 @@
1
1
  import getpass
2
2
  import logging
3
3
 
4
- import paramiko
4
+ from parsl.channels.ssh.ssh import DeprecatedSSHChannel
5
+ from parsl.errors import OptionalModuleMissing
6
+
7
+ try:
8
+ import paramiko
9
+ _ssh_enabled = True
10
+ except (ImportError, NameError, FileNotFoundError):
11
+ _ssh_enabled = False
5
12
 
6
- from parsl.channels.ssh.ssh import SSHChannel
7
13
 
8
14
  logger = logging.getLogger(__name__)
9
15
 
10
16
 
11
- class SSHInteractiveLoginChannel(SSHChannel):
17
+ class DeprecatedSSHInteractiveLoginChannel(DeprecatedSSHChannel):
12
18
  """SSH persistent channel. This enables remote execution on sites
13
19
  accessible via ssh. This channel supports interactive login and is appropriate when
14
20
  keys are not set up.
@@ -30,6 +36,10 @@ class SSHInteractiveLoginChannel(SSHChannel):
30
36
 
31
37
  Raises:
32
38
  '''
39
+ if not _ssh_enabled:
40
+ raise OptionalModuleMissing(['ssh'],
41
+ "SSHInteractiveLoginChannel requires the ssh module and config.")
42
+
33
43
  self.hostname = hostname
34
44
  self.username = username
35
45
  self.password = password
parsl/dataflow/dflow.py CHANGED
@@ -116,7 +116,7 @@ class DataFlowKernel:
116
116
  if self.monitoring:
117
117
  if self.monitoring.logdir is None:
118
118
  self.monitoring.logdir = self.run_dir
119
- self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir)
119
+ self.monitoring.start(self.run_dir, self.config.run_dir)
120
120
 
121
121
  self.time_began = datetime.datetime.now()
122
122
  self.time_completed: Optional[datetime.datetime] = None
@@ -12,7 +12,6 @@ from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
12
12
 
13
13
  import typeguard
14
14
 
15
- import parsl.launchers
16
15
  from parsl import curvezmq
17
16
  from parsl.addresses import get_all_addresses
18
17
  from parsl.app.errors import RemoteExceptionWrapper
@@ -25,8 +24,7 @@ from parsl.executors.high_throughput.manager_selector import (
25
24
  RandomManagerSelector,
26
25
  )
27
26
  from parsl.executors.high_throughput.mpi_prefix_composer import (
28
- VALID_LAUNCHERS,
29
- validate_resource_spec,
27
+ InvalidResourceSpecification,
30
28
  )
31
29
  from parsl.executors.status_handling import BlockProviderExecutor
32
30
  from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
@@ -224,17 +222,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
224
222
  Parsl will create names as integers starting with 0.
225
223
 
226
224
  default: empty list
227
-
228
- enable_mpi_mode: bool
229
- If enabled, MPI launch prefixes will be composed for the batch scheduler based on
230
- the nodes available in each batch job and the resource_specification dict passed
231
- from the app. This is an experimental feature, please refer to the following doc section
232
- before use: https://parsl.readthedocs.io/en/stable/userguide/mpi_apps.html
233
-
234
- mpi_launcher: str
235
- This field is only used if enable_mpi_mode is set. Select one from the
236
- list of supported MPI launchers = ("srun", "aprun", "mpiexec").
237
- default: "mpiexec"
238
225
  """
239
226
 
240
227
  @typeguard.typechecked
@@ -263,8 +250,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
263
250
  poll_period: int = 10,
264
251
  address_probe_timeout: Optional[int] = None,
265
252
  worker_logdir_root: Optional[str] = None,
266
- enable_mpi_mode: bool = False,
267
- mpi_launcher: str = "mpiexec",
268
253
  manager_selector: ManagerSelector = RandomManagerSelector(),
269
254
  block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
270
255
  encrypted: bool = False):
@@ -330,15 +315,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
330
315
  self.encrypted = encrypted
331
316
  self.cert_dir = None
332
317
 
333
- self.enable_mpi_mode = enable_mpi_mode
334
- assert mpi_launcher in VALID_LAUNCHERS, \
335
- f"mpi_launcher must be set to one of {VALID_LAUNCHERS}"
336
- if self.enable_mpi_mode:
337
- assert isinstance(self.provider.launcher, parsl.launchers.SimpleLauncher), \
338
- "mpi_mode requires the provider to be configured to use a SimpleLauncher"
339
-
340
- self.mpi_launcher = mpi_launcher
341
-
342
318
  if not launch_cmd:
343
319
  launch_cmd = DEFAULT_LAUNCH_CMD
344
320
  self.launch_cmd = launch_cmd
@@ -348,6 +324,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
348
324
  self.interchange_launch_cmd = interchange_launch_cmd
349
325
 
350
326
  radio_mode = "htex"
327
+ enable_mpi_mode: bool = False
328
+ mpi_launcher: str = "mpiexec"
351
329
 
352
330
  def _warn_deprecated(self, old: str, new: str):
353
331
  warnings.warn(
@@ -377,6 +355,18 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
377
355
  return "{}/{}".format(self.worker_logdir_root, self.label)
378
356
  return self.logdir
379
357
 
358
+ def validate_resource_spec(self, resource_specification: dict):
359
+ """HTEX does not support *any* resource_specification options and
360
+ will raise InvalidResourceSpecification is any are passed to it"""
361
+ if resource_specification:
362
+ raise InvalidResourceSpecification(
363
+ set(resource_specification.keys()),
364
+ ("HTEX does not support the supplied resource_specifications."
365
+ "For MPI applications consider using the MPIExecutor. "
366
+ "For specifications for core count/memory/walltime, consider using WorkQueueExecutor. ")
367
+ )
368
+ return
369
+
380
370
  def initialize_scaling(self):
381
371
  """Compose the launch command and scale out the initial blocks.
382
372
  """
@@ -551,6 +541,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
551
541
  "logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
552
542
  "cert_dir": self.cert_dir,
553
543
  "manager_selector": self.manager_selector,
544
+ "run_id": self.run_id,
554
545
  }
555
546
 
556
547
  config_pickle = pickle.dumps(interchange_config)
@@ -659,7 +650,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
659
650
  Future
660
651
  """
661
652
 
662
- validate_resource_spec(resource_specification, self.enable_mpi_mode)
653
+ self.validate_resource_spec(resource_specification)
663
654
 
664
655
  if self.bad_state_is_set:
665
656
  raise self.executor_exception
@@ -831,7 +822,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
831
822
  try:
832
823
  self.interchange_proc.wait(timeout=timeout)
833
824
  except subprocess.TimeoutExpired:
834
- logger.info("Unable to terminate Interchange process; sending SIGKILL")
825
+ logger.warning("Unable to terminate Interchange process; sending SIGKILL")
835
826
  self.interchange_proc.kill()
836
827
 
837
828
  logger.info("Closing ZMQ pipes")
@@ -55,6 +55,7 @@ class Interchange:
55
55
  poll_period: int,
56
56
  cert_dir: Optional[str],
57
57
  manager_selector: ManagerSelector,
58
+ run_id: str,
58
59
  ) -> None:
59
60
  """
60
61
  Parameters
@@ -125,6 +126,8 @@ class Interchange:
125
126
  self.command_channel.connect("tcp://{}:{}".format(client_address, client_ports[2]))
126
127
  logger.info("Connected to client")
127
128
 
129
+ self.run_id = run_id
130
+
128
131
  self.hub_address = hub_address
129
132
  self.hub_zmq_port = hub_zmq_port
130
133
 
@@ -227,6 +230,7 @@ class Interchange:
227
230
  d: Dict = cast(Dict, manager.copy())
228
231
  d['timestamp'] = datetime.datetime.now()
229
232
  d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat'])
233
+ d['run_id'] = self.run_id
230
234
 
231
235
  monitoring_radio.send((MessageType.NODE_INFO, d))
232
236
 
@@ -8,8 +8,13 @@ from parsl.executors.high_throughput.executor import (
8
8
  GENERAL_HTEX_PARAM_DOCS,
9
9
  HighThroughputExecutor,
10
10
  )
11
+ from parsl.executors.high_throughput.mpi_prefix_composer import (
12
+ VALID_LAUNCHERS,
13
+ validate_resource_spec,
14
+ )
11
15
  from parsl.executors.status_handling import BlockProviderExecutor
12
16
  from parsl.jobs.states import JobStatus
17
+ from parsl.launchers import SimpleLauncher
13
18
  from parsl.providers import LocalProvider
14
19
  from parsl.providers.base import ExecutionProvider
15
20
 
@@ -30,6 +35,11 @@ class MPIExecutor(HighThroughputExecutor):
30
35
  max_workers_per_block: int
31
36
  Maximum number of MPI applications to run at once per block
32
37
 
38
+ mpi_launcher: str
39
+ Select one from the list of supported MPI launchers:
40
+ ("srun", "aprun", "mpiexec").
41
+ default: "mpiexec"
42
+
33
43
  {GENERAL_HTEX_PARAM_DOCS}
34
44
  """
35
45
 
@@ -60,7 +70,6 @@ class MPIExecutor(HighThroughputExecutor):
60
70
  super().__init__(
61
71
  # Hard-coded settings
62
72
  cores_per_worker=1e-9, # Ensures there will be at least an absurd number of workers
63
- enable_mpi_mode=True,
64
73
  max_workers_per_node=max_workers_per_block,
65
74
 
66
75
  # Everything else
@@ -82,9 +91,21 @@ class MPIExecutor(HighThroughputExecutor):
82
91
  poll_period=poll_period,
83
92
  address_probe_timeout=address_probe_timeout,
84
93
  worker_logdir_root=worker_logdir_root,
85
- mpi_launcher=mpi_launcher,
86
94
  block_error_handler=block_error_handler,
87
95
  encrypted=encrypted
88
96
  )
97
+ self.enable_mpi_mode = True
98
+ self.mpi_launcher = mpi_launcher
89
99
 
90
100
  self.max_workers_per_block = max_workers_per_block
101
+
102
+ if not isinstance(self.provider.launcher, SimpleLauncher):
103
+ raise TypeError("mpi_mode requires the provider to be configured to use a SimpleLauncher")
104
+
105
+ if mpi_launcher not in VALID_LAUNCHERS:
106
+ raise ValueError(f"mpi_launcher set to:{mpi_launcher} must be set to one of {VALID_LAUNCHERS}")
107
+
108
+ self.mpi_launcher = mpi_launcher
109
+
110
+ def validate_resource_spec(self, resource_specification: dict):
111
+ return validate_resource_spec(resource_specification)
@@ -21,14 +21,15 @@ class MissingResourceSpecification(Exception):
21
21
  class InvalidResourceSpecification(Exception):
22
22
  """Exception raised when Invalid input is supplied via resource specification"""
23
23
 
24
- def __init__(self, invalid_keys: Set[str]):
24
+ def __init__(self, invalid_keys: Set[str], message: str = ''):
25
25
  self.invalid_keys = invalid_keys
26
+ self.message = message
26
27
 
27
28
  def __str__(self):
28
- return f"Invalid resource specification options supplied: {self.invalid_keys}"
29
+ return f"Invalid resource specification options supplied: {self.invalid_keys} {self.message}"
29
30
 
30
31
 
31
- def validate_resource_spec(resource_spec: Dict[str, str], is_mpi_enabled: bool):
32
+ def validate_resource_spec(resource_spec: Dict[str, str]):
32
33
  """Basic validation of keys in the resource_spec
33
34
 
34
35
  Raises: InvalidResourceSpecification if the resource_spec
@@ -38,7 +39,7 @@ def validate_resource_spec(resource_spec: Dict[str, str], is_mpi_enabled: bool):
38
39
 
39
40
  # empty resource_spec when mpi_mode is set causes parsl to hang
40
41
  # ref issue #3427
41
- if is_mpi_enabled and len(user_keys) == 0:
42
+ if len(user_keys) == 0:
42
43
  raise MissingResourceSpecification('MPI mode requires optional parsl_resource_specification keyword argument to be configured')
43
44
 
44
45
  legal_keys = set(("ranks_per_node",
@@ -589,11 +589,13 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
589
589
  # Join all processes before exiting
590
590
  logger.debug("Joining on submit process")
591
591
  self._submit_process.join()
592
+ self._submit_process.close()
592
593
  logger.debug("Joining on collector thread")
593
594
  self._collector_thread.join()
594
595
  if self.worker_launch_method == 'factory':
595
596
  logger.debug("Joining on factory process")
596
597
  self._factory_process.join()
598
+ self._factory_process.close()
597
599
 
598
600
  # Shutdown multiprocessing queues
599
601
  self._ready_task_queue.close()
@@ -704,6 +704,8 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
704
704
 
705
705
  logger.debug("Joining on submit process")
706
706
  self.submit_process.join()
707
+ self.submit_process.close()
708
+
707
709
  logger.debug("Joining on collector thread")
708
710
  self.collector_thread.join()
709
711
 
@@ -1,11 +1,14 @@
1
1
  import datetime
2
2
  import logging
3
+ import multiprocessing.queues as mpq
3
4
  import os
4
5
  import queue
5
6
  import threading
6
7
  import time
7
8
  from typing import Any, Dict, List, Optional, Set, Tuple, TypeVar, cast
8
9
 
10
+ import typeguard
11
+
9
12
  from parsl.dataflow.states import States
10
13
  from parsl.errors import OptionalModuleMissing
11
14
  from parsl.log_utils import set_file_logger
@@ -283,7 +286,7 @@ class DatabaseManager:
283
286
  ):
284
287
 
285
288
  self.workflow_end = False
286
- self.workflow_start_message = None # type: Optional[MonitoringMessage]
289
+ self.workflow_start_message: Optional[MonitoringMessage] = None
287
290
  self.logdir = logdir
288
291
  os.makedirs(self.logdir, exist_ok=True)
289
292
 
@@ -299,21 +302,21 @@ class DatabaseManager:
299
302
  self.batching_interval = batching_interval
300
303
  self.batching_threshold = batching_threshold
301
304
 
302
- self.pending_priority_queue = queue.Queue() # type: queue.Queue[TaggedMonitoringMessage]
303
- self.pending_node_queue = queue.Queue() # type: queue.Queue[MonitoringMessage]
304
- self.pending_block_queue = queue.Queue() # type: queue.Queue[MonitoringMessage]
305
- self.pending_resource_queue = queue.Queue() # type: queue.Queue[MonitoringMessage]
305
+ self.pending_priority_queue: queue.Queue[TaggedMonitoringMessage] = queue.Queue()
306
+ self.pending_node_queue: queue.Queue[MonitoringMessage] = queue.Queue()
307
+ self.pending_block_queue: queue.Queue[MonitoringMessage] = queue.Queue()
308
+ self.pending_resource_queue: queue.Queue[MonitoringMessage] = queue.Queue()
306
309
 
307
310
  def start(self,
308
- priority_queue: "queue.Queue[TaggedMonitoringMessage]",
309
- node_queue: "queue.Queue[MonitoringMessage]",
310
- block_queue: "queue.Queue[MonitoringMessage]",
311
- resource_queue: "queue.Queue[MonitoringMessage]") -> None:
311
+ priority_queue: mpq.Queue,
312
+ node_queue: mpq.Queue,
313
+ block_queue: mpq.Queue,
314
+ resource_queue: mpq.Queue) -> None:
312
315
 
313
316
  self._kill_event = threading.Event()
314
317
  self._priority_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
315
318
  args=(
316
- priority_queue, 'priority', self._kill_event,),
319
+ priority_queue, self._kill_event,),
317
320
  name="Monitoring-migrate-priority",
318
321
  daemon=True,
319
322
  )
@@ -321,7 +324,7 @@ class DatabaseManager:
321
324
 
322
325
  self._node_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
323
326
  args=(
324
- node_queue, 'node', self._kill_event,),
327
+ node_queue, self._kill_event,),
325
328
  name="Monitoring-migrate-node",
326
329
  daemon=True,
327
330
  )
@@ -329,7 +332,7 @@ class DatabaseManager:
329
332
 
330
333
  self._block_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
331
334
  args=(
332
- block_queue, 'block', self._kill_event,),
335
+ block_queue, self._kill_event,),
333
336
  name="Monitoring-migrate-block",
334
337
  daemon=True,
335
338
  )
@@ -337,7 +340,7 @@ class DatabaseManager:
337
340
 
338
341
  self._resource_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
339
342
  args=(
340
- resource_queue, 'resource', self._kill_event,),
343
+ resource_queue, self._kill_event,),
341
344
  name="Monitoring-migrate-resource",
342
345
  daemon=True,
343
346
  )
@@ -351,18 +354,18 @@ class DatabaseManager:
351
354
  If that happens, the message will be added to deferred_resource_messages and processed later.
352
355
 
353
356
  """
354
- inserted_tasks = set() # type: Set[object]
357
+ inserted_tasks: Set[object] = set()
355
358
 
356
359
  """
357
360
  like inserted_tasks but for task,try tuples
358
361
  """
359
- inserted_tries = set() # type: Set[Any]
362
+ inserted_tries: Set[Any] = set()
360
363
 
361
364
  # for any task ID, we can defer exactly one message, which is the
362
365
  # assumed-to-be-unique first message (with first message flag set).
363
366
  # The code prior to this patch will discard previous message in
364
367
  # the case of multiple messages to defer.
365
- deferred_resource_messages = {} # type: MonitoringMessage
368
+ deferred_resource_messages: MonitoringMessage = {}
366
369
 
367
370
  exception_happened = False
368
371
 
@@ -505,7 +508,7 @@ class DatabaseManager:
505
508
  "Got {} messages from block queue".format(len(block_info_messages)))
506
509
  # block_info_messages is possibly a nested list of dict (at different polling times)
507
510
  # Each dict refers to the info of a job/block at one polling time
508
- block_messages_to_insert = [] # type: List[Any]
511
+ block_messages_to_insert: List[Any] = []
509
512
  for block_msg in block_info_messages:
510
513
  block_messages_to_insert.extend(block_msg)
511
514
  self._insert(table=BLOCK, messages=block_messages_to_insert)
@@ -574,43 +577,26 @@ class DatabaseManager:
574
577
  raise RuntimeError("An exception happened sometime during database processing and should have been logged in database_manager.log")
575
578
 
576
579
  @wrap_with_logs(target="database_manager")
577
- def _migrate_logs_to_internal(self, logs_queue: queue.Queue, queue_tag: str, kill_event: threading.Event) -> None:
578
- logger.info("Starting processing for queue {}".format(queue_tag))
580
+ def _migrate_logs_to_internal(self, logs_queue: queue.Queue, kill_event: threading.Event) -> None:
581
+ logger.info("Starting _migrate_logs_to_internal")
579
582
 
580
583
  while not kill_event.is_set() or logs_queue.qsize() != 0:
581
- logger.debug("""Checking STOP conditions for {} threads: {}, {}"""
582
- .format(queue_tag, kill_event.is_set(), logs_queue.qsize() != 0))
584
+ logger.debug("Checking STOP conditions: kill event: %s, queue has entries: %s",
585
+ kill_event.is_set(), logs_queue.qsize() != 0)
583
586
  try:
584
587
  x, addr = logs_queue.get(timeout=0.1)
585
588
  except queue.Empty:
586
589
  continue
587
590
  else:
588
- if queue_tag == 'priority' and x == 'STOP':
591
+ if x == 'STOP':
589
592
  self.close()
590
- elif queue_tag == 'priority': # implicitly not 'STOP'
591
- assert isinstance(x, tuple)
592
- assert len(x) == 2
593
- assert x[0] in [MessageType.WORKFLOW_INFO, MessageType.TASK_INFO], \
594
- "_migrate_logs_to_internal can only migrate WORKFLOW_,TASK_INFO message from priority queue, got x[0] == {}".format(x[0])
595
- self._dispatch_to_internal(x)
596
- elif queue_tag == 'resource':
597
- assert isinstance(x, tuple), "_migrate_logs_to_internal was expecting a tuple, got {}".format(x)
598
- assert x[0] == MessageType.RESOURCE_INFO, (
599
- "_migrate_logs_to_internal can only migrate RESOURCE_INFO message from resource queue, "
600
- "got tag {}, message {}".format(x[0], x)
601
- )
602
- self._dispatch_to_internal(x)
603
- elif queue_tag == 'node':
604
- assert len(x) == 2, "expected message tuple to have exactly two elements"
605
- assert x[0] == MessageType.NODE_INFO, "_migrate_logs_to_internal can only migrate NODE_INFO messages from node queue"
606
-
607
- self._dispatch_to_internal(x)
608
- elif queue_tag == "block":
609
- self._dispatch_to_internal(x)
610
593
  else:
611
- logger.error(f"Discarding because unknown queue tag '{queue_tag}', message: {x}")
594
+ self._dispatch_to_internal(x)
612
595
 
613
596
  def _dispatch_to_internal(self, x: Tuple) -> None:
597
+ assert isinstance(x, tuple)
598
+ assert len(x) == 2, "expected message tuple to have exactly two elements"
599
+
614
600
  if x[0] in [MessageType.WORKFLOW_INFO, MessageType.TASK_INFO]:
615
601
  self.pending_priority_queue.put(cast(Any, x))
616
602
  elif x[0] == MessageType.RESOURCE_INFO:
@@ -686,7 +672,7 @@ class DatabaseManager:
686
672
  logger.exception("Rollback failed")
687
673
 
688
674
  def _get_messages_in_batch(self, msg_queue: "queue.Queue[X]") -> List[X]:
689
- messages = [] # type: List[X]
675
+ messages: List[X] = []
690
676
  start = time.time()
691
677
  while True:
692
678
  if time.time() - start >= self.batching_interval or len(messages) >= self.batching_threshold:
@@ -719,11 +705,12 @@ class DatabaseManager:
719
705
 
720
706
 
721
707
  @wrap_with_logs(target="database_manager")
722
- def dbm_starter(exception_q: "queue.Queue[Tuple[str, str]]",
723
- priority_msgs: "queue.Queue[TaggedMonitoringMessage]",
724
- node_msgs: "queue.Queue[MonitoringMessage]",
725
- block_msgs: "queue.Queue[MonitoringMessage]",
726
- resource_msgs: "queue.Queue[MonitoringMessage]",
708
+ @typeguard.typechecked
709
+ def dbm_starter(exception_q: mpq.Queue,
710
+ priority_msgs: mpq.Queue,
711
+ node_msgs: mpq.Queue,
712
+ block_msgs: mpq.Queue,
713
+ resource_msgs: mpq.Queue,
727
714
  db_url: str,
728
715
  logdir: str,
729
716
  logging_level: int) -> None:
@@ -106,7 +106,7 @@ class MonitoringHub(RepresentationMixin):
106
106
  self.resource_monitoring_enabled = resource_monitoring_enabled
107
107
  self.resource_monitoring_interval = resource_monitoring_interval
108
108
 
109
- def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None:
109
+ def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None:
110
110
 
111
111
  logger.debug("Starting MonitoringHub")
112
112
 
@@ -154,14 +154,18 @@ class MonitoringHub(RepresentationMixin):
154
154
  self.router_exit_event = Event()
155
155
 
156
156
  self.router_proc = ForkProcess(target=router_starter,
157
- args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs,
158
- self.block_msgs, self.resource_msgs, self.router_exit_event),
159
- kwargs={"hub_address": self.hub_address,
157
+ kwargs={"comm_q": comm_q,
158
+ "exception_q": self.exception_q,
159
+ "priority_msgs": self.priority_msgs,
160
+ "node_msgs": self.node_msgs,
161
+ "block_msgs": self.block_msgs,
162
+ "resource_msgs": self.resource_msgs,
163
+ "exit_event": self.router_exit_event,
164
+ "hub_address": self.hub_address,
160
165
  "udp_port": self.hub_port,
161
166
  "zmq_port_range": self.hub_port_range,
162
167
  "logdir": self.logdir,
163
168
  "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
164
- "run_id": run_id
165
169
  },
166
170
  name="Monitoring-Router-Process",
167
171
  daemon=True,
@@ -199,10 +199,10 @@ def monitor(pid: int,
199
199
 
200
200
  pm = psutil.Process(pid)
201
201
 
202
- children_user_time = {} # type: Dict[int, float]
203
- children_system_time = {} # type: Dict[int, float]
204
- children_num_ctx_switches_voluntary = {} # type: Dict[int, float]
205
- children_num_ctx_switches_involuntary = {} # type: Dict[int, float]
202
+ children_user_time: Dict[int, float] = {}
203
+ children_system_time: Dict[int, float] = {}
204
+ children_num_ctx_switches_voluntary: Dict[int, float] = {}
205
+ children_num_ctx_switches_involuntary: Dict[int, float] = {}
206
206
 
207
207
  def accumulate_and_prepare() -> Dict[str, Any]:
208
208
  d = {"psutil_process_" + str(k): v for k, v in pm.as_dict().items() if k in simple}