parsl 2024.2.26__py3-none-any.whl → 2024.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/addresses.py +1 -1
- parsl/configs/ASPIRE1.py +1 -1
- parsl/configs/ad_hoc.py +1 -1
- parsl/configs/bridges.py +1 -1
- parsl/configs/cc_in2p3.py +1 -1
- parsl/configs/expanse.py +1 -1
- parsl/configs/frontera.py +1 -1
- parsl/configs/kubernetes.py +1 -1
- parsl/configs/midway.py +1 -1
- parsl/configs/osg.py +1 -1
- parsl/configs/stampede2.py +1 -1
- parsl/dataflow/dflow.py +11 -6
- parsl/dataflow/taskrecord.py +3 -1
- parsl/executors/high_throughput/executor.py +69 -37
- parsl/executors/high_throughput/interchange.py +78 -59
- parsl/executors/high_throughput/process_worker_pool.py +40 -28
- parsl/executors/taskvine/executor.py +3 -1
- parsl/executors/workqueue/executor.py +5 -2
- parsl/executors/workqueue/parsl_coprocess.py +107 -95
- parsl/jobs/job_status_poller.py +9 -3
- parsl/jobs/strategy.py +4 -3
- parsl/monitoring/db_manager.py +25 -5
- parsl/monitoring/monitoring.py +6 -2
- parsl/monitoring/remote.py +29 -0
- parsl/monitoring/visualization/models.py +7 -0
- parsl/providers/slurm/slurm.py +13 -2
- parsl/tests/configs/ad_hoc_cluster_htex.py +1 -1
- parsl/tests/configs/bluewaters.py +1 -1
- parsl/tests/configs/bridges.py +1 -1
- parsl/tests/configs/cc_in2p3.py +1 -1
- parsl/tests/configs/comet.py +1 -1
- parsl/tests/configs/frontera.py +1 -1
- parsl/tests/configs/midway.py +1 -1
- parsl/tests/configs/nscc_singapore.py +1 -1
- parsl/tests/configs/osg_htex.py +1 -1
- parsl/tests/configs/petrelkube.py +1 -1
- parsl/tests/configs/summit.py +1 -1
- parsl/tests/configs/theta.py +1 -1
- parsl/tests/configs/user_opts.py +3 -1
- parsl/tests/manual_tests/test_ad_hoc_htex.py +1 -1
- parsl/tests/scaling_tests/htex_local.py +1 -1
- parsl/tests/sites/test_affinity.py +1 -1
- parsl/tests/sites/test_concurrent.py +1 -1
- parsl/tests/sites/test_dynamic_executor.py +1 -1
- parsl/tests/sites/test_worker_info.py +1 -1
- parsl/tests/test_htex/test_basic.py +1 -1
- parsl/tests/test_htex/test_connected_blocks.py +1 -1
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +1 -1
- parsl/tests/test_htex/test_disconnected_blocks.py +1 -1
- parsl/tests/test_htex/test_htex.py +13 -0
- parsl/tests/test_htex/test_manager_failure.py +1 -1
- parsl/tests/test_htex/test_missing_worker.py +1 -1
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +1 -1
- parsl/tests/test_htex/test_worker_failure.py +1 -1
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +1 -1
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +1 -1
- parsl/tests/test_mpi_apps/test_resource_spec.py +1 -1
- parsl/tests/test_scaling/test_scale_down.py +2 -2
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +159 -0
- parsl/usage_tracking/usage.py +5 -9
- parsl/version.py +1 -1
- parsl-2024.3.11.data/scripts/parsl_coprocess.py +166 -0
- {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/process_worker_pool.py +40 -28
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/METADATA +2 -2
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/RECORD +70 -70
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/WHEEL +1 -1
- parsl/configs/bluewaters.py +0 -28
- parsl-2024.2.26.data/scripts/parsl_coprocess.py +0 -154
- {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/LICENSE +0 -0
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/entry_points.txt +0 -0
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/top_level.txt +0 -0
parsl/addresses.py
CHANGED
@@ -81,7 +81,7 @@ def address_by_hostname() -> str:
|
|
81
81
|
def address_by_interface(ifname: str) -> str:
|
82
82
|
"""Returns the IP address of the given interface name, e.g. 'eth0'
|
83
83
|
|
84
|
-
This is
|
84
|
+
This is from a Stack Overflow answer: https://stackoverflow.com/questions/24196932/how-can-i-get-the-ip-address-of-eth0-in-python#24196955
|
85
85
|
|
86
86
|
Parameters
|
87
87
|
----------
|
parsl/configs/ASPIRE1.py
CHANGED
parsl/configs/ad_hoc.py
CHANGED
@@ -17,7 +17,7 @@ config = Config(
|
|
17
17
|
executors=[
|
18
18
|
HighThroughputExecutor(
|
19
19
|
label='remote_htex',
|
20
|
-
|
20
|
+
max_workers_per_node=2,
|
21
21
|
worker_logdir_root=user_opts['adhoc']['script_dir'],
|
22
22
|
provider=AdHocProvider(
|
23
23
|
# Command to be run before starting a worker, such as:
|
parsl/configs/bridges.py
CHANGED
@@ -13,7 +13,7 @@ config = Config(
|
|
13
13
|
HighThroughputExecutor(
|
14
14
|
label='Bridges_HTEX_multinode',
|
15
15
|
address=address_by_interface('ens3f0'),
|
16
|
-
|
16
|
+
max_workers_per_node=1,
|
17
17
|
provider=SlurmProvider(
|
18
18
|
'YOUR_PARTITION_NAME', # Specify Partition / QOS, for eg. RM-small
|
19
19
|
nodes_per_block=2,
|
parsl/configs/cc_in2p3.py
CHANGED
parsl/configs/expanse.py
CHANGED
parsl/configs/frontera.py
CHANGED
@@ -12,7 +12,7 @@ config = Config(
|
|
12
12
|
executors=[
|
13
13
|
HighThroughputExecutor(
|
14
14
|
label="frontera_htex",
|
15
|
-
|
15
|
+
max_workers_per_node=1, # Set number of workers per node
|
16
16
|
provider=SlurmProvider(
|
17
17
|
cmd_timeout=60, # Add extra time for slow scheduler responses
|
18
18
|
channel=LocalChannel(),
|
parsl/configs/kubernetes.py
CHANGED
parsl/configs/midway.py
CHANGED
@@ -10,7 +10,7 @@ config = Config(
|
|
10
10
|
label='Midway_HTEX_multinode',
|
11
11
|
address=address_by_interface('bond0'),
|
12
12
|
worker_debug=False,
|
13
|
-
|
13
|
+
max_workers_per_node=2,
|
14
14
|
provider=SlurmProvider(
|
15
15
|
'YOUR_PARTITION', # Partition name, e.g 'broadwl'
|
16
16
|
launcher=SrunLauncher(),
|
parsl/configs/osg.py
CHANGED
parsl/configs/stampede2.py
CHANGED
parsl/dataflow/dflow.py
CHANGED
@@ -95,7 +95,7 @@ class DataFlowKernel:
|
|
95
95
|
self.checkpoint_lock = threading.Lock()
|
96
96
|
|
97
97
|
self.usage_tracker = UsageTracker(self)
|
98
|
-
self.usage_tracker.
|
98
|
+
self.usage_tracker.send_start_message()
|
99
99
|
|
100
100
|
self.task_state_counts_lock = threading.Lock()
|
101
101
|
self.task_state_counts = {state: 0 for state in States}
|
@@ -722,7 +722,10 @@ class DataFlowKernel:
|
|
722
722
|
self._send_task_log_info(task_record)
|
723
723
|
|
724
724
|
if hasattr(exec_fu, "parsl_executor_task_id"):
|
725
|
-
logger.info(
|
725
|
+
logger.info(
|
726
|
+
f"Parsl task {task_id} try {try_id} launched on executor {executor.label} "
|
727
|
+
f"with executor id {exec_fu.parsl_executor_task_id}")
|
728
|
+
|
726
729
|
else:
|
727
730
|
logger.info(f"Parsl task {task_id} try {try_id} launched on executor {executor.label}")
|
728
731
|
|
@@ -730,7 +733,8 @@ class DataFlowKernel:
|
|
730
733
|
|
731
734
|
return exec_fu
|
732
735
|
|
733
|
-
def _add_input_deps(self, executor: str, args: Sequence[Any], kwargs: Dict[str, Any], func: Callable) -> Tuple[Sequence[Any], Dict[str, Any],
|
736
|
+
def _add_input_deps(self, executor: str, args: Sequence[Any], kwargs: Dict[str, Any], func: Callable) -> Tuple[Sequence[Any], Dict[str, Any],
|
737
|
+
Callable]:
|
734
738
|
"""Look for inputs of the app that are files. Give the data manager
|
735
739
|
the opportunity to replace a file with a data future for that file,
|
736
740
|
for example wrapping the result of a staging action.
|
@@ -1142,8 +1146,9 @@ class DataFlowKernel:
|
|
1142
1146
|
|
1143
1147
|
def atexit_cleanup(self) -> None:
|
1144
1148
|
if not self.cleanup_called:
|
1145
|
-
logger.
|
1146
|
-
|
1149
|
+
logger.warning("Python is exiting with a DFK still running. "
|
1150
|
+
"You should call parsl.dfk().cleanup() before "
|
1151
|
+
"exiting to release any resources")
|
1147
1152
|
else:
|
1148
1153
|
logger.info("python process is exiting, but DFK has already been cleaned up")
|
1149
1154
|
|
@@ -1200,7 +1205,7 @@ class DataFlowKernel:
|
|
1200
1205
|
self._checkpoint_timer.close()
|
1201
1206
|
|
1202
1207
|
# Send final stats
|
1203
|
-
self.usage_tracker.
|
1208
|
+
self.usage_tracker.send_end_message()
|
1204
1209
|
self.usage_tracker.close()
|
1205
1210
|
|
1206
1211
|
logger.info("Closing job status poller")
|
parsl/dataflow/taskrecord.py
CHANGED
@@ -70,7 +70,9 @@ class TaskRecord(TypedDict, total=False):
|
|
70
70
|
# these three could be more strongly typed perhaps but I'm not thinking about that now
|
71
71
|
func: Callable
|
72
72
|
fn_hash: str
|
73
|
-
args: Sequence[Any]
|
73
|
+
args: Sequence[Any]
|
74
|
+
# in some places we uses a Tuple[Any, ...] and in some places a List[Any].
|
75
|
+
# This is an attempt to correctly type both of those.
|
74
76
|
kwargs: Dict[str, Any]
|
75
77
|
|
76
78
|
time_invoked: Optional[datetime.datetime]
|
@@ -6,10 +6,12 @@ import threading
|
|
6
6
|
import queue
|
7
7
|
import datetime
|
8
8
|
import pickle
|
9
|
+
from dataclasses import dataclass
|
9
10
|
from multiprocessing import Process, Queue
|
10
11
|
from typing import Dict, Sequence
|
11
12
|
from typing import List, Optional, Tuple, Union, Callable
|
12
13
|
import math
|
14
|
+
import warnings
|
13
15
|
|
14
16
|
import parsl.launchers
|
15
17
|
from parsl.serialize import pack_res_spec_apply_message, deserialize
|
@@ -39,7 +41,7 @@ from parsl.providers import LocalProvider
|
|
39
41
|
|
40
42
|
logger = logging.getLogger(__name__)
|
41
43
|
|
42
|
-
DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {
|
44
|
+
DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
|
43
45
|
"-a {addresses} "
|
44
46
|
"-p {prefetch_capacity} "
|
45
47
|
"-c {cores_per_worker} "
|
@@ -154,7 +156,10 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
154
156
|
the there's sufficient memory for each worker. Default: None
|
155
157
|
|
156
158
|
max_workers : int
|
157
|
-
|
159
|
+
Deprecated. Please use max_workers_per_node instead.
|
160
|
+
|
161
|
+
max_workers_per_node : int
|
162
|
+
Caps the number of workers launched per node. Default: None
|
158
163
|
|
159
164
|
cpu_affinity: string
|
160
165
|
Whether or how each worker process sets thread affinity. Options include "none" to forgo
|
@@ -228,7 +233,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
228
233
|
worker_debug: bool = False,
|
229
234
|
cores_per_worker: float = 1.0,
|
230
235
|
mem_per_worker: Optional[float] = None,
|
231
|
-
max_workers: Union[int, float] =
|
236
|
+
max_workers: Optional[Union[int, float]] = None,
|
237
|
+
max_workers_per_node: Optional[Union[int, float]] = None,
|
232
238
|
cpu_affinity: str = 'none',
|
233
239
|
available_accelerators: Union[int, Sequence[str]] = (),
|
234
240
|
prefetch_capacity: int = 0,
|
@@ -251,7 +257,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
251
257
|
self.working_dir = working_dir
|
252
258
|
self.cores_per_worker = cores_per_worker
|
253
259
|
self.mem_per_worker = mem_per_worker
|
254
|
-
self.max_workers = max_workers
|
255
260
|
self.prefetch_capacity = prefetch_capacity
|
256
261
|
self.address = address
|
257
262
|
self.address_probe_timeout = address_probe_timeout
|
@@ -260,8 +265,12 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
260
265
|
else:
|
261
266
|
self.all_addresses = ','.join(get_all_addresses())
|
262
267
|
|
263
|
-
|
264
|
-
|
268
|
+
if max_workers:
|
269
|
+
self._warn_deprecated("max_workers", "max_workers_per_node")
|
270
|
+
self.max_workers_per_node = max_workers_per_node or max_workers or float("inf")
|
271
|
+
|
272
|
+
mem_slots = self.max_workers_per_node
|
273
|
+
cpu_slots = self.max_workers_per_node
|
265
274
|
if hasattr(self.provider, 'mem_per_node') and \
|
266
275
|
self.provider.mem_per_node is not None and \
|
267
276
|
mem_per_worker is not None and \
|
@@ -278,7 +287,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
278
287
|
self.available_accelerators = list(available_accelerators)
|
279
288
|
|
280
289
|
# Determine the number of workers per node
|
281
|
-
self._workers_per_node = min(
|
290
|
+
self._workers_per_node = min(self.max_workers_per_node, mem_slots, cpu_slots)
|
282
291
|
if len(self.available_accelerators) > 0:
|
283
292
|
self._workers_per_node = min(self._workers_per_node, len(available_accelerators))
|
284
293
|
if self._workers_per_node == float('inf'):
|
@@ -316,6 +325,24 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
316
325
|
|
317
326
|
radio_mode = "htex"
|
318
327
|
|
328
|
+
def _warn_deprecated(self, old: str, new: str):
|
329
|
+
warnings.warn(
|
330
|
+
f"{old} is deprecated and will be removed in a future release. "
|
331
|
+
f"Please use {new} instead.",
|
332
|
+
DeprecationWarning,
|
333
|
+
stacklevel=2
|
334
|
+
)
|
335
|
+
|
336
|
+
@property
|
337
|
+
def max_workers(self):
|
338
|
+
self._warn_deprecated("max_workers", "max_workers_per_node")
|
339
|
+
return self.max_workers_per_node
|
340
|
+
|
341
|
+
@max_workers.setter
|
342
|
+
def max_workers(self, val: Union[int, float]):
|
343
|
+
self._warn_deprecated("max_workers", "max_workers_per_node")
|
344
|
+
self.max_workers_per_node = val
|
345
|
+
|
319
346
|
@property
|
320
347
|
def logdir(self):
|
321
348
|
return "{}/{}".format(self.run_dir, self.label)
|
@@ -330,7 +357,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
330
357
|
"""Compose the launch command and scale out the initial blocks.
|
331
358
|
"""
|
332
359
|
debug_opts = "--debug" if self.worker_debug else ""
|
333
|
-
|
360
|
+
max_workers_per_node = "" if self.max_workers_per_node == float('inf') else "--max_workers_per_node={}".format(self.max_workers_per_node)
|
334
361
|
enable_mpi_opts = "--enable_mpi_mode " if self.enable_mpi_mode else ""
|
335
362
|
|
336
363
|
address_probe_timeout_string = ""
|
@@ -345,7 +372,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
345
372
|
result_port=self.worker_result_port,
|
346
373
|
cores_per_worker=self.cores_per_worker,
|
347
374
|
mem_per_worker=self.mem_per_worker,
|
348
|
-
|
375
|
+
max_workers_per_node=max_workers_per_node,
|
349
376
|
nodes_per_block=self.provider.nodes_per_block,
|
350
377
|
heartbeat_period=self.heartbeat_period,
|
351
378
|
heartbeat_threshold=self.heartbeat_threshold,
|
@@ -602,8 +629,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
602
629
|
"""Submits work to the outgoing_q.
|
603
630
|
|
604
631
|
The outgoing_q is an external process listens on this
|
605
|
-
queue for new work. This method behaves like a
|
606
|
-
|
632
|
+
queue for new work. This method behaves like a submit call as described here `Python docs: <https://docs.python.org/3/
|
633
|
+
library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor>`_
|
607
634
|
|
608
635
|
Args:
|
609
636
|
- func (callable) : Callable function
|
@@ -668,7 +695,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
668
695
|
def workers_per_node(self) -> Union[int, float]:
|
669
696
|
return self._workers_per_node
|
670
697
|
|
671
|
-
def scale_in(self, blocks
|
698
|
+
def scale_in(self, blocks: int, max_idletime: Optional[float] = None) -> List[str]:
|
672
699
|
"""Scale in the number of active blocks by specified amount.
|
673
700
|
|
674
701
|
The scale in method here is very rude. It doesn't give the workers
|
@@ -681,49 +708,54 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
681
708
|
blocks : int
|
682
709
|
Number of blocks to terminate and scale_in by
|
683
710
|
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
When force = True, we will kill blocks regardless of the blocks being busy
|
711
|
+
max_idletime: float
|
712
|
+
A time to indicate how long a block should be idle to be a
|
713
|
+
candidate for scaling in.
|
688
714
|
|
689
|
-
|
690
|
-
number of idle blocks < ``blocks``, then fewer than ``blocks``
|
691
|
-
blocks will be terminated.
|
715
|
+
If None then blocks will be force scaled in even if they are busy.
|
692
716
|
|
693
|
-
|
694
|
-
|
695
|
-
Used along with force = False to kill blocks that have been idle for that long.
|
717
|
+
If a float, then only idle blocks will be terminated, which may be less than
|
718
|
+
the requested number.
|
696
719
|
|
697
720
|
Returns
|
698
721
|
-------
|
699
722
|
List of block IDs scaled in
|
700
723
|
"""
|
701
724
|
logger.debug(f"Scale in called, blocks={blocks}")
|
725
|
+
|
726
|
+
@dataclass
|
727
|
+
class BlockInfo:
|
728
|
+
tasks: int # sum of tasks in this block
|
729
|
+
idle: float # shortest idle time of any manager in this block
|
730
|
+
|
702
731
|
managers = self.connected_managers()
|
703
|
-
block_info = {}
|
732
|
+
block_info: Dict[str, BlockInfo] = {}
|
704
733
|
for manager in managers:
|
705
734
|
if not manager['active']:
|
706
735
|
continue
|
707
736
|
b_id = manager['block_id']
|
708
737
|
if b_id not in block_info:
|
709
|
-
block_info[b_id] =
|
710
|
-
block_info[b_id]
|
711
|
-
block_info[b_id]
|
738
|
+
block_info[b_id] = BlockInfo(tasks=0, idle=float('inf'))
|
739
|
+
block_info[b_id].tasks += manager['tasks']
|
740
|
+
block_info[b_id].idle = min(block_info[b_id].idle, manager['idle_duration'])
|
741
|
+
|
742
|
+
# The scaling policy is that longest idle blocks should be scaled down
|
743
|
+
# in preference to least idle (most recently used) blocks.
|
744
|
+
# Other policies could be implemented here.
|
745
|
+
|
746
|
+
sorted_blocks = sorted(block_info.items(), key=lambda item: (-item[1].idle, item[1].tasks))
|
712
747
|
|
713
|
-
sorted_blocks = sorted(block_info.items(), key=lambda item: (item[1][1], item[1][0]))
|
714
748
|
logger.debug(f"Scale in selecting from {len(sorted_blocks)} blocks")
|
715
|
-
if
|
749
|
+
if max_idletime is None:
|
716
750
|
block_ids_to_kill = [x[0] for x in sorted_blocks[:blocks]]
|
717
751
|
else:
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
if len(block_ids_to_kill) == blocks:
|
726
|
-
break
|
752
|
+
block_ids_to_kill = []
|
753
|
+
for x in sorted_blocks:
|
754
|
+
if x[1].idle > max_idletime and x[1].tasks == 0:
|
755
|
+
block_ids_to_kill.append(x[0])
|
756
|
+
if len(block_ids_to_kill) == blocks:
|
757
|
+
break
|
758
|
+
|
727
759
|
logger.debug("Selected idle block ids to kill: {}".format(
|
728
760
|
block_ids_to_kill))
|
729
761
|
if len(block_ids_to_kill) < blocks:
|
@@ -27,7 +27,6 @@ from parsl.monitoring.message_type import MessageType
|
|
27
27
|
from parsl.process_loggers import wrap_with_logs
|
28
28
|
|
29
29
|
|
30
|
-
HEARTBEAT_CODE = (2 ** 32) - 1
|
31
30
|
PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
|
32
31
|
|
33
32
|
LOGGER_NAME = "interchange"
|
@@ -393,71 +392,85 @@ class Interchange:
|
|
393
392
|
logger.info("Processed {} tasks in {} seconds".format(self.count, delta))
|
394
393
|
logger.warning("Exiting")
|
395
394
|
|
396
|
-
def process_task_outgoing_incoming(
|
397
|
-
|
395
|
+
def process_task_outgoing_incoming(
|
396
|
+
self,
|
397
|
+
interesting_managers: Set[bytes],
|
398
|
+
hub_channel: Optional[zmq.Socket],
|
399
|
+
kill_event: threading.Event
|
400
|
+
) -> None:
|
401
|
+
"""Process one message from manager on the task_outgoing channel.
|
402
|
+
Note that this message flow is in contradiction to the name of the
|
403
|
+
channel - it is not an outgoing message and it is not a task.
|
404
|
+
"""
|
398
405
|
if self.task_outgoing in self.socks and self.socks[self.task_outgoing] == zmq.POLLIN:
|
399
406
|
logger.debug("starting task_outgoing section")
|
400
407
|
message = self.task_outgoing.recv_multipart()
|
401
408
|
manager_id = message[0]
|
402
409
|
|
403
|
-
|
404
|
-
|
410
|
+
try:
|
411
|
+
msg = json.loads(message[1].decode('utf-8'))
|
412
|
+
except Exception:
|
413
|
+
logger.warning("Got Exception reading message from manager: {!r}".format(
|
414
|
+
manager_id), exc_info=True)
|
415
|
+
logger.debug("Message: \n{!r}\n".format(message[1]))
|
416
|
+
return
|
417
|
+
|
418
|
+
# perform a bit of validation on the structure of the deserialized
|
419
|
+
# object, at least enough to behave like a deserialization error
|
420
|
+
# in obviously malformed cases
|
421
|
+
if not isinstance(msg, dict) or 'type' not in msg:
|
422
|
+
logger.error(f"JSON message was not correctly formatted from manager: {manager_id!r}")
|
423
|
+
logger.debug("Message: \n{!r}\n".format(message[1]))
|
424
|
+
return
|
425
|
+
|
426
|
+
if msg['type'] == 'registration':
|
427
|
+
# We set up an entry only if registration works correctly
|
428
|
+
self._ready_managers[manager_id] = {'last_heartbeat': time.time(),
|
429
|
+
'idle_since': time.time(),
|
430
|
+
'block_id': None,
|
431
|
+
'max_capacity': 0,
|
432
|
+
'worker_count': 0,
|
433
|
+
'active': True,
|
434
|
+
'tasks': []}
|
435
|
+
self.connected_block_history.append(msg['block_id'])
|
436
|
+
|
437
|
+
interesting_managers.add(manager_id)
|
438
|
+
logger.info("Adding manager: {!r} to ready queue".format(manager_id))
|
439
|
+
m = self._ready_managers[manager_id]
|
405
440
|
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
manager_id), exc_info=True)
|
412
|
-
logger.debug("Message: \n{!r}\n".format(message[1]))
|
413
|
-
else:
|
414
|
-
# We set up an entry only if registration works correctly
|
415
|
-
self._ready_managers[manager_id] = {'last_heartbeat': time.time(),
|
416
|
-
'idle_since': time.time(),
|
417
|
-
'block_id': None,
|
418
|
-
'max_capacity': 0,
|
419
|
-
'worker_count': 0,
|
420
|
-
'active': True,
|
421
|
-
'tasks': []}
|
422
|
-
self.connected_block_history.append(msg['block_id'])
|
423
|
-
if reg_flag is True:
|
424
|
-
interesting_managers.add(manager_id)
|
425
|
-
logger.info("Adding manager: {!r} to ready queue".format(manager_id))
|
426
|
-
m = self._ready_managers[manager_id]
|
427
|
-
m.update(msg)
|
428
|
-
logger.info("Registration info for manager {!r}: {}".format(manager_id, msg))
|
429
|
-
self._send_monitoring_info(hub_channel, m)
|
430
|
-
|
431
|
-
if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
|
432
|
-
msg['parsl_v'] != self.current_platform['parsl_v']):
|
433
|
-
logger.error("Manager {!r} has incompatible version info with the interchange".format(manager_id))
|
434
|
-
logger.debug("Setting kill event")
|
435
|
-
kill_event.set()
|
436
|
-
e = VersionMismatch("py.v={} parsl.v={}".format(self.current_platform['python_v'].rsplit(".", 1)[0],
|
437
|
-
self.current_platform['parsl_v']),
|
438
|
-
"py.v={} parsl.v={}".format(msg['python_v'].rsplit(".", 1)[0],
|
439
|
-
msg['parsl_v'])
|
440
|
-
)
|
441
|
-
result_package = {'type': 'result', 'task_id': -1, 'exception': serialize_object(e)}
|
442
|
-
pkl_package = pickle.dumps(result_package)
|
443
|
-
self.results_outgoing.send(pkl_package)
|
444
|
-
logger.error("Sent failure reports, shutting down interchange")
|
445
|
-
else:
|
446
|
-
logger.info("Manager {!r} has compatible Parsl version {}".format(manager_id, msg['parsl_v']))
|
447
|
-
logger.info("Manager {!r} has compatible Python version {}".format(manager_id,
|
448
|
-
msg['python_v'].rsplit(".", 1)[0]))
|
449
|
-
else:
|
450
|
-
# Registration has failed.
|
451
|
-
logger.debug("Suppressing bad registration from manager: {!r}".format(manager_id))
|
441
|
+
# m is a ManagerRecord, but msg is a dict[Any,Any] and so can
|
442
|
+
# contain arbitrary fields beyond those in ManagerRecord (and
|
443
|
+
# indeed does - for example, python_v) which are then ignored
|
444
|
+
# later.
|
445
|
+
m.update(msg) # type: ignore[typeddict-item]
|
452
446
|
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
if
|
457
|
-
|
458
|
-
|
447
|
+
logger.info("Registration info for manager {!r}: {}".format(manager_id, msg))
|
448
|
+
self._send_monitoring_info(hub_channel, m)
|
449
|
+
|
450
|
+
if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
|
451
|
+
msg['parsl_v'] != self.current_platform['parsl_v']):
|
452
|
+
logger.error("Manager {!r} has incompatible version info with the interchange".format(manager_id))
|
453
|
+
logger.debug("Setting kill event")
|
454
|
+
kill_event.set()
|
455
|
+
e = VersionMismatch("py.v={} parsl.v={}".format(self.current_platform['python_v'].rsplit(".", 1)[0],
|
456
|
+
self.current_platform['parsl_v']),
|
457
|
+
"py.v={} parsl.v={}".format(msg['python_v'].rsplit(".", 1)[0],
|
458
|
+
msg['parsl_v'])
|
459
|
+
)
|
460
|
+
result_package = {'type': 'result', 'task_id': -1, 'exception': serialize_object(e)}
|
461
|
+
pkl_package = pickle.dumps(result_package)
|
462
|
+
self.results_outgoing.send(pkl_package)
|
463
|
+
logger.error("Sent failure reports, shutting down interchange")
|
459
464
|
else:
|
460
|
-
logger.
|
465
|
+
logger.info("Manager {!r} has compatible Parsl version {}".format(manager_id, msg['parsl_v']))
|
466
|
+
logger.info("Manager {!r} has compatible Python version {}".format(manager_id,
|
467
|
+
msg['python_v'].rsplit(".", 1)[0]))
|
468
|
+
elif msg['type'] == 'heartbeat':
|
469
|
+
self._ready_managers[manager_id]['last_heartbeat'] = time.time()
|
470
|
+
logger.debug("Manager {!r} sent heartbeat via tasks connection".format(manager_id))
|
471
|
+
self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
|
472
|
+
else:
|
473
|
+
logger.error(f"Unexpected message type received from manager: {msg['type']}")
|
461
474
|
logger.debug("leaving task_outgoing section")
|
462
475
|
|
463
476
|
def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
|
@@ -613,7 +626,13 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
|
|
613
626
|
None.
|
614
627
|
"""
|
615
628
|
if format_string is None:
|
616
|
-
format_string =
|
629
|
+
format_string = (
|
630
|
+
|
631
|
+
"%(asctime)s.%(msecs)03d %(name)s:%(lineno)d "
|
632
|
+
"%(processName)s(%(process)d) %(threadName)s "
|
633
|
+
"%(funcName)s [%(levelname)s] %(message)s"
|
634
|
+
|
635
|
+
)
|
617
636
|
|
618
637
|
global logger
|
619
638
|
logger = logging.getLogger(LOGGER_NAME)
|