parsl 2025.3.17__py3-none-any.whl → 2025.3.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/dataflow/dflow.py +18 -15
- parsl/executors/base.py +13 -37
- parsl/executors/flux/executor.py +1 -0
- parsl/executors/globus_compute.py +13 -2
- parsl/executors/high_throughput/executor.py +18 -0
- parsl/executors/high_throughput/interchange.py +26 -36
- parsl/executors/radical/executor.py +1 -0
- parsl/executors/status_handling.py +20 -12
- parsl/executors/taskvine/executor.py +13 -11
- parsl/executors/workqueue/executor.py +9 -7
- parsl/monitoring/errors.py +5 -0
- parsl/monitoring/monitoring.py +55 -122
- parsl/monitoring/radios/zmq_router.py +80 -18
- parsl/multiprocessing.py +42 -2
- parsl/tests/test_monitoring/test_basic.py +1 -1
- parsl/tests/test_monitoring/test_exit_helper.py +6 -7
- parsl/tests/test_monitoring/test_fuzz_zmq.py +1 -1
- parsl/tests/test_monitoring/test_radio_zmq.py +27 -0
- parsl/tests/test_monitoring/test_stdouterr.py +3 -0
- parsl/tests/test_shutdown/test_kill_monitoring.py +1 -1
- parsl/usage_tracking/usage.py +2 -2
- parsl/version.py +1 -1
- {parsl-2025.3.17.data → parsl-2025.3.31.data}/scripts/interchange.py +26 -36
- {parsl-2025.3.17.dist-info → parsl-2025.3.31.dist-info}/METADATA +2 -2
- {parsl-2025.3.17.dist-info → parsl-2025.3.31.dist-info}/RECORD +32 -31
- {parsl-2025.3.17.data → parsl-2025.3.31.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.3.17.data → parsl-2025.3.31.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.3.17.data → parsl-2025.3.31.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2025.3.17.dist-info → parsl-2025.3.31.dist-info}/LICENSE +0 -0
- {parsl-2025.3.17.dist-info → parsl-2025.3.31.dist-info}/WHEEL +0 -0
- {parsl-2025.3.17.dist-info → parsl-2025.3.31.dist-info}/entry_points.txt +0 -0
- {parsl-2025.3.17.dist-info → parsl-2025.3.31.dist-info}/top_level.txt +0 -0
parsl/dataflow/dflow.py
CHANGED
@@ -45,6 +45,7 @@ from parsl.executors.threads import ThreadPoolExecutor
|
|
45
45
|
from parsl.jobs.job_status_poller import JobStatusPoller
|
46
46
|
from parsl.monitoring import MonitoringHub
|
47
47
|
from parsl.monitoring.message_type import MessageType
|
48
|
+
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
48
49
|
from parsl.monitoring.remote import monitor_wrapper
|
49
50
|
from parsl.process_loggers import wrap_with_logs
|
50
51
|
from parsl.usage_tracking.usage import UsageTracker
|
@@ -110,8 +111,11 @@ class DataFlowKernel:
|
|
110
111
|
self.monitoring: Optional[MonitoringHub]
|
111
112
|
self.monitoring = config.monitoring
|
112
113
|
|
114
|
+
self.monitoring_radio = None
|
115
|
+
|
113
116
|
if self.monitoring:
|
114
117
|
self.monitoring.start(self.run_dir, self.config.run_dir)
|
118
|
+
self.monitoring_radio = MultiprocessingQueueRadioSender(self.monitoring.resource_msgs)
|
115
119
|
|
116
120
|
self.time_began = datetime.datetime.now()
|
117
121
|
self.time_completed: Optional[datetime.datetime] = None
|
@@ -156,9 +160,9 @@ class DataFlowKernel:
|
|
156
160
|
'host': gethostname(),
|
157
161
|
}
|
158
162
|
|
159
|
-
if self.
|
160
|
-
self.
|
161
|
-
|
163
|
+
if self.monitoring_radio:
|
164
|
+
self.monitoring_radio.send((MessageType.WORKFLOW_INFO,
|
165
|
+
workflow_info))
|
162
166
|
|
163
167
|
if config.checkpoint_files is not None:
|
164
168
|
checkpoint_files = config.checkpoint_files
|
@@ -231,9 +235,9 @@ class DataFlowKernel:
|
|
231
235
|
raise InternalConsistencyError(f"Exit case for {mode} should be unreachable, validated by typeguard on Config()")
|
232
236
|
|
233
237
|
def _send_task_log_info(self, task_record: TaskRecord) -> None:
|
234
|
-
if self.
|
238
|
+
if self.monitoring_radio:
|
235
239
|
task_log_info = self._create_task_log_info(task_record)
|
236
|
-
self.
|
240
|
+
self.monitoring_radio.send((MessageType.TASK_INFO, task_log_info))
|
237
241
|
|
238
242
|
def _create_task_log_info(self, task_record: TaskRecord) -> Dict[str, Any]:
|
239
243
|
"""
|
@@ -1128,9 +1132,7 @@ class DataFlowKernel:
|
|
1128
1132
|
executor.run_id = self.run_id
|
1129
1133
|
executor.run_dir = self.run_dir
|
1130
1134
|
if self.monitoring:
|
1131
|
-
executor.
|
1132
|
-
executor.hub_zmq_port = self.monitoring.hub_zmq_port
|
1133
|
-
executor.submit_monitoring_radio = self.monitoring.radio
|
1135
|
+
executor.monitoring_messages = self.monitoring.resource_msgs
|
1134
1136
|
if hasattr(executor, 'provider'):
|
1135
1137
|
if hasattr(executor.provider, 'script_dir'):
|
1136
1138
|
executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
|
@@ -1217,15 +1219,16 @@ class DataFlowKernel:
|
|
1217
1219
|
logger.info("Terminated executors")
|
1218
1220
|
self.time_completed = datetime.datetime.now()
|
1219
1221
|
|
1220
|
-
if self.
|
1222
|
+
if self.monitoring_radio:
|
1221
1223
|
logger.info("Sending final monitoring message")
|
1222
|
-
self.
|
1223
|
-
|
1224
|
-
|
1225
|
-
|
1226
|
-
|
1227
|
-
|
1224
|
+
self.monitoring_radio.send((MessageType.WORKFLOW_INFO,
|
1225
|
+
{'tasks_failed_count': self.task_state_counts[States.failed],
|
1226
|
+
'tasks_completed_count': self.task_state_counts[States.exec_done],
|
1227
|
+
"time_began": self.time_began,
|
1228
|
+
'time_completed': self.time_completed,
|
1229
|
+
'run_id': self.run_id, 'rundir': self.run_dir}))
|
1228
1230
|
|
1231
|
+
if self.monitoring:
|
1229
1232
|
logger.info("Terminating monitoring")
|
1230
1233
|
self.monitoring.close()
|
1231
1234
|
logger.info("Terminated monitoring")
|
parsl/executors/base.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import os
|
2
4
|
from abc import ABCMeta, abstractmethod
|
3
5
|
from concurrent.futures import Future
|
6
|
+
from multiprocessing.queues import Queue
|
4
7
|
from typing import Any, Callable, Dict, Optional
|
5
8
|
|
6
9
|
from typing_extensions import Literal, Self
|
7
10
|
|
8
|
-
from parsl.monitoring.
|
11
|
+
from parsl.monitoring.types import TaggedMonitoringMessage
|
9
12
|
|
10
13
|
|
11
14
|
class ParslExecutor(metaclass=ABCMeta):
|
@@ -42,6 +45,13 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
42
45
|
invariant, not co-variant, and it looks like @typeguard cannot be
|
43
46
|
persuaded otherwise. So if you're implementing an executor and want to
|
44
47
|
@typeguard the constructor, you'll have to use List[Any] here.
|
48
|
+
|
49
|
+
The DataFlowKernel will set this attribute before calling .start(),
|
50
|
+
if monitoring is enabled:
|
51
|
+
|
52
|
+
monitoring_messages: Optional[Queue[TaggedMonitoringMessage]] - an executor
|
53
|
+
can send messages to the monitoring hub by putting them into
|
54
|
+
this queue.
|
45
55
|
"""
|
46
56
|
|
47
57
|
label: str = "undefined"
|
@@ -50,15 +60,11 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
50
60
|
def __init__(
|
51
61
|
self,
|
52
62
|
*,
|
53
|
-
|
54
|
-
hub_zmq_port: Optional[int] = None,
|
55
|
-
submit_monitoring_radio: Optional[MonitoringRadioSender] = None,
|
63
|
+
monitoring_messages: Optional[Queue[TaggedMonitoringMessage]] = None,
|
56
64
|
run_dir: str = ".",
|
57
65
|
run_id: Optional[str] = None,
|
58
66
|
):
|
59
|
-
self.
|
60
|
-
self.hub_zmq_port = hub_zmq_port
|
61
|
-
self.submit_monitoring_radio = submit_monitoring_radio
|
67
|
+
self.monitoring_messages = monitoring_messages
|
62
68
|
self.run_dir = os.path.abspath(run_dir)
|
63
69
|
self.run_id = run_id
|
64
70
|
|
@@ -125,33 +131,3 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
125
131
|
@run_id.setter
|
126
132
|
def run_id(self, value: Optional[str]) -> None:
|
127
133
|
self._run_id = value
|
128
|
-
|
129
|
-
@property
|
130
|
-
def hub_address(self) -> Optional[str]:
|
131
|
-
"""Address to the Hub for monitoring.
|
132
|
-
"""
|
133
|
-
return self._hub_address
|
134
|
-
|
135
|
-
@hub_address.setter
|
136
|
-
def hub_address(self, value: Optional[str]) -> None:
|
137
|
-
self._hub_address = value
|
138
|
-
|
139
|
-
@property
|
140
|
-
def hub_zmq_port(self) -> Optional[int]:
|
141
|
-
"""Port to the Hub for monitoring.
|
142
|
-
"""
|
143
|
-
return self._hub_zmq_port
|
144
|
-
|
145
|
-
@hub_zmq_port.setter
|
146
|
-
def hub_zmq_port(self, value: Optional[int]) -> None:
|
147
|
-
self._hub_zmq_port = value
|
148
|
-
|
149
|
-
@property
|
150
|
-
def submit_monitoring_radio(self) -> Optional[MonitoringRadioSender]:
|
151
|
-
"""Local radio for sending monitoring messages
|
152
|
-
"""
|
153
|
-
return self._submit_monitoring_radio
|
154
|
-
|
155
|
-
@submit_monitoring_radio.setter
|
156
|
-
def submit_monitoring_radio(self, value: Optional[MonitoringRadioSender]) -> None:
|
157
|
-
self._submit_monitoring_radio = value
|
parsl/executors/flux/executor.py
CHANGED
@@ -231,6 +231,7 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
|
|
231
231
|
|
232
232
|
def start(self):
|
233
233
|
"""Called when DFK starts the executor when the config is loaded."""
|
234
|
+
super().start()
|
234
235
|
os.makedirs(self.working_dir, exist_ok=True)
|
235
236
|
self._submission_thread.start()
|
236
237
|
|
@@ -2,10 +2,11 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import copy
|
4
4
|
from concurrent.futures import Future
|
5
|
-
from typing import Any, Callable, Dict
|
5
|
+
from typing import Any, Callable, Dict, List, Optional
|
6
6
|
|
7
7
|
import typeguard
|
8
8
|
|
9
|
+
from parsl.data_provider.staging import Staging
|
9
10
|
from parsl.errors import OptionalModuleMissing
|
10
11
|
from parsl.executors.base import ParslExecutor
|
11
12
|
from parsl.utils import RepresentationMixin
|
@@ -40,6 +41,8 @@ class GlobusComputeExecutor(ParslExecutor, RepresentationMixin):
|
|
40
41
|
self,
|
41
42
|
executor: Executor,
|
42
43
|
label: str = 'GlobusComputeExecutor',
|
44
|
+
storage_access: Optional[List[Staging]] = None,
|
45
|
+
working_dir: Optional[str] = None,
|
43
46
|
):
|
44
47
|
"""
|
45
48
|
Parameters
|
@@ -52,6 +55,12 @@ class GlobusComputeExecutor(ParslExecutor, RepresentationMixin):
|
|
52
55
|
|
53
56
|
label:
|
54
57
|
a label to name the executor
|
58
|
+
|
59
|
+
storage_access:
|
60
|
+
a list of staging providers that will be used for file staging
|
61
|
+
|
62
|
+
working_dir:
|
63
|
+
The working dir to be used for file staging
|
55
64
|
"""
|
56
65
|
if not _globus_compute_enabled:
|
57
66
|
raise OptionalModuleMissing(
|
@@ -64,10 +73,12 @@ class GlobusComputeExecutor(ParslExecutor, RepresentationMixin):
|
|
64
73
|
self.resource_specification = self.executor.resource_specification
|
65
74
|
self.user_endpoint_config = self.executor.user_endpoint_config
|
66
75
|
self.label = label
|
76
|
+
self.storage_access = storage_access
|
77
|
+
self.working_dir = working_dir
|
67
78
|
|
68
79
|
def start(self) -> None:
|
69
80
|
""" Start the Globus Compute Executor """
|
70
|
-
|
81
|
+
super().start()
|
71
82
|
|
72
83
|
def submit(self, func: Callable, resource_specification: Dict[str, Any], *args: Any, **kwargs: Any) -> Future:
|
73
84
|
""" Submit func to globus-compute
|
@@ -29,6 +29,7 @@ from parsl.executors.high_throughput.manager_selector import (
|
|
29
29
|
)
|
30
30
|
from parsl.executors.status_handling import BlockProviderExecutor
|
31
31
|
from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
|
32
|
+
from parsl.monitoring.radios.zmq_router import ZMQRadioReceiver, start_zmq_receiver
|
32
33
|
from parsl.process_loggers import wrap_with_logs
|
33
34
|
from parsl.providers import LocalProvider
|
34
35
|
from parsl.providers.base import ExecutionProvider
|
@@ -334,6 +335,10 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
334
335
|
self._result_queue_thread_exit = threading.Event()
|
335
336
|
self._result_queue_thread: Optional[threading.Thread] = None
|
336
337
|
|
338
|
+
self.zmq_monitoring: Optional[ZMQRadioReceiver]
|
339
|
+
self.zmq_monitoring = None
|
340
|
+
self.hub_zmq_port = None
|
341
|
+
|
337
342
|
radio_mode = "htex"
|
338
343
|
enable_mpi_mode: bool = False
|
339
344
|
mpi_launcher: str = "mpiexec"
|
@@ -407,6 +412,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
407
412
|
def start(self):
|
408
413
|
"""Create the Interchange process and connect to it.
|
409
414
|
"""
|
415
|
+
super().start()
|
410
416
|
if self.encrypted and self.cert_dir is None:
|
411
417
|
logger.debug("Creating CurveZMQ certificates")
|
412
418
|
self.cert_dir = curvezmq.create_certificates(self.logdir)
|
@@ -427,6 +433,15 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
427
433
|
self.loopback_address, self.interchange_port_range, self.cert_dir
|
428
434
|
)
|
429
435
|
|
436
|
+
if self.monitoring_messages is not None:
|
437
|
+
self.zmq_monitoring = start_zmq_receiver(monitoring_messages=self.monitoring_messages,
|
438
|
+
loopback_address=self.loopback_address,
|
439
|
+
port_range=self.interchange_port_range,
|
440
|
+
logdir=self.logdir,
|
441
|
+
worker_debug=self.worker_debug,
|
442
|
+
)
|
443
|
+
self.hub_zmq_port = self.zmq_monitoring.port
|
444
|
+
|
430
445
|
self._result_queue_thread = None
|
431
446
|
self._start_result_queue_thread()
|
432
447
|
self._start_local_interchange_process()
|
@@ -861,6 +876,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
861
876
|
if self._result_queue_thread:
|
862
877
|
self._result_queue_thread.join()
|
863
878
|
|
879
|
+
if self.zmq_monitoring:
|
880
|
+
self.zmq_monitoring.close()
|
881
|
+
|
864
882
|
logger.info("Finished HighThroughputExecutor shutdown attempt")
|
865
883
|
|
866
884
|
def get_usage_information(self):
|
@@ -328,7 +328,7 @@ class Interchange:
|
|
328
328
|
self.process_results_incoming(interesting_managers, monitoring_radio)
|
329
329
|
self.expire_bad_managers(interesting_managers, monitoring_radio)
|
330
330
|
self.expire_drained_managers(interesting_managers, monitoring_radio)
|
331
|
-
self.process_tasks_to_send(interesting_managers)
|
331
|
+
self.process_tasks_to_send(interesting_managers, monitoring_radio)
|
332
332
|
|
333
333
|
self.zmq_context.destroy()
|
334
334
|
delta = time.time() - start
|
@@ -452,7 +452,7 @@ class Interchange:
|
|
452
452
|
m['active'] = False
|
453
453
|
self._send_monitoring_info(monitoring_radio, m)
|
454
454
|
|
455
|
-
def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
|
455
|
+
def process_tasks_to_send(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
456
456
|
# Check if there are tasks that could be sent to managers
|
457
457
|
|
458
458
|
logger.debug(
|
@@ -481,13 +481,14 @@ class Interchange:
|
|
481
481
|
m['idle_since'] = None
|
482
482
|
logger.debug("Sent tasks: %s to manager %r", tids, manager_id)
|
483
483
|
# recompute real_capacity after sending tasks
|
484
|
-
real_capacity
|
484
|
+
real_capacity -= task_count
|
485
485
|
if real_capacity > 0:
|
486
486
|
logger.debug("Manager %r has free capacity %s", manager_id, real_capacity)
|
487
487
|
# ... so keep it in the interesting_managers list
|
488
488
|
else:
|
489
489
|
logger.debug("Manager %r is now saturated", manager_id)
|
490
490
|
interesting_managers.remove(manager_id)
|
491
|
+
self._send_monitoring_info(monitoring_radio, m)
|
491
492
|
else:
|
492
493
|
interesting_managers.remove(manager_id)
|
493
494
|
# logger.debug("Nothing to send to manager {}".format(manager_id))
|
@@ -505,13 +506,24 @@ class Interchange:
|
|
505
506
|
else:
|
506
507
|
logger.debug("Got %s result items in batch from manager %r", len(all_messages), manager_id)
|
507
508
|
|
508
|
-
|
509
|
+
m = self._ready_managers[manager_id]
|
510
|
+
b_messages_to_send = []
|
509
511
|
|
510
512
|
for p_message in all_messages:
|
511
513
|
r = pickle.loads(p_message)
|
512
514
|
if r['type'] == 'result':
|
513
515
|
# process this for task ID and forward to executor
|
514
|
-
|
516
|
+
logger.debug("Removing task %s from manager record %r", r["task_id"], manager_id)
|
517
|
+
try:
|
518
|
+
m['tasks'].remove(r['task_id'])
|
519
|
+
b_messages_to_send.append(p_message)
|
520
|
+
except Exception:
|
521
|
+
logger.exception(
|
522
|
+
"Ignoring exception removing task_id %s for manager %r with task list %s",
|
523
|
+
r['task_id'],
|
524
|
+
manager_id,
|
525
|
+
m["tasks"]
|
526
|
+
)
|
515
527
|
elif r['type'] == 'monitoring':
|
516
528
|
# the monitoring code makes the assumption that no
|
517
529
|
# monitoring messages will be received if monitoring
|
@@ -525,43 +537,21 @@ class Interchange:
|
|
525
537
|
else:
|
526
538
|
logger.error("Interchange discarding result_queue message of unknown type: %s", r["type"])
|
527
539
|
|
528
|
-
got_result = False
|
529
|
-
m = self._ready_managers[manager_id]
|
530
|
-
for (_, r) in b_messages:
|
531
|
-
assert 'type' in r, f"Message is missing type entry: {r}"
|
532
|
-
if r['type'] == 'result':
|
533
|
-
got_result = True
|
534
|
-
try:
|
535
|
-
logger.debug("Removing task %s from manager record %r", r["task_id"], manager_id)
|
536
|
-
m['tasks'].remove(r['task_id'])
|
537
|
-
except Exception:
|
538
|
-
# If we reach here, there's something very wrong.
|
539
|
-
logger.exception(
|
540
|
-
"Ignoring exception removing task_id %s for manager %r with task list %s",
|
541
|
-
r['task_id'],
|
542
|
-
manager_id,
|
543
|
-
m["tasks"]
|
544
|
-
)
|
545
|
-
|
546
|
-
b_messages_to_send = []
|
547
|
-
for (b_message, _) in b_messages:
|
548
|
-
b_messages_to_send.append(b_message)
|
549
|
-
|
550
540
|
if b_messages_to_send:
|
551
541
|
logger.debug("Sending messages on results_outgoing")
|
552
542
|
self.results_outgoing.send_multipart(b_messages_to_send)
|
553
543
|
logger.debug("Sent messages on results_outgoing")
|
554
544
|
|
555
|
-
|
556
|
-
if len(m['tasks']) == 0 and m['idle_since'] is None:
|
557
|
-
m['idle_since'] = time.time()
|
558
|
-
|
559
|
-
# A manager is only made interesting here if a result was
|
560
|
-
# received, which means there should be capacity for a new
|
561
|
-
# task now. Heartbeats and monitoring messages do not make a
|
562
|
-
# manager become interesting.
|
563
|
-
if got_result:
|
545
|
+
# At least one result received, so manager now has idle capacity
|
564
546
|
interesting_managers.add(manager_id)
|
547
|
+
|
548
|
+
if len(m['tasks']) == 0 and m['idle_since'] is None:
|
549
|
+
m['idle_since'] = time.time()
|
550
|
+
|
551
|
+
self._send_monitoring_info(monitoring_radio, m)
|
552
|
+
|
553
|
+
logger.debug("Current tasks on manager %r: %s", manager_id, m["tasks"])
|
554
|
+
|
565
555
|
logger.debug("leaving results_incoming section")
|
566
556
|
|
567
557
|
def expire_bad_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
@@ -215,6 +215,7 @@ class RadicalPilotExecutor(ParslExecutor, RepresentationMixin):
|
|
215
215
|
"""Create the Pilot component and pass it.
|
216
216
|
"""
|
217
217
|
logger.info("starting RadicalPilotExecutor")
|
218
|
+
super().start()
|
218
219
|
logger.info('Parsl: {0}'.format(parsl.__version__))
|
219
220
|
logger.info('RADICAL pilot: {0}'.format(rp.version))
|
220
221
|
self.session = rp.Session(cfg={'base': self.run_dir},
|
@@ -14,6 +14,7 @@ from parsl.executors.errors import BadStateException, ScalingFailed
|
|
14
14
|
from parsl.jobs.error_handlers import noop_error_handler, simple_error_handler
|
15
15
|
from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
|
16
16
|
from parsl.monitoring.message_type import MessageType
|
17
|
+
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
17
18
|
from parsl.providers.base import ExecutionProvider
|
18
19
|
from parsl.utils import AtomicIDCounter
|
19
20
|
|
@@ -83,6 +84,13 @@ class BlockProviderExecutor(ParslExecutor):
|
|
83
84
|
# of pending, active and recently terminated blocks
|
84
85
|
self._status = {} # type: Dict[str, JobStatus]
|
85
86
|
|
87
|
+
self.submit_monitoring_radio: Optional[MultiprocessingQueueRadioSender] = None
|
88
|
+
|
89
|
+
def start(self):
|
90
|
+
super().start()
|
91
|
+
if self.monitoring_messages:
|
92
|
+
self.submit_monitoring_radio = MultiprocessingQueueRadioSender(self.monitoring_messages)
|
93
|
+
|
86
94
|
def _make_status_dict(self, block_ids: List[str], status_list: List[JobStatus]) -> Dict[str, JobStatus]:
|
87
95
|
"""Given a list of block ids and a list of corresponding status strings,
|
88
96
|
returns a dictionary mapping each block id to the corresponding status
|
@@ -281,20 +289,20 @@ class BlockProviderExecutor(ParslExecutor):
|
|
281
289
|
logger.debug("Sending block monitoring message: %r", msg)
|
282
290
|
self.submit_monitoring_radio.send((MessageType.BLOCK_INFO, msg))
|
283
291
|
|
284
|
-
def create_monitoring_info(self, status: Dict[str, JobStatus]) -> Sequence[
|
292
|
+
def create_monitoring_info(self, status: Dict[str, JobStatus]) -> Sequence[Dict[str, Any]]:
|
285
293
|
"""Create a monitoring message for each block based on the poll status.
|
286
294
|
"""
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
295
|
+
return [
|
296
|
+
{
|
297
|
+
"run_id": self.run_id,
|
298
|
+
"status": s.status_name,
|
299
|
+
"timestamp": datetime.datetime.now(),
|
300
|
+
"executor_label": self.label,
|
301
|
+
"job_id": self.blocks_to_job_id.get(bid, None),
|
302
|
+
"block_id": bid
|
303
|
+
}
|
304
|
+
for bid, s in status.items()
|
305
|
+
]
|
298
306
|
|
299
307
|
def poll_facade(self) -> None:
|
300
308
|
now = time.time()
|
@@ -40,6 +40,7 @@ from parsl.executors.taskvine.factory_config import TaskVineFactoryConfig
|
|
40
40
|
from parsl.executors.taskvine.manager import _taskvine_submit_wait
|
41
41
|
from parsl.executors.taskvine.manager_config import TaskVineManagerConfig
|
42
42
|
from parsl.executors.taskvine.utils import ParslFileToVine, ParslTaskToVine
|
43
|
+
from parsl.multiprocessing import SpawnContext
|
43
44
|
from parsl.process_loggers import wrap_with_logs
|
44
45
|
from parsl.providers import CondorProvider, LocalProvider
|
45
46
|
from parsl.providers.base import ExecutionProvider
|
@@ -134,13 +135,13 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
134
135
|
self.storage_access = storage_access
|
135
136
|
|
136
137
|
# Queue to send ready tasks from TaskVine executor process to TaskVine manager process
|
137
|
-
self._ready_task_queue: multiprocessing.Queue =
|
138
|
+
self._ready_task_queue: multiprocessing.Queue = SpawnContext.Queue()
|
138
139
|
|
139
140
|
# Queue to send finished tasks from TaskVine manager process to TaskVine executor process
|
140
|
-
self._finished_task_queue: multiprocessing.Queue =
|
141
|
+
self._finished_task_queue: multiprocessing.Queue = SpawnContext.Queue()
|
141
142
|
|
142
143
|
# Event to signal whether the manager and factory processes should stop running
|
143
|
-
self._should_stop =
|
144
|
+
self._should_stop = SpawnContext.Event()
|
144
145
|
|
145
146
|
# TaskVine manager process
|
146
147
|
self._submit_process = None
|
@@ -239,6 +240,7 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
239
240
|
retrieve Parsl tasks within the TaskVine system.
|
240
241
|
"""
|
241
242
|
|
243
|
+
super().start()
|
242
244
|
# Synchronize connection and communication settings between the manager and factory
|
243
245
|
self.__synchronize_manager_factory_comm_settings()
|
244
246
|
|
@@ -252,17 +254,17 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
252
254
|
"finished_task_queue": self._finished_task_queue,
|
253
255
|
"should_stop": self._should_stop,
|
254
256
|
"manager_config": self.manager_config}
|
255
|
-
self._submit_process =
|
256
|
-
|
257
|
-
|
257
|
+
self._submit_process = SpawnContext.Process(target=_taskvine_submit_wait,
|
258
|
+
name="TaskVine-Submit-Process",
|
259
|
+
kwargs=submit_process_kwargs)
|
258
260
|
|
259
261
|
# Create a process to run the TaskVine factory if enabled.
|
260
262
|
if self.worker_launch_method == 'factory':
|
261
263
|
factory_process_kwargs = {"should_stop": self._should_stop,
|
262
264
|
"factory_config": self.factory_config}
|
263
|
-
self._factory_process =
|
264
|
-
|
265
|
-
|
265
|
+
self._factory_process = SpawnContext.Process(target=_taskvine_factory,
|
266
|
+
name="TaskVine-Factory-Process",
|
267
|
+
kwargs=factory_process_kwargs)
|
266
268
|
|
267
269
|
# Run thread to collect results and set tasks' futures.
|
268
270
|
self._collector_thread = threading.Thread(target=self._collect_taskvine_results,
|
@@ -621,8 +623,8 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
621
623
|
with self._tasks_lock:
|
622
624
|
future = self.tasks.pop(task_report.executor_id)
|
623
625
|
|
624
|
-
logger.debug(f'Updating Future for Parsl Task: {task_report.executor_id}.
|
625
|
-
|
626
|
+
logger.debug(f'Updating Future for Parsl Task: {task_report.executor_id}. '
|
627
|
+
f'Task {task_report.executor_id} has result_received set to {task_report.result_received}')
|
626
628
|
if task_report.result_received:
|
627
629
|
try:
|
628
630
|
with open(task_report.result_file, 'rb') as f_in:
|
@@ -31,6 +31,7 @@ from parsl.errors import OptionalModuleMissing
|
|
31
31
|
from parsl.executors.errors import ExecutorError, InvalidResourceSpecification
|
32
32
|
from parsl.executors.status_handling import BlockProviderExecutor
|
33
33
|
from parsl.executors.workqueue import exec_parsl_function
|
34
|
+
from parsl.multiprocessing import SpawnContext, SpawnProcess
|
34
35
|
from parsl.process_loggers import wrap_with_logs
|
35
36
|
from parsl.providers import CondorProvider, LocalProvider
|
36
37
|
from parsl.providers.base import ExecutionProvider
|
@@ -260,8 +261,8 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
260
261
|
|
261
262
|
self.scaling_cores_per_worker = scaling_cores_per_worker
|
262
263
|
self.label = label
|
263
|
-
self.task_queue
|
264
|
-
self.collector_queue
|
264
|
+
self.task_queue: multiprocessing.Queue = SpawnContext.Queue()
|
265
|
+
self.collector_queue: multiprocessing.Queue = SpawnContext.Queue()
|
265
266
|
self.address = address
|
266
267
|
self.port = port
|
267
268
|
self.executor_task_counter = -1
|
@@ -282,7 +283,7 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
282
283
|
self.autolabel_window = autolabel_window
|
283
284
|
self.autocategory = autocategory
|
284
285
|
self.max_retries = max_retries
|
285
|
-
self.should_stop =
|
286
|
+
self.should_stop = SpawnContext.Value(c_bool, False)
|
286
287
|
self.cached_envs = {} # type: Dict[int, str]
|
287
288
|
self.worker_options = worker_options
|
288
289
|
self.worker_executable = worker_executable
|
@@ -314,6 +315,7 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
314
315
|
"""Create submit process and collector thread to create, send, and
|
315
316
|
retrieve Parsl tasks within the Work Queue system.
|
316
317
|
"""
|
318
|
+
super().start()
|
317
319
|
self.tasks_lock = threading.Lock()
|
318
320
|
|
319
321
|
# Create directories for data and results
|
@@ -333,7 +335,7 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
333
335
|
|
334
336
|
logger.debug("Starting WorkQueueExecutor")
|
335
337
|
|
336
|
-
port_mailbox =
|
338
|
+
port_mailbox = SpawnContext.Queue()
|
337
339
|
|
338
340
|
# Create a Process to perform WorkQueue submissions
|
339
341
|
submit_process_kwargs = {"task_queue": self.task_queue,
|
@@ -354,9 +356,9 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
354
356
|
"port_mailbox": port_mailbox,
|
355
357
|
"coprocess": self.coprocess
|
356
358
|
}
|
357
|
-
self.submit_process =
|
358
|
-
|
359
|
-
|
359
|
+
self.submit_process = SpawnProcess(target=_work_queue_submit_wait,
|
360
|
+
name="WorkQueue-Submit-Process",
|
361
|
+
kwargs=submit_process_kwargs)
|
360
362
|
|
361
363
|
self.collector_thread = threading.Thread(target=self._collect_work_queue_results,
|
362
364
|
name="WorkQueue-collector-thread")
|
parsl/monitoring/errors.py
CHANGED