parsl 2024.3.11__py3-none-any.whl → 2024.3.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/addresses.py +3 -1
- parsl/config.py +4 -0
- parsl/dataflow/dflow.py +14 -5
- parsl/executors/base.py +10 -0
- parsl/executors/high_throughput/executor.py +12 -0
- parsl/executors/high_throughput/interchange.py +30 -8
- parsl/executors/high_throughput/manager_record.py +1 -0
- parsl/executors/high_throughput/process_worker_pool.py +41 -5
- parsl/executors/status_handling.py +2 -9
- parsl/executors/taskvine/executor.py +24 -3
- parsl/executors/taskvine/manager.py +1 -0
- parsl/executors/taskvine/manager_config.py +3 -4
- parsl/executors/workqueue/executor.py +19 -0
- parsl/jobs/error_handlers.py +1 -1
- parsl/jobs/job_status_poller.py +8 -7
- parsl/launchers/launchers.py +6 -6
- parsl/log_utils.py +8 -4
- parsl/monitoring/db_manager.py +4 -2
- parsl/monitoring/monitoring.py +30 -264
- parsl/monitoring/router.py +208 -0
- parsl/monitoring/visualization/plots/default/workflow_plots.py +3 -0
- parsl/monitoring/visualization/views.py +2 -1
- parsl/providers/cluster_provider.py +1 -3
- parsl/tests/configs/user_opts.py +2 -1
- parsl/tests/test_htex/test_drain.py +78 -0
- parsl/tests/test_monitoring/test_app_names.py +86 -0
- parsl/tests/test_monitoring/test_fuzz_zmq.py +2 -2
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +82 -0
- parsl/tests/test_python_apps/test_context_manager.py +40 -0
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +1 -10
- parsl/tests/test_shutdown/__init__.py +0 -0
- parsl/tests/test_shutdown/test_kill_monitoring.py +65 -0
- parsl/utils.py +2 -2
- parsl/version.py +1 -1
- {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/process_worker_pool.py +41 -5
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/METADATA +4 -4
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/RECORD +43 -36
- {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.3.11.data → parsl-2024.3.25.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/LICENSE +0 -0
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/WHEEL +0 -0
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.11.dist-info → parsl-2024.3.25.dist-info}/top_level.txt +0 -0
parsl/addresses.py
CHANGED
@@ -81,7 +81,9 @@ def address_by_hostname() -> str:
|
|
81
81
|
def address_by_interface(ifname: str) -> str:
|
82
82
|
"""Returns the IP address of the given interface name, e.g. 'eth0'
|
83
83
|
|
84
|
-
This is from a Stack Overflow answer:
|
84
|
+
This is taken from a Stack Overflow answer:
|
85
|
+
https://stackoverflow.com/questions/24196932/how-can-i-get-the-ip-address-of-eth0-in-python#24196955
|
86
|
+
|
85
87
|
|
86
88
|
Parameters
|
87
89
|
----------
|
parsl/config.py
CHANGED
@@ -55,6 +55,8 @@ class Config(RepresentationMixin):
|
|
55
55
|
or `None`.
|
56
56
|
If 'none' or `None`, dynamic scaling will be disabled. Default is 'simple'. The literal value `None` is
|
57
57
|
deprecated.
|
58
|
+
strategy_period : float or int, optional
|
59
|
+
How often the scaling strategy should be executed. Default is 5 seconds.
|
58
60
|
max_idletime : float, optional
|
59
61
|
The maximum idle time allowed for an executor before strategy could shut down unused blocks. Default is 120.0 seconds.
|
60
62
|
usage_tracking : bool, optional
|
@@ -88,6 +90,7 @@ class Config(RepresentationMixin):
|
|
88
90
|
retry_handler: Optional[Callable[[Exception, TaskRecord], float]] = None,
|
89
91
|
run_dir: str = 'runinfo',
|
90
92
|
strategy: Optional[str] = 'simple',
|
93
|
+
strategy_period: Union[float, int] = 5,
|
91
94
|
max_idletime: float = 120.0,
|
92
95
|
monitoring: Optional[MonitoringHub] = None,
|
93
96
|
usage_tracking: bool = False,
|
@@ -121,6 +124,7 @@ class Config(RepresentationMixin):
|
|
121
124
|
self.retry_handler = retry_handler
|
122
125
|
self.run_dir = run_dir
|
123
126
|
self.strategy = strategy
|
127
|
+
self.strategy_period = strategy_period
|
124
128
|
self.max_idletime = max_idletime
|
125
129
|
self.usage_tracking = usage_tracking
|
126
130
|
self.initialize_logging = initialize_logging
|
parsl/dataflow/dflow.py
CHANGED
@@ -108,12 +108,12 @@ class DataFlowKernel:
|
|
108
108
|
|
109
109
|
# hub address and port for interchange to connect
|
110
110
|
self.hub_address = None # type: Optional[str]
|
111
|
-
self.
|
111
|
+
self.hub_zmq_port = None # type: Optional[int]
|
112
112
|
if self.monitoring:
|
113
113
|
if self.monitoring.logdir is None:
|
114
114
|
self.monitoring.logdir = self.run_dir
|
115
115
|
self.hub_address = self.monitoring.hub_address
|
116
|
-
self.
|
116
|
+
self.hub_zmq_port = self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir)
|
117
117
|
|
118
118
|
self.time_began = datetime.datetime.now()
|
119
119
|
self.time_completed: Optional[datetime.datetime] = None
|
@@ -178,6 +178,7 @@ class DataFlowKernel:
|
|
178
178
|
# this must be set before executors are added since add_executors calls
|
179
179
|
# job_status_poller.add_executors.
|
180
180
|
self.job_status_poller = JobStatusPoller(strategy=self.config.strategy,
|
181
|
+
strategy_period=self.config.strategy_period,
|
181
182
|
max_idletime=self.config.max_idletime,
|
182
183
|
dfk=self)
|
183
184
|
|
@@ -205,6 +206,13 @@ class DataFlowKernel:
|
|
205
206
|
|
206
207
|
atexit.register(self.atexit_cleanup)
|
207
208
|
|
209
|
+
def __enter__(self):
|
210
|
+
pass
|
211
|
+
|
212
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
213
|
+
logger.debug("Exiting the context manager, calling cleanup for DFK")
|
214
|
+
self.cleanup()
|
215
|
+
|
208
216
|
def _send_task_log_info(self, task_record: TaskRecord) -> None:
|
209
217
|
if self.monitoring:
|
210
218
|
task_log_info = self._create_task_log_info(task_record)
|
@@ -1114,12 +1122,12 @@ class DataFlowKernel:
|
|
1114
1122
|
|
1115
1123
|
channel.makedirs(channel.script_dir, exist_ok=True)
|
1116
1124
|
|
1117
|
-
def add_executors(self, executors):
|
1125
|
+
def add_executors(self, executors: Sequence[ParslExecutor]) -> None:
|
1118
1126
|
for executor in executors:
|
1119
1127
|
executor.run_id = self.run_id
|
1120
1128
|
executor.run_dir = self.run_dir
|
1121
1129
|
executor.hub_address = self.hub_address
|
1122
|
-
executor.hub_port = self.
|
1130
|
+
executor.hub_port = self.hub_zmq_port
|
1123
1131
|
if hasattr(executor, 'provider'):
|
1124
1132
|
if hasattr(executor.provider, 'script_dir'):
|
1125
1133
|
executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
|
@@ -1170,7 +1178,8 @@ class DataFlowKernel:
|
|
1170
1178
|
fut = task_record['app_fu']
|
1171
1179
|
if not fut.done():
|
1172
1180
|
fut.exception()
|
1173
|
-
# now app future is done, poll until DFK state is final: a
|
1181
|
+
# now app future is done, poll until DFK state is final: a
|
1182
|
+
# DFK state being final and the app future being done do not imply each other.
|
1174
1183
|
while task_record['status'] not in FINAL_STATES:
|
1175
1184
|
time.sleep(0.1)
|
1176
1185
|
|
parsl/executors/base.py
CHANGED
@@ -106,6 +106,16 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
106
106
|
def run_dir(self, value: str) -> None:
|
107
107
|
self._run_dir = value
|
108
108
|
|
109
|
+
@property
|
110
|
+
def run_id(self) -> Optional[str]:
|
111
|
+
"""UUID for the enclosing DFK.
|
112
|
+
"""
|
113
|
+
return self._run_id
|
114
|
+
|
115
|
+
@run_id.setter
|
116
|
+
def run_id(self, value: Optional[str]) -> None:
|
117
|
+
self._run_id = value
|
118
|
+
|
109
119
|
@property
|
110
120
|
def hub_address(self) -> Optional[str]:
|
111
121
|
"""Address to the Hub for monitoring.
|
@@ -55,6 +55,7 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
|
|
55
55
|
"--hb_period={heartbeat_period} "
|
56
56
|
"{address_probe_timeout_string} "
|
57
57
|
"--hb_threshold={heartbeat_threshold} "
|
58
|
+
"--drain_period={drain_period} "
|
58
59
|
"--cpu-affinity {cpu_affinity} "
|
59
60
|
"{enable_mpi_mode} "
|
60
61
|
"--mpi-launcher={mpi_launcher} "
|
@@ -201,6 +202,14 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
201
202
|
Timeout period to be used by the executor components in milliseconds. Increasing poll_periods
|
202
203
|
trades performance for cpu efficiency. Default: 10ms
|
203
204
|
|
205
|
+
drain_period : int
|
206
|
+
The number of seconds after start when workers will begin to drain
|
207
|
+
and then exit. Set this to a time that is slightly less than the
|
208
|
+
maximum walltime of batch jobs to avoid killing tasks while they
|
209
|
+
execute. For example, you could set this to the walltime minus a grace
|
210
|
+
period for the batch job to start the workers, minus the expected
|
211
|
+
maximum length of an individual task.
|
212
|
+
|
204
213
|
worker_logdir_root : string
|
205
214
|
In case of a remote file system, specify the path to where logs will be kept.
|
206
215
|
|
@@ -240,6 +249,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
240
249
|
prefetch_capacity: int = 0,
|
241
250
|
heartbeat_threshold: int = 120,
|
242
251
|
heartbeat_period: int = 30,
|
252
|
+
drain_period: Optional[int] = None,
|
243
253
|
poll_period: int = 10,
|
244
254
|
address_probe_timeout: Optional[int] = None,
|
245
255
|
worker_logdir_root: Optional[str] = None,
|
@@ -303,6 +313,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
303
313
|
self.interchange_port_range = interchange_port_range
|
304
314
|
self.heartbeat_threshold = heartbeat_threshold
|
305
315
|
self.heartbeat_period = heartbeat_period
|
316
|
+
self.drain_period = drain_period
|
306
317
|
self.poll_period = poll_period
|
307
318
|
self.run_dir = '.'
|
308
319
|
self.worker_logdir_root = worker_logdir_root
|
@@ -376,6 +387,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
376
387
|
nodes_per_block=self.provider.nodes_per_block,
|
377
388
|
heartbeat_period=self.heartbeat_period,
|
378
389
|
heartbeat_threshold=self.heartbeat_threshold,
|
390
|
+
drain_period=self.drain_period,
|
379
391
|
poll_period=self.poll_period,
|
380
392
|
cert_dir=self.cert_dir,
|
381
393
|
logdir=self.worker_logdir,
|
@@ -28,6 +28,7 @@ from parsl.process_loggers import wrap_with_logs
|
|
28
28
|
|
29
29
|
|
30
30
|
PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
|
31
|
+
PKL_DRAINED_CODE = pickle.dumps((2 ** 32) - 2)
|
31
32
|
|
32
33
|
LOGGER_NAME = "interchange"
|
33
34
|
logger = logging.getLogger(LOGGER_NAME)
|
@@ -101,12 +102,12 @@ class Interchange:
|
|
101
102
|
This is overridden when the worker_ports option is set. Default: (54000, 55000)
|
102
103
|
|
103
104
|
hub_address : str
|
104
|
-
The
|
105
|
-
|
105
|
+
The IP address at which the interchange can send info about managers to when monitoring is enabled.
|
106
|
+
Default: None (meaning monitoring disabled)
|
106
107
|
|
107
108
|
hub_port : str
|
108
109
|
The port at which the interchange can send info about managers to when monitoring is enabled.
|
109
|
-
|
110
|
+
Default: None (meaning monitoring disabled)
|
110
111
|
|
111
112
|
heartbeat_threshold : int
|
112
113
|
Number of seconds since the last heartbeat after which worker is considered lost.
|
@@ -244,19 +245,19 @@ class Interchange:
|
|
244
245
|
|
245
246
|
def _create_monitoring_channel(self) -> Optional[zmq.Socket]:
|
246
247
|
if self.hub_address and self.hub_port:
|
247
|
-
logger.info("Connecting to
|
248
|
+
logger.info("Connecting to MonitoringHub")
|
248
249
|
# This is a one-off because monitoring is unencrypted
|
249
250
|
hub_channel = zmq.Context().socket(zmq.DEALER)
|
250
251
|
hub_channel.set_hwm(0)
|
251
252
|
hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_port))
|
252
|
-
logger.info("
|
253
|
+
logger.info("Connected to MonitoringHub")
|
253
254
|
return hub_channel
|
254
255
|
else:
|
255
256
|
return None
|
256
257
|
|
257
258
|
def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
|
258
259
|
if hub_channel:
|
259
|
-
logger.info("Sending message {} to
|
260
|
+
logger.info("Sending message {} to MonitoringHub".format(manager))
|
260
261
|
|
261
262
|
d: Dict = cast(Dict, manager.copy())
|
262
263
|
d['timestamp'] = datetime.datetime.now()
|
@@ -308,7 +309,8 @@ class Interchange:
|
|
308
309
|
'worker_count': m['worker_count'],
|
309
310
|
'tasks': len(m['tasks']),
|
310
311
|
'idle_duration': idle_duration,
|
311
|
-
'active': m['active']
|
312
|
+
'active': m['active'],
|
313
|
+
'draining': m['draining']}
|
312
314
|
reply.append(resp)
|
313
315
|
|
314
316
|
elif command_req.startswith("HOLD_WORKER"):
|
@@ -385,6 +387,7 @@ class Interchange:
|
|
385
387
|
self.process_task_outgoing_incoming(interesting_managers, hub_channel, kill_event)
|
386
388
|
self.process_results_incoming(interesting_managers, hub_channel)
|
387
389
|
self.expire_bad_managers(interesting_managers, hub_channel)
|
390
|
+
self.expire_drained_managers(interesting_managers, hub_channel)
|
388
391
|
self.process_tasks_to_send(interesting_managers)
|
389
392
|
|
390
393
|
self.zmq_context.destroy()
|
@@ -431,6 +434,7 @@ class Interchange:
|
|
431
434
|
'max_capacity': 0,
|
432
435
|
'worker_count': 0,
|
433
436
|
'active': True,
|
437
|
+
'draining': False,
|
434
438
|
'tasks': []}
|
435
439
|
self.connected_block_history.append(msg['block_id'])
|
436
440
|
|
@@ -469,10 +473,28 @@ class Interchange:
|
|
469
473
|
self._ready_managers[manager_id]['last_heartbeat'] = time.time()
|
470
474
|
logger.debug("Manager {!r} sent heartbeat via tasks connection".format(manager_id))
|
471
475
|
self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
|
476
|
+
elif msg['type'] == 'drain':
|
477
|
+
self._ready_managers[manager_id]['draining'] = True
|
478
|
+
logger.debug(f"Manager {manager_id!r} requested drain")
|
472
479
|
else:
|
473
480
|
logger.error(f"Unexpected message type received from manager: {msg['type']}")
|
474
481
|
logger.debug("leaving task_outgoing section")
|
475
482
|
|
483
|
+
def expire_drained_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
|
484
|
+
|
485
|
+
for manager_id in list(interesting_managers):
|
486
|
+
# is it always true that a draining manager will be in interesting managers?
|
487
|
+
# i think so because it will have outstanding capacity?
|
488
|
+
m = self._ready_managers[manager_id]
|
489
|
+
if m['draining'] and len(m['tasks']) == 0:
|
490
|
+
logger.info(f"Manager {manager_id!r} is drained - sending drained message to manager")
|
491
|
+
self.task_outgoing.send_multipart([manager_id, b'', PKL_DRAINED_CODE])
|
492
|
+
interesting_managers.remove(manager_id)
|
493
|
+
self._ready_managers.pop(manager_id)
|
494
|
+
|
495
|
+
m['active'] = False
|
496
|
+
self._send_monitoring_info(hub_channel, m)
|
497
|
+
|
476
498
|
def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
|
477
499
|
# Check if there are tasks that could be sent to managers
|
478
500
|
|
@@ -490,7 +512,7 @@ class Interchange:
|
|
490
512
|
tasks_inflight = len(m['tasks'])
|
491
513
|
real_capacity = m['max_capacity'] - tasks_inflight
|
492
514
|
|
493
|
-
if (real_capacity and m['active']):
|
515
|
+
if (real_capacity and m['active'] and not m['draining']):
|
494
516
|
tasks = self.get_tasks(real_capacity)
|
495
517
|
if tasks:
|
496
518
|
self.task_outgoing.send_multipart([manager_id, b'', pickle.dumps(tasks)])
|
@@ -36,6 +36,7 @@ from parsl.executors.high_throughput.mpi_resource_management import (
|
|
36
36
|
from parsl.executors.high_throughput.mpi_prefix_composer import compose_all, VALID_LAUNCHERS
|
37
37
|
|
38
38
|
HEARTBEAT_CODE = (2 ** 32) - 1
|
39
|
+
DRAINED_CODE = (2 ** 32) - 2
|
39
40
|
|
40
41
|
|
41
42
|
class Manager:
|
@@ -73,7 +74,8 @@ class Manager:
|
|
73
74
|
enable_mpi_mode: bool = False,
|
74
75
|
mpi_launcher: str = "mpiexec",
|
75
76
|
available_accelerators: Sequence[str],
|
76
|
-
cert_dir: Optional[str]
|
77
|
+
cert_dir: Optional[str],
|
78
|
+
drain_period: Optional[int]):
|
77
79
|
"""
|
78
80
|
Parameters
|
79
81
|
----------
|
@@ -138,6 +140,9 @@ class Manager:
|
|
138
140
|
|
139
141
|
cert_dir : str | None
|
140
142
|
Path to the certificate directory.
|
143
|
+
|
144
|
+
drain_period: int | None
|
145
|
+
Number of seconds to drain after TODO: could be a nicer timespec involving m,s,h qualifiers for user friendliness?
|
141
146
|
"""
|
142
147
|
|
143
148
|
logger.info("Manager initializing")
|
@@ -227,6 +232,14 @@ class Manager:
|
|
227
232
|
self.heartbeat_period = heartbeat_period
|
228
233
|
self.heartbeat_threshold = heartbeat_threshold
|
229
234
|
self.poll_period = poll_period
|
235
|
+
|
236
|
+
self.drain_time: float
|
237
|
+
if drain_period:
|
238
|
+
self.drain_time = self._start_time + drain_period
|
239
|
+
logger.info(f"Will request drain at {self.drain_time}")
|
240
|
+
else:
|
241
|
+
self.drain_time = float('inf')
|
242
|
+
|
230
243
|
self.cpu_affinity = cpu_affinity
|
231
244
|
|
232
245
|
# Define accelerator available, adjust worker count accordingly
|
@@ -262,10 +275,19 @@ class Manager:
|
|
262
275
|
""" Send heartbeat to the incoming task queue
|
263
276
|
"""
|
264
277
|
msg = {'type': 'heartbeat'}
|
278
|
+
# don't need to dumps and encode this every time - could do as a global on import?
|
265
279
|
b_msg = json.dumps(msg).encode('utf-8')
|
266
280
|
self.task_incoming.send(b_msg)
|
267
281
|
logger.debug("Sent heartbeat")
|
268
282
|
|
283
|
+
def drain_to_incoming(self):
|
284
|
+
""" Send heartbeat to the incoming task queue
|
285
|
+
"""
|
286
|
+
msg = {'type': 'drain'}
|
287
|
+
b_msg = json.dumps(msg).encode('utf-8')
|
288
|
+
self.task_incoming.send(b_msg)
|
289
|
+
logger.debug("Sent drain")
|
290
|
+
|
269
291
|
@wrap_with_logs
|
270
292
|
def pull_tasks(self, kill_event):
|
271
293
|
""" Pull tasks from the incoming tasks zmq pipe onto the internal
|
@@ -298,6 +320,7 @@ class Manager:
|
|
298
320
|
# time here are correctly copy-pasted from the relevant if
|
299
321
|
# statements.
|
300
322
|
next_interesting_event_time = min(last_beat + self.heartbeat_period,
|
323
|
+
self.drain_time,
|
301
324
|
last_interchange_contact + self.heartbeat_threshold)
|
302
325
|
try:
|
303
326
|
pending_task_count = self.pending_task_queue.qsize()
|
@@ -312,6 +335,14 @@ class Manager:
|
|
312
335
|
self.heartbeat_to_incoming()
|
313
336
|
last_beat = time.time()
|
314
337
|
|
338
|
+
if self.drain_time and time.time() > self.drain_time:
|
339
|
+
logger.info("Requesting drain")
|
340
|
+
self.drain_to_incoming()
|
341
|
+
self.drain_time = None
|
342
|
+
# This will start the pool draining...
|
343
|
+
# Drained exit behaviour does not happen here. It will be
|
344
|
+
# driven by the interchange sending a DRAINED_CODE message.
|
345
|
+
|
315
346
|
poll_duration_s = max(0, next_interesting_event_time - time.time())
|
316
347
|
socks = dict(poller.poll(timeout=poll_duration_s * 1000))
|
317
348
|
|
@@ -322,7 +353,9 @@ class Manager:
|
|
322
353
|
|
323
354
|
if tasks == HEARTBEAT_CODE:
|
324
355
|
logger.debug("Got heartbeat from interchange")
|
325
|
-
|
356
|
+
elif tasks == DRAINED_CODE:
|
357
|
+
logger.info("Got fulled drained message from interchange - setting kill flag")
|
358
|
+
kill_event.set()
|
326
359
|
else:
|
327
360
|
task_recv_counter += len(tasks)
|
328
361
|
logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format([t['task_id'] for t in tasks], task_recv_counter))
|
@@ -490,9 +523,8 @@ class Manager:
|
|
490
523
|
self._worker_watchdog_thread.start()
|
491
524
|
self._monitoring_handler_thread.start()
|
492
525
|
|
493
|
-
logger.info("
|
526
|
+
logger.info("Manager threads started")
|
494
527
|
|
495
|
-
# TODO : Add mechanism in this loop to stop the worker pool
|
496
528
|
# This might need a multiprocessing event to signal back.
|
497
529
|
self._kill_event.wait()
|
498
530
|
logger.critical("Received kill event, terminating worker processes")
|
@@ -804,6 +836,8 @@ if __name__ == "__main__":
|
|
804
836
|
help="Heartbeat period in seconds. Uses manager default unless set")
|
805
837
|
parser.add_argument("--hb_threshold", default=120,
|
806
838
|
help="Heartbeat threshold in seconds. Uses manager default unless set")
|
839
|
+
parser.add_argument("--drain_period", default=None,
|
840
|
+
help="Drain this pool after specified number of seconds. By default, does not drain.")
|
807
841
|
parser.add_argument("--address_probe_timeout", default=30,
|
808
842
|
help="Timeout to probe for viable address to interchange. Default: 30s")
|
809
843
|
parser.add_argument("--poll", default=10,
|
@@ -824,7 +858,7 @@ if __name__ == "__main__":
|
|
824
858
|
required=True,
|
825
859
|
help="Whether/how workers should control CPU affinity.")
|
826
860
|
parser.add_argument("--available-accelerators", type=str, nargs="*",
|
827
|
-
help="Names of available accelerators")
|
861
|
+
help="Names of available accelerators, if not given assumed to be zero accelerators available", default=[])
|
828
862
|
parser.add_argument("--enable_mpi_mode", action='store_true',
|
829
863
|
help="Enable MPI mode")
|
830
864
|
parser.add_argument("--mpi-launcher", type=str, choices=VALID_LAUNCHERS,
|
@@ -856,6 +890,7 @@ if __name__ == "__main__":
|
|
856
890
|
logger.info("Prefetch capacity: {}".format(args.prefetch_capacity))
|
857
891
|
logger.info("Heartbeat threshold: {}".format(args.hb_threshold))
|
858
892
|
logger.info("Heartbeat period: {}".format(args.hb_period))
|
893
|
+
logger.info("Drain period: {}".format(args.drain_period))
|
859
894
|
logger.info("CPU affinity: {}".format(args.cpu_affinity))
|
860
895
|
logger.info("Accelerators: {}".format(" ".join(args.available_accelerators)))
|
861
896
|
logger.info("enable_mpi_mode: {}".format(args.enable_mpi_mode))
|
@@ -876,6 +911,7 @@ if __name__ == "__main__":
|
|
876
911
|
prefetch_capacity=int(args.prefetch_capacity),
|
877
912
|
heartbeat_threshold=int(args.hb_threshold),
|
878
913
|
heartbeat_period=int(args.hb_period),
|
914
|
+
drain_period=None if args.drain_period == "None" else int(args.drain_period),
|
879
915
|
poll_period=int(args.poll),
|
880
916
|
cpu_affinity=args.cpu_affinity,
|
881
917
|
enable_mpi_mode=args.enable_mpi_mode,
|
@@ -61,7 +61,7 @@ class BlockProviderExecutor(ParslExecutor):
|
|
61
61
|
# errors can happen during the submit call to the provider; this is used
|
62
62
|
# to keep track of such errors so that they can be handled in one place
|
63
63
|
# together with errors reported by status()
|
64
|
-
self._simulated_status: Dict[
|
64
|
+
self._simulated_status: Dict[str, JobStatus] = {}
|
65
65
|
self._executor_bad_state = threading.Event()
|
66
66
|
self._executor_exception: Optional[Exception] = None
|
67
67
|
|
@@ -102,13 +102,10 @@ class BlockProviderExecutor(ParslExecutor):
|
|
102
102
|
else:
|
103
103
|
return self._provider.status_polling_interval
|
104
104
|
|
105
|
-
def _fail_job_async(self, block_id:
|
105
|
+
def _fail_job_async(self, block_id: str, message: str):
|
106
106
|
"""Marks a job that has failed to start but would not otherwise be included in status()
|
107
107
|
as failed and report it in status()
|
108
108
|
"""
|
109
|
-
if block_id is None:
|
110
|
-
block_id = str(self._block_id_counter.get_id())
|
111
|
-
logger.info(f"Allocated block ID {block_id} for simulated failure")
|
112
109
|
self._simulated_status[block_id] = JobStatus(JobState.FAILED, message)
|
113
110
|
|
114
111
|
@abstractproperty
|
@@ -211,10 +208,6 @@ class BlockProviderExecutor(ParslExecutor):
|
|
211
208
|
|
212
209
|
Cause the executor to reduce the number of blocks by count.
|
213
210
|
|
214
|
-
We should have the scale in method simply take resource object
|
215
|
-
which will have the scaling methods, scale_in itself should be a coroutine, since
|
216
|
-
scaling tasks can be slow.
|
217
|
-
|
218
211
|
:return: A list of block ids corresponding to the blocks that were removed.
|
219
212
|
"""
|
220
213
|
pass
|
@@ -4,6 +4,7 @@ high-throughput system for delegating Parsl tasks to thousands of remote machine
|
|
4
4
|
"""
|
5
5
|
|
6
6
|
# Import Python built-in libraries
|
7
|
+
import atexit
|
7
8
|
import threading
|
8
9
|
import multiprocessing
|
9
10
|
import logging
|
@@ -171,7 +172,7 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
171
172
|
# Path to directory that holds all tasks' data and results.
|
172
173
|
self._function_data_dir = ""
|
173
174
|
|
174
|
-
#
|
175
|
+
# Helper scripts to prepare package tarballs for Parsl apps
|
175
176
|
self._package_analyze_script = shutil.which("poncho_package_analyze")
|
176
177
|
self._package_create_script = shutil.which("poncho_package_create")
|
177
178
|
if self._package_analyze_script is None or self._package_create_script is None:
|
@@ -179,6 +180,18 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
179
180
|
else:
|
180
181
|
self._poncho_available = True
|
181
182
|
|
183
|
+
# Register atexit handler to cleanup when Python shuts down
|
184
|
+
atexit.register(self.atexit_cleanup)
|
185
|
+
|
186
|
+
# Attribute indicating whether this executor was started to shut it down properly.
|
187
|
+
# This safeguards cases where an object of this executor is created but
|
188
|
+
# the executor never starts, so it shouldn't be shutdowned.
|
189
|
+
self._started = False
|
190
|
+
|
191
|
+
def atexit_cleanup(self):
|
192
|
+
# Calls this executor's shutdown method upon Python exiting the process.
|
193
|
+
self.shutdown()
|
194
|
+
|
182
195
|
def _get_launch_command(self, block_id):
|
183
196
|
# Implements BlockProviderExecutor's abstract method.
|
184
197
|
# This executor uses different terminology for worker/launch
|
@@ -196,8 +209,9 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
196
209
|
if self.manager_config.port == 0 and self.manager_config.project_name is None:
|
197
210
|
self.manager_config.project_name = "parsl-vine-" + str(uuid.uuid4())
|
198
211
|
|
199
|
-
# guess the host name if the project name is not given
|
200
|
-
|
212
|
+
# guess the host name if the project name is not given and none has been supplied
|
213
|
+
# explicitly in the manager config.
|
214
|
+
if not self.manager_config.project_name and self.manager_config.address is None:
|
201
215
|
self.manager_config.address = get_any_address()
|
202
216
|
|
203
217
|
# Factory communication settings are overridden by manager communication settings.
|
@@ -237,6 +251,9 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
237
251
|
retrieve Parsl tasks within the TaskVine system.
|
238
252
|
"""
|
239
253
|
|
254
|
+
# Mark this executor object as started
|
255
|
+
self._started = True
|
256
|
+
|
240
257
|
# Synchronize connection and communication settings between the manager and factory
|
241
258
|
self.__synchronize_manager_factory_comm_settings()
|
242
259
|
|
@@ -597,6 +614,10 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
597
614
|
"""Shutdown the executor. Sets flag to cancel the submit process and
|
598
615
|
collector thread, which shuts down the TaskVine system submission.
|
599
616
|
"""
|
617
|
+
if not self._started:
|
618
|
+
# Don't shutdown if the executor never starts.
|
619
|
+
return
|
620
|
+
|
600
621
|
logger.debug("TaskVine shutdown started")
|
601
622
|
self._should_stop.set()
|
602
623
|
|
@@ -376,6 +376,7 @@ def _taskvine_submit_wait(ready_task_queue=None,
|
|
376
376
|
task_out_file = parsl_file_name_to_vine_file[spec.parsl_name]
|
377
377
|
else:
|
378
378
|
task_out_file = m.declare_file(spec.parsl_name, cache=spec.cache, peer_transfer=True)
|
379
|
+
parsl_file_name_to_vine_file[spec.parsl_name] = task_out_file
|
379
380
|
t.add_output(task_out_file, spec.parsl_name)
|
380
381
|
|
381
382
|
# Submit the task to the TaskVine object
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import socket
|
2
1
|
from dataclasses import dataclass
|
3
2
|
from typing import Optional
|
4
3
|
|
@@ -23,9 +22,9 @@ class TaskVineManagerConfig:
|
|
23
22
|
A value of 0 means TaskVine chooses any available port.
|
24
23
|
Default is VINE_DEFAULT_PORT.
|
25
24
|
|
26
|
-
address: str
|
25
|
+
address: Optional[str]
|
27
26
|
Address of the local machine.
|
28
|
-
|
27
|
+
If None, socket.gethostname() will be used to determine the address.
|
29
28
|
|
30
29
|
project_name: Optional[str]
|
31
30
|
If given, TaskVine will periodically report its status and performance
|
@@ -161,7 +160,7 @@ class TaskVineManagerConfig:
|
|
161
160
|
|
162
161
|
# Connection and communication settings
|
163
162
|
port: int = VINE_DEFAULT_PORT
|
164
|
-
address: str =
|
163
|
+
address: Optional[str] = None
|
165
164
|
project_name: Optional[str] = None
|
166
165
|
project_password_file: Optional[str] = None
|
167
166
|
|
@@ -3,6 +3,7 @@ Cooperative Computing Lab (CCL) at Notre Dame to provide a fault-tolerant,
|
|
3
3
|
high-throughput system for delegating Parsl tasks to thousands of remote machines
|
4
4
|
"""
|
5
5
|
|
6
|
+
import atexit
|
6
7
|
import threading
|
7
8
|
import multiprocessing
|
8
9
|
import logging
|
@@ -298,6 +299,18 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
298
299
|
if self.init_command != "":
|
299
300
|
self.launch_cmd = self.init_command + "; " + self.launch_cmd
|
300
301
|
|
302
|
+
# register atexit handler to cleanup when Python shuts down
|
303
|
+
atexit.register(self.atexit_cleanup)
|
304
|
+
|
305
|
+
# Attribute indicating whether this executor was started to shut it down properly.
|
306
|
+
# This safeguards cases where an object of this executor is created but
|
307
|
+
# the executor never starts, so it shouldn't be shutdowned.
|
308
|
+
self.started = False
|
309
|
+
|
310
|
+
def atexit_cleanup(self):
|
311
|
+
# Calls this executor's shutdown method upon Python exiting the process.
|
312
|
+
self.shutdown()
|
313
|
+
|
301
314
|
def _get_launch_command(self, block_id):
|
302
315
|
# this executor uses different terminology for worker/launch
|
303
316
|
# commands than in htex
|
@@ -307,6 +320,8 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
307
320
|
"""Create submit process and collector thread to create, send, and
|
308
321
|
retrieve Parsl tasks within the Work Queue system.
|
309
322
|
"""
|
323
|
+
# Mark this executor object as started
|
324
|
+
self.started = True
|
310
325
|
self.tasks_lock = threading.Lock()
|
311
326
|
|
312
327
|
# Create directories for data and results
|
@@ -695,6 +710,10 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
695
710
|
"""Shutdown the executor. Sets flag to cancel the submit process and
|
696
711
|
collector thread, which shuts down the Work Queue system submission.
|
697
712
|
"""
|
713
|
+
if not self.started:
|
714
|
+
# Don't shutdown if the executor never starts.
|
715
|
+
return
|
716
|
+
|
698
717
|
logger.debug("Work Queue shutdown started")
|
699
718
|
self.should_stop.value = True
|
700
719
|
|
parsl/jobs/error_handlers.py
CHANGED
@@ -20,7 +20,7 @@ def simple_error_handler(executor: status_handling.BlockProviderExecutor, status
|
|
20
20
|
executor.set_bad_state_and_fail_all(_get_error(status))
|
21
21
|
|
22
22
|
|
23
|
-
def windowed_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3):
|
23
|
+
def windowed_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3) -> None:
|
24
24
|
sorted_status = [(key, status[key]) for key in sorted(status, key=lambda x: int(x))]
|
25
25
|
current_window = dict(sorted_status[-threshold:])
|
26
26
|
total, failed = _count_jobs(current_window)
|