parsl 2024.6.10__py3-none-any.whl → 2024.6.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. parsl/app/app.py +0 -2
  2. parsl/app/bash.py +2 -3
  3. parsl/channels/local/local.py +7 -2
  4. parsl/configs/ASPIRE1.py +3 -1
  5. parsl/configs/Azure.py +3 -1
  6. parsl/configs/ad_hoc.py +2 -0
  7. parsl/configs/bridges.py +3 -1
  8. parsl/configs/cc_in2p3.py +2 -0
  9. parsl/configs/ec2.py +2 -0
  10. parsl/configs/expanse.py +3 -1
  11. parsl/configs/frontera.py +2 -0
  12. parsl/configs/htex_local.py +2 -0
  13. parsl/configs/illinoiscluster.py +2 -0
  14. parsl/configs/kubernetes.py +3 -1
  15. parsl/configs/local_threads.py +5 -1
  16. parsl/configs/midway.py +2 -0
  17. parsl/configs/osg.py +3 -1
  18. parsl/configs/polaris.py +3 -1
  19. parsl/configs/stampede2.py +2 -0
  20. parsl/configs/summit.py +2 -0
  21. parsl/configs/toss3_llnl.py +3 -1
  22. parsl/configs/vineex_local.py +3 -1
  23. parsl/configs/wqex_local.py +3 -1
  24. parsl/executors/high_throughput/executor.py +36 -31
  25. parsl/executors/high_throughput/interchange.py +5 -8
  26. parsl/executors/workqueue/executor.py +25 -5
  27. parsl/providers/kubernetes/kube.py +3 -3
  28. parsl/tests/test_htex/test_htex.py +24 -7
  29. parsl/version.py +1 -1
  30. parsl-2024.6.24.data/scripts/interchange.py +681 -0
  31. {parsl-2024.6.10.dist-info → parsl-2024.6.24.dist-info}/METADATA +2 -2
  32. {parsl-2024.6.10.dist-info → parsl-2024.6.24.dist-info}/RECORD +39 -38
  33. {parsl-2024.6.10.data → parsl-2024.6.24.data}/scripts/exec_parsl_function.py +0 -0
  34. {parsl-2024.6.10.data → parsl-2024.6.24.data}/scripts/parsl_coprocess.py +0 -0
  35. {parsl-2024.6.10.data → parsl-2024.6.24.data}/scripts/process_worker_pool.py +0 -0
  36. {parsl-2024.6.10.dist-info → parsl-2024.6.24.dist-info}/LICENSE +0 -0
  37. {parsl-2024.6.10.dist-info → parsl-2024.6.24.dist-info}/WHEEL +0 -0
  38. {parsl-2024.6.10.dist-info → parsl-2024.6.24.dist-info}/entry_points.txt +0 -0
  39. {parsl-2024.6.10.dist-info → parsl-2024.6.24.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,681 @@
1
+ #!python
2
+ import datetime
3
+ import json
4
+ import logging
5
+ import os
6
+ import pickle
7
+ import platform
8
+ import queue
9
+ import random
10
+ import signal
11
+ import sys
12
+ import threading
13
+ import time
14
+ from typing import Any, Dict, List, NoReturn, Optional, Sequence, Set, Tuple, cast
15
+
16
+ import zmq
17
+
18
+ from parsl import curvezmq
19
+ from parsl.app.errors import RemoteExceptionWrapper
20
+ from parsl.executors.high_throughput.manager_record import ManagerRecord
21
+ from parsl.monitoring.message_type import MessageType
22
+ from parsl.process_loggers import wrap_with_logs
23
+ from parsl.serialize import serialize as serialize_object
24
+ from parsl.utils import setproctitle
25
+ from parsl.version import VERSION as PARSL_VERSION
26
+
27
+ PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
28
+ PKL_DRAINED_CODE = pickle.dumps((2 ** 32) - 2)
29
+
30
+ LOGGER_NAME = "interchange"
31
+ logger = logging.getLogger(LOGGER_NAME)
32
+
33
+
34
+ class ManagerLost(Exception):
35
+ ''' Task lost due to manager loss. Manager is considered lost when multiple heartbeats
36
+ have been missed.
37
+ '''
38
+ def __init__(self, manager_id: bytes, hostname: str) -> None:
39
+ self.manager_id = manager_id
40
+ self.tstamp = time.time()
41
+ self.hostname = hostname
42
+
43
+ def __str__(self) -> str:
44
+ return "Task failure due to loss of manager {} on host {}".format(self.manager_id.decode(), self.hostname)
45
+
46
+
47
+ class VersionMismatch(Exception):
48
+ ''' Manager and Interchange versions do not match
49
+ '''
50
+ def __init__(self, interchange_version: str, manager_version: str):
51
+ self.interchange_version = interchange_version
52
+ self.manager_version = manager_version
53
+
54
+ def __str__(self) -> str:
55
+ return "Manager version info {} does not match interchange version info {}, causing a critical failure".format(
56
+ self.manager_version,
57
+ self.interchange_version)
58
+
59
+
60
+ class Interchange:
61
+ """ Interchange is a task orchestrator for distributed systems.
62
+
63
+ 1. Asynchronously queue large volume of tasks (>100K)
64
+ 2. Allow for workers to join and leave the union
65
+ 3. Detect workers that have failed using heartbeats
66
+ """
67
+ def __init__(self,
68
+ *,
69
+ client_address: str,
70
+ interchange_address: Optional[str],
71
+ client_ports: Tuple[int, int, int],
72
+ worker_ports: Optional[Tuple[int, int]],
73
+ worker_port_range: Tuple[int, int],
74
+ hub_address: Optional[str],
75
+ hub_zmq_port: Optional[int],
76
+ heartbeat_threshold: int,
77
+ logdir: str,
78
+ logging_level: int,
79
+ poll_period: int,
80
+ cert_dir: Optional[str],
81
+ ) -> None:
82
+ """
83
+ Parameters
84
+ ----------
85
+ client_address : str
86
+ The ip address at which the parsl client can be reached. Default: "127.0.0.1"
87
+
88
+ interchange_address : Optional str
89
+ If specified the interchange will only listen on this address for connections from workers
90
+ else, it binds to all addresses.
91
+
92
+ client_ports : triple(int, int, int)
93
+ The ports at which the client can be reached
94
+
95
+ worker_ports : tuple(int, int)
96
+ The specific two ports at which workers will connect to the Interchange.
97
+
98
+ worker_port_range : tuple(int, int)
99
+ The interchange picks ports at random from the range which will be used by workers.
100
+ This is overridden when the worker_ports option is set.
101
+
102
+ hub_address : str
103
+ The IP address at which the interchange can send info about managers to when monitoring is enabled.
104
+ When None, monitoring is disabled.
105
+
106
+ hub_zmq_port : str
107
+ The port at which the interchange can send info about managers to when monitoring is enabled.
108
+ When None, monitoring is disabled.
109
+
110
+ heartbeat_threshold : int
111
+ Number of seconds since the last heartbeat after which worker is considered lost.
112
+
113
+ logdir : str
114
+ Parsl log directory paths. Logs and temp files go here.
115
+
116
+ logging_level : int
117
+ Logging level as defined in the logging module.
118
+
119
+ poll_period : int
120
+ The main thread polling period, in milliseconds.
121
+
122
+ cert_dir : str | None
123
+ Path to the certificate directory.
124
+ """
125
+ self.cert_dir = cert_dir
126
+ self.logdir = logdir
127
+ os.makedirs(self.logdir, exist_ok=True)
128
+
129
+ start_file_logger("{}/interchange.log".format(self.logdir), level=logging_level)
130
+ logger.propagate = False
131
+ logger.debug("Initializing Interchange process")
132
+
133
+ self.client_address = client_address
134
+ self.interchange_address: str = interchange_address or "*"
135
+ self.poll_period = poll_period
136
+
137
+ logger.info("Attempting connection to client at {} on ports: {},{},{}".format(
138
+ client_address, client_ports[0], client_ports[1], client_ports[2]))
139
+ self.zmq_context = curvezmq.ServerContext(self.cert_dir)
140
+ self.task_incoming = self.zmq_context.socket(zmq.DEALER)
141
+ self.task_incoming.set_hwm(0)
142
+ self.task_incoming.connect("tcp://{}:{}".format(client_address, client_ports[0]))
143
+ self.results_outgoing = self.zmq_context.socket(zmq.DEALER)
144
+ self.results_outgoing.set_hwm(0)
145
+ self.results_outgoing.connect("tcp://{}:{}".format(client_address, client_ports[1]))
146
+
147
+ self.command_channel = self.zmq_context.socket(zmq.REP)
148
+ self.command_channel.connect("tcp://{}:{}".format(client_address, client_ports[2]))
149
+ logger.info("Connected to client")
150
+
151
+ self.hub_address = hub_address
152
+ self.hub_zmq_port = hub_zmq_port
153
+
154
+ self.pending_task_queue: queue.Queue[Any] = queue.Queue(maxsize=10 ** 6)
155
+ self.count = 0
156
+
157
+ self.worker_ports = worker_ports
158
+ self.worker_port_range = worker_port_range
159
+
160
+ self.task_outgoing = self.zmq_context.socket(zmq.ROUTER)
161
+ self.task_outgoing.set_hwm(0)
162
+ self.results_incoming = self.zmq_context.socket(zmq.ROUTER)
163
+ self.results_incoming.set_hwm(0)
164
+
165
+ if self.worker_ports:
166
+ self.worker_task_port = self.worker_ports[0]
167
+ self.worker_result_port = self.worker_ports[1]
168
+
169
+ self.task_outgoing.bind(f"tcp://{self.interchange_address}:{self.worker_task_port}")
170
+ self.results_incoming.bind(f"tcp://{self.interchange_address}:{self.worker_result_port}")
171
+
172
+ else:
173
+ self.worker_task_port = self.task_outgoing.bind_to_random_port(f"tcp://{self.interchange_address}",
174
+ min_port=worker_port_range[0],
175
+ max_port=worker_port_range[1], max_tries=100)
176
+ self.worker_result_port = self.results_incoming.bind_to_random_port(f"tcp://{self.interchange_address}",
177
+ min_port=worker_port_range[0],
178
+ max_port=worker_port_range[1], max_tries=100)
179
+
180
+ logger.info("Bound to ports {},{} for incoming worker connections".format(
181
+ self.worker_task_port, self.worker_result_port))
182
+
183
+ self._ready_managers: Dict[bytes, ManagerRecord] = {}
184
+ self.connected_block_history: List[str] = []
185
+
186
+ self.heartbeat_threshold = heartbeat_threshold
187
+
188
+ self.current_platform = {'parsl_v': PARSL_VERSION,
189
+ 'python_v': "{}.{}.{}".format(sys.version_info.major,
190
+ sys.version_info.minor,
191
+ sys.version_info.micro),
192
+ 'os': platform.system(),
193
+ 'hostname': platform.node(),
194
+ 'dir': os.getcwd()}
195
+
196
+ logger.info("Platform info: {}".format(self.current_platform))
197
+
198
+ def get_tasks(self, count: int) -> Sequence[dict]:
199
+ """ Obtains a batch of tasks from the internal pending_task_queue
200
+
201
+ Parameters
202
+ ----------
203
+ count: int
204
+ Count of tasks to get from the queue
205
+
206
+ Returns
207
+ -------
208
+ List of upto count tasks. May return fewer than count down to an empty list
209
+ eg. [{'task_id':<x>, 'buffer':<buf>} ... ]
210
+ """
211
+ tasks = []
212
+ for _ in range(0, count):
213
+ try:
214
+ x = self.pending_task_queue.get(block=False)
215
+ except queue.Empty:
216
+ break
217
+ else:
218
+ tasks.append(x)
219
+
220
+ return tasks
221
+
222
+ @wrap_with_logs(target="interchange")
223
+ def task_puller(self) -> NoReturn:
224
+ """Pull tasks from the incoming tasks zmq pipe onto the internal
225
+ pending task queue
226
+ """
227
+ logger.info("Starting")
228
+ task_counter = 0
229
+
230
+ while True:
231
+ logger.debug("launching recv_pyobj")
232
+ try:
233
+ msg = self.task_incoming.recv_pyobj()
234
+ except zmq.Again:
235
+ # We just timed out while attempting to receive
236
+ logger.debug("zmq.Again with {} tasks in internal queue".format(self.pending_task_queue.qsize()))
237
+ continue
238
+
239
+ logger.debug("putting message onto pending_task_queue")
240
+ self.pending_task_queue.put(msg)
241
+ task_counter += 1
242
+ logger.debug(f"Fetched {task_counter} tasks so far")
243
+
244
+ def _create_monitoring_channel(self) -> Optional[zmq.Socket]:
245
+ if self.hub_address and self.hub_zmq_port:
246
+ logger.info("Connecting to MonitoringHub")
247
+ # This is a one-off because monitoring is unencrypted
248
+ hub_channel = zmq.Context().socket(zmq.DEALER)
249
+ hub_channel.set_hwm(0)
250
+ hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_zmq_port))
251
+ logger.info("Connected to MonitoringHub")
252
+ return hub_channel
253
+ else:
254
+ return None
255
+
256
+ def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
257
+ if hub_channel:
258
+ logger.info("Sending message {} to MonitoringHub".format(manager))
259
+
260
+ d: Dict = cast(Dict, manager.copy())
261
+ d['timestamp'] = datetime.datetime.now()
262
+ d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat'])
263
+
264
+ hub_channel.send_pyobj((MessageType.NODE_INFO, d))
265
+
266
+ @wrap_with_logs(target="interchange")
267
+ def _command_server(self) -> NoReturn:
268
+ """ Command server to run async command to the interchange
269
+ """
270
+ logger.debug("Command Server Starting")
271
+
272
+ # Need to create a new ZMQ socket for command server thread
273
+ hub_channel = self._create_monitoring_channel()
274
+
275
+ reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
276
+
277
+ while True:
278
+ try:
279
+ command_req = self.command_channel.recv_pyobj()
280
+ logger.debug("Received command request: {}".format(command_req))
281
+ if command_req == "OUTSTANDING_C":
282
+ outstanding = self.pending_task_queue.qsize()
283
+ for manager in self._ready_managers.values():
284
+ outstanding += len(manager['tasks'])
285
+ reply = outstanding
286
+
287
+ elif command_req == "CONNECTED_BLOCKS":
288
+ reply = self.connected_block_history
289
+
290
+ elif command_req == "WORKERS":
291
+ num_workers = 0
292
+ for manager in self._ready_managers.values():
293
+ num_workers += manager['worker_count']
294
+ reply = num_workers
295
+
296
+ elif command_req == "MANAGERS":
297
+ reply = []
298
+ for manager_id in self._ready_managers:
299
+ m = self._ready_managers[manager_id]
300
+ idle_since = m['idle_since']
301
+ if idle_since is not None:
302
+ idle_duration = time.time() - idle_since
303
+ else:
304
+ idle_duration = 0.0
305
+ resp = {'manager': manager_id.decode('utf-8'),
306
+ 'block_id': m['block_id'],
307
+ 'worker_count': m['worker_count'],
308
+ 'tasks': len(m['tasks']),
309
+ 'idle_duration': idle_duration,
310
+ 'active': m['active'],
311
+ 'parsl_version': m['parsl_version'],
312
+ 'python_version': m['python_version'],
313
+ 'draining': m['draining']}
314
+ reply.append(resp)
315
+
316
+ elif command_req.startswith("HOLD_WORKER"):
317
+ cmd, s_manager = command_req.split(';')
318
+ manager_id = s_manager.encode('utf-8')
319
+ logger.info("Received HOLD_WORKER for {!r}".format(manager_id))
320
+ if manager_id in self._ready_managers:
321
+ m = self._ready_managers[manager_id]
322
+ m['active'] = False
323
+ self._send_monitoring_info(hub_channel, m)
324
+ else:
325
+ logger.warning("Worker to hold was not in ready managers list")
326
+
327
+ reply = None
328
+
329
+ elif command_req == "WORKER_PORTS":
330
+ reply = (self.worker_task_port, self.worker_result_port)
331
+
332
+ else:
333
+ logger.error(f"Received unknown command: {command_req}")
334
+ reply = None
335
+
336
+ logger.debug("Reply: {}".format(reply))
337
+ self.command_channel.send_pyobj(reply)
338
+
339
+ except zmq.Again:
340
+ logger.debug("Command thread is alive")
341
+ continue
342
+
343
+ @wrap_with_logs
344
+ def start(self) -> None:
345
+ """ Start the interchange
346
+ """
347
+
348
+ # If a user workflow has set its own signal handler for sigterm, that
349
+ # handler will be inherited by the interchange process because it is
350
+ # launched as a multiprocessing fork process.
351
+ # That can interfere with the interchange shutdown mechanism, which is
352
+ # to receive a SIGTERM and exit immediately.
353
+ # See Parsl issue #2343 (Threads and multiprocessing cannot be
354
+ # intermingled without deadlocks) which talks about other fork-related
355
+ # parent-process-inheritance problems.
356
+ signal.signal(signal.SIGTERM, signal.SIG_DFL)
357
+
358
+ logger.info("Incoming ports bound")
359
+
360
+ hub_channel = self._create_monitoring_channel()
361
+
362
+ poll_period = self.poll_period
363
+
364
+ start = time.time()
365
+
366
+ self._task_puller_thread = threading.Thread(target=self.task_puller,
367
+ name="Interchange-Task-Puller",
368
+ daemon=True)
369
+ self._task_puller_thread.start()
370
+
371
+ self._command_thread = threading.Thread(target=self._command_server,
372
+ name="Interchange-Command",
373
+ daemon=True)
374
+ self._command_thread.start()
375
+
376
+ kill_event = threading.Event()
377
+
378
+ poller = zmq.Poller()
379
+ poller.register(self.task_outgoing, zmq.POLLIN)
380
+ poller.register(self.results_incoming, zmq.POLLIN)
381
+
382
+ # These are managers which we should examine in an iteration
383
+ # for scheduling a job (or maybe any other attention?).
384
+ # Anything altering the state of the manager should add it
385
+ # onto this list.
386
+ interesting_managers: Set[bytes] = set()
387
+
388
+ while not kill_event.is_set():
389
+ self.socks = dict(poller.poll(timeout=poll_period))
390
+
391
+ self.process_task_outgoing_incoming(interesting_managers, hub_channel, kill_event)
392
+ self.process_results_incoming(interesting_managers, hub_channel)
393
+ self.expire_bad_managers(interesting_managers, hub_channel)
394
+ self.expire_drained_managers(interesting_managers, hub_channel)
395
+ self.process_tasks_to_send(interesting_managers)
396
+
397
+ self.zmq_context.destroy()
398
+ delta = time.time() - start
399
+ logger.info("Processed {} tasks in {} seconds".format(self.count, delta))
400
+ logger.warning("Exiting")
401
+
402
+ def process_task_outgoing_incoming(
403
+ self,
404
+ interesting_managers: Set[bytes],
405
+ hub_channel: Optional[zmq.Socket],
406
+ kill_event: threading.Event
407
+ ) -> None:
408
+ """Process one message from manager on the task_outgoing channel.
409
+ Note that this message flow is in contradiction to the name of the
410
+ channel - it is not an outgoing message and it is not a task.
411
+ """
412
+ if self.task_outgoing in self.socks and self.socks[self.task_outgoing] == zmq.POLLIN:
413
+ logger.debug("starting task_outgoing section")
414
+ message = self.task_outgoing.recv_multipart()
415
+ manager_id = message[0]
416
+
417
+ try:
418
+ msg = json.loads(message[1].decode('utf-8'))
419
+ except Exception:
420
+ logger.warning("Got Exception reading message from manager: {!r}".format(
421
+ manager_id), exc_info=True)
422
+ logger.debug("Message: \n{!r}\n".format(message[1]))
423
+ return
424
+
425
+ # perform a bit of validation on the structure of the deserialized
426
+ # object, at least enough to behave like a deserialization error
427
+ # in obviously malformed cases
428
+ if not isinstance(msg, dict) or 'type' not in msg:
429
+ logger.error(f"JSON message was not correctly formatted from manager: {manager_id!r}")
430
+ logger.debug("Message: \n{!r}\n".format(message[1]))
431
+ return
432
+
433
+ if msg['type'] == 'registration':
434
+ # We set up an entry only if registration works correctly
435
+ self._ready_managers[manager_id] = {'last_heartbeat': time.time(),
436
+ 'idle_since': time.time(),
437
+ 'block_id': None,
438
+ 'max_capacity': 0,
439
+ 'worker_count': 0,
440
+ 'active': True,
441
+ 'draining': False,
442
+ 'parsl_version': msg['parsl_v'],
443
+ 'python_version': msg['python_v'],
444
+ 'tasks': []}
445
+ self.connected_block_history.append(msg['block_id'])
446
+
447
+ interesting_managers.add(manager_id)
448
+ logger.info("Adding manager: {!r} to ready queue".format(manager_id))
449
+ m = self._ready_managers[manager_id]
450
+
451
+ # m is a ManagerRecord, but msg is a dict[Any,Any] and so can
452
+ # contain arbitrary fields beyond those in ManagerRecord (and
453
+ # indeed does - for example, python_v) which are then ignored
454
+ # later.
455
+ m.update(msg) # type: ignore[typeddict-item]
456
+
457
+ logger.info("Registration info for manager {!r}: {}".format(manager_id, msg))
458
+ self._send_monitoring_info(hub_channel, m)
459
+
460
+ if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
461
+ msg['parsl_v'] != self.current_platform['parsl_v']):
462
+ logger.error("Manager {!r} has incompatible version info with the interchange".format(manager_id))
463
+ logger.debug("Setting kill event")
464
+ kill_event.set()
465
+ e = VersionMismatch("py.v={} parsl.v={}".format(self.current_platform['python_v'].rsplit(".", 1)[0],
466
+ self.current_platform['parsl_v']),
467
+ "py.v={} parsl.v={}".format(msg['python_v'].rsplit(".", 1)[0],
468
+ msg['parsl_v'])
469
+ )
470
+ result_package = {'type': 'result', 'task_id': -1, 'exception': serialize_object(e)}
471
+ pkl_package = pickle.dumps(result_package)
472
+ self.results_outgoing.send(pkl_package)
473
+ logger.error("Sent failure reports, shutting down interchange")
474
+ else:
475
+ logger.info("Manager {!r} has compatible Parsl version {}".format(manager_id, msg['parsl_v']))
476
+ logger.info("Manager {!r} has compatible Python version {}".format(manager_id,
477
+ msg['python_v'].rsplit(".", 1)[0]))
478
+ elif msg['type'] == 'heartbeat':
479
+ self._ready_managers[manager_id]['last_heartbeat'] = time.time()
480
+ logger.debug("Manager {!r} sent heartbeat via tasks connection".format(manager_id))
481
+ self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
482
+ elif msg['type'] == 'drain':
483
+ self._ready_managers[manager_id]['draining'] = True
484
+ logger.debug(f"Manager {manager_id!r} requested drain")
485
+ else:
486
+ logger.error(f"Unexpected message type received from manager: {msg['type']}")
487
+ logger.debug("leaving task_outgoing section")
488
+
489
+ def expire_drained_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
490
+
491
+ for manager_id in list(interesting_managers):
492
+ # is it always true that a draining manager will be in interesting managers?
493
+ # i think so because it will have outstanding capacity?
494
+ m = self._ready_managers[manager_id]
495
+ if m['draining'] and len(m['tasks']) == 0:
496
+ logger.info(f"Manager {manager_id!r} is drained - sending drained message to manager")
497
+ self.task_outgoing.send_multipart([manager_id, b'', PKL_DRAINED_CODE])
498
+ interesting_managers.remove(manager_id)
499
+ self._ready_managers.pop(manager_id)
500
+
501
+ m['active'] = False
502
+ self._send_monitoring_info(hub_channel, m)
503
+
504
+ def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
505
+ # Check if there are tasks that could be sent to managers
506
+
507
+ logger.debug("Managers count (interesting/total): {interesting}/{total}".format(
508
+ total=len(self._ready_managers),
509
+ interesting=len(interesting_managers)))
510
+
511
+ if interesting_managers and not self.pending_task_queue.empty():
512
+ shuffled_managers = list(interesting_managers)
513
+ random.shuffle(shuffled_managers)
514
+
515
+ while shuffled_managers and not self.pending_task_queue.empty(): # cf. the if statement above...
516
+ manager_id = shuffled_managers.pop()
517
+ m = self._ready_managers[manager_id]
518
+ tasks_inflight = len(m['tasks'])
519
+ real_capacity = m['max_capacity'] - tasks_inflight
520
+
521
+ if (real_capacity and m['active'] and not m['draining']):
522
+ tasks = self.get_tasks(real_capacity)
523
+ if tasks:
524
+ self.task_outgoing.send_multipart([manager_id, b'', pickle.dumps(tasks)])
525
+ task_count = len(tasks)
526
+ self.count += task_count
527
+ tids = [t['task_id'] for t in tasks]
528
+ m['tasks'].extend(tids)
529
+ m['idle_since'] = None
530
+ logger.debug("Sent tasks: {} to manager {!r}".format(tids, manager_id))
531
+ # recompute real_capacity after sending tasks
532
+ real_capacity = m['max_capacity'] - tasks_inflight
533
+ if real_capacity > 0:
534
+ logger.debug("Manager {!r} has free capacity {}".format(manager_id, real_capacity))
535
+ # ... so keep it in the interesting_managers list
536
+ else:
537
+ logger.debug("Manager {!r} is now saturated".format(manager_id))
538
+ interesting_managers.remove(manager_id)
539
+ else:
540
+ interesting_managers.remove(manager_id)
541
+ # logger.debug("Nothing to send to manager {}".format(manager_id))
542
+ logger.debug("leaving _ready_managers section, with {} managers still interesting".format(len(interesting_managers)))
543
+ else:
544
+ logger.debug("either no interesting managers or no tasks, so skipping manager pass")
545
+
546
+ def process_results_incoming(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
547
+ # Receive any results and forward to client
548
+ if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN:
549
+ logger.debug("entering results_incoming section")
550
+ manager_id, *all_messages = self.results_incoming.recv_multipart()
551
+ if manager_id not in self._ready_managers:
552
+ logger.warning("Received a result from a un-registered manager: {!r}".format(manager_id))
553
+ else:
554
+ logger.debug(f"Got {len(all_messages)} result items in batch from manager {manager_id!r}")
555
+
556
+ b_messages = []
557
+
558
+ for p_message in all_messages:
559
+ r = pickle.loads(p_message)
560
+ if r['type'] == 'result':
561
+ # process this for task ID and forward to executor
562
+ b_messages.append((p_message, r))
563
+ elif r['type'] == 'monitoring':
564
+ # the monitoring code makes the assumption that no
565
+ # monitoring messages will be received if monitoring
566
+ # is not configured, and that hub_channel will only
567
+ # be None when monitoring is not configurated.
568
+ assert hub_channel is not None
569
+
570
+ hub_channel.send_pyobj(r['payload'])
571
+ elif r['type'] == 'heartbeat':
572
+ logger.debug(f"Manager {manager_id!r} sent heartbeat via results connection")
573
+ b_messages.append((p_message, r))
574
+ else:
575
+ logger.error("Interchange discarding result_queue message of unknown type: {}".format(r['type']))
576
+
577
+ got_result = False
578
+ m = self._ready_managers[manager_id]
579
+ for (_, r) in b_messages:
580
+ assert 'type' in r, f"Message is missing type entry: {r}"
581
+ if r['type'] == 'result':
582
+ got_result = True
583
+ try:
584
+ logger.debug(f"Removing task {r['task_id']} from manager record {manager_id!r}")
585
+ m['tasks'].remove(r['task_id'])
586
+ except Exception:
587
+ # If we reach here, there's something very wrong.
588
+ logger.exception("Ignoring exception removing task_id {} for manager {!r} with task list {}".format(
589
+ r['task_id'],
590
+ manager_id,
591
+ m['tasks']))
592
+
593
+ b_messages_to_send = []
594
+ for (b_message, _) in b_messages:
595
+ b_messages_to_send.append(b_message)
596
+
597
+ if b_messages_to_send:
598
+ logger.debug("Sending messages on results_outgoing")
599
+ self.results_outgoing.send_multipart(b_messages_to_send)
600
+ logger.debug("Sent messages on results_outgoing")
601
+
602
+ logger.debug(f"Current tasks on manager {manager_id!r}: {m['tasks']}")
603
+ if len(m['tasks']) == 0 and m['idle_since'] is None:
604
+ m['idle_since'] = time.time()
605
+
606
+ # A manager is only made interesting here if a result was
607
+ # received, which means there should be capacity for a new
608
+ # task now. Heartbeats and monitoring messages do not make a
609
+ # manager become interesting.
610
+ if got_result:
611
+ interesting_managers.add(manager_id)
612
+ logger.debug("leaving results_incoming section")
613
+
614
+ def expire_bad_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
615
+ bad_managers = [(manager_id, m) for (manager_id, m) in self._ready_managers.items() if
616
+ time.time() - m['last_heartbeat'] > self.heartbeat_threshold]
617
+ for (manager_id, m) in bad_managers:
618
+ logger.debug("Last: {} Current: {}".format(m['last_heartbeat'], time.time()))
619
+ logger.warning(f"Too many heartbeats missed for manager {manager_id!r} - removing manager")
620
+ if m['active']:
621
+ m['active'] = False
622
+ self._send_monitoring_info(hub_channel, m)
623
+
624
+ logger.warning(f"Cancelling htex tasks {m['tasks']} on removed manager")
625
+ for tid in m['tasks']:
626
+ try:
627
+ raise ManagerLost(manager_id, m['hostname'])
628
+ except Exception:
629
+ result_package = {'type': 'result', 'task_id': tid, 'exception': serialize_object(RemoteExceptionWrapper(*sys.exc_info()))}
630
+ pkl_package = pickle.dumps(result_package)
631
+ self.results_outgoing.send(pkl_package)
632
+ logger.warning("Sent failure reports, unregistering manager")
633
+ self._ready_managers.pop(manager_id, 'None')
634
+ if manager_id in interesting_managers:
635
+ interesting_managers.remove(manager_id)
636
+
637
+
638
+ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string: Optional[str] = None) -> None:
639
+ """Add a stream log handler.
640
+
641
+ Parameters
642
+ ---------
643
+
644
+ filename: string
645
+ Name of the file to write logs to. Required.
646
+ level: logging.LEVEL
647
+ Set the logging level. Default=logging.DEBUG
648
+ - format_string (string): Set the format string
649
+ format_string: string
650
+ Format string to use.
651
+
652
+ Returns
653
+ -------
654
+ None.
655
+ """
656
+ if format_string is None:
657
+ format_string = (
658
+
659
+ "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d "
660
+ "%(processName)s(%(process)d) %(threadName)s "
661
+ "%(funcName)s [%(levelname)s] %(message)s"
662
+
663
+ )
664
+
665
+ global logger
666
+ logger = logging.getLogger(LOGGER_NAME)
667
+ logger.setLevel(level)
668
+ handler = logging.FileHandler(filename)
669
+ handler.setLevel(level)
670
+ formatter = logging.Formatter(format_string, datefmt='%Y-%m-%d %H:%M:%S')
671
+ handler.setFormatter(formatter)
672
+ logger.addHandler(handler)
673
+
674
+
675
+ if __name__ == "__main__":
676
+ setproctitle("parsl: HTEX interchange")
677
+
678
+ config = pickle.load(sys.stdin.buffer)
679
+
680
+ ic = Interchange(**config)
681
+ ic.start()