parsl 2024.8.19__py3-none-any.whl → 2024.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/configs/improv.py +34 -0
- parsl/executors/high_throughput/executor.py +2 -1
- parsl/executors/high_throughput/interchange.py +31 -29
- parsl/executors/status_handling.py +5 -2
- parsl/jobs/states.py +6 -1
- parsl/monitoring/db_manager.py +4 -35
- parsl/monitoring/monitoring.py +5 -23
- parsl/monitoring/router.py +3 -33
- parsl/providers/slurm/slurm.py +40 -10
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +3 -5
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +1 -1
- parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +85 -0
- parsl/version.py +1 -1
- {parsl-2024.8.19.data → parsl-2024.9.2.data}/scripts/interchange.py +31 -29
- {parsl-2024.8.19.dist-info → parsl-2024.9.2.dist-info}/METADATA +2 -2
- {parsl-2024.8.19.dist-info → parsl-2024.9.2.dist-info}/RECORD +23 -21
- {parsl-2024.8.19.data → parsl-2024.9.2.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.8.19.data → parsl-2024.9.2.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.8.19.data → parsl-2024.9.2.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.8.19.dist-info → parsl-2024.9.2.dist-info}/LICENSE +0 -0
- {parsl-2024.8.19.dist-info → parsl-2024.9.2.dist-info}/WHEEL +0 -0
- {parsl-2024.8.19.dist-info → parsl-2024.9.2.dist-info}/entry_points.txt +0 -0
- {parsl-2024.8.19.dist-info → parsl-2024.9.2.dist-info}/top_level.txt +0 -0
parsl/configs/improv.py
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
from parsl.config import Config
|
2
|
+
from parsl.executors import HighThroughputExecutor
|
3
|
+
from parsl.launchers import MpiRunLauncher
|
4
|
+
from parsl.providers import PBSProProvider
|
5
|
+
|
6
|
+
config = Config(
|
7
|
+
executors=[
|
8
|
+
HighThroughputExecutor(
|
9
|
+
label="Improv_multinode",
|
10
|
+
max_workers_per_node=32,
|
11
|
+
provider=PBSProProvider(
|
12
|
+
account="YOUR_ALLOCATION_ON_IMPROV",
|
13
|
+
# PBS directives (header lines), for example:
|
14
|
+
# scheduler_options='#PBS -l mem=4gb',
|
15
|
+
scheduler_options='',
|
16
|
+
|
17
|
+
queue="compute",
|
18
|
+
|
19
|
+
# Command to be run before starting a worker:
|
20
|
+
# **WARNING** Improv requires an openmpi module to be
|
21
|
+
# loaded for the MpiRunLauncher. Add additional env
|
22
|
+
# load commands to this multiline string.
|
23
|
+
worker_init='''
|
24
|
+
module load gcc/13.2.0;
|
25
|
+
module load openmpi/5.0.3-gcc-13.2.0; ''',
|
26
|
+
launcher=MpiRunLauncher(),
|
27
|
+
|
28
|
+
# number of compute nodes allocated for each block
|
29
|
+
nodes_per_block=2,
|
30
|
+
walltime='00:10:00'
|
31
|
+
),
|
32
|
+
),
|
33
|
+
],
|
34
|
+
)
|
@@ -790,7 +790,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
790
790
|
connected_blocks = self.connected_blocks()
|
791
791
|
for job_id in job_status:
|
792
792
|
job_info = job_status[job_id]
|
793
|
-
if job_info.terminal and job_id not in connected_blocks:
|
793
|
+
if job_info.terminal and job_id not in connected_blocks and job_info.state != JobState.SCALED_IN:
|
794
|
+
logger.debug("Rewriting job %s from status %s to MISSING", job_id, job_info)
|
794
795
|
job_status[job_id].state = JobState.MISSING
|
795
796
|
if job_status[job_id].message is None:
|
796
797
|
job_status[job_id].message = (
|
@@ -375,7 +375,7 @@ class Interchange:
|
|
375
375
|
|
376
376
|
self.zmq_context.destroy()
|
377
377
|
delta = time.time() - start
|
378
|
-
logger.info("Processed {} tasks in {} seconds"
|
378
|
+
logger.info(f"Processed {self.count} tasks in {delta} seconds")
|
379
379
|
logger.warning("Exiting")
|
380
380
|
|
381
381
|
def process_task_outgoing_incoming(
|
@@ -396,9 +396,8 @@ class Interchange:
|
|
396
396
|
try:
|
397
397
|
msg = json.loads(message[1].decode('utf-8'))
|
398
398
|
except Exception:
|
399
|
-
logger.warning("Got Exception reading message from manager: {!r}"
|
400
|
-
|
401
|
-
logger.debug("Message: \n{!r}\n".format(message[1]))
|
399
|
+
logger.warning(f"Got Exception reading message from manager: {manager_id!r}", exc_info=True)
|
400
|
+
logger.debug("Message:\n %r\n", message[1])
|
402
401
|
return
|
403
402
|
|
404
403
|
# perform a bit of validation on the structure of the deserialized
|
@@ -406,7 +405,7 @@ class Interchange:
|
|
406
405
|
# in obviously malformed cases
|
407
406
|
if not isinstance(msg, dict) or 'type' not in msg:
|
408
407
|
logger.error(f"JSON message was not correctly formatted from manager: {manager_id!r}")
|
409
|
-
logger.debug("Message
|
408
|
+
logger.debug("Message:\n %r\n", message[1])
|
410
409
|
return
|
411
410
|
|
412
411
|
if msg['type'] == 'registration':
|
@@ -425,7 +424,7 @@ class Interchange:
|
|
425
424
|
self.connected_block_history.append(msg['block_id'])
|
426
425
|
|
427
426
|
interesting_managers.add(manager_id)
|
428
|
-
logger.info("Adding manager: {!r} to ready queue"
|
427
|
+
logger.info(f"Adding manager: {manager_id!r} to ready queue")
|
429
428
|
m = self._ready_managers[manager_id]
|
430
429
|
|
431
430
|
# m is a ManagerRecord, but msg is a dict[Any,Any] and so can
|
@@ -434,12 +433,12 @@ class Interchange:
|
|
434
433
|
# later.
|
435
434
|
m.update(msg) # type: ignore[typeddict-item]
|
436
435
|
|
437
|
-
logger.info("Registration info for manager {!r}: {}"
|
436
|
+
logger.info(f"Registration info for manager {manager_id!r}: {msg}")
|
438
437
|
self._send_monitoring_info(monitoring_radio, m)
|
439
438
|
|
440
439
|
if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
|
441
440
|
msg['parsl_v'] != self.current_platform['parsl_v']):
|
442
|
-
logger.error("Manager {!r} has incompatible version info with the interchange"
|
441
|
+
logger.error(f"Manager {manager_id!r} has incompatible version info with the interchange")
|
443
442
|
logger.debug("Setting kill event")
|
444
443
|
kill_event.set()
|
445
444
|
e = VersionMismatch("py.v={} parsl.v={}".format(self.current_platform['python_v'].rsplit(".", 1)[0],
|
@@ -452,16 +451,15 @@ class Interchange:
|
|
452
451
|
self.results_outgoing.send(pkl_package)
|
453
452
|
logger.error("Sent failure reports, shutting down interchange")
|
454
453
|
else:
|
455
|
-
logger.info("Manager {!r} has compatible Parsl version {
|
456
|
-
logger.info("Manager {!r} has compatible Python version {
|
457
|
-
msg['python_v'].rsplit(".", 1)[0]))
|
454
|
+
logger.info(f"Manager {manager_id!r} has compatible Parsl version {msg['parsl_v']}")
|
455
|
+
logger.info(f"Manager {manager_id!r} has compatible Python version {msg['python_v'].rsplit('.', 1)[0]}")
|
458
456
|
elif msg['type'] == 'heartbeat':
|
459
457
|
self._ready_managers[manager_id]['last_heartbeat'] = time.time()
|
460
|
-
logger.debug("Manager
|
458
|
+
logger.debug("Manager %r sent heartbeat via tasks connection", manager_id)
|
461
459
|
self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
|
462
460
|
elif msg['type'] == 'drain':
|
463
461
|
self._ready_managers[manager_id]['draining'] = True
|
464
|
-
logger.debug(
|
462
|
+
logger.debug("Manager %r requested drain", manager_id)
|
465
463
|
else:
|
466
464
|
logger.error(f"Unexpected message type received from manager: {msg['type']}")
|
467
465
|
logger.debug("leaving task_outgoing section")
|
@@ -484,9 +482,11 @@ class Interchange:
|
|
484
482
|
def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
|
485
483
|
# Check if there are tasks that could be sent to managers
|
486
484
|
|
487
|
-
logger.debug(
|
488
|
-
total
|
489
|
-
|
485
|
+
logger.debug(
|
486
|
+
"Managers count (interesting/total): %d/%d",
|
487
|
+
len(interesting_managers),
|
488
|
+
len(self._ready_managers)
|
489
|
+
)
|
490
490
|
|
491
491
|
if interesting_managers and not self.pending_task_queue.empty():
|
492
492
|
shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
|
@@ -497,7 +497,7 @@ class Interchange:
|
|
497
497
|
tasks_inflight = len(m['tasks'])
|
498
498
|
real_capacity = m['max_capacity'] - tasks_inflight
|
499
499
|
|
500
|
-
if
|
500
|
+
if real_capacity and m["active"] and not m["draining"]:
|
501
501
|
tasks = self.get_tasks(real_capacity)
|
502
502
|
if tasks:
|
503
503
|
self.task_outgoing.send_multipart([manager_id, b'', pickle.dumps(tasks)])
|
@@ -506,19 +506,19 @@ class Interchange:
|
|
506
506
|
tids = [t['task_id'] for t in tasks]
|
507
507
|
m['tasks'].extend(tids)
|
508
508
|
m['idle_since'] = None
|
509
|
-
logger.debug("Sent tasks:
|
509
|
+
logger.debug("Sent tasks: %s to manager %r", tids, manager_id)
|
510
510
|
# recompute real_capacity after sending tasks
|
511
511
|
real_capacity = m['max_capacity'] - tasks_inflight
|
512
512
|
if real_capacity > 0:
|
513
|
-
logger.debug("Manager
|
513
|
+
logger.debug("Manager %r has free capacity %s", manager_id, real_capacity)
|
514
514
|
# ... so keep it in the interesting_managers list
|
515
515
|
else:
|
516
|
-
logger.debug("Manager
|
516
|
+
logger.debug("Manager %r is now saturated", manager_id)
|
517
517
|
interesting_managers.remove(manager_id)
|
518
518
|
else:
|
519
519
|
interesting_managers.remove(manager_id)
|
520
520
|
# logger.debug("Nothing to send to manager {}".format(manager_id))
|
521
|
-
logger.debug("leaving _ready_managers section, with
|
521
|
+
logger.debug("leaving _ready_managers section, with %s managers still interesting", len(interesting_managers))
|
522
522
|
else:
|
523
523
|
logger.debug("either no interesting managers or no tasks, so skipping manager pass")
|
524
524
|
|
@@ -528,9 +528,9 @@ class Interchange:
|
|
528
528
|
logger.debug("entering results_incoming section")
|
529
529
|
manager_id, *all_messages = self.results_incoming.recv_multipart()
|
530
530
|
if manager_id not in self._ready_managers:
|
531
|
-
logger.warning("Received a result from a un-registered manager: {!r}"
|
531
|
+
logger.warning(f"Received a result from a un-registered manager: {manager_id!r}")
|
532
532
|
else:
|
533
|
-
logger.debug(
|
533
|
+
logger.debug("Got %s result items in batch from manager %r", len(all_messages), manager_id)
|
534
534
|
|
535
535
|
b_messages = []
|
536
536
|
|
@@ -548,10 +548,10 @@ class Interchange:
|
|
548
548
|
|
549
549
|
monitoring_radio.send(r['payload'])
|
550
550
|
elif r['type'] == 'heartbeat':
|
551
|
-
logger.debug(
|
551
|
+
logger.debug("Manager %r sent heartbeat via results connection", manager_id)
|
552
552
|
b_messages.append((p_message, r))
|
553
553
|
else:
|
554
|
-
logger.error("Interchange discarding result_queue message of unknown type:
|
554
|
+
logger.error("Interchange discarding result_queue message of unknown type: %s", r["type"])
|
555
555
|
|
556
556
|
got_result = False
|
557
557
|
m = self._ready_managers[manager_id]
|
@@ -560,14 +560,16 @@ class Interchange:
|
|
560
560
|
if r['type'] == 'result':
|
561
561
|
got_result = True
|
562
562
|
try:
|
563
|
-
logger.debug(
|
563
|
+
logger.debug("Removing task %s from manager record %r", r["task_id"], manager_id)
|
564
564
|
m['tasks'].remove(r['task_id'])
|
565
565
|
except Exception:
|
566
566
|
# If we reach here, there's something very wrong.
|
567
|
-
logger.exception(
|
567
|
+
logger.exception(
|
568
|
+
"Ignoring exception removing task_id %s for manager %r with task list %s",
|
568
569
|
r['task_id'],
|
569
570
|
manager_id,
|
570
|
-
m[
|
571
|
+
m["tasks"]
|
572
|
+
)
|
571
573
|
|
572
574
|
b_messages_to_send = []
|
573
575
|
for (b_message, _) in b_messages:
|
@@ -578,7 +580,7 @@ class Interchange:
|
|
578
580
|
self.results_outgoing.send_multipart(b_messages_to_send)
|
579
581
|
logger.debug("Sent messages on results_outgoing")
|
580
582
|
|
581
|
-
logger.debug(
|
583
|
+
logger.debug("Current tasks on manager %r: %s", manager_id, m["tasks"])
|
582
584
|
if len(m['tasks']) == 0 and m['idle_since'] is None:
|
583
585
|
m['idle_since'] = time.time()
|
584
586
|
|
@@ -347,7 +347,10 @@ class BlockProviderExecutor(ParslExecutor):
|
|
347
347
|
if block_ids is not None:
|
348
348
|
new_status = {}
|
349
349
|
for block_id in block_ids:
|
350
|
-
|
351
|
-
|
350
|
+
logger.debug("Marking block %s as SCALED_IN", block_id)
|
351
|
+
s = JobStatus(JobState.SCALED_IN)
|
352
|
+
new_status[block_id] = s
|
353
|
+
self._status[block_id] = s
|
354
|
+
self._simulated_status[block_id] = s
|
352
355
|
self.send_monitoring_info(new_status)
|
353
356
|
return block_ids
|
parsl/jobs/states.py
CHANGED
@@ -46,12 +46,17 @@ class JobState(IntEnum):
|
|
46
46
|
bad worker environment or network connectivity issues.
|
47
47
|
"""
|
48
48
|
|
49
|
+
SCALED_IN = 9
|
50
|
+
"""This job has been deliberately scaled in. Scaling code should not be concerned
|
51
|
+
that the job never ran (for example for error handling purposes).
|
52
|
+
"""
|
53
|
+
|
49
54
|
def __str__(self) -> str:
|
50
55
|
return f"{self.__class__.__name__}.{self.name}"
|
51
56
|
|
52
57
|
|
53
58
|
TERMINAL_STATES = [JobState.CANCELLED, JobState.COMPLETED, JobState.FAILED,
|
54
|
-
JobState.TIMEOUT, JobState.MISSING]
|
59
|
+
JobState.TIMEOUT, JobState.MISSING, JobState.SCALED_IN]
|
55
60
|
|
56
61
|
|
57
62
|
class JobStatus:
|
parsl/monitoring/db_manager.py
CHANGED
@@ -308,35 +308,9 @@ class DatabaseManager:
|
|
308
308
|
self.pending_resource_queue: queue.Queue[MonitoringMessage] = queue.Queue()
|
309
309
|
|
310
310
|
def start(self,
|
311
|
-
priority_queue: mpq.Queue,
|
312
|
-
node_queue: mpq.Queue,
|
313
|
-
block_queue: mpq.Queue,
|
314
311
|
resource_queue: mpq.Queue) -> None:
|
315
312
|
|
316
313
|
self._kill_event = threading.Event()
|
317
|
-
self._priority_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
|
318
|
-
args=(
|
319
|
-
priority_queue, self._kill_event,),
|
320
|
-
name="Monitoring-migrate-priority",
|
321
|
-
daemon=True,
|
322
|
-
)
|
323
|
-
self._priority_queue_pull_thread.start()
|
324
|
-
|
325
|
-
self._node_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
|
326
|
-
args=(
|
327
|
-
node_queue, self._kill_event,),
|
328
|
-
name="Monitoring-migrate-node",
|
329
|
-
daemon=True,
|
330
|
-
)
|
331
|
-
self._node_queue_pull_thread.start()
|
332
|
-
|
333
|
-
self._block_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
|
334
|
-
args=(
|
335
|
-
block_queue, self._kill_event,),
|
336
|
-
name="Monitoring-migrate-block",
|
337
|
-
daemon=True,
|
338
|
-
)
|
339
|
-
self._block_queue_pull_thread.start()
|
340
314
|
|
341
315
|
self._resource_queue_pull_thread = threading.Thread(target=self._migrate_logs_to_internal,
|
342
316
|
args=(
|
@@ -372,20 +346,18 @@ class DatabaseManager:
|
|
372
346
|
while (not self._kill_event.is_set() or
|
373
347
|
self.pending_priority_queue.qsize() != 0 or self.pending_resource_queue.qsize() != 0 or
|
374
348
|
self.pending_node_queue.qsize() != 0 or self.pending_block_queue.qsize() != 0 or
|
375
|
-
|
376
|
-
node_queue.qsize() != 0 or block_queue.qsize() != 0):
|
349
|
+
resource_queue.qsize() != 0):
|
377
350
|
|
378
351
|
"""
|
379
352
|
WORKFLOW_INFO and TASK_INFO messages (i.e. priority messages)
|
380
353
|
|
381
354
|
"""
|
382
355
|
try:
|
383
|
-
logger.debug("""Checking STOP conditions: {}, {}, {}, {}, {}, {}
|
356
|
+
logger.debug("""Checking STOP conditions: {}, {}, {}, {}, {}, {}""".format(
|
384
357
|
self._kill_event.is_set(),
|
385
358
|
self.pending_priority_queue.qsize() != 0, self.pending_resource_queue.qsize() != 0,
|
386
359
|
self.pending_node_queue.qsize() != 0, self.pending_block_queue.qsize() != 0,
|
387
|
-
|
388
|
-
node_queue.qsize() != 0, block_queue.qsize() != 0))
|
360
|
+
resource_queue.qsize() != 0))
|
389
361
|
|
390
362
|
# This is the list of resource messages which can be reprocessed as if they
|
391
363
|
# had just arrived because the corresponding first task message has been
|
@@ -707,9 +679,6 @@ class DatabaseManager:
|
|
707
679
|
@wrap_with_logs(target="database_manager")
|
708
680
|
@typeguard.typechecked
|
709
681
|
def dbm_starter(exception_q: mpq.Queue,
|
710
|
-
priority_msgs: mpq.Queue,
|
711
|
-
node_msgs: mpq.Queue,
|
712
|
-
block_msgs: mpq.Queue,
|
713
682
|
resource_msgs: mpq.Queue,
|
714
683
|
db_url: str,
|
715
684
|
logdir: str,
|
@@ -726,7 +695,7 @@ def dbm_starter(exception_q: mpq.Queue,
|
|
726
695
|
logdir=logdir,
|
727
696
|
logging_level=logging_level)
|
728
697
|
logger.info("Starting dbm in dbm starter")
|
729
|
-
dbm.start(
|
698
|
+
dbm.start(resource_msgs)
|
730
699
|
except KeyboardInterrupt:
|
731
700
|
logger.exception("KeyboardInterrupt signal caught")
|
732
701
|
dbm.close()
|
parsl/monitoring/monitoring.py
CHANGED
@@ -7,7 +7,7 @@ import queue
|
|
7
7
|
import time
|
8
8
|
from multiprocessing import Event, Process
|
9
9
|
from multiprocessing.queues import Queue
|
10
|
-
from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
|
10
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional, Tuple, Union, cast
|
11
11
|
|
12
12
|
import typeguard
|
13
13
|
|
@@ -138,27 +138,15 @@ class MonitoringHub(RepresentationMixin):
|
|
138
138
|
self.exception_q: Queue[Tuple[str, str]]
|
139
139
|
self.exception_q = SizedQueue(maxsize=10)
|
140
140
|
|
141
|
-
self.
|
142
|
-
self.priority_msgs = SizedQueue()
|
143
|
-
|
144
|
-
self.resource_msgs: Queue[AddressedMonitoringMessage]
|
141
|
+
self.resource_msgs: Queue[Union[AddressedMonitoringMessage, Tuple[Literal["STOP"], Literal[0]]]]
|
145
142
|
self.resource_msgs = SizedQueue()
|
146
143
|
|
147
|
-
self.node_msgs: Queue[AddressedMonitoringMessage]
|
148
|
-
self.node_msgs = SizedQueue()
|
149
|
-
|
150
|
-
self.block_msgs: Queue[AddressedMonitoringMessage]
|
151
|
-
self.block_msgs = SizedQueue()
|
152
|
-
|
153
144
|
self.router_exit_event: ms.Event
|
154
145
|
self.router_exit_event = Event()
|
155
146
|
|
156
147
|
self.router_proc = ForkProcess(target=router_starter,
|
157
148
|
kwargs={"comm_q": comm_q,
|
158
149
|
"exception_q": self.exception_q,
|
159
|
-
"priority_msgs": self.priority_msgs,
|
160
|
-
"node_msgs": self.node_msgs,
|
161
|
-
"block_msgs": self.block_msgs,
|
162
150
|
"resource_msgs": self.resource_msgs,
|
163
151
|
"exit_event": self.router_exit_event,
|
164
152
|
"hub_address": self.hub_address,
|
@@ -173,7 +161,7 @@ class MonitoringHub(RepresentationMixin):
|
|
173
161
|
self.router_proc.start()
|
174
162
|
|
175
163
|
self.dbm_proc = ForkProcess(target=dbm_starter,
|
176
|
-
args=(self.exception_q, self.
|
164
|
+
args=(self.exception_q, self.resource_msgs,),
|
177
165
|
kwargs={"logdir": self.logdir,
|
178
166
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
179
167
|
"db_url": self.logging_endpoint,
|
@@ -192,7 +180,7 @@ class MonitoringHub(RepresentationMixin):
|
|
192
180
|
self.filesystem_proc.start()
|
193
181
|
logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
|
194
182
|
|
195
|
-
self.radio = MultiprocessingQueueRadioSender(self.
|
183
|
+
self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
|
196
184
|
|
197
185
|
try:
|
198
186
|
comm_q_result = comm_q.get(block=True, timeout=120)
|
@@ -249,7 +237,7 @@ class MonitoringHub(RepresentationMixin):
|
|
249
237
|
logger.debug("Finished waiting for router termination")
|
250
238
|
if len(exception_msgs) == 0:
|
251
239
|
logger.debug("Sending STOP to DBM")
|
252
|
-
self.
|
240
|
+
self.resource_msgs.put(("STOP", 0))
|
253
241
|
else:
|
254
242
|
logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
|
255
243
|
logger.debug("Waiting for DB termination")
|
@@ -267,14 +255,8 @@ class MonitoringHub(RepresentationMixin):
|
|
267
255
|
logger.info("Closing monitoring multiprocessing queues")
|
268
256
|
self.exception_q.close()
|
269
257
|
self.exception_q.join_thread()
|
270
|
-
self.priority_msgs.close()
|
271
|
-
self.priority_msgs.join_thread()
|
272
258
|
self.resource_msgs.close()
|
273
259
|
self.resource_msgs.join_thread()
|
274
|
-
self.node_msgs.close()
|
275
|
-
self.node_msgs.join_thread()
|
276
|
-
self.block_msgs.close()
|
277
|
-
self.block_msgs.join_thread()
|
278
260
|
logger.info("Closed monitoring multiprocessing queues")
|
279
261
|
|
280
262
|
|
parsl/monitoring/router.py
CHANGED
@@ -14,7 +14,6 @@ import typeguard
|
|
14
14
|
import zmq
|
15
15
|
|
16
16
|
from parsl.log_utils import set_file_logger
|
17
|
-
from parsl.monitoring.message_type import MessageType
|
18
17
|
from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
|
19
18
|
from parsl.process_loggers import wrap_with_logs
|
20
19
|
from parsl.utils import setproctitle
|
@@ -34,9 +33,6 @@ class MonitoringRouter:
|
|
34
33
|
logdir: str = ".",
|
35
34
|
logging_level: int = logging.INFO,
|
36
35
|
atexit_timeout: int = 3, # in seconds
|
37
|
-
priority_msgs: mpq.Queue,
|
38
|
-
node_msgs: mpq.Queue,
|
39
|
-
block_msgs: mpq.Queue,
|
40
36
|
resource_msgs: mpq.Queue,
|
41
37
|
exit_event: Event,
|
42
38
|
):
|
@@ -57,8 +53,8 @@ class MonitoringRouter:
|
|
57
53
|
Logging level as defined in the logging module. Default: logging.INFO
|
58
54
|
atexit_timeout : float, optional
|
59
55
|
The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
|
60
|
-
|
61
|
-
|
56
|
+
resource_msgs : multiprocessing.Queue
|
57
|
+
A multiprocessing queue to receive messages to be routed onwards to the database process
|
62
58
|
|
63
59
|
exit_event : Event
|
64
60
|
An event that the main Parsl process will set to signal that the monitoring router should shut down.
|
@@ -102,9 +98,6 @@ class MonitoringRouter:
|
|
102
98
|
min_port=zmq_port_range[0],
|
103
99
|
max_port=zmq_port_range[1])
|
104
100
|
|
105
|
-
self.priority_msgs = priority_msgs
|
106
|
-
self.node_msgs = node_msgs
|
107
|
-
self.block_msgs = block_msgs
|
108
101
|
self.resource_msgs = resource_msgs
|
109
102
|
self.exit_event = exit_event
|
110
103
|
|
@@ -170,24 +163,7 @@ class MonitoringRouter:
|
|
170
163
|
msg_0: AddressedMonitoringMessage
|
171
164
|
msg_0 = (msg, 0)
|
172
165
|
|
173
|
-
|
174
|
-
self.node_msgs.put(msg_0)
|
175
|
-
elif msg[0] == MessageType.RESOURCE_INFO:
|
176
|
-
self.resource_msgs.put(msg_0)
|
177
|
-
elif msg[0] == MessageType.BLOCK_INFO:
|
178
|
-
self.block_msgs.put(msg_0)
|
179
|
-
elif msg[0] == MessageType.TASK_INFO:
|
180
|
-
self.priority_msgs.put(msg_0)
|
181
|
-
elif msg[0] == MessageType.WORKFLOW_INFO:
|
182
|
-
self.priority_msgs.put(msg_0)
|
183
|
-
else:
|
184
|
-
# There is a type: ignore here because if msg[0]
|
185
|
-
# is of the correct type, this code is unreachable,
|
186
|
-
# but there is no verification that the message
|
187
|
-
# received from zmq_receiver_channel.recv_pyobj() is actually
|
188
|
-
# of that type.
|
189
|
-
self.logger.error("Discarding message " # type: ignore[unreachable]
|
190
|
-
f"from interchange with unknown type {msg[0].value}")
|
166
|
+
self.resource_msgs.put(msg_0)
|
191
167
|
except zmq.Again:
|
192
168
|
pass
|
193
169
|
except Exception:
|
@@ -207,9 +183,6 @@ class MonitoringRouter:
|
|
207
183
|
def router_starter(*,
|
208
184
|
comm_q: mpq.Queue,
|
209
185
|
exception_q: mpq.Queue,
|
210
|
-
priority_msgs: mpq.Queue,
|
211
|
-
node_msgs: mpq.Queue,
|
212
|
-
block_msgs: mpq.Queue,
|
213
186
|
resource_msgs: mpq.Queue,
|
214
187
|
exit_event: Event,
|
215
188
|
|
@@ -226,9 +199,6 @@ def router_starter(*,
|
|
226
199
|
zmq_port_range=zmq_port_range,
|
227
200
|
logdir=logdir,
|
228
201
|
logging_level=logging_level,
|
229
|
-
priority_msgs=priority_msgs,
|
230
|
-
node_msgs=node_msgs,
|
231
|
-
block_msgs=block_msgs,
|
232
202
|
resource_msgs=resource_msgs,
|
233
203
|
exit_event=exit_event)
|
234
204
|
except Exception as e:
|
parsl/providers/slurm/slurm.py
CHANGED
@@ -20,7 +20,7 @@ from parsl.utils import RepresentationMixin, wtime_to_minutes
|
|
20
20
|
logger = logging.getLogger(__name__)
|
21
21
|
|
22
22
|
# From https://slurm.schedmd.com/sacct.html#SECTION_JOB-STATE-CODES
|
23
|
-
|
23
|
+
sacct_translate_table = {
|
24
24
|
'PENDING': JobState.PENDING,
|
25
25
|
'RUNNING': JobState.RUNNING,
|
26
26
|
'CANCELLED': JobState.CANCELLED,
|
@@ -37,6 +37,20 @@ translate_table = {
|
|
37
37
|
'REQUEUED': JobState.PENDING
|
38
38
|
}
|
39
39
|
|
40
|
+
squeue_translate_table = {
|
41
|
+
'PD': JobState.PENDING,
|
42
|
+
'R': JobState.RUNNING,
|
43
|
+
'CA': JobState.CANCELLED,
|
44
|
+
'CF': JobState.PENDING, # (configuring),
|
45
|
+
'CG': JobState.RUNNING, # (completing),
|
46
|
+
'CD': JobState.COMPLETED,
|
47
|
+
'F': JobState.FAILED, # (failed),
|
48
|
+
'TO': JobState.TIMEOUT, # (timeout),
|
49
|
+
'NF': JobState.FAILED, # (node failure),
|
50
|
+
'RV': JobState.FAILED, # (revoked) and
|
51
|
+
'SE': JobState.FAILED # (special exit state)
|
52
|
+
}
|
53
|
+
|
40
54
|
|
41
55
|
class SlurmProvider(ClusterProvider, RepresentationMixin):
|
42
56
|
"""Slurm Execution Provider
|
@@ -155,6 +169,23 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
155
169
|
|
156
170
|
self.regex_job_id = regex_job_id
|
157
171
|
self.worker_init = worker_init + '\n'
|
172
|
+
# Check if sacct works and if not fall back to squeue
|
173
|
+
cmd = "sacct -X"
|
174
|
+
logger.debug("Executing %s", cmd)
|
175
|
+
retcode, stdout, stderr = self.execute_wait(cmd)
|
176
|
+
# If sacct fails it should return retcode=1 stderr="Slurm accounting storage is disabled"
|
177
|
+
logger.debug(f"sacct returned retcode={retcode} stderr={stderr}")
|
178
|
+
if retcode == 0:
|
179
|
+
logger.debug("using sacct to get job status")
|
180
|
+
# Using state%20 to get enough characters to not truncate output
|
181
|
+
# of the state. Without output can look like "<job_id> CANCELLED+"
|
182
|
+
self._cmd = "sacct -X --noheader --format=jobid,state%20 --job '{0}'"
|
183
|
+
self._translate_table = sacct_translate_table
|
184
|
+
else:
|
185
|
+
logger.debug(f"sacct failed with retcode={retcode}")
|
186
|
+
logger.debug("falling back to using squeue to get job status")
|
187
|
+
self._cmd = "squeue --noheader --format='%i %t' --job '{0}'"
|
188
|
+
self._translate_table = squeue_translate_table
|
158
189
|
|
159
190
|
def _status(self):
|
160
191
|
'''Returns the status list for a list of job_ids
|
@@ -172,16 +203,14 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
172
203
|
logger.debug('No active jobs, skipping status update')
|
173
204
|
return
|
174
205
|
|
175
|
-
|
176
|
-
# of the state. Without output can look like "<job_id> CANCELLED+"
|
177
|
-
cmd = "sacct -X --noheader --format=jobid,state%20 --job '{0}'".format(job_id_list)
|
206
|
+
cmd = self._cmd.format(job_id_list)
|
178
207
|
logger.debug("Executing %s", cmd)
|
179
208
|
retcode, stdout, stderr = self.execute_wait(cmd)
|
180
|
-
logger.debug("sacct returned %s %s", stdout, stderr)
|
209
|
+
logger.debug("sacct/squeue returned %s %s", stdout, stderr)
|
181
210
|
|
182
211
|
# Execute_wait failed. Do no update
|
183
212
|
if retcode != 0:
|
184
|
-
logger.warning("sacct failed with non-zero exit code {}".format(retcode))
|
213
|
+
logger.warning("sacct/squeue failed with non-zero exit code {}".format(retcode))
|
185
214
|
return
|
186
215
|
|
187
216
|
jobs_missing = set(self.resources.keys())
|
@@ -193,9 +222,9 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
193
222
|
# For example "<job_id> CANCELLED by <user_id>"
|
194
223
|
# This splits and ignores anything past the first two unpacked values
|
195
224
|
job_id, slurm_state, *ignore = line.split()
|
196
|
-
if slurm_state not in
|
225
|
+
if slurm_state not in self._translate_table:
|
197
226
|
logger.warning(f"Slurm status {slurm_state} is not recognized")
|
198
|
-
status =
|
227
|
+
status = self._translate_table.get(slurm_state, JobState.UNKNOWN)
|
199
228
|
logger.debug("Updating job {} with slurm status {} to parsl state {!s}".format(job_id, slurm_state, status))
|
200
229
|
self.resources[job_id]['status'] = JobStatus(status,
|
201
230
|
stdout_path=self.resources[job_id]['job_stdout_path'],
|
@@ -203,9 +232,10 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
203
232
|
jobs_missing.remove(job_id)
|
204
233
|
|
205
234
|
# sacct can get job info after jobs have completed so this path shouldn't be hit
|
206
|
-
#
|
235
|
+
# squeue does not report on jobs that are not running. So we are filling in the
|
236
|
+
# blanks for missing jobs, we might lose some information about why the jobs failed.
|
207
237
|
for missing_job in jobs_missing:
|
208
|
-
logger.
|
238
|
+
logger.debug("Updating missing job {} to completed status".format(missing_job))
|
209
239
|
self.resources[missing_job]['status'] = JobStatus(
|
210
240
|
JobState.COMPLETED, stdout_path=self.resources[missing_job]['job_stdout_path'],
|
211
241
|
stderr_path=self.resources[missing_job]['job_stderr_path'])
|
@@ -21,16 +21,14 @@ def local_config():
|
|
21
21
|
poll_period=100,
|
22
22
|
max_workers_per_node=1,
|
23
23
|
provider=LocalProvider(
|
24
|
-
worker_init="
|
25
|
-
init_blocks=2
|
26
|
-
max_blocks=4,
|
27
|
-
min_blocks=0,
|
24
|
+
worker_init="exit 0",
|
25
|
+
init_blocks=2
|
28
26
|
),
|
29
27
|
)
|
30
28
|
],
|
31
29
|
run_dir="/tmp/test_htex",
|
32
30
|
max_idletime=0.5,
|
33
|
-
strategy='
|
31
|
+
strategy='none',
|
34
32
|
)
|
35
33
|
|
36
34
|
|
@@ -78,6 +78,6 @@ def test_row_counts(tmpd_cwd, strategy):
|
|
78
78
|
(c, ) = result.first()
|
79
79
|
assert c == 1, "There should be a single pending status"
|
80
80
|
|
81
|
-
result = connection.execute(text("SELECT COUNT(*) FROM block WHERE block_id = 0 AND status = '
|
81
|
+
result = connection.execute(text("SELECT COUNT(*) FROM block WHERE block_id = 0 AND status = 'SCALED_IN' AND run_id = :run_id"), binds)
|
82
82
|
(c, ) = result.first()
|
83
83
|
assert c == 1, "There should be a single cancelled status"
|
@@ -0,0 +1,85 @@
|
|
1
|
+
import time
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
import parsl
|
6
|
+
from parsl.channels import LocalChannel
|
7
|
+
from parsl.config import Config
|
8
|
+
from parsl.executors import HighThroughputExecutor
|
9
|
+
from parsl.launchers import WrappedLauncher
|
10
|
+
from parsl.providers import LocalProvider
|
11
|
+
|
12
|
+
|
13
|
+
def local_config():
|
14
|
+
# see the comments inside test_regression for reasoning about why each
|
15
|
+
# of these parameters is set why it is.
|
16
|
+
return Config(
|
17
|
+
max_idletime=1,
|
18
|
+
|
19
|
+
strategy='htex_auto_scale',
|
20
|
+
strategy_period=1,
|
21
|
+
|
22
|
+
executors=[
|
23
|
+
HighThroughputExecutor(
|
24
|
+
label="htex_local",
|
25
|
+
encrypted=True,
|
26
|
+
provider=LocalProvider(
|
27
|
+
init_blocks=1,
|
28
|
+
min_blocks=0,
|
29
|
+
max_blocks=1,
|
30
|
+
launcher=WrappedLauncher(prepend="sleep inf ; "),
|
31
|
+
),
|
32
|
+
)
|
33
|
+
],
|
34
|
+
)
|
35
|
+
|
36
|
+
|
37
|
+
@parsl.python_app
|
38
|
+
def task():
|
39
|
+
return 7
|
40
|
+
|
41
|
+
|
42
|
+
@pytest.mark.local
|
43
|
+
def test_regression(try_assert):
|
44
|
+
# The above config means that we should start scaling out one initial
|
45
|
+
# block, but then scale it back in after a second or so if the executor
|
46
|
+
# is kept idle (which this test does using try_assert).
|
47
|
+
|
48
|
+
# Because of 'sleep inf' in the WrappedLaucher, the block will not ever
|
49
|
+
# register.
|
50
|
+
|
51
|
+
# The bug being tested is about mistreatment of blocks which are scaled in
|
52
|
+
# before they have a chance to register, and the above forces that to
|
53
|
+
# happen.
|
54
|
+
|
55
|
+
# After that scaling in has happened, we should see that we have one block
|
56
|
+
# and it should be in a terminal state. The below try_assert waits for
|
57
|
+
# that to become true.
|
58
|
+
|
59
|
+
# At that time, we should also see htex reporting no blocks registered - as
|
60
|
+
# mentioned above, that is a necessary part of the bug being tested here.
|
61
|
+
|
62
|
+
# Give 10 strategy periods for the above to happen: each step of scale up,
|
63
|
+
# and scale down due to idleness isn't guaranteed to happen in exactly one
|
64
|
+
# scaling step.
|
65
|
+
|
66
|
+
htex = parsl.dfk().executors['htex_local']
|
67
|
+
|
68
|
+
try_assert(lambda: len(htex.status_facade) == 1 and htex.status_facade['0'].terminal,
|
69
|
+
timeout_ms=10000)
|
70
|
+
|
71
|
+
assert htex.connected_blocks() == [], "No block should have connected to interchange"
|
72
|
+
|
73
|
+
# Now we can reconfigure the launcher to let subsequent blocks launch ok,
|
74
|
+
# and run a trivial task. That trivial task will scale up a new block and
|
75
|
+
# run the task successfully.
|
76
|
+
|
77
|
+
# Prior to issue #3568, the bug was that the scale in of the first
|
78
|
+
# block earlier in the test case would have incorrectly been treated as a
|
79
|
+
# failure, and then the block error handler would have treated that failure
|
80
|
+
# as a permanent htex failure, and so the task execution below would raise
|
81
|
+
# a BadStateException rather than attempt to run the task.
|
82
|
+
|
83
|
+
assert htex.provider.launcher.prepend != "", "Pre-req: prepend attribute should exist and be non-empty"
|
84
|
+
htex.provider.launcher.prepend = ""
|
85
|
+
assert task().result() == 7
|
parsl/version.py
CHANGED
@@ -375,7 +375,7 @@ class Interchange:
|
|
375
375
|
|
376
376
|
self.zmq_context.destroy()
|
377
377
|
delta = time.time() - start
|
378
|
-
logger.info("Processed {} tasks in {} seconds"
|
378
|
+
logger.info(f"Processed {self.count} tasks in {delta} seconds")
|
379
379
|
logger.warning("Exiting")
|
380
380
|
|
381
381
|
def process_task_outgoing_incoming(
|
@@ -396,9 +396,8 @@ class Interchange:
|
|
396
396
|
try:
|
397
397
|
msg = json.loads(message[1].decode('utf-8'))
|
398
398
|
except Exception:
|
399
|
-
logger.warning("Got Exception reading message from manager: {!r}"
|
400
|
-
|
401
|
-
logger.debug("Message: \n{!r}\n".format(message[1]))
|
399
|
+
logger.warning(f"Got Exception reading message from manager: {manager_id!r}", exc_info=True)
|
400
|
+
logger.debug("Message:\n %r\n", message[1])
|
402
401
|
return
|
403
402
|
|
404
403
|
# perform a bit of validation on the structure of the deserialized
|
@@ -406,7 +405,7 @@ class Interchange:
|
|
406
405
|
# in obviously malformed cases
|
407
406
|
if not isinstance(msg, dict) or 'type' not in msg:
|
408
407
|
logger.error(f"JSON message was not correctly formatted from manager: {manager_id!r}")
|
409
|
-
logger.debug("Message
|
408
|
+
logger.debug("Message:\n %r\n", message[1])
|
410
409
|
return
|
411
410
|
|
412
411
|
if msg['type'] == 'registration':
|
@@ -425,7 +424,7 @@ class Interchange:
|
|
425
424
|
self.connected_block_history.append(msg['block_id'])
|
426
425
|
|
427
426
|
interesting_managers.add(manager_id)
|
428
|
-
logger.info("Adding manager: {!r} to ready queue"
|
427
|
+
logger.info(f"Adding manager: {manager_id!r} to ready queue")
|
429
428
|
m = self._ready_managers[manager_id]
|
430
429
|
|
431
430
|
# m is a ManagerRecord, but msg is a dict[Any,Any] and so can
|
@@ -434,12 +433,12 @@ class Interchange:
|
|
434
433
|
# later.
|
435
434
|
m.update(msg) # type: ignore[typeddict-item]
|
436
435
|
|
437
|
-
logger.info("Registration info for manager {!r}: {}"
|
436
|
+
logger.info(f"Registration info for manager {manager_id!r}: {msg}")
|
438
437
|
self._send_monitoring_info(monitoring_radio, m)
|
439
438
|
|
440
439
|
if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
|
441
440
|
msg['parsl_v'] != self.current_platform['parsl_v']):
|
442
|
-
logger.error("Manager {!r} has incompatible version info with the interchange"
|
441
|
+
logger.error(f"Manager {manager_id!r} has incompatible version info with the interchange")
|
443
442
|
logger.debug("Setting kill event")
|
444
443
|
kill_event.set()
|
445
444
|
e = VersionMismatch("py.v={} parsl.v={}".format(self.current_platform['python_v'].rsplit(".", 1)[0],
|
@@ -452,16 +451,15 @@ class Interchange:
|
|
452
451
|
self.results_outgoing.send(pkl_package)
|
453
452
|
logger.error("Sent failure reports, shutting down interchange")
|
454
453
|
else:
|
455
|
-
logger.info("Manager {!r} has compatible Parsl version {
|
456
|
-
logger.info("Manager {!r} has compatible Python version {
|
457
|
-
msg['python_v'].rsplit(".", 1)[0]))
|
454
|
+
logger.info(f"Manager {manager_id!r} has compatible Parsl version {msg['parsl_v']}")
|
455
|
+
logger.info(f"Manager {manager_id!r} has compatible Python version {msg['python_v'].rsplit('.', 1)[0]}")
|
458
456
|
elif msg['type'] == 'heartbeat':
|
459
457
|
self._ready_managers[manager_id]['last_heartbeat'] = time.time()
|
460
|
-
logger.debug("Manager
|
458
|
+
logger.debug("Manager %r sent heartbeat via tasks connection", manager_id)
|
461
459
|
self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
|
462
460
|
elif msg['type'] == 'drain':
|
463
461
|
self._ready_managers[manager_id]['draining'] = True
|
464
|
-
logger.debug(
|
462
|
+
logger.debug("Manager %r requested drain", manager_id)
|
465
463
|
else:
|
466
464
|
logger.error(f"Unexpected message type received from manager: {msg['type']}")
|
467
465
|
logger.debug("leaving task_outgoing section")
|
@@ -484,9 +482,11 @@ class Interchange:
|
|
484
482
|
def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
|
485
483
|
# Check if there are tasks that could be sent to managers
|
486
484
|
|
487
|
-
logger.debug(
|
488
|
-
total
|
489
|
-
|
485
|
+
logger.debug(
|
486
|
+
"Managers count (interesting/total): %d/%d",
|
487
|
+
len(interesting_managers),
|
488
|
+
len(self._ready_managers)
|
489
|
+
)
|
490
490
|
|
491
491
|
if interesting_managers and not self.pending_task_queue.empty():
|
492
492
|
shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
|
@@ -497,7 +497,7 @@ class Interchange:
|
|
497
497
|
tasks_inflight = len(m['tasks'])
|
498
498
|
real_capacity = m['max_capacity'] - tasks_inflight
|
499
499
|
|
500
|
-
if
|
500
|
+
if real_capacity and m["active"] and not m["draining"]:
|
501
501
|
tasks = self.get_tasks(real_capacity)
|
502
502
|
if tasks:
|
503
503
|
self.task_outgoing.send_multipart([manager_id, b'', pickle.dumps(tasks)])
|
@@ -506,19 +506,19 @@ class Interchange:
|
|
506
506
|
tids = [t['task_id'] for t in tasks]
|
507
507
|
m['tasks'].extend(tids)
|
508
508
|
m['idle_since'] = None
|
509
|
-
logger.debug("Sent tasks:
|
509
|
+
logger.debug("Sent tasks: %s to manager %r", tids, manager_id)
|
510
510
|
# recompute real_capacity after sending tasks
|
511
511
|
real_capacity = m['max_capacity'] - tasks_inflight
|
512
512
|
if real_capacity > 0:
|
513
|
-
logger.debug("Manager
|
513
|
+
logger.debug("Manager %r has free capacity %s", manager_id, real_capacity)
|
514
514
|
# ... so keep it in the interesting_managers list
|
515
515
|
else:
|
516
|
-
logger.debug("Manager
|
516
|
+
logger.debug("Manager %r is now saturated", manager_id)
|
517
517
|
interesting_managers.remove(manager_id)
|
518
518
|
else:
|
519
519
|
interesting_managers.remove(manager_id)
|
520
520
|
# logger.debug("Nothing to send to manager {}".format(manager_id))
|
521
|
-
logger.debug("leaving _ready_managers section, with
|
521
|
+
logger.debug("leaving _ready_managers section, with %s managers still interesting", len(interesting_managers))
|
522
522
|
else:
|
523
523
|
logger.debug("either no interesting managers or no tasks, so skipping manager pass")
|
524
524
|
|
@@ -528,9 +528,9 @@ class Interchange:
|
|
528
528
|
logger.debug("entering results_incoming section")
|
529
529
|
manager_id, *all_messages = self.results_incoming.recv_multipart()
|
530
530
|
if manager_id not in self._ready_managers:
|
531
|
-
logger.warning("Received a result from a un-registered manager: {!r}"
|
531
|
+
logger.warning(f"Received a result from a un-registered manager: {manager_id!r}")
|
532
532
|
else:
|
533
|
-
logger.debug(
|
533
|
+
logger.debug("Got %s result items in batch from manager %r", len(all_messages), manager_id)
|
534
534
|
|
535
535
|
b_messages = []
|
536
536
|
|
@@ -548,10 +548,10 @@ class Interchange:
|
|
548
548
|
|
549
549
|
monitoring_radio.send(r['payload'])
|
550
550
|
elif r['type'] == 'heartbeat':
|
551
|
-
logger.debug(
|
551
|
+
logger.debug("Manager %r sent heartbeat via results connection", manager_id)
|
552
552
|
b_messages.append((p_message, r))
|
553
553
|
else:
|
554
|
-
logger.error("Interchange discarding result_queue message of unknown type:
|
554
|
+
logger.error("Interchange discarding result_queue message of unknown type: %s", r["type"])
|
555
555
|
|
556
556
|
got_result = False
|
557
557
|
m = self._ready_managers[manager_id]
|
@@ -560,14 +560,16 @@ class Interchange:
|
|
560
560
|
if r['type'] == 'result':
|
561
561
|
got_result = True
|
562
562
|
try:
|
563
|
-
logger.debug(
|
563
|
+
logger.debug("Removing task %s from manager record %r", r["task_id"], manager_id)
|
564
564
|
m['tasks'].remove(r['task_id'])
|
565
565
|
except Exception:
|
566
566
|
# If we reach here, there's something very wrong.
|
567
|
-
logger.exception(
|
567
|
+
logger.exception(
|
568
|
+
"Ignoring exception removing task_id %s for manager %r with task list %s",
|
568
569
|
r['task_id'],
|
569
570
|
manager_id,
|
570
|
-
m[
|
571
|
+
m["tasks"]
|
572
|
+
)
|
571
573
|
|
572
574
|
b_messages_to_send = []
|
573
575
|
for (b_message, _) in b_messages:
|
@@ -578,7 +580,7 @@ class Interchange:
|
|
578
580
|
self.results_outgoing.send_multipart(b_messages_to_send)
|
579
581
|
logger.debug("Sent messages on results_outgoing")
|
580
582
|
|
581
|
-
logger.debug(
|
583
|
+
logger.debug("Current tasks on manager %r: %s", manager_id, m["tasks"])
|
582
584
|
if len(m['tasks']) == 0 and m['idle_since'] is None:
|
583
585
|
m['idle_since'] = time.time()
|
584
586
|
|
@@ -1,9 +1,9 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: parsl
|
3
|
-
Version: 2024.
|
3
|
+
Version: 2024.9.2
|
4
4
|
Summary: Simple data dependent workflows in Python
|
5
5
|
Home-page: https://github.com/Parsl/parsl
|
6
|
-
Download-URL: https://github.com/Parsl/parsl/archive/2024.
|
6
|
+
Download-URL: https://github.com/Parsl/parsl/archive/2024.09.02.tar.gz
|
7
7
|
Author: The Parsl Team
|
8
8
|
Author-email: parsl@googlegroups.com
|
9
9
|
License: Apache 2.0
|
@@ -8,7 +8,7 @@ parsl/multiprocessing.py,sha256=MyaEcEq-Qf860u7V98u-PZrPNdtzOZL_NW6EhIJnmfQ,1937
|
|
8
8
|
parsl/process_loggers.py,sha256=uQ7Gd0W72Jz7rrcYlOMfLsAEhkRltxXJL2MgdduJjEw,1136
|
9
9
|
parsl/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
parsl/utils.py,sha256=91FjQiTUY383ueAjkBAgE21My9nba6SP2a2SrbB1r1Q,11250
|
11
|
-
parsl/version.py,sha256
|
11
|
+
parsl/version.py,sha256=-CxPczDJTqi5fdaWV26l0UO4NHlIPdBwBCFIx6tfkMQ,131
|
12
12
|
parsl/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
parsl/app/app.py,sha256=0gbM4AH2OtFOLsv07I5nglpElcwMSOi-FzdZZfrk7So,8532
|
14
14
|
parsl/app/bash.py,sha256=jm2AvePlCT9DZR7H_4ANDWxatp5dN_22FUlT_gWhZ-g,5528
|
@@ -39,6 +39,7 @@ parsl/configs/expanse.py,sha256=ADUY3GZWSfVKmqFWbgdfC85kRxNPChqOGwly0XdcKSw,1033
|
|
39
39
|
parsl/configs/frontera.py,sha256=6n0TMvF2IFdJ3g5NdFcM-rg5Na_6dDroF0ZIozk3-LU,1495
|
40
40
|
parsl/configs/htex_local.py,sha256=v6VG9UoWoosy3ls66lToTyGahZFjoLb5ni0nVWKfKNY,542
|
41
41
|
parsl/configs/illinoiscluster.py,sha256=ZR22A8uwFb8tzSzmU1D0kR0qcr5Thr0j-7Nb5hiCgQ8,1170
|
42
|
+
parsl/configs/improv.py,sha256=le9fDip-Mr-HqKObiyHXbdR-Ne7cy15Ao5ONoUzCSaE,1252
|
42
43
|
parsl/configs/kubernetes.py,sha256=s6ABVRwHEKsIFi-w9gc5OK-P0UDmlAZsoHc6OZ3oOD4,1325
|
43
44
|
parsl/configs/local_threads.py,sha256=I1VFfGo2TMTrBL9g_rlG3TEqEWkhL-AHpkqJ3lvcTf8,221
|
44
45
|
parsl/configs/midway.py,sha256=An2Z-TbL3b6AP3uQwauxtUqZaYO2CtUiP8XH05hpWks,1221
|
@@ -71,7 +72,7 @@ parsl/dataflow/taskrecord.py,sha256=-FuujdZQ1y5GSc-PJ91QKGT-Kp0lrg70MFDoxpbWI1Q,
|
|
71
72
|
parsl/executors/__init__.py,sha256=Cg8e-F2NUaBD8A9crDAXKCSdoBEwQVIdgm4FlXd-wvk,476
|
72
73
|
parsl/executors/base.py,sha256=5A59mCXPjYNCep9JgfvIjBdZvGV-1mNVHklr-ZIEojg,5200
|
73
74
|
parsl/executors/errors.py,sha256=xVswxgi7vmJcUMCeYDAPK8sQT2kHFFROVoOr0dnmcWE,2098
|
74
|
-
parsl/executors/status_handling.py,sha256=
|
75
|
+
parsl/executors/status_handling.py,sha256=nxbkiGr6f3xDc0nsUeSrMMxlj7UD32K7nOLCLzfthDs,15416
|
75
76
|
parsl/executors/threads.py,sha256=hJt1LzxphqX4fe_9R9Cf1MU0lepWTU_eJe8O665B0Xo,3352
|
76
77
|
parsl/executors/flux/__init__.py,sha256=P9grTTeRPXfqXurFhlSS7XhmE6tTbnCnyQ1f9b-oYHE,136
|
77
78
|
parsl/executors/flux/execute_parsl_task.py,sha256=gRN7F4HhdrKQ-bvn4wXrquBzFOp_9WF88hMIeUaRg5I,1553
|
@@ -79,8 +80,8 @@ parsl/executors/flux/executor.py,sha256=8_xakLUu5zNJAHL0LbeTCFEWqWzRK1eE-3ep4GII
|
|
79
80
|
parsl/executors/flux/flux_instance_manager.py,sha256=5T3Rp7ZM-mlT0Pf0Gxgs5_YmnaPrSF9ec7zvRfLfYJw,2129
|
80
81
|
parsl/executors/high_throughput/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
81
82
|
parsl/executors/high_throughput/errors.py,sha256=Sak8e8UpiEcXefUjMHbhyXc4Rn7kJtOoh7L8wreBQdk,1638
|
82
|
-
parsl/executors/high_throughput/executor.py,sha256=
|
83
|
-
parsl/executors/high_throughput/interchange.py,sha256=
|
83
|
+
parsl/executors/high_throughput/executor.py,sha256=x6DGXdriDIPpfDK6yms7XTUrkwxNCHNfz6X9kJRvt2w,37904
|
84
|
+
parsl/executors/high_throughput/interchange.py,sha256=WP9zseYYb0B8522j8wt3yhO12bzmFIxdCIepEU-4oWA,30877
|
84
85
|
parsl/executors/high_throughput/manager_record.py,sha256=yn3L8TUJFkgm2lX1x0SeS9mkvJowC0s2VIMCFiU7ThM,455
|
85
86
|
parsl/executors/high_throughput/manager_selector.py,sha256=uRaEtcbDO2vXf8vjEcm7bfZVdeUlSPTRc3G4oFRO29M,820
|
86
87
|
parsl/executors/high_throughput/monitoring_info.py,sha256=HC0drp6nlXQpAop5PTUKNjdXMgtZVvrBL0JzZJebPP4,298
|
@@ -113,20 +114,20 @@ parsl/jobs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
113
114
|
parsl/jobs/error_handlers.py,sha256=BBXwUAMJpBm0HxV1P-I6jv7ZF9wcrhnCfzSTlsd2g4w,2319
|
114
115
|
parsl/jobs/errors.py,sha256=cpSQXCrlKtuHsQf7usjF-lX8XsDkFnE5kWpmFjiN6OU,178
|
115
116
|
parsl/jobs/job_status_poller.py,sha256=b37JOqDpSesqeSreEh1HzfVTFnD5Aoy6k8JDXkkPDmk,2192
|
116
|
-
parsl/jobs/states.py,sha256=
|
117
|
+
parsl/jobs/states.py,sha256=dUM8gC4YVpUjLMARJJ_tDERs6oHsoNheAtG6JWPIJt4,5058
|
117
118
|
parsl/jobs/strategy.py,sha256=KYcIpjWVKLYbM0TXhue9Zp2a7I1zkbHx4raPFiDlewA,13799
|
118
119
|
parsl/launchers/__init__.py,sha256=jJeDOWGKJjvpmWTLsj1zSqce_UAhWRc_IO-TzaOAlII,579
|
119
120
|
parsl/launchers/base.py,sha256=CblcvPTJiu-MNLWaRtFe29SZQ0BpTOlaY8CGcHdlHIE,538
|
120
121
|
parsl/launchers/errors.py,sha256=8YMV_CHpBNVa4eXkGE4x5DaFQlZkDCRCHmBktYcY6TA,467
|
121
122
|
parsl/launchers/launchers.py,sha256=VB--fiVv_IQne3DydTMSdGUY0o0g69puAs-Hd3mJ2vo,15464
|
122
123
|
parsl/monitoring/__init__.py,sha256=0ywNz6i0lM1xo_7_BIxhETDGeVd2C_0wwD7qgeaMR4c,83
|
123
|
-
parsl/monitoring/db_manager.py,sha256=
|
124
|
+
parsl/monitoring/db_manager.py,sha256=l7Qiub4JsR6QUzTYUAJ9sVytZOvba2QMBdFH3cGbNIo,33336
|
124
125
|
parsl/monitoring/errors.py,sha256=D6jpYzEzp0d6FmVKGqhvjAxr4ztZfJX2s-aXemH9bBU,148
|
125
126
|
parsl/monitoring/message_type.py,sha256=Khn88afNxcOIciKiCK4GLnn90I5BlRTiOL3zK-P07yQ,401
|
126
|
-
parsl/monitoring/monitoring.py,sha256=
|
127
|
+
parsl/monitoring/monitoring.py,sha256=q_U2zpcd_hy0cxdWNXF_qhNBe1SQDipStvD1LdcWhlo,13098
|
127
128
|
parsl/monitoring/radios.py,sha256=cHdpBOW1ITYvFnOgYjziuZOauq8p7mlSBOvcbIP78mg,6437
|
128
129
|
parsl/monitoring/remote.py,sha256=avIWMvejN0LeIXpt_RCXJxGLbsXhapUab2rS5Tmjca4,13739
|
129
|
-
parsl/monitoring/router.py,sha256=
|
130
|
+
parsl/monitoring/router.py,sha256=8zWTaYIXWsgpMranTTEPhTPqQSmT2ePK8JJmfW8K34s,9256
|
130
131
|
parsl/monitoring/types.py,sha256=_WGizCTgQVOkJ2dvNfsvHpYBj21Ky3bJsmyIskIx10I,631
|
131
132
|
parsl/monitoring/queries/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
132
133
|
parsl/monitoring/queries/pandas.py,sha256=0Z2r0rjTKCemf0eaDkF1irvVHn5g7KC5SYETvQPRxwU,2232
|
@@ -186,7 +187,7 @@ parsl/providers/pbspro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
|
|
186
187
|
parsl/providers/pbspro/pbspro.py,sha256=jh9rzSOKRf0LKtqHSaolqVQtRa1jyjcZLsjk8Wp-llg,8794
|
187
188
|
parsl/providers/pbspro/template.py,sha256=y-Dher--t5Eury-c7cAuSZs9FEUXWiruFUI07v81558,315
|
188
189
|
parsl/providers/slurm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
189
|
-
parsl/providers/slurm/slurm.py,sha256=
|
190
|
+
parsl/providers/slurm/slurm.py,sha256=qFG0MNSV6oG5mBYusS8y53DR13Nhq9DxQ6bGfncbJeQ,15719
|
190
191
|
parsl/providers/slurm/template.py,sha256=KpgBEFMc1ps-38jdrk13xUGx9TCivu-iF90jgQDdiEQ,315
|
191
192
|
parsl/providers/torque/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
192
193
|
parsl/providers/torque/template.py,sha256=4qfc2gmlEhRCAD7erFDOs4prJQ43I8s4E8DSUSVQx3A,358
|
@@ -340,7 +341,7 @@ parsl/tests/test_htex/test_htex.py,sha256=5ylQvWgmSLP3lOdoHxqK9wkvAgfgeJx6gihKPk
|
|
340
341
|
parsl/tests/test_htex/test_manager_failure.py,sha256=N-obuSZ8f7XA_XcddoN2LWKSVtpKUZvTHb7BFelS3iQ,1143
|
341
342
|
parsl/tests/test_htex/test_managers_command.py,sha256=Y-eUjtBzwW9erCYdph9bOesbkUvX8QUPqXt27DCgVS8,951
|
342
343
|
parsl/tests/test_htex/test_missing_worker.py,sha256=gyp5i7_t-JHyJGtz_eXZKKBY5w8oqLOIxO6cJgGJMtQ,745
|
343
|
-
parsl/tests/test_htex/test_multiple_disconnected_blocks.py,sha256=
|
344
|
+
parsl/tests/test_htex/test_multiple_disconnected_blocks.py,sha256=2vXZoIx4NuAWYuiNoL5Gxr85w72qZ7Kdb3JGh0FufTg,1867
|
344
345
|
parsl/tests/test_htex/test_resource_spec_validation.py,sha256=k1zQ--46bCyhOnt2UTaYnSh0I2UhwX747ISAfy8xPvk,952
|
345
346
|
parsl/tests/test_htex/test_worker_failure.py,sha256=Uz-RHI-LK78FMjXUvrUFmo4iYfmpDVBUcBxxRb3UG9M,603
|
346
347
|
parsl/tests/test_htex/test_zmq_binding.py,sha256=Bq1HHuMxBE_AcaP1VZ-RqE4euCHO__Du05b2UZ5H1RA,3950
|
@@ -349,7 +350,7 @@ parsl/tests/test_monitoring/test_app_names.py,sha256=ayyxySGWpKSe9dDw2UeJo1dicxj
|
|
349
350
|
parsl/tests/test_monitoring/test_basic.py,sha256=nQERwVH56CjrKc_YSsMxH5UziJDqN2357Vhyd0brbRU,4177
|
350
351
|
parsl/tests/test_monitoring/test_db_locks.py,sha256=3s3c1xhKo230ZZIJ3f1Ca4U7LcEdXnanOGVXQyNlk2U,2895
|
351
352
|
parsl/tests/test_monitoring/test_fuzz_zmq.py,sha256=--3-pQUvXXbkr8v_BEJoPvVvNly1oXvrD2nJh6yl_0M,3436
|
352
|
-
parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py,sha256=
|
353
|
+
parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py,sha256=BAnl80waEaE41pvtpYD-AbNgdfF7QBgVwcCN9IsFPTM,2746
|
353
354
|
parsl/tests/test_monitoring/test_incomplete_futures.py,sha256=ZnO1sFSwlWUBHX64C_zwfTVRVC_UFNlU4h0POgx6NEo,2005
|
354
355
|
parsl/tests/test_monitoring/test_memoization_representation.py,sha256=dknv2nO7pNZ1jGxWGsC_AW3rs90gjMIeC5d7pIJ75Xc,2645
|
355
356
|
parsl/tests/test_monitoring/test_stdouterr.py,sha256=9FQSfiaMrOpoSwravZuEwmdgUgI7iG0TPRucsYC_NJA,4498
|
@@ -415,6 +416,7 @@ parsl/tests/test_regression/test_98.py,sha256=E7dituuonKN5uWocZkJYZlaE5x5rDM4MZl
|
|
415
416
|
parsl/tests/test_scaling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
416
417
|
parsl/tests/test_scaling/test_block_error_handler.py,sha256=OS1IyiK8gjRFI1VzpmOvEnKsPev2vKmC6Z2Hp5LaHpA,6068
|
417
418
|
parsl/tests/test_scaling/test_regression_1621.py,sha256=cAPjJ0p_VLZm9Z6EK7QuOgeO5KpcUXQ0ar698T6uMy4,1944
|
419
|
+
parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py,sha256=uL4dmaxqix9K6P-5vDTFqPye1BIeyynJjiYZBx5XI3E,2982
|
418
420
|
parsl/tests/test_scaling/test_scale_down.py,sha256=u8TbbVM2PXgy4Zg7bAkh0C-KQuF1kD_WEsO79R0Y-UE,2820
|
419
421
|
parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py,sha256=A-aDudFnWvOr5Z4m3Z0WW7-MJZ6ZveEneogZEzSom1k,4596
|
420
422
|
parsl/tests/test_scaling/test_scale_down_htex_unregistered.py,sha256=4DYZB9BMDzyC659bf-gmU3ltnRvCgXVrfnnehb7cL5c,2029
|
@@ -459,13 +461,13 @@ parsl/usage_tracking/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
459
461
|
parsl/usage_tracking/api.py,sha256=iaCY58Dc5J4UM7_dJzEEs871P1p1HdxBMtNGyVdzc9g,1821
|
460
462
|
parsl/usage_tracking/levels.py,sha256=xbfzYEsd55KiZJ-mzNgPebvOH4rRHum04hROzEf41tU,291
|
461
463
|
parsl/usage_tracking/usage.py,sha256=qNEJ7nPimqd3Y7OWFLdYmNwJ6XDKlyfV_fTzasxsQw8,8690
|
462
|
-
parsl-2024.
|
463
|
-
parsl-2024.
|
464
|
-
parsl-2024.
|
465
|
-
parsl-2024.
|
466
|
-
parsl-2024.
|
467
|
-
parsl-2024.
|
468
|
-
parsl-2024.
|
469
|
-
parsl-2024.
|
470
|
-
parsl-2024.
|
471
|
-
parsl-2024.
|
464
|
+
parsl-2024.9.2.data/scripts/exec_parsl_function.py,sha256=RUkJ4JSJAjr7YyRZ58zhMdg8cR5dVV9odUl3AuzNf3k,7802
|
465
|
+
parsl-2024.9.2.data/scripts/interchange.py,sha256=2tsbwd055SEnSpWLNNoqMW6o6ohRJFNSgvgN_umsqN8,30864
|
466
|
+
parsl-2024.9.2.data/scripts/parsl_coprocess.py,sha256=zrVjEqQvFOHxsLufPi00xzMONagjVwLZbavPM7bbjK4,5722
|
467
|
+
parsl-2024.9.2.data/scripts/process_worker_pool.py,sha256=78QKnV5KbY_vcteC6k60gpDE4wEk6hsciet_qzs9QoU,43061
|
468
|
+
parsl-2024.9.2.dist-info/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
|
469
|
+
parsl-2024.9.2.dist-info/METADATA,sha256=RHRyaL2xjgp_GVytcH5CI5Q9uKAfxJ54ZJzfKC-rFNY,4120
|
470
|
+
parsl-2024.9.2.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
471
|
+
parsl-2024.9.2.dist-info/entry_points.txt,sha256=XqnsWDYoEcLbsMcpnYGKLEnSBmaIe1YoM5YsBdJG2tI,176
|
472
|
+
parsl-2024.9.2.dist-info/top_level.txt,sha256=PIheYoUFQtF2icLsgOykgU-Cjuwr2Oi6On2jo5RYgRM,6
|
473
|
+
parsl-2024.9.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|