parsl 2025.8.4__py3-none-any.whl → 2025.11.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. parsl/__init__.py +0 -4
  2. parsl/app/bash.py +1 -1
  3. parsl/benchmark/perf.py +73 -17
  4. parsl/concurrent/__init__.py +95 -14
  5. parsl/curvezmq.py +0 -16
  6. parsl/data_provider/globus.py +3 -1
  7. parsl/dataflow/dflow.py +107 -207
  8. parsl/dataflow/memoization.py +144 -31
  9. parsl/dataflow/states.py +5 -5
  10. parsl/executors/base.py +2 -2
  11. parsl/executors/execute_task.py +2 -8
  12. parsl/executors/flux/executor.py +4 -6
  13. parsl/executors/globus_compute.py +0 -4
  14. parsl/executors/high_throughput/executor.py +86 -25
  15. parsl/executors/high_throughput/interchange.py +55 -42
  16. parsl/executors/high_throughput/mpi_executor.py +1 -2
  17. parsl/executors/high_throughput/mpi_resource_management.py +7 -14
  18. parsl/executors/high_throughput/process_worker_pool.py +32 -7
  19. parsl/executors/high_throughput/zmq_pipes.py +36 -67
  20. parsl/executors/radical/executor.py +2 -6
  21. parsl/executors/radical/rpex_worker.py +2 -2
  22. parsl/executors/taskvine/executor.py +5 -1
  23. parsl/executors/threads.py +5 -2
  24. parsl/jobs/states.py +2 -2
  25. parsl/jobs/strategy.py +7 -6
  26. parsl/monitoring/db_manager.py +21 -23
  27. parsl/monitoring/monitoring.py +2 -2
  28. parsl/monitoring/radios/filesystem.py +2 -1
  29. parsl/monitoring/radios/htex.py +2 -1
  30. parsl/monitoring/radios/multiprocessing.py +2 -1
  31. parsl/monitoring/radios/udp.py +2 -1
  32. parsl/monitoring/radios/udp_router.py +2 -2
  33. parsl/monitoring/radios/zmq_router.py +2 -2
  34. parsl/multiprocessing.py +0 -49
  35. parsl/providers/base.py +24 -37
  36. parsl/providers/pbspro/pbspro.py +1 -1
  37. parsl/serialize/__init__.py +6 -9
  38. parsl/serialize/facade.py +0 -32
  39. parsl/tests/configs/local_threads_globus.py +18 -14
  40. parsl/tests/configs/taskvine_ex.py +1 -1
  41. parsl/tests/manual_tests/test_memory_limits.py +1 -1
  42. parsl/tests/sites/test_concurrent.py +51 -3
  43. parsl/tests/test_checkpointing/test_periodic.py +15 -9
  44. parsl/tests/test_checkpointing/test_python_checkpoint_1.py +6 -3
  45. parsl/tests/test_checkpointing/test_regression_233.py +0 -1
  46. parsl/tests/test_curvezmq.py +0 -42
  47. parsl/tests/test_execute_task.py +2 -11
  48. parsl/tests/test_htex/test_command_concurrency_regression_1321.py +54 -0
  49. parsl/tests/test_htex/test_htex.py +36 -1
  50. parsl/tests/test_htex/test_interchange_exit_bad_registration.py +2 -2
  51. parsl/tests/test_htex/test_priority_queue.py +26 -3
  52. parsl/tests/test_htex/test_zmq_binding.py +2 -1
  53. parsl/tests/test_mpi_apps/test_mpi_scheduler.py +18 -43
  54. parsl/tests/test_python_apps/test_basic.py +0 -14
  55. parsl/tests/test_python_apps/test_depfail_propagation.py +11 -1
  56. parsl/tests/test_python_apps/test_exception.py +19 -0
  57. parsl/tests/test_python_apps/test_garbage_collect.py +1 -6
  58. parsl/tests/test_python_apps/test_memoize_2.py +11 -1
  59. parsl/tests/test_python_apps/test_memoize_exception.py +41 -0
  60. parsl/tests/test_regression/test_3874.py +47 -0
  61. parsl/tests/test_scaling/test_regression_3696_oscillation.py +1 -0
  62. parsl/tests/test_staging/test_staging_globus.py +2 -2
  63. parsl/tests/test_utils/test_representation_mixin.py +53 -0
  64. parsl/tests/unit/test_globus_compute_executor.py +11 -2
  65. parsl/utils.py +11 -3
  66. parsl/version.py +1 -1
  67. {parsl-2025.8.4.data → parsl-2025.11.10.data}/scripts/interchange.py +55 -42
  68. {parsl-2025.8.4.data → parsl-2025.11.10.data}/scripts/process_worker_pool.py +32 -7
  69. {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/METADATA +64 -50
  70. {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/RECORD +76 -81
  71. {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/WHEEL +1 -1
  72. parsl/tests/configs/local_threads_checkpoint_periodic.py +0 -11
  73. parsl/tests/configs/local_threads_no_cache.py +0 -11
  74. parsl/tests/site_tests/test_provider.py +0 -88
  75. parsl/tests/site_tests/test_site.py +0 -70
  76. parsl/tests/test_aalst_patterns.py +0 -474
  77. parsl/tests/test_docs/test_workflow2.py +0 -42
  78. parsl/tests/test_error_handling/test_rand_fail.py +0 -171
  79. parsl/tests/test_regression/test_854.py +0 -62
  80. parsl/tests/test_serialization/test_pack_resource_spec.py +0 -23
  81. {parsl-2025.8.4.data → parsl-2025.11.10.data}/scripts/exec_parsl_function.py +0 -0
  82. {parsl-2025.8.4.data → parsl-2025.11.10.data}/scripts/parsl_coprocess.py +0 -0
  83. {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/entry_points.txt +0 -0
  84. {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info/licenses}/LICENSE +0 -0
  85. {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/top_level.txt +0 -0
parsl/dataflow/dflow.py CHANGED
@@ -6,7 +6,6 @@ import datetime
6
6
  import inspect
7
7
  import logging
8
8
  import os
9
- import pickle
10
9
  import random
11
10
  import sys
12
11
  import threading
@@ -50,7 +49,7 @@ from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSen
50
49
  from parsl.monitoring.remote import monitor_wrapper
51
50
  from parsl.process_loggers import wrap_with_logs
52
51
  from parsl.usage_tracking.usage import UsageTracker
53
- from parsl.utils import Timer, get_all_checkpoints, get_std_fname_mode, get_version
52
+ from parsl.utils import get_std_fname_mode, get_version
54
53
 
55
54
  logger = logging.getLogger(__name__)
56
55
 
@@ -101,8 +100,6 @@ class DataFlowKernel:
101
100
 
102
101
  logger.info("Parsl version: {}".format(get_version()))
103
102
 
104
- self.checkpoint_lock = threading.Lock()
105
-
106
103
  self.usage_tracker = UsageTracker(self)
107
104
  self.usage_tracker.send_start_message()
108
105
 
@@ -168,18 +165,12 @@ class DataFlowKernel:
168
165
  self.monitoring_radio.send((MessageType.WORKFLOW_INFO,
169
166
  workflow_info))
170
167
 
171
- if config.checkpoint_files is not None:
172
- checkpoint_files = config.checkpoint_files
173
- elif config.checkpoint_files is None and config.checkpoint_mode is not None:
174
- checkpoint_files = get_all_checkpoints(self.run_dir)
175
- else:
176
- checkpoint_files = []
177
-
178
- self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files)
179
- self.checkpointed_tasks = 0
180
- self._checkpoint_timer = None
181
- self.checkpoint_mode = config.checkpoint_mode
182
- self.checkpointable_tasks: List[TaskRecord] = []
168
+ self.memoizer = Memoizer(memoize=config.app_cache,
169
+ checkpoint_mode=config.checkpoint_mode,
170
+ checkpoint_files=config.checkpoint_files,
171
+ checkpoint_period=config.checkpoint_period)
172
+ self.memoizer.run_dir = self.run_dir
173
+ self.memoizer.start()
183
174
 
184
175
  # this must be set before executors are added since add_executors calls
185
176
  # job_status_poller.add_executors.
@@ -195,22 +186,11 @@ class DataFlowKernel:
195
186
  self.add_executors(config.executors)
196
187
  self.add_executors([parsl_internal_executor])
197
188
 
198
- if self.checkpoint_mode == "periodic":
199
- if config.checkpoint_period is None:
200
- raise ConfigurationError("Checkpoint period must be specified with periodic checkpoint mode")
201
- else:
202
- try:
203
- h, m, s = map(int, config.checkpoint_period.split(':'))
204
- except Exception:
205
- raise ConfigurationError("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(config.checkpoint_period))
206
- checkpoint_period = (h * 3600) + (m * 60) + s
207
- self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period, name="Checkpoint")
208
-
209
189
  self.task_count = 0
210
190
  self.tasks: Dict[int, TaskRecord] = {}
211
191
  self.submitter_lock = threading.Lock()
212
192
 
213
- self.dependency_launch_pool = cf.ThreadPoolExecutor(max_workers=1, thread_name_prefix="Dependency-Launch")
193
+ self._task_launch_pool = cf.ThreadPoolExecutor(max_workers=1, thread_name_prefix="Task-Launch")
214
194
 
215
195
  self.dependency_resolver = self.config.dependency_resolver if self.config.dependency_resolver is not None \
216
196
  else SHALLOW_DEPENDENCY_RESOLVER
@@ -239,7 +219,7 @@ class DataFlowKernel:
239
219
  else:
240
220
  raise InternalConsistencyError(f"Exit case for {mode} should be unreachable, validated by typeguard on Config()")
241
221
 
242
- def _send_task_log_info(self, task_record: TaskRecord) -> None:
222
+ def _send_task_info(self, task_record: TaskRecord) -> None:
243
223
  if self.monitoring_radio:
244
224
  task_log_info = self._create_task_log_info(task_record)
245
225
  self.monitoring_radio.send((MessageType.TASK_INFO, task_log_info))
@@ -371,84 +351,70 @@ class DataFlowKernel:
371
351
  else:
372
352
  task_record['fail_cost'] += 1
373
353
 
374
- if task_record['status'] == States.dep_fail:
354
+ if isinstance(e, DependencyError):
375
355
  logger.info("Task {} failed due to dependency failure so skipping retries".format(task_id))
376
- task_record['time_returned'] = datetime.datetime.now()
377
- self._send_task_log_info(task_record)
378
- with task_record['app_fu']._update_lock:
379
- task_record['app_fu'].set_exception(e)
356
+ self._complete_task_exception(task_record, States.dep_fail, e)
380
357
 
381
358
  elif task_record['fail_cost'] <= self._config.retries:
382
359
 
383
360
  # record the final state for this try before we mutate for retries
384
- self.update_task_state(task_record, States.fail_retryable)
385
- self._send_task_log_info(task_record)
361
+ self._update_task_state(task_record, States.fail_retryable)
362
+ self._send_task_info(task_record)
386
363
 
387
364
  task_record['try_id'] += 1
388
- self.update_task_state(task_record, States.pending)
365
+ self._update_task_state(task_record, States.pending)
389
366
  task_record['try_time_launched'] = None
390
367
  task_record['try_time_returned'] = None
391
368
  task_record['fail_history'] = []
392
- self._send_task_log_info(task_record)
369
+ self._send_task_info(task_record)
393
370
 
394
371
  logger.info("Task {} marked for retry".format(task_id))
395
372
 
396
373
  else:
397
374
  logger.exception("Task {} failed after {} retry attempts".format(task_id,
398
375
  task_record['try_id']))
399
- task_record['time_returned'] = datetime.datetime.now()
400
- self.update_task_state(task_record, States.failed)
401
- task_record['time_returned'] = datetime.datetime.now()
402
- self._send_task_log_info(task_record)
403
- with task_record['app_fu']._update_lock:
404
- task_record['app_fu'].set_exception(e)
376
+ self._complete_task_exception(task_record, States.failed, e)
405
377
 
406
378
  else:
407
379
  if task_record['from_memo']:
408
- self._complete_task(task_record, States.memo_done, res)
409
- self._send_task_log_info(task_record)
380
+ self._complete_task_result(task_record, States.memo_done, res)
381
+ elif not task_record['join']:
382
+ self._complete_task_result(task_record, States.exec_done, res)
410
383
  else:
411
- if not task_record['join']:
412
- self._complete_task(task_record, States.exec_done, res)
413
- self._send_task_log_info(task_record)
384
+ # This is a join task, and the original task's function code has
385
+ # completed. That means that the future returned by that code
386
+ # will be available inside the executor future, so we can now
387
+ # record the inner app ID in monitoring, and add a completion
388
+ # listener to that inner future.
389
+
390
+ joinable = future.result()
391
+
392
+ # Fail with a TypeError if the joinapp python body returned
393
+ # something we can't join on.
394
+ if isinstance(joinable, Future):
395
+ self._update_task_state(task_record, States.joining)
396
+ task_record['joins'] = joinable
397
+ task_record['join_lock'] = threading.Lock()
398
+ self._send_task_info(task_record)
399
+ joinable.add_done_callback(partial(self.handle_join_update, task_record))
400
+ elif joinable == []: # got a list, but it had no entries, and specifically, no Futures.
401
+ self._update_task_state(task_record, States.joining)
402
+ task_record['joins'] = joinable
403
+ task_record['join_lock'] = threading.Lock()
404
+ self._send_task_info(task_record)
405
+ self.handle_join_update(task_record, None)
406
+ elif isinstance(joinable, list) and [j for j in joinable if not isinstance(j, Future)] == []:
407
+ self._update_task_state(task_record, States.joining)
408
+ task_record['joins'] = joinable
409
+ task_record['join_lock'] = threading.Lock()
410
+ self._send_task_info(task_record)
411
+ for inner_future in joinable:
412
+ inner_future.add_done_callback(partial(self.handle_join_update, task_record))
414
413
  else:
415
- # This is a join task, and the original task's function code has
416
- # completed. That means that the future returned by that code
417
- # will be available inside the executor future, so we can now
418
- # record the inner app ID in monitoring, and add a completion
419
- # listener to that inner future.
420
-
421
- joinable = future.result()
422
-
423
- # Fail with a TypeError if the joinapp python body returned
424
- # something we can't join on.
425
- if isinstance(joinable, Future):
426
- self.update_task_state(task_record, States.joining)
427
- task_record['joins'] = joinable
428
- task_record['join_lock'] = threading.Lock()
429
- self._send_task_log_info(task_record)
430
- joinable.add_done_callback(partial(self.handle_join_update, task_record))
431
- elif joinable == []: # got a list, but it had no entries, and specifically, no Futures.
432
- self.update_task_state(task_record, States.joining)
433
- task_record['joins'] = joinable
434
- task_record['join_lock'] = threading.Lock()
435
- self._send_task_log_info(task_record)
436
- self.handle_join_update(task_record, None)
437
- elif isinstance(joinable, list) and [j for j in joinable if not isinstance(j, Future)] == []:
438
- self.update_task_state(task_record, States.joining)
439
- task_record['joins'] = joinable
440
- task_record['join_lock'] = threading.Lock()
441
- self._send_task_log_info(task_record)
442
- for inner_future in joinable:
443
- inner_future.add_done_callback(partial(self.handle_join_update, task_record))
444
- else:
445
- task_record['time_returned'] = datetime.datetime.now()
446
- self.update_task_state(task_record, States.failed)
447
- task_record['time_returned'] = datetime.datetime.now()
448
- self._send_task_log_info(task_record)
449
- with task_record['app_fu']._update_lock:
450
- task_record['app_fu'].set_exception(
451
- TypeError(f"join_app body must return a Future or list of Futures, got {joinable} of type {type(joinable)}"))
414
+ self._complete_task_exception(
415
+ task_record,
416
+ States.failed,
417
+ TypeError(f"join_app body must return a Future or list of Futures, got {joinable} of type {type(joinable)}"))
452
418
 
453
419
  self._log_std_streams(task_record)
454
420
 
@@ -519,10 +485,7 @@ class DataFlowKernel:
519
485
  # no need to update the fail cost because join apps are never
520
486
  # retried
521
487
 
522
- self.update_task_state(task_record, States.failed)
523
- task_record['time_returned'] = datetime.datetime.now()
524
- with task_record['app_fu']._update_lock:
525
- task_record['app_fu'].set_exception(e)
488
+ self._complete_task_exception(task_record, States.failed, e)
526
489
 
527
490
  else:
528
491
  # all the joinables succeeded, so construct a result:
@@ -535,12 +498,10 @@ class DataFlowKernel:
535
498
  res.append(future.result())
536
499
  else:
537
500
  raise TypeError(f"Unknown joinable type {type(joinable)}")
538
- self._complete_task(task_record, States.exec_done, res)
501
+ self._complete_task_result(task_record, States.exec_done, res)
539
502
 
540
503
  self._log_std_streams(task_record)
541
504
 
542
- self._send_task_log_info(task_record)
543
-
544
505
  def handle_app_update(self, task_record: TaskRecord, future: AppFuture) -> None:
545
506
  """This function is called as a callback when an AppFuture
546
507
  is in its final state.
@@ -561,40 +522,50 @@ class DataFlowKernel:
561
522
  if not task_record['app_fu'] == future:
562
523
  logger.error("Internal consistency error: callback future is not the app_fu in task structure, for task {}".format(task_id))
563
524
 
564
- self.memoizer.update_memo(task_record, future)
565
-
566
- # Cover all checkpointing cases here:
567
- # Do we need to checkpoint now, or queue for later,
568
- # or do nothing?
569
- if self.checkpoint_mode == 'task_exit':
570
- self.checkpoint(tasks=[task_record])
571
- elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'):
572
- with self.checkpoint_lock:
573
- self.checkpointable_tasks.append(task_record)
574
- elif self.checkpoint_mode is None:
575
- pass
576
- else:
577
- raise InternalConsistencyError(f"Invalid checkpoint mode {self.checkpoint_mode}")
525
+ self.memoizer.update_checkpoint(task_record)
578
526
 
579
527
  self.wipe_task(task_id)
580
528
  return
581
529
 
582
- def _complete_task(self, task_record: TaskRecord, new_state: States, result: Any) -> None:
530
+ def _complete_task_result(self, task_record: TaskRecord, new_state: States, result: Any) -> None:
583
531
  """Set a task into a completed state
584
532
  """
585
533
  assert new_state in FINAL_STATES
586
534
  assert new_state not in FINAL_FAILURE_STATES
587
535
  old_state = task_record['status']
588
536
 
589
- self.update_task_state(task_record, new_state)
537
+ self._update_task_state(task_record, new_state)
590
538
 
591
539
  logger.info(f"Task {task_record['id']} completed ({old_state.name} -> {new_state.name})")
592
540
  task_record['time_returned'] = datetime.datetime.now()
593
541
 
542
+ self.memoizer.update_memo_result(task_record, result)
543
+
544
+ self._send_task_info(task_record)
545
+
594
546
  with task_record['app_fu']._update_lock:
595
547
  task_record['app_fu'].set_result(result)
596
548
 
597
- def update_task_state(self, task_record: TaskRecord, new_state: States) -> None:
549
+ def _complete_task_exception(self, task_record: TaskRecord, new_state: States, exception: BaseException) -> None:
550
+ """Set a task into a failure state
551
+ """
552
+ assert new_state in FINAL_STATES
553
+ assert new_state in FINAL_FAILURE_STATES
554
+ old_state = task_record['status']
555
+
556
+ self._update_task_state(task_record, new_state)
557
+
558
+ logger.info(f"Task {task_record['id']} failed ({old_state.name} -> {new_state.name})")
559
+ task_record['time_returned'] = datetime.datetime.now()
560
+
561
+ self.memoizer.update_memo_exception(task_record, exception)
562
+
563
+ self._send_task_info(task_record)
564
+
565
+ with task_record['app_fu']._update_lock:
566
+ task_record['app_fu'].set_exception(exception)
567
+
568
+ def _update_task_state(self, task_record: TaskRecord, new_state: States) -> None:
598
569
  """Updates a task record state, and recording an appropriate change
599
570
  to task state counters.
600
571
  """
@@ -637,7 +608,7 @@ class DataFlowKernel:
637
608
  launch_if_ready is thread safe, so may be called from any thread
638
609
  or callback.
639
610
  """
640
- self.dependency_launch_pool.submit(self._launch_if_ready_async, task_record)
611
+ self._task_launch_pool.submit(self._launch_if_ready_async, task_record)
641
612
 
642
613
  @wrap_with_logs
643
614
  def _launch_if_ready_async(self, task_record: TaskRecord) -> None:
@@ -645,7 +616,7 @@ class DataFlowKernel:
645
616
  _launch_if_ready will launch the specified task, if it is ready
646
617
  to run (for example, without dependencies, and in pending state).
647
618
  """
648
- exec_fu = None
619
+ exec_fu: Future
649
620
 
650
621
  task_id = task_record['id']
651
622
  with task_record['task_launch_lock']:
@@ -684,28 +655,24 @@ class DataFlowKernel:
684
655
  else:
685
656
  logger.info(
686
657
  "Task {} failed due to dependency failure".format(task_id))
687
- # Raise a dependency exception
688
- self.update_task_state(task_record, States.dep_fail)
689
-
690
- self._send_task_log_info(task_record)
691
658
 
692
659
  exec_fu = Future()
693
660
  exec_fu.set_exception(DependencyError(exceptions_tids,
694
661
  task_id))
695
662
 
696
- if exec_fu:
697
- assert isinstance(exec_fu, Future)
698
- try:
699
- exec_fu.add_done_callback(partial(self.handle_exec_update, task_record))
700
- except Exception:
701
- # this exception is ignored here because it is assumed that exception
702
- # comes from directly executing handle_exec_update (because exec_fu is
703
- # done already). If the callback executes later, then any exception
704
- # coming out of the callback will be ignored and not propate anywhere,
705
- # so this block attempts to keep the same behaviour here.
706
- logger.error("add_done_callback got an exception which will be ignored", exc_info=True)
663
+ assert isinstance(exec_fu, Future), "Every code path leading here needs to define exec_fu"
707
664
 
708
- task_record['exec_fu'] = exec_fu
665
+ try:
666
+ exec_fu.add_done_callback(partial(self.handle_exec_update, task_record))
667
+ except Exception:
668
+ # this exception is ignored here because it is assumed that exception
669
+ # comes from directly executing handle_exec_update (because exec_fu is
670
+ # done already). If the callback executes later, then any exception
671
+ # coming out of the callback will be ignored and not propate anywhere,
672
+ # so this block attempts to keep the same behaviour here.
673
+ logger.error("add_done_callback got an exception which will be ignored", exc_info=True)
674
+
675
+ task_record['exec_fu'] = exec_fu
709
676
 
710
677
  def launch_task(self, task_record: TaskRecord) -> Future:
711
678
  """Handle the actual submission of the task to the executor layer.
@@ -759,9 +726,9 @@ class DataFlowKernel:
759
726
 
760
727
  with self.submitter_lock:
761
728
  exec_fu = executor.submit(function, task_record['resource_specification'], *args, **kwargs)
762
- self.update_task_state(task_record, States.launched)
729
+ self._update_task_state(task_record, States.launched)
763
730
 
764
- self._send_task_log_info(task_record)
731
+ self._send_task_info(task_record)
765
732
 
766
733
  if hasattr(exec_fu, "parsl_executor_task_id"):
767
734
  logger.info(
@@ -1034,7 +1001,7 @@ class DataFlowKernel:
1034
1001
  'try_time_returned': None,
1035
1002
  'resource_specification': resource_specification}
1036
1003
 
1037
- self.update_task_state(task_record, States.unsched)
1004
+ self._update_task_state(task_record, States.unsched)
1038
1005
 
1039
1006
  for kw in ['stdout', 'stderr']:
1040
1007
  if kw in app_kwargs:
@@ -1087,10 +1054,10 @@ class DataFlowKernel:
1087
1054
  waiting_message))
1088
1055
 
1089
1056
  app_fu.add_done_callback(partial(self.handle_app_update, task_record))
1090
- self.update_task_state(task_record, States.pending)
1057
+ self._update_task_state(task_record, States.pending)
1091
1058
  logger.debug("Task {} set to pending state with AppFuture: {}".format(task_id, task_record['app_fu']))
1092
1059
 
1093
- self._send_task_log_info(task_record)
1060
+ self._send_task_info(task_record)
1094
1061
 
1095
1062
  # at this point add callbacks to all dependencies to do a launch_if_ready
1096
1063
  # call whenever a dependency completes.
@@ -1142,7 +1109,7 @@ class DataFlowKernel:
1142
1109
  executor.monitoring_messages = self.monitoring.resource_msgs
1143
1110
  logger.debug("Starting monitoring receiver for executor %s "
1144
1111
  "with remote monitoring radio config %s",
1145
- executor, executor.remote_monitoring_radio)
1112
+ executor.label, executor.remote_monitoring_radio)
1146
1113
 
1147
1114
  executor.monitoring_receiver = executor.remote_monitoring_radio.create_receiver(resource_msgs=executor.monitoring_messages,
1148
1115
  run_dir=executor.run_dir)
@@ -1202,14 +1169,7 @@ class DataFlowKernel:
1202
1169
 
1203
1170
  self.log_task_states()
1204
1171
 
1205
- # Checkpointing takes priority over the rest of the tasks
1206
- # checkpoint if any valid checkpoint method is specified
1207
- if self.checkpoint_mode is not None:
1208
- self.checkpoint()
1209
-
1210
- if self._checkpoint_timer:
1211
- logger.info("Stopping checkpoint timer")
1212
- self._checkpoint_timer.close()
1172
+ self.memoizer.close()
1213
1173
 
1214
1174
  # Send final stats
1215
1175
  self.usage_tracker.send_end_message()
@@ -1243,9 +1203,9 @@ class DataFlowKernel:
1243
1203
  self.monitoring.close()
1244
1204
  logger.info("Terminated monitoring")
1245
1205
 
1246
- logger.info("Terminating dependency launch pool")
1247
- self.dependency_launch_pool.shutdown()
1248
- logger.info("Terminated dependency launch pool")
1206
+ logger.info("Terminating task launch pool")
1207
+ self._task_launch_pool.shutdown()
1208
+ logger.info("Terminated task launch pool")
1249
1209
 
1250
1210
  logger.info("Unregistering atexit hook")
1251
1211
  atexit.unregister(self.atexit_cleanup)
@@ -1267,68 +1227,8 @@ class DataFlowKernel:
1267
1227
  # should still see it.
1268
1228
  logger.info("DFK cleanup complete")
1269
1229
 
1270
- def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> str:
1271
- """Checkpoint the dfk incrementally to a checkpoint file.
1272
-
1273
- When called, every task that has been completed yet not
1274
- checkpointed is checkpointed to a file.
1275
-
1276
- Kwargs:
1277
- - tasks (List of task records) : List of task ids to checkpoint. Default=None
1278
- if set to None, we iterate over all tasks held by the DFK.
1279
-
1280
- .. note::
1281
- Checkpointing only works if memoization is enabled
1282
-
1283
- Returns:
1284
- Checkpoint dir if checkpoints were written successfully.
1285
- By default the checkpoints are written to the RUNDIR of the current
1286
- run under RUNDIR/checkpoints/tasks.pkl
1287
- """
1288
- with self.checkpoint_lock:
1289
- if tasks:
1290
- checkpoint_queue = tasks
1291
- else:
1292
- checkpoint_queue = self.checkpointable_tasks
1293
- self.checkpointable_tasks = []
1294
-
1295
- checkpoint_dir = '{0}/checkpoint'.format(self.run_dir)
1296
- checkpoint_tasks = checkpoint_dir + '/tasks.pkl'
1297
-
1298
- if not os.path.exists(checkpoint_dir):
1299
- os.makedirs(checkpoint_dir, exist_ok=True)
1300
-
1301
- count = 0
1302
-
1303
- with open(checkpoint_tasks, 'ab') as f:
1304
- for task_record in checkpoint_queue:
1305
- task_id = task_record['id']
1306
-
1307
- app_fu = task_record['app_fu']
1308
-
1309
- if app_fu.done() and app_fu.exception() is None:
1310
- hashsum = task_record['hashsum']
1311
- if not hashsum:
1312
- continue
1313
- t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}
1314
-
1315
- # We are using pickle here since pickle dumps to a file in 'ab'
1316
- # mode behave like a incremental log.
1317
- pickle.dump(t, f)
1318
- count += 1
1319
- logger.debug("Task {} checkpointed".format(task_id))
1320
-
1321
- self.checkpointed_tasks += count
1322
-
1323
- if count == 0:
1324
- if self.checkpointed_tasks == 0:
1325
- logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled")
1326
- else:
1327
- logger.debug("No tasks checkpointed in this pass.")
1328
- else:
1329
- logger.info("Done checkpointing {} tasks".format(count))
1330
-
1331
- return checkpoint_dir
1230
+ def checkpoint(self) -> None:
1231
+ self.memoizer.checkpoint_queue()
1332
1232
 
1333
1233
  @staticmethod
1334
1234
  def _log_std_streams(task_record: TaskRecord) -> None: