parsl 2025.8.4__py3-none-any.whl → 2025.11.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/__init__.py +0 -4
- parsl/app/bash.py +1 -1
- parsl/benchmark/perf.py +73 -17
- parsl/concurrent/__init__.py +95 -14
- parsl/curvezmq.py +0 -16
- parsl/data_provider/globus.py +3 -1
- parsl/dataflow/dflow.py +107 -207
- parsl/dataflow/memoization.py +144 -31
- parsl/dataflow/states.py +5 -5
- parsl/executors/base.py +2 -2
- parsl/executors/execute_task.py +2 -8
- parsl/executors/flux/executor.py +4 -6
- parsl/executors/globus_compute.py +0 -4
- parsl/executors/high_throughput/executor.py +86 -25
- parsl/executors/high_throughput/interchange.py +55 -42
- parsl/executors/high_throughput/mpi_executor.py +1 -2
- parsl/executors/high_throughput/mpi_resource_management.py +7 -14
- parsl/executors/high_throughput/process_worker_pool.py +32 -7
- parsl/executors/high_throughput/zmq_pipes.py +36 -67
- parsl/executors/radical/executor.py +2 -6
- parsl/executors/radical/rpex_worker.py +2 -2
- parsl/executors/taskvine/executor.py +5 -1
- parsl/executors/threads.py +5 -2
- parsl/jobs/states.py +2 -2
- parsl/jobs/strategy.py +7 -6
- parsl/monitoring/db_manager.py +21 -23
- parsl/monitoring/monitoring.py +2 -2
- parsl/monitoring/radios/filesystem.py +2 -1
- parsl/monitoring/radios/htex.py +2 -1
- parsl/monitoring/radios/multiprocessing.py +2 -1
- parsl/monitoring/radios/udp.py +2 -1
- parsl/monitoring/radios/udp_router.py +2 -2
- parsl/monitoring/radios/zmq_router.py +2 -2
- parsl/multiprocessing.py +0 -49
- parsl/providers/base.py +24 -37
- parsl/providers/pbspro/pbspro.py +1 -1
- parsl/serialize/__init__.py +6 -9
- parsl/serialize/facade.py +0 -32
- parsl/tests/configs/local_threads_globus.py +18 -14
- parsl/tests/configs/taskvine_ex.py +1 -1
- parsl/tests/manual_tests/test_memory_limits.py +1 -1
- parsl/tests/sites/test_concurrent.py +51 -3
- parsl/tests/test_checkpointing/test_periodic.py +15 -9
- parsl/tests/test_checkpointing/test_python_checkpoint_1.py +6 -3
- parsl/tests/test_checkpointing/test_regression_233.py +0 -1
- parsl/tests/test_curvezmq.py +0 -42
- parsl/tests/test_execute_task.py +2 -11
- parsl/tests/test_htex/test_command_concurrency_regression_1321.py +54 -0
- parsl/tests/test_htex/test_htex.py +36 -1
- parsl/tests/test_htex/test_interchange_exit_bad_registration.py +2 -2
- parsl/tests/test_htex/test_priority_queue.py +26 -3
- parsl/tests/test_htex/test_zmq_binding.py +2 -1
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +18 -43
- parsl/tests/test_python_apps/test_basic.py +0 -14
- parsl/tests/test_python_apps/test_depfail_propagation.py +11 -1
- parsl/tests/test_python_apps/test_exception.py +19 -0
- parsl/tests/test_python_apps/test_garbage_collect.py +1 -6
- parsl/tests/test_python_apps/test_memoize_2.py +11 -1
- parsl/tests/test_python_apps/test_memoize_exception.py +41 -0
- parsl/tests/test_regression/test_3874.py +47 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +1 -0
- parsl/tests/test_staging/test_staging_globus.py +2 -2
- parsl/tests/test_utils/test_representation_mixin.py +53 -0
- parsl/tests/unit/test_globus_compute_executor.py +11 -2
- parsl/utils.py +11 -3
- parsl/version.py +1 -1
- {parsl-2025.8.4.data → parsl-2025.11.10.data}/scripts/interchange.py +55 -42
- {parsl-2025.8.4.data → parsl-2025.11.10.data}/scripts/process_worker_pool.py +32 -7
- {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/METADATA +64 -50
- {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/RECORD +76 -81
- {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/WHEEL +1 -1
- parsl/tests/configs/local_threads_checkpoint_periodic.py +0 -11
- parsl/tests/configs/local_threads_no_cache.py +0 -11
- parsl/tests/site_tests/test_provider.py +0 -88
- parsl/tests/site_tests/test_site.py +0 -70
- parsl/tests/test_aalst_patterns.py +0 -474
- parsl/tests/test_docs/test_workflow2.py +0 -42
- parsl/tests/test_error_handling/test_rand_fail.py +0 -171
- parsl/tests/test_regression/test_854.py +0 -62
- parsl/tests/test_serialization/test_pack_resource_spec.py +0 -23
- {parsl-2025.8.4.data → parsl-2025.11.10.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.8.4.data → parsl-2025.11.10.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/entry_points.txt +0 -0
- {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info/licenses}/LICENSE +0 -0
- {parsl-2025.8.4.dist-info → parsl-2025.11.10.dist-info}/top_level.txt +0 -0
parsl/dataflow/dflow.py
CHANGED
|
@@ -6,7 +6,6 @@ import datetime
|
|
|
6
6
|
import inspect
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
|
-
import pickle
|
|
10
9
|
import random
|
|
11
10
|
import sys
|
|
12
11
|
import threading
|
|
@@ -50,7 +49,7 @@ from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSen
|
|
|
50
49
|
from parsl.monitoring.remote import monitor_wrapper
|
|
51
50
|
from parsl.process_loggers import wrap_with_logs
|
|
52
51
|
from parsl.usage_tracking.usage import UsageTracker
|
|
53
|
-
from parsl.utils import
|
|
52
|
+
from parsl.utils import get_std_fname_mode, get_version
|
|
54
53
|
|
|
55
54
|
logger = logging.getLogger(__name__)
|
|
56
55
|
|
|
@@ -101,8 +100,6 @@ class DataFlowKernel:
|
|
|
101
100
|
|
|
102
101
|
logger.info("Parsl version: {}".format(get_version()))
|
|
103
102
|
|
|
104
|
-
self.checkpoint_lock = threading.Lock()
|
|
105
|
-
|
|
106
103
|
self.usage_tracker = UsageTracker(self)
|
|
107
104
|
self.usage_tracker.send_start_message()
|
|
108
105
|
|
|
@@ -168,18 +165,12 @@ class DataFlowKernel:
|
|
|
168
165
|
self.monitoring_radio.send((MessageType.WORKFLOW_INFO,
|
|
169
166
|
workflow_info))
|
|
170
167
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files)
|
|
179
|
-
self.checkpointed_tasks = 0
|
|
180
|
-
self._checkpoint_timer = None
|
|
181
|
-
self.checkpoint_mode = config.checkpoint_mode
|
|
182
|
-
self.checkpointable_tasks: List[TaskRecord] = []
|
|
168
|
+
self.memoizer = Memoizer(memoize=config.app_cache,
|
|
169
|
+
checkpoint_mode=config.checkpoint_mode,
|
|
170
|
+
checkpoint_files=config.checkpoint_files,
|
|
171
|
+
checkpoint_period=config.checkpoint_period)
|
|
172
|
+
self.memoizer.run_dir = self.run_dir
|
|
173
|
+
self.memoizer.start()
|
|
183
174
|
|
|
184
175
|
# this must be set before executors are added since add_executors calls
|
|
185
176
|
# job_status_poller.add_executors.
|
|
@@ -195,22 +186,11 @@ class DataFlowKernel:
|
|
|
195
186
|
self.add_executors(config.executors)
|
|
196
187
|
self.add_executors([parsl_internal_executor])
|
|
197
188
|
|
|
198
|
-
if self.checkpoint_mode == "periodic":
|
|
199
|
-
if config.checkpoint_period is None:
|
|
200
|
-
raise ConfigurationError("Checkpoint period must be specified with periodic checkpoint mode")
|
|
201
|
-
else:
|
|
202
|
-
try:
|
|
203
|
-
h, m, s = map(int, config.checkpoint_period.split(':'))
|
|
204
|
-
except Exception:
|
|
205
|
-
raise ConfigurationError("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(config.checkpoint_period))
|
|
206
|
-
checkpoint_period = (h * 3600) + (m * 60) + s
|
|
207
|
-
self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period, name="Checkpoint")
|
|
208
|
-
|
|
209
189
|
self.task_count = 0
|
|
210
190
|
self.tasks: Dict[int, TaskRecord] = {}
|
|
211
191
|
self.submitter_lock = threading.Lock()
|
|
212
192
|
|
|
213
|
-
self.
|
|
193
|
+
self._task_launch_pool = cf.ThreadPoolExecutor(max_workers=1, thread_name_prefix="Task-Launch")
|
|
214
194
|
|
|
215
195
|
self.dependency_resolver = self.config.dependency_resolver if self.config.dependency_resolver is not None \
|
|
216
196
|
else SHALLOW_DEPENDENCY_RESOLVER
|
|
@@ -239,7 +219,7 @@ class DataFlowKernel:
|
|
|
239
219
|
else:
|
|
240
220
|
raise InternalConsistencyError(f"Exit case for {mode} should be unreachable, validated by typeguard on Config()")
|
|
241
221
|
|
|
242
|
-
def
|
|
222
|
+
def _send_task_info(self, task_record: TaskRecord) -> None:
|
|
243
223
|
if self.monitoring_radio:
|
|
244
224
|
task_log_info = self._create_task_log_info(task_record)
|
|
245
225
|
self.monitoring_radio.send((MessageType.TASK_INFO, task_log_info))
|
|
@@ -371,84 +351,70 @@ class DataFlowKernel:
|
|
|
371
351
|
else:
|
|
372
352
|
task_record['fail_cost'] += 1
|
|
373
353
|
|
|
374
|
-
if
|
|
354
|
+
if isinstance(e, DependencyError):
|
|
375
355
|
logger.info("Task {} failed due to dependency failure so skipping retries".format(task_id))
|
|
376
|
-
task_record
|
|
377
|
-
self._send_task_log_info(task_record)
|
|
378
|
-
with task_record['app_fu']._update_lock:
|
|
379
|
-
task_record['app_fu'].set_exception(e)
|
|
356
|
+
self._complete_task_exception(task_record, States.dep_fail, e)
|
|
380
357
|
|
|
381
358
|
elif task_record['fail_cost'] <= self._config.retries:
|
|
382
359
|
|
|
383
360
|
# record the final state for this try before we mutate for retries
|
|
384
|
-
self.
|
|
385
|
-
self.
|
|
361
|
+
self._update_task_state(task_record, States.fail_retryable)
|
|
362
|
+
self._send_task_info(task_record)
|
|
386
363
|
|
|
387
364
|
task_record['try_id'] += 1
|
|
388
|
-
self.
|
|
365
|
+
self._update_task_state(task_record, States.pending)
|
|
389
366
|
task_record['try_time_launched'] = None
|
|
390
367
|
task_record['try_time_returned'] = None
|
|
391
368
|
task_record['fail_history'] = []
|
|
392
|
-
self.
|
|
369
|
+
self._send_task_info(task_record)
|
|
393
370
|
|
|
394
371
|
logger.info("Task {} marked for retry".format(task_id))
|
|
395
372
|
|
|
396
373
|
else:
|
|
397
374
|
logger.exception("Task {} failed after {} retry attempts".format(task_id,
|
|
398
375
|
task_record['try_id']))
|
|
399
|
-
task_record
|
|
400
|
-
self.update_task_state(task_record, States.failed)
|
|
401
|
-
task_record['time_returned'] = datetime.datetime.now()
|
|
402
|
-
self._send_task_log_info(task_record)
|
|
403
|
-
with task_record['app_fu']._update_lock:
|
|
404
|
-
task_record['app_fu'].set_exception(e)
|
|
376
|
+
self._complete_task_exception(task_record, States.failed, e)
|
|
405
377
|
|
|
406
378
|
else:
|
|
407
379
|
if task_record['from_memo']:
|
|
408
|
-
self.
|
|
409
|
-
|
|
380
|
+
self._complete_task_result(task_record, States.memo_done, res)
|
|
381
|
+
elif not task_record['join']:
|
|
382
|
+
self._complete_task_result(task_record, States.exec_done, res)
|
|
410
383
|
else:
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
384
|
+
# This is a join task, and the original task's function code has
|
|
385
|
+
# completed. That means that the future returned by that code
|
|
386
|
+
# will be available inside the executor future, so we can now
|
|
387
|
+
# record the inner app ID in monitoring, and add a completion
|
|
388
|
+
# listener to that inner future.
|
|
389
|
+
|
|
390
|
+
joinable = future.result()
|
|
391
|
+
|
|
392
|
+
# Fail with a TypeError if the joinapp python body returned
|
|
393
|
+
# something we can't join on.
|
|
394
|
+
if isinstance(joinable, Future):
|
|
395
|
+
self._update_task_state(task_record, States.joining)
|
|
396
|
+
task_record['joins'] = joinable
|
|
397
|
+
task_record['join_lock'] = threading.Lock()
|
|
398
|
+
self._send_task_info(task_record)
|
|
399
|
+
joinable.add_done_callback(partial(self.handle_join_update, task_record))
|
|
400
|
+
elif joinable == []: # got a list, but it had no entries, and specifically, no Futures.
|
|
401
|
+
self._update_task_state(task_record, States.joining)
|
|
402
|
+
task_record['joins'] = joinable
|
|
403
|
+
task_record['join_lock'] = threading.Lock()
|
|
404
|
+
self._send_task_info(task_record)
|
|
405
|
+
self.handle_join_update(task_record, None)
|
|
406
|
+
elif isinstance(joinable, list) and [j for j in joinable if not isinstance(j, Future)] == []:
|
|
407
|
+
self._update_task_state(task_record, States.joining)
|
|
408
|
+
task_record['joins'] = joinable
|
|
409
|
+
task_record['join_lock'] = threading.Lock()
|
|
410
|
+
self._send_task_info(task_record)
|
|
411
|
+
for inner_future in joinable:
|
|
412
|
+
inner_future.add_done_callback(partial(self.handle_join_update, task_record))
|
|
414
413
|
else:
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
# listener to that inner future.
|
|
420
|
-
|
|
421
|
-
joinable = future.result()
|
|
422
|
-
|
|
423
|
-
# Fail with a TypeError if the joinapp python body returned
|
|
424
|
-
# something we can't join on.
|
|
425
|
-
if isinstance(joinable, Future):
|
|
426
|
-
self.update_task_state(task_record, States.joining)
|
|
427
|
-
task_record['joins'] = joinable
|
|
428
|
-
task_record['join_lock'] = threading.Lock()
|
|
429
|
-
self._send_task_log_info(task_record)
|
|
430
|
-
joinable.add_done_callback(partial(self.handle_join_update, task_record))
|
|
431
|
-
elif joinable == []: # got a list, but it had no entries, and specifically, no Futures.
|
|
432
|
-
self.update_task_state(task_record, States.joining)
|
|
433
|
-
task_record['joins'] = joinable
|
|
434
|
-
task_record['join_lock'] = threading.Lock()
|
|
435
|
-
self._send_task_log_info(task_record)
|
|
436
|
-
self.handle_join_update(task_record, None)
|
|
437
|
-
elif isinstance(joinable, list) and [j for j in joinable if not isinstance(j, Future)] == []:
|
|
438
|
-
self.update_task_state(task_record, States.joining)
|
|
439
|
-
task_record['joins'] = joinable
|
|
440
|
-
task_record['join_lock'] = threading.Lock()
|
|
441
|
-
self._send_task_log_info(task_record)
|
|
442
|
-
for inner_future in joinable:
|
|
443
|
-
inner_future.add_done_callback(partial(self.handle_join_update, task_record))
|
|
444
|
-
else:
|
|
445
|
-
task_record['time_returned'] = datetime.datetime.now()
|
|
446
|
-
self.update_task_state(task_record, States.failed)
|
|
447
|
-
task_record['time_returned'] = datetime.datetime.now()
|
|
448
|
-
self._send_task_log_info(task_record)
|
|
449
|
-
with task_record['app_fu']._update_lock:
|
|
450
|
-
task_record['app_fu'].set_exception(
|
|
451
|
-
TypeError(f"join_app body must return a Future or list of Futures, got {joinable} of type {type(joinable)}"))
|
|
414
|
+
self._complete_task_exception(
|
|
415
|
+
task_record,
|
|
416
|
+
States.failed,
|
|
417
|
+
TypeError(f"join_app body must return a Future or list of Futures, got {joinable} of type {type(joinable)}"))
|
|
452
418
|
|
|
453
419
|
self._log_std_streams(task_record)
|
|
454
420
|
|
|
@@ -519,10 +485,7 @@ class DataFlowKernel:
|
|
|
519
485
|
# no need to update the fail cost because join apps are never
|
|
520
486
|
# retried
|
|
521
487
|
|
|
522
|
-
self.
|
|
523
|
-
task_record['time_returned'] = datetime.datetime.now()
|
|
524
|
-
with task_record['app_fu']._update_lock:
|
|
525
|
-
task_record['app_fu'].set_exception(e)
|
|
488
|
+
self._complete_task_exception(task_record, States.failed, e)
|
|
526
489
|
|
|
527
490
|
else:
|
|
528
491
|
# all the joinables succeeded, so construct a result:
|
|
@@ -535,12 +498,10 @@ class DataFlowKernel:
|
|
|
535
498
|
res.append(future.result())
|
|
536
499
|
else:
|
|
537
500
|
raise TypeError(f"Unknown joinable type {type(joinable)}")
|
|
538
|
-
self.
|
|
501
|
+
self._complete_task_result(task_record, States.exec_done, res)
|
|
539
502
|
|
|
540
503
|
self._log_std_streams(task_record)
|
|
541
504
|
|
|
542
|
-
self._send_task_log_info(task_record)
|
|
543
|
-
|
|
544
505
|
def handle_app_update(self, task_record: TaskRecord, future: AppFuture) -> None:
|
|
545
506
|
"""This function is called as a callback when an AppFuture
|
|
546
507
|
is in its final state.
|
|
@@ -561,40 +522,50 @@ class DataFlowKernel:
|
|
|
561
522
|
if not task_record['app_fu'] == future:
|
|
562
523
|
logger.error("Internal consistency error: callback future is not the app_fu in task structure, for task {}".format(task_id))
|
|
563
524
|
|
|
564
|
-
self.memoizer.
|
|
565
|
-
|
|
566
|
-
# Cover all checkpointing cases here:
|
|
567
|
-
# Do we need to checkpoint now, or queue for later,
|
|
568
|
-
# or do nothing?
|
|
569
|
-
if self.checkpoint_mode == 'task_exit':
|
|
570
|
-
self.checkpoint(tasks=[task_record])
|
|
571
|
-
elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'):
|
|
572
|
-
with self.checkpoint_lock:
|
|
573
|
-
self.checkpointable_tasks.append(task_record)
|
|
574
|
-
elif self.checkpoint_mode is None:
|
|
575
|
-
pass
|
|
576
|
-
else:
|
|
577
|
-
raise InternalConsistencyError(f"Invalid checkpoint mode {self.checkpoint_mode}")
|
|
525
|
+
self.memoizer.update_checkpoint(task_record)
|
|
578
526
|
|
|
579
527
|
self.wipe_task(task_id)
|
|
580
528
|
return
|
|
581
529
|
|
|
582
|
-
def
|
|
530
|
+
def _complete_task_result(self, task_record: TaskRecord, new_state: States, result: Any) -> None:
|
|
583
531
|
"""Set a task into a completed state
|
|
584
532
|
"""
|
|
585
533
|
assert new_state in FINAL_STATES
|
|
586
534
|
assert new_state not in FINAL_FAILURE_STATES
|
|
587
535
|
old_state = task_record['status']
|
|
588
536
|
|
|
589
|
-
self.
|
|
537
|
+
self._update_task_state(task_record, new_state)
|
|
590
538
|
|
|
591
539
|
logger.info(f"Task {task_record['id']} completed ({old_state.name} -> {new_state.name})")
|
|
592
540
|
task_record['time_returned'] = datetime.datetime.now()
|
|
593
541
|
|
|
542
|
+
self.memoizer.update_memo_result(task_record, result)
|
|
543
|
+
|
|
544
|
+
self._send_task_info(task_record)
|
|
545
|
+
|
|
594
546
|
with task_record['app_fu']._update_lock:
|
|
595
547
|
task_record['app_fu'].set_result(result)
|
|
596
548
|
|
|
597
|
-
def
|
|
549
|
+
def _complete_task_exception(self, task_record: TaskRecord, new_state: States, exception: BaseException) -> None:
|
|
550
|
+
"""Set a task into a failure state
|
|
551
|
+
"""
|
|
552
|
+
assert new_state in FINAL_STATES
|
|
553
|
+
assert new_state in FINAL_FAILURE_STATES
|
|
554
|
+
old_state = task_record['status']
|
|
555
|
+
|
|
556
|
+
self._update_task_state(task_record, new_state)
|
|
557
|
+
|
|
558
|
+
logger.info(f"Task {task_record['id']} failed ({old_state.name} -> {new_state.name})")
|
|
559
|
+
task_record['time_returned'] = datetime.datetime.now()
|
|
560
|
+
|
|
561
|
+
self.memoizer.update_memo_exception(task_record, exception)
|
|
562
|
+
|
|
563
|
+
self._send_task_info(task_record)
|
|
564
|
+
|
|
565
|
+
with task_record['app_fu']._update_lock:
|
|
566
|
+
task_record['app_fu'].set_exception(exception)
|
|
567
|
+
|
|
568
|
+
def _update_task_state(self, task_record: TaskRecord, new_state: States) -> None:
|
|
598
569
|
"""Updates a task record state, and recording an appropriate change
|
|
599
570
|
to task state counters.
|
|
600
571
|
"""
|
|
@@ -637,7 +608,7 @@ class DataFlowKernel:
|
|
|
637
608
|
launch_if_ready is thread safe, so may be called from any thread
|
|
638
609
|
or callback.
|
|
639
610
|
"""
|
|
640
|
-
self.
|
|
611
|
+
self._task_launch_pool.submit(self._launch_if_ready_async, task_record)
|
|
641
612
|
|
|
642
613
|
@wrap_with_logs
|
|
643
614
|
def _launch_if_ready_async(self, task_record: TaskRecord) -> None:
|
|
@@ -645,7 +616,7 @@ class DataFlowKernel:
|
|
|
645
616
|
_launch_if_ready will launch the specified task, if it is ready
|
|
646
617
|
to run (for example, without dependencies, and in pending state).
|
|
647
618
|
"""
|
|
648
|
-
exec_fu
|
|
619
|
+
exec_fu: Future
|
|
649
620
|
|
|
650
621
|
task_id = task_record['id']
|
|
651
622
|
with task_record['task_launch_lock']:
|
|
@@ -684,28 +655,24 @@ class DataFlowKernel:
|
|
|
684
655
|
else:
|
|
685
656
|
logger.info(
|
|
686
657
|
"Task {} failed due to dependency failure".format(task_id))
|
|
687
|
-
# Raise a dependency exception
|
|
688
|
-
self.update_task_state(task_record, States.dep_fail)
|
|
689
|
-
|
|
690
|
-
self._send_task_log_info(task_record)
|
|
691
658
|
|
|
692
659
|
exec_fu = Future()
|
|
693
660
|
exec_fu.set_exception(DependencyError(exceptions_tids,
|
|
694
661
|
task_id))
|
|
695
662
|
|
|
696
|
-
|
|
697
|
-
assert isinstance(exec_fu, Future)
|
|
698
|
-
try:
|
|
699
|
-
exec_fu.add_done_callback(partial(self.handle_exec_update, task_record))
|
|
700
|
-
except Exception:
|
|
701
|
-
# this exception is ignored here because it is assumed that exception
|
|
702
|
-
# comes from directly executing handle_exec_update (because exec_fu is
|
|
703
|
-
# done already). If the callback executes later, then any exception
|
|
704
|
-
# coming out of the callback will be ignored and not propate anywhere,
|
|
705
|
-
# so this block attempts to keep the same behaviour here.
|
|
706
|
-
logger.error("add_done_callback got an exception which will be ignored", exc_info=True)
|
|
663
|
+
assert isinstance(exec_fu, Future), "Every code path leading here needs to define exec_fu"
|
|
707
664
|
|
|
708
|
-
|
|
665
|
+
try:
|
|
666
|
+
exec_fu.add_done_callback(partial(self.handle_exec_update, task_record))
|
|
667
|
+
except Exception:
|
|
668
|
+
# this exception is ignored here because it is assumed that exception
|
|
669
|
+
# comes from directly executing handle_exec_update (because exec_fu is
|
|
670
|
+
# done already). If the callback executes later, then any exception
|
|
671
|
+
# coming out of the callback will be ignored and not propate anywhere,
|
|
672
|
+
# so this block attempts to keep the same behaviour here.
|
|
673
|
+
logger.error("add_done_callback got an exception which will be ignored", exc_info=True)
|
|
674
|
+
|
|
675
|
+
task_record['exec_fu'] = exec_fu
|
|
709
676
|
|
|
710
677
|
def launch_task(self, task_record: TaskRecord) -> Future:
|
|
711
678
|
"""Handle the actual submission of the task to the executor layer.
|
|
@@ -759,9 +726,9 @@ class DataFlowKernel:
|
|
|
759
726
|
|
|
760
727
|
with self.submitter_lock:
|
|
761
728
|
exec_fu = executor.submit(function, task_record['resource_specification'], *args, **kwargs)
|
|
762
|
-
self.
|
|
729
|
+
self._update_task_state(task_record, States.launched)
|
|
763
730
|
|
|
764
|
-
self.
|
|
731
|
+
self._send_task_info(task_record)
|
|
765
732
|
|
|
766
733
|
if hasattr(exec_fu, "parsl_executor_task_id"):
|
|
767
734
|
logger.info(
|
|
@@ -1034,7 +1001,7 @@ class DataFlowKernel:
|
|
|
1034
1001
|
'try_time_returned': None,
|
|
1035
1002
|
'resource_specification': resource_specification}
|
|
1036
1003
|
|
|
1037
|
-
self.
|
|
1004
|
+
self._update_task_state(task_record, States.unsched)
|
|
1038
1005
|
|
|
1039
1006
|
for kw in ['stdout', 'stderr']:
|
|
1040
1007
|
if kw in app_kwargs:
|
|
@@ -1087,10 +1054,10 @@ class DataFlowKernel:
|
|
|
1087
1054
|
waiting_message))
|
|
1088
1055
|
|
|
1089
1056
|
app_fu.add_done_callback(partial(self.handle_app_update, task_record))
|
|
1090
|
-
self.
|
|
1057
|
+
self._update_task_state(task_record, States.pending)
|
|
1091
1058
|
logger.debug("Task {} set to pending state with AppFuture: {}".format(task_id, task_record['app_fu']))
|
|
1092
1059
|
|
|
1093
|
-
self.
|
|
1060
|
+
self._send_task_info(task_record)
|
|
1094
1061
|
|
|
1095
1062
|
# at this point add callbacks to all dependencies to do a launch_if_ready
|
|
1096
1063
|
# call whenever a dependency completes.
|
|
@@ -1142,7 +1109,7 @@ class DataFlowKernel:
|
|
|
1142
1109
|
executor.monitoring_messages = self.monitoring.resource_msgs
|
|
1143
1110
|
logger.debug("Starting monitoring receiver for executor %s "
|
|
1144
1111
|
"with remote monitoring radio config %s",
|
|
1145
|
-
executor, executor.remote_monitoring_radio)
|
|
1112
|
+
executor.label, executor.remote_monitoring_radio)
|
|
1146
1113
|
|
|
1147
1114
|
executor.monitoring_receiver = executor.remote_monitoring_radio.create_receiver(resource_msgs=executor.monitoring_messages,
|
|
1148
1115
|
run_dir=executor.run_dir)
|
|
@@ -1202,14 +1169,7 @@ class DataFlowKernel:
|
|
|
1202
1169
|
|
|
1203
1170
|
self.log_task_states()
|
|
1204
1171
|
|
|
1205
|
-
|
|
1206
|
-
# checkpoint if any valid checkpoint method is specified
|
|
1207
|
-
if self.checkpoint_mode is not None:
|
|
1208
|
-
self.checkpoint()
|
|
1209
|
-
|
|
1210
|
-
if self._checkpoint_timer:
|
|
1211
|
-
logger.info("Stopping checkpoint timer")
|
|
1212
|
-
self._checkpoint_timer.close()
|
|
1172
|
+
self.memoizer.close()
|
|
1213
1173
|
|
|
1214
1174
|
# Send final stats
|
|
1215
1175
|
self.usage_tracker.send_end_message()
|
|
@@ -1243,9 +1203,9 @@ class DataFlowKernel:
|
|
|
1243
1203
|
self.monitoring.close()
|
|
1244
1204
|
logger.info("Terminated monitoring")
|
|
1245
1205
|
|
|
1246
|
-
logger.info("Terminating
|
|
1247
|
-
self.
|
|
1248
|
-
logger.info("Terminated
|
|
1206
|
+
logger.info("Terminating task launch pool")
|
|
1207
|
+
self._task_launch_pool.shutdown()
|
|
1208
|
+
logger.info("Terminated task launch pool")
|
|
1249
1209
|
|
|
1250
1210
|
logger.info("Unregistering atexit hook")
|
|
1251
1211
|
atexit.unregister(self.atexit_cleanup)
|
|
@@ -1267,68 +1227,8 @@ class DataFlowKernel:
|
|
|
1267
1227
|
# should still see it.
|
|
1268
1228
|
logger.info("DFK cleanup complete")
|
|
1269
1229
|
|
|
1270
|
-
def checkpoint(self
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
When called, every task that has been completed yet not
|
|
1274
|
-
checkpointed is checkpointed to a file.
|
|
1275
|
-
|
|
1276
|
-
Kwargs:
|
|
1277
|
-
- tasks (List of task records) : List of task ids to checkpoint. Default=None
|
|
1278
|
-
if set to None, we iterate over all tasks held by the DFK.
|
|
1279
|
-
|
|
1280
|
-
.. note::
|
|
1281
|
-
Checkpointing only works if memoization is enabled
|
|
1282
|
-
|
|
1283
|
-
Returns:
|
|
1284
|
-
Checkpoint dir if checkpoints were written successfully.
|
|
1285
|
-
By default the checkpoints are written to the RUNDIR of the current
|
|
1286
|
-
run under RUNDIR/checkpoints/tasks.pkl
|
|
1287
|
-
"""
|
|
1288
|
-
with self.checkpoint_lock:
|
|
1289
|
-
if tasks:
|
|
1290
|
-
checkpoint_queue = tasks
|
|
1291
|
-
else:
|
|
1292
|
-
checkpoint_queue = self.checkpointable_tasks
|
|
1293
|
-
self.checkpointable_tasks = []
|
|
1294
|
-
|
|
1295
|
-
checkpoint_dir = '{0}/checkpoint'.format(self.run_dir)
|
|
1296
|
-
checkpoint_tasks = checkpoint_dir + '/tasks.pkl'
|
|
1297
|
-
|
|
1298
|
-
if not os.path.exists(checkpoint_dir):
|
|
1299
|
-
os.makedirs(checkpoint_dir, exist_ok=True)
|
|
1300
|
-
|
|
1301
|
-
count = 0
|
|
1302
|
-
|
|
1303
|
-
with open(checkpoint_tasks, 'ab') as f:
|
|
1304
|
-
for task_record in checkpoint_queue:
|
|
1305
|
-
task_id = task_record['id']
|
|
1306
|
-
|
|
1307
|
-
app_fu = task_record['app_fu']
|
|
1308
|
-
|
|
1309
|
-
if app_fu.done() and app_fu.exception() is None:
|
|
1310
|
-
hashsum = task_record['hashsum']
|
|
1311
|
-
if not hashsum:
|
|
1312
|
-
continue
|
|
1313
|
-
t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}
|
|
1314
|
-
|
|
1315
|
-
# We are using pickle here since pickle dumps to a file in 'ab'
|
|
1316
|
-
# mode behave like a incremental log.
|
|
1317
|
-
pickle.dump(t, f)
|
|
1318
|
-
count += 1
|
|
1319
|
-
logger.debug("Task {} checkpointed".format(task_id))
|
|
1320
|
-
|
|
1321
|
-
self.checkpointed_tasks += count
|
|
1322
|
-
|
|
1323
|
-
if count == 0:
|
|
1324
|
-
if self.checkpointed_tasks == 0:
|
|
1325
|
-
logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled")
|
|
1326
|
-
else:
|
|
1327
|
-
logger.debug("No tasks checkpointed in this pass.")
|
|
1328
|
-
else:
|
|
1329
|
-
logger.info("Done checkpointing {} tasks".format(count))
|
|
1330
|
-
|
|
1331
|
-
return checkpoint_dir
|
|
1230
|
+
def checkpoint(self) -> None:
|
|
1231
|
+
self.memoizer.checkpoint_queue()
|
|
1332
1232
|
|
|
1333
1233
|
@staticmethod
|
|
1334
1234
|
def _log_std_streams(task_record: TaskRecord) -> None:
|