parsl 2025.10.13__py3-none-any.whl → 2025.10.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of parsl might be problematic. Click here for more details.

parsl/app/bash.py CHANGED
@@ -88,7 +88,7 @@ def remote_side_bash_executor(func, *args, **kwargs):
88
88
  raise pe.AppTimeout(f"App {func_name} exceeded walltime: {timeout} seconds")
89
89
 
90
90
  except Exception as e:
91
- raise pe.AppException(f"App {func_name} caught exception with returncode: {returncode}", e)
91
+ raise pe.AppException(f"App {func_name} caught exception", e)
92
92
 
93
93
  if returncode != 0:
94
94
  raise pe.BashExitFailure(func_name, proc.returncode)
parsl/dataflow/dflow.py CHANGED
@@ -6,7 +6,6 @@ import datetime
6
6
  import inspect
7
7
  import logging
8
8
  import os
9
- import pickle
10
9
  import random
11
10
  import sys
12
11
  import threading
@@ -50,7 +49,7 @@ from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSen
50
49
  from parsl.monitoring.remote import monitor_wrapper
51
50
  from parsl.process_loggers import wrap_with_logs
52
51
  from parsl.usage_tracking.usage import UsageTracker
53
- from parsl.utils import Timer, get_all_checkpoints, get_std_fname_mode, get_version
52
+ from parsl.utils import get_std_fname_mode, get_version
54
53
 
55
54
  logger = logging.getLogger(__name__)
56
55
 
@@ -101,8 +100,6 @@ class DataFlowKernel:
101
100
 
102
101
  logger.info("Parsl version: {}".format(get_version()))
103
102
 
104
- self.checkpoint_lock = threading.Lock()
105
-
106
103
  self.usage_tracker = UsageTracker(self)
107
104
  self.usage_tracker.send_start_message()
108
105
 
@@ -168,18 +165,12 @@ class DataFlowKernel:
168
165
  self.monitoring_radio.send((MessageType.WORKFLOW_INFO,
169
166
  workflow_info))
170
167
 
171
- if config.checkpoint_files is not None:
172
- checkpoint_files = config.checkpoint_files
173
- elif config.checkpoint_files is None and config.checkpoint_mode is not None:
174
- checkpoint_files = get_all_checkpoints(self.run_dir)
175
- else:
176
- checkpoint_files = []
177
-
178
- self.memoizer = Memoizer(memoize=config.app_cache, checkpoint_files=checkpoint_files)
179
- self.checkpointed_tasks = 0
180
- self._checkpoint_timer = None
181
- self.checkpoint_mode = config.checkpoint_mode
182
- self.checkpointable_tasks: List[TaskRecord] = []
168
+ self.memoizer = Memoizer(memoize=config.app_cache,
169
+ checkpoint_mode=config.checkpoint_mode,
170
+ checkpoint_files=config.checkpoint_files,
171
+ checkpoint_period=config.checkpoint_period)
172
+ self.memoizer.run_dir = self.run_dir
173
+ self.memoizer.start()
183
174
 
184
175
  # this must be set before executors are added since add_executors calls
185
176
  # job_status_poller.add_executors.
@@ -195,17 +186,6 @@ class DataFlowKernel:
195
186
  self.add_executors(config.executors)
196
187
  self.add_executors([parsl_internal_executor])
197
188
 
198
- if self.checkpoint_mode == "periodic":
199
- if config.checkpoint_period is None:
200
- raise ConfigurationError("Checkpoint period must be specified with periodic checkpoint mode")
201
- else:
202
- try:
203
- h, m, s = map(int, config.checkpoint_period.split(':'))
204
- except Exception:
205
- raise ConfigurationError("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(config.checkpoint_period))
206
- checkpoint_period = (h * 3600) + (m * 60) + s
207
- self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period, name="Checkpoint")
208
-
209
189
  self.task_count = 0
210
190
  self.tasks: Dict[int, TaskRecord] = {}
211
191
  self.submitter_lock = threading.Lock()
@@ -371,13 +351,9 @@ class DataFlowKernel:
371
351
  else:
372
352
  task_record['fail_cost'] += 1
373
353
 
374
- if task_record['status'] == States.dep_fail:
354
+ if isinstance(e, DependencyError):
375
355
  logger.info("Task {} failed due to dependency failure so skipping retries".format(task_id))
376
- task_record['time_returned'] = datetime.datetime.now()
377
- self._send_task_log_info(task_record)
378
- self.memoizer.update_memo(task_record)
379
- with task_record['app_fu']._update_lock:
380
- task_record['app_fu'].set_exception(e)
356
+ self._complete_task_exception(task_record, States.dep_fail, e)
381
357
 
382
358
  elif task_record['fail_cost'] <= self._config.retries:
383
359
 
@@ -397,61 +373,48 @@ class DataFlowKernel:
397
373
  else:
398
374
  logger.exception("Task {} failed after {} retry attempts".format(task_id,
399
375
  task_record['try_id']))
400
- task_record['time_returned'] = datetime.datetime.now()
401
- self.update_task_state(task_record, States.failed)
402
- task_record['time_returned'] = datetime.datetime.now()
403
- self._send_task_log_info(task_record)
404
- self.memoizer.update_memo(task_record)
405
- with task_record['app_fu']._update_lock:
406
- task_record['app_fu'].set_exception(e)
376
+ self._complete_task_exception(task_record, States.failed, e)
407
377
 
408
378
  else:
409
379
  if task_record['from_memo']:
410
- self._complete_task(task_record, States.memo_done, res)
411
- self._send_task_log_info(task_record)
380
+ self._complete_task_result(task_record, States.memo_done, res)
381
+ elif not task_record['join']:
382
+ self._complete_task_result(task_record, States.exec_done, res)
412
383
  else:
413
- if not task_record['join']:
414
- self._complete_task(task_record, States.exec_done, res)
384
+ # This is a join task, and the original task's function code has
385
+ # completed. That means that the future returned by that code
386
+ # will be available inside the executor future, so we can now
387
+ # record the inner app ID in monitoring, and add a completion
388
+ # listener to that inner future.
389
+
390
+ joinable = future.result()
391
+
392
+ # Fail with a TypeError if the joinapp python body returned
393
+ # something we can't join on.
394
+ if isinstance(joinable, Future):
395
+ self.update_task_state(task_record, States.joining)
396
+ task_record['joins'] = joinable
397
+ task_record['join_lock'] = threading.Lock()
398
+ self._send_task_log_info(task_record)
399
+ joinable.add_done_callback(partial(self.handle_join_update, task_record))
400
+ elif joinable == []: # got a list, but it had no entries, and specifically, no Futures.
401
+ self.update_task_state(task_record, States.joining)
402
+ task_record['joins'] = joinable
403
+ task_record['join_lock'] = threading.Lock()
415
404
  self._send_task_log_info(task_record)
405
+ self.handle_join_update(task_record, None)
406
+ elif isinstance(joinable, list) and [j for j in joinable if not isinstance(j, Future)] == []:
407
+ self.update_task_state(task_record, States.joining)
408
+ task_record['joins'] = joinable
409
+ task_record['join_lock'] = threading.Lock()
410
+ self._send_task_log_info(task_record)
411
+ for inner_future in joinable:
412
+ inner_future.add_done_callback(partial(self.handle_join_update, task_record))
416
413
  else:
417
- # This is a join task, and the original task's function code has
418
- # completed. That means that the future returned by that code
419
- # will be available inside the executor future, so we can now
420
- # record the inner app ID in monitoring, and add a completion
421
- # listener to that inner future.
422
-
423
- joinable = future.result()
424
-
425
- # Fail with a TypeError if the joinapp python body returned
426
- # something we can't join on.
427
- if isinstance(joinable, Future):
428
- self.update_task_state(task_record, States.joining)
429
- task_record['joins'] = joinable
430
- task_record['join_lock'] = threading.Lock()
431
- self._send_task_log_info(task_record)
432
- joinable.add_done_callback(partial(self.handle_join_update, task_record))
433
- elif joinable == []: # got a list, but it had no entries, and specifically, no Futures.
434
- self.update_task_state(task_record, States.joining)
435
- task_record['joins'] = joinable
436
- task_record['join_lock'] = threading.Lock()
437
- self._send_task_log_info(task_record)
438
- self.handle_join_update(task_record, None)
439
- elif isinstance(joinable, list) and [j for j in joinable if not isinstance(j, Future)] == []:
440
- self.update_task_state(task_record, States.joining)
441
- task_record['joins'] = joinable
442
- task_record['join_lock'] = threading.Lock()
443
- self._send_task_log_info(task_record)
444
- for inner_future in joinable:
445
- inner_future.add_done_callback(partial(self.handle_join_update, task_record))
446
- else:
447
- task_record['time_returned'] = datetime.datetime.now()
448
- self.update_task_state(task_record, States.failed)
449
- task_record['time_returned'] = datetime.datetime.now()
450
- self._send_task_log_info(task_record)
451
- self.memoizer.update_memo(task_record)
452
- with task_record['app_fu']._update_lock:
453
- task_record['app_fu'].set_exception(
454
- TypeError(f"join_app body must return a Future or list of Futures, got {joinable} of type {type(joinable)}"))
414
+ self._complete_task_exception(
415
+ task_record,
416
+ States.failed,
417
+ TypeError(f"join_app body must return a Future or list of Futures, got {joinable} of type {type(joinable)}"))
455
418
 
456
419
  self._log_std_streams(task_record)
457
420
 
@@ -522,11 +485,7 @@ class DataFlowKernel:
522
485
  # no need to update the fail cost because join apps are never
523
486
  # retried
524
487
 
525
- self.update_task_state(task_record, States.failed)
526
- task_record['time_returned'] = datetime.datetime.now()
527
- self.memoizer.update_memo(task_record)
528
- with task_record['app_fu']._update_lock:
529
- task_record['app_fu'].set_exception(e)
488
+ self._complete_task_exception(task_record, States.failed, e)
530
489
 
531
490
  else:
532
491
  # all the joinables succeeded, so construct a result:
@@ -539,12 +498,10 @@ class DataFlowKernel:
539
498
  res.append(future.result())
540
499
  else:
541
500
  raise TypeError(f"Unknown joinable type {type(joinable)}")
542
- self._complete_task(task_record, States.exec_done, res)
501
+ self._complete_task_result(task_record, States.exec_done, res)
543
502
 
544
503
  self._log_std_streams(task_record)
545
504
 
546
- self._send_task_log_info(task_record)
547
-
548
505
  def handle_app_update(self, task_record: TaskRecord, future: AppFuture) -> None:
549
506
  """This function is called as a callback when an AppFuture
550
507
  is in its final state.
@@ -565,23 +522,12 @@ class DataFlowKernel:
565
522
  if not task_record['app_fu'] == future:
566
523
  logger.error("Internal consistency error: callback future is not the app_fu in task structure, for task {}".format(task_id))
567
524
 
568
- # Cover all checkpointing cases here:
569
- # Do we need to checkpoint now, or queue for later,
570
- # or do nothing?
571
- if self.checkpoint_mode == 'task_exit':
572
- self.checkpoint(tasks=[task_record])
573
- elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'):
574
- with self.checkpoint_lock:
575
- self.checkpointable_tasks.append(task_record)
576
- elif self.checkpoint_mode is None:
577
- pass
578
- else:
579
- raise InternalConsistencyError(f"Invalid checkpoint mode {self.checkpoint_mode}")
525
+ self.memoizer.update_checkpoint(task_record)
580
526
 
581
527
  self.wipe_task(task_id)
582
528
  return
583
529
 
584
- def _complete_task(self, task_record: TaskRecord, new_state: States, result: Any) -> None:
530
+ def _complete_task_result(self, task_record: TaskRecord, new_state: States, result: Any) -> None:
585
531
  """Set a task into a completed state
586
532
  """
587
533
  assert new_state in FINAL_STATES
@@ -594,9 +540,31 @@ class DataFlowKernel:
594
540
  task_record['time_returned'] = datetime.datetime.now()
595
541
 
596
542
  self.memoizer.update_memo(task_record)
543
+
544
+ self._send_task_log_info(task_record)
545
+
597
546
  with task_record['app_fu']._update_lock:
598
547
  task_record['app_fu'].set_result(result)
599
548
 
549
+ def _complete_task_exception(self, task_record: TaskRecord, new_state: States, exception: BaseException) -> None:
550
+ """Set a task into a failure state
551
+ """
552
+ assert new_state in FINAL_STATES
553
+ assert new_state in FINAL_FAILURE_STATES
554
+ old_state = task_record['status']
555
+
556
+ self.update_task_state(task_record, new_state)
557
+
558
+ logger.info(f"Task {task_record['id']} failed ({old_state.name} -> {new_state.name})")
559
+ task_record['time_returned'] = datetime.datetime.now()
560
+
561
+ self.memoizer.update_memo(task_record)
562
+
563
+ self._send_task_log_info(task_record)
564
+
565
+ with task_record['app_fu']._update_lock:
566
+ task_record['app_fu'].set_exception(exception)
567
+
600
568
  def update_task_state(self, task_record: TaskRecord, new_state: States) -> None:
601
569
  """Updates a task record state, and recording an appropriate change
602
570
  to task state counters.
@@ -648,7 +616,7 @@ class DataFlowKernel:
648
616
  _launch_if_ready will launch the specified task, if it is ready
649
617
  to run (for example, without dependencies, and in pending state).
650
618
  """
651
- exec_fu = None
619
+ exec_fu: Future
652
620
 
653
621
  task_id = task_record['id']
654
622
  with task_record['task_launch_lock']:
@@ -687,28 +655,24 @@ class DataFlowKernel:
687
655
  else:
688
656
  logger.info(
689
657
  "Task {} failed due to dependency failure".format(task_id))
690
- # Raise a dependency exception
691
- self.update_task_state(task_record, States.dep_fail)
692
-
693
- self._send_task_log_info(task_record)
694
658
 
695
659
  exec_fu = Future()
696
660
  exec_fu.set_exception(DependencyError(exceptions_tids,
697
661
  task_id))
698
662
 
699
- if exec_fu:
700
- assert isinstance(exec_fu, Future)
701
- try:
702
- exec_fu.add_done_callback(partial(self.handle_exec_update, task_record))
703
- except Exception:
704
- # this exception is ignored here because it is assumed that exception
705
- # comes from directly executing handle_exec_update (because exec_fu is
706
- # done already). If the callback executes later, then any exception
707
- # coming out of the callback will be ignored and not propate anywhere,
708
- # so this block attempts to keep the same behaviour here.
709
- logger.error("add_done_callback got an exception which will be ignored", exc_info=True)
663
+ assert isinstance(exec_fu, Future), "Every code path leading here needs to define exec_fu"
710
664
 
711
- task_record['exec_fu'] = exec_fu
665
+ try:
666
+ exec_fu.add_done_callback(partial(self.handle_exec_update, task_record))
667
+ except Exception:
668
+ # this exception is ignored here because it is assumed that exception
669
+ # comes from directly executing handle_exec_update (because exec_fu is
670
+ # done already). If the callback executes later, then any exception
671
+ # coming out of the callback will be ignored and not propate anywhere,
672
+ # so this block attempts to keep the same behaviour here.
673
+ logger.error("add_done_callback got an exception which will be ignored", exc_info=True)
674
+
675
+ task_record['exec_fu'] = exec_fu
712
676
 
713
677
  def launch_task(self, task_record: TaskRecord) -> Future:
714
678
  """Handle the actual submission of the task to the executor layer.
@@ -1205,13 +1169,7 @@ class DataFlowKernel:
1205
1169
 
1206
1170
  self.log_task_states()
1207
1171
 
1208
- # checkpoint if any valid checkpoint method is specified
1209
- if self.checkpoint_mode is not None:
1210
- self.checkpoint()
1211
-
1212
- if self._checkpoint_timer:
1213
- logger.info("Stopping checkpoint timer")
1214
- self._checkpoint_timer.close()
1172
+ self.memoizer.close()
1215
1173
 
1216
1174
  # Send final stats
1217
1175
  self.usage_tracker.send_end_message()
@@ -1269,66 +1227,8 @@ class DataFlowKernel:
1269
1227
  # should still see it.
1270
1228
  logger.info("DFK cleanup complete")
1271
1229
 
1272
- def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> None:
1273
- """Checkpoint the dfk incrementally to a checkpoint file.
1274
-
1275
- When called, every task that has been completed yet not
1276
- checkpointed is checkpointed to a file.
1277
-
1278
- Kwargs:
1279
- - tasks (List of task records) : List of task ids to checkpoint. Default=None
1280
- if set to None, we iterate over all tasks held by the DFK.
1281
-
1282
- .. note::
1283
- Checkpointing only works if memoization is enabled
1284
-
1285
- Returns:
1286
- Checkpoint dir if checkpoints were written successfully.
1287
- By default the checkpoints are written to the RUNDIR of the current
1288
- run under RUNDIR/checkpoints/tasks.pkl
1289
- """
1290
- with self.checkpoint_lock:
1291
- if tasks:
1292
- checkpoint_queue = tasks
1293
- else:
1294
- checkpoint_queue = self.checkpointable_tasks
1295
- self.checkpointable_tasks = []
1296
-
1297
- checkpoint_dir = '{0}/checkpoint'.format(self.run_dir)
1298
- checkpoint_tasks = checkpoint_dir + '/tasks.pkl'
1299
-
1300
- if not os.path.exists(checkpoint_dir):
1301
- os.makedirs(checkpoint_dir, exist_ok=True)
1302
-
1303
- count = 0
1304
-
1305
- with open(checkpoint_tasks, 'ab') as f:
1306
- for task_record in checkpoint_queue:
1307
- task_id = task_record['id']
1308
-
1309
- app_fu = task_record['app_fu']
1310
-
1311
- if app_fu.done() and app_fu.exception() is None:
1312
- hashsum = task_record['hashsum']
1313
- if not hashsum:
1314
- continue
1315
- t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}
1316
-
1317
- # We are using pickle here since pickle dumps to a file in 'ab'
1318
- # mode behave like a incremental log.
1319
- pickle.dump(t, f)
1320
- count += 1
1321
- logger.debug("Task {} checkpointed".format(task_id))
1322
-
1323
- self.checkpointed_tasks += count
1324
-
1325
- if count == 0:
1326
- if self.checkpointed_tasks == 0:
1327
- logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled")
1328
- else:
1329
- logger.debug("No tasks checkpointed in this pass.")
1330
- else:
1331
- logger.info("Done checkpointing {} tasks".format(count))
1230
+ def checkpoint(self) -> None:
1231
+ self.memoizer.checkpoint()
1332
1232
 
1333
1233
  @staticmethod
1334
1234
  def _log_std_streams(task_record: TaskRecord) -> None:
@@ -4,15 +4,18 @@ import hashlib
4
4
  import logging
5
5
  import os
6
6
  import pickle
7
+ import threading
7
8
  import types
8
9
  from concurrent.futures import Future
9
10
  from functools import lru_cache, singledispatch
10
- from typing import Any, Dict, List, Optional, Sequence
11
+ from typing import Any, Dict, List, Literal, Optional, Sequence
11
12
 
12
13
  import typeguard
13
14
 
14
15
  from parsl.dataflow.errors import BadCheckpoint
15
16
  from parsl.dataflow.taskrecord import TaskRecord
17
+ from parsl.errors import ConfigurationError, InternalConsistencyError
18
+ from parsl.utils import Timer, get_all_checkpoints
16
19
 
17
20
  logger = logging.getLogger(__name__)
18
21
 
@@ -146,7 +149,13 @@ class Memoizer:
146
149
 
147
150
  """
148
151
 
149
- def __init__(self, *, memoize: bool = True, checkpoint_files: Sequence[str]):
152
+ run_dir: str
153
+
154
+ def __init__(self, *,
155
+ memoize: bool = True,
156
+ checkpoint_files: Sequence[str] | None,
157
+ checkpoint_period: Optional[str],
158
+ checkpoint_mode: Literal['task_exit', 'periodic', 'dfk_exit', 'manual'] | None):
150
159
  """Initialize the memoizer.
151
160
 
152
161
  KWargs:
@@ -155,6 +164,26 @@ class Memoizer:
155
164
  """
156
165
  self.memoize = memoize
157
166
 
167
+ self.checkpointed_tasks = 0
168
+
169
+ self.checkpoint_lock = threading.Lock()
170
+
171
+ self.checkpoint_files = checkpoint_files
172
+ self.checkpoint_mode = checkpoint_mode
173
+ self.checkpoint_period = checkpoint_period
174
+
175
+ self.checkpointable_tasks: List[TaskRecord] = []
176
+
177
+ self._checkpoint_timer: Timer | None = None
178
+
179
+ def start(self) -> None:
180
+ if self.checkpoint_files is not None:
181
+ checkpoint_files = self.checkpoint_files
182
+ elif self.checkpoint_files is None and self.checkpoint_mode is not None:
183
+ checkpoint_files = get_all_checkpoints(self.run_dir)
184
+ else:
185
+ checkpoint_files = []
186
+
158
187
  checkpoint = self.load_checkpoints(checkpoint_files)
159
188
 
160
189
  if self.memoize:
@@ -164,6 +193,26 @@ class Memoizer:
164
193
  logger.info("App caching disabled for all apps")
165
194
  self.memo_lookup_table = {}
166
195
 
196
+ if self.checkpoint_mode == "periodic":
197
+ if self.checkpoint_period is None:
198
+ raise ConfigurationError("Checkpoint period must be specified with periodic checkpoint mode")
199
+ else:
200
+ try:
201
+ h, m, s = map(int, self.checkpoint_period.split(':'))
202
+ except Exception:
203
+ raise ConfigurationError("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(self.checkpoint_period))
204
+ checkpoint_period = (h * 3600) + (m * 60) + s
205
+ self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period, name="Checkpoint")
206
+
207
+ def close(self) -> None:
208
+ if self.checkpoint_mode is not None:
209
+ logger.info("Making final checkpoint")
210
+ self.checkpoint()
211
+
212
+ if self._checkpoint_timer:
213
+ logger.info("Stopping checkpoint timer")
214
+ self._checkpoint_timer.close()
215
+
167
216
  def make_hash(self, task: TaskRecord) -> str:
168
217
  """Create a hash of the task inputs.
169
218
 
@@ -324,3 +373,78 @@ class Memoizer:
324
373
  return self._load_checkpoints(checkpointDirs)
325
374
  else:
326
375
  return {}
376
+
377
+ def update_checkpoint(self, task_record: TaskRecord) -> None:
378
+ if self.checkpoint_mode == 'task_exit':
379
+ self.checkpoint(task=task_record)
380
+ elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'):
381
+ with self.checkpoint_lock:
382
+ self.checkpointable_tasks.append(task_record)
383
+ elif self.checkpoint_mode is None:
384
+ pass
385
+ else:
386
+ raise InternalConsistencyError(f"Invalid checkpoint mode {self.checkpoint_mode}")
387
+
388
+ def checkpoint(self, *, task: Optional[TaskRecord] = None) -> None:
389
+ """Checkpoint the dfk incrementally to a checkpoint file.
390
+
391
+ When called with no argument, all tasks registered in self.checkpointable_tasks
392
+ will be checkpointed. When called with a single TaskRecord argument, that task will be
393
+ checkpointed.
394
+
395
+ By default the checkpoints are written to the RUNDIR of the current
396
+ run under RUNDIR/checkpoints/tasks.pkl
397
+
398
+ Kwargs:
399
+ - task (Optional task records) : A task to checkpoint. Default=None, meaning all
400
+ tasks registered for checkpointing.
401
+
402
+ .. note::
403
+ Checkpointing only works if memoization is enabled
404
+
405
+ """
406
+ with self.checkpoint_lock:
407
+
408
+ if task:
409
+ checkpoint_queue = [task]
410
+ else:
411
+ checkpoint_queue = self.checkpointable_tasks
412
+
413
+ checkpoint_dir = '{0}/checkpoint'.format(self.run_dir)
414
+ checkpoint_tasks = checkpoint_dir + '/tasks.pkl'
415
+
416
+ if not os.path.exists(checkpoint_dir):
417
+ os.makedirs(checkpoint_dir, exist_ok=True)
418
+
419
+ count = 0
420
+
421
+ with open(checkpoint_tasks, 'ab') as f:
422
+ for task_record in checkpoint_queue:
423
+ task_id = task_record['id']
424
+
425
+ app_fu = task_record['app_fu']
426
+
427
+ if app_fu.done() and app_fu.exception() is None:
428
+ hashsum = task_record['hashsum']
429
+ if not hashsum:
430
+ continue
431
+ t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}
432
+
433
+ # We are using pickle here since pickle dumps to a file in 'ab'
434
+ # mode behave like a incremental log.
435
+ pickle.dump(t, f)
436
+ count += 1
437
+ logger.debug("Task {} checkpointed".format(task_id))
438
+
439
+ self.checkpointed_tasks += count
440
+
441
+ if count == 0:
442
+ if self.checkpointed_tasks == 0:
443
+ logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled")
444
+ else:
445
+ logger.debug("No tasks checkpointed in this pass.")
446
+ else:
447
+ logger.info("Done checkpointing {} tasks".format(count))
448
+
449
+ if not task:
450
+ self.checkpointable_tasks = []
parsl/dataflow/states.py CHANGED
@@ -67,10 +67,10 @@ class States(IntEnum):
67
67
  return self.__class__.__name__ + "." + self.name
68
68
 
69
69
 
70
- FINAL_STATES = [States.exec_done, States.memo_done, States.failed, States.dep_fail]
71
- """States from which we will never move to another state, because the job has
72
- either definitively completed or failed."""
73
-
74
- FINAL_FAILURE_STATES = [States.failed, States.dep_fail]
70
+ FINAL_FAILURE_STATES = {States.failed, States.dep_fail}
75
71
  """States which are final and which indicate a failure. This must
76
72
  be a subset of FINAL_STATES"""
73
+
74
+ FINAL_STATES = {States.exec_done, States.memo_done, *FINAL_FAILURE_STATES}
75
+ """States from which we will never move to another state, because the job has
76
+ either definitively completed or failed."""
parsl/jobs/strategy.py CHANGED
@@ -185,6 +185,11 @@ class Strategy:
185
185
 
186
186
  for executor in executors:
187
187
  label = executor.label
188
+
189
+ if executor.bad_state_is_set:
190
+ logger.info(f"Not strategizing for executor {label} because bad state is set")
191
+ continue
192
+
188
193
  logger.debug(f"Strategizing for executor {label}")
189
194
 
190
195
  if self.executors[label]['first']:
@@ -213,12 +218,8 @@ class Strategy:
213
218
 
214
219
  logger.debug(f"Slot ratio calculation: active_slots = {active_slots}, active_tasks = {active_tasks}")
215
220
 
216
- if hasattr(executor, 'connected_workers'):
217
- logger.debug('Executor {} has {} active tasks, {}/{} running/pending blocks, and {} connected workers'.format(
218
- label, active_tasks, running, pending, executor.connected_workers()))
219
- else:
220
- logger.debug('Executor {} has {} active tasks and {}/{} running/pending blocks'.format(
221
- label, active_tasks, running, pending))
221
+ logger.debug('Executor {} has {} active tasks and {}/{} running/pending blocks'.format(
222
+ label, active_tasks, running, pending))
222
223
 
223
224
  # reset idle timer if executor has active tasks
224
225
 
@@ -18,7 +18,7 @@ def fake_task(parsl_resource_specification=None):
18
18
 
19
19
 
20
20
  @pytest.mark.local
21
- def test_priority_queue():
21
+ def test_priority_queue(try_assert):
22
22
  provider = LocalProvider(
23
23
  init_blocks=0,
24
24
  max_blocks=0,
@@ -30,6 +30,7 @@ def test_priority_queue():
30
30
  max_workers_per_node=1,
31
31
  manager_selector=RandomManagerSelector(),
32
32
  provider=provider,
33
+ worker_debug=True, # needed to instrospect interchange logs
33
34
  )
34
35
 
35
36
  config = Config(
@@ -50,6 +51,22 @@ def test_priority_queue():
50
51
  spec = {'priority': priority}
51
52
  futures[(priority, i)] = fake_task(parsl_resource_specification=spec)
52
53
 
54
+ # wait for the interchange to have received all tasks
55
+ # (which happens asynchronously to the main thread, and is otherwise
56
+ # a race condition which can cause this test to fail)
57
+
58
+ n = len(priorities)
59
+
60
+ def interchange_logs_task_count():
61
+ with open(htex.worker_logdir + "/interchange.log", "r") as f:
62
+ lines = f.readlines()
63
+ for line in lines:
64
+ if f"Fetched {n} tasks so far" in line:
65
+ return True
66
+ return False
67
+
68
+ try_assert(interchange_logs_task_count)
69
+
53
70
  provider.max_blocks = 1
54
71
  htex.scale_out_facade(1) # don't wait for the JSP to catch up
55
72
 
@@ -1,5 +1,7 @@
1
+ import parsl
1
2
  from parsl import python_app
2
3
  from parsl.dataflow.errors import DependencyError
4
+ from parsl.dataflow.states import States
3
5
 
4
6
 
5
7
  @python_app
@@ -14,6 +16,7 @@ def depends(parent):
14
16
 
15
17
  def test_depfail_once():
16
18
  """Test the simplest dependency failure case"""
19
+ start_dep_fail_count = parsl.dfk().task_state_counts[States.dep_fail]
17
20
  f1 = fails()
18
21
  f2 = depends(f1)
19
22
 
@@ -25,9 +28,12 @@ def test_depfail_once():
25
28
  # in the DependencyError message
26
29
  assert ("task " + str(f1.task_record['id'])) in str(f2.exception())
27
30
 
31
+ assert parsl.dfk().task_state_counts[States.dep_fail] == start_dep_fail_count + 1
32
+
28
33
 
29
34
  def test_depfail_chain():
30
35
  """Test that dependency failures chain"""
36
+ start_dep_fail_count = parsl.dfk().task_state_counts[States.dep_fail]
31
37
  f1 = fails()
32
38
  f2 = depends(f1)
33
39
  f3 = depends(f2)
@@ -39,11 +45,13 @@ def test_depfail_chain():
39
45
  assert isinstance(f3.exception(), DependencyError)
40
46
  assert isinstance(f4.exception(), DependencyError)
41
47
 
48
+ assert parsl.dfk().task_state_counts[States.dep_fail] == start_dep_fail_count + 3
49
+
42
50
 
43
51
  def test_depfail_branches():
44
52
  """Test that dependency failures propagate in the
45
53
  presence of multiple downstream tasks."""
46
-
54
+ start_dep_fail_count = parsl.dfk().task_state_counts[States.dep_fail]
47
55
  f1 = fails()
48
56
  f2 = depends(f1)
49
57
  f3 = depends(f1)
@@ -52,3 +60,5 @@ def test_depfail_branches():
52
60
  assert not isinstance(f1.exception(), DependencyError)
53
61
  assert isinstance(f2.exception(), DependencyError)
54
62
  assert isinstance(f3.exception(), DependencyError)
63
+
64
+ assert parsl.dfk().task_state_counts[States.dep_fail] == start_dep_fail_count + 2
@@ -51,6 +51,7 @@ def test_htex_strategy_does_not_oscillate(ns):
51
51
  executor.outstanding = lambda: n_tasks
52
52
  executor.status_facade = statuses
53
53
  executor.workers_per_node = n_workers
54
+ executor.bad_state_is_set = False
54
55
 
55
56
  provider.parallelism = 1
56
57
  provider.init_blocks = 0
parsl/version.py CHANGED
@@ -3,4 +3,4 @@
3
3
  Year.Month.Day[alpha/beta/..]
4
4
  Alphas will be numbered like this -> 2024.12.10a0
5
5
  """
6
- VERSION = '2025.10.13'
6
+ VERSION = '2025.10.20'
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parsl
3
- Version: 2025.10.13
3
+ Version: 2025.10.20
4
4
  Summary: Simple data dependent workflows in Python
5
5
  Home-page: https://github.com/Parsl/parsl
6
- Download-URL: https://github.com/Parsl/parsl/archive/2025.10.13.tar.gz
6
+ Download-URL: https://github.com/Parsl/parsl/archive/2025.10.20.tar.gz
7
7
  Author: The Parsl Team
8
8
  Author-email: parsl@googlegroups.com
9
9
  License: Apache 2.0
@@ -8,10 +8,10 @@ parsl/multiprocessing.py,sha256=xqieTLko3DrHykCqqSHQszMwd8ORYllrgz6Qc_PsHCE,2112
8
8
  parsl/process_loggers.py,sha256=uQ7Gd0W72Jz7rrcYlOMfLsAEhkRltxXJL2MgdduJjEw,1136
9
9
  parsl/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  parsl/utils.py,sha256=smVYTusMoYUTD5N9OxTW5bh6o2iioh0NnfjrBAj8zYk,14452
11
- parsl/version.py,sha256=rx6RT17unUOKqi7eD7qVlatEYlVhzqpa0D-qZ3mnuMs,131
11
+ parsl/version.py,sha256=whi_IdOncV7eAqL5UV49y8XFRCw7SVxlohTSQa_fU70,131
12
12
  parsl/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  parsl/app/app.py,sha256=0gbM4AH2OtFOLsv07I5nglpElcwMSOi-FzdZZfrk7So,8532
14
- parsl/app/bash.py,sha256=jm2AvePlCT9DZR7H_4ANDWxatp5dN_22FUlT_gWhZ-g,5528
14
+ parsl/app/bash.py,sha256=VYIUTvy3qbjR7MzVO9jErui2WMZteIeuc7iGK6NSjL0,5498
15
15
  parsl/app/errors.py,sha256=SQQ1fNp8834DZnoRnlsoZn1WMAFM3fnh2CNHRPmFcKc,3854
16
16
  parsl/app/futures.py,sha256=2tMUeKIuDzwuhLIWlsEiZuDrhkxxsUed4QUbQuQg20Y,2826
17
17
  parsl/app/python.py,sha256=0hrz2BppVOwwNfh5hnoP70Yv56gSRkIoT-fP9XNb4v4,2331
@@ -55,12 +55,12 @@ parsl/data_provider/staging.py,sha256=ZDZuuFg38pjUStegKPcvPsfGp3iMeReMzfU6DSwtJj
55
55
  parsl/data_provider/zip.py,sha256=S4kVuH9lxAegRURYbvIUR7EYYBOccyslaqyCrVWUBhw,4497
56
56
  parsl/dataflow/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
57
  parsl/dataflow/dependency_resolvers.py,sha256=Om8Dgh7a0ZwgXAc6TlhxLSzvxXHDlNNV1aBNiD3JTNY,3325
58
- parsl/dataflow/dflow.py,sha256=shxgZ5ulMYPnvyKx4nOrdhCZYLX3JgXPo1OC3168OZw,63087
58
+ parsl/dataflow/dflow.py,sha256=AQKNtTwqk6YkzzDFEWmQ3dFHmDT8r1PuBF2RBhWC4Q8,58047
59
59
  parsl/dataflow/errors.py,sha256=daVfr2BWs1zRsGD6JtosEMttWHvK1df1Npiu_MUvFKg,3998
60
60
  parsl/dataflow/futures.py,sha256=08LuP-HFiHBIZmeKCjlsazw_WpQ5fwevrU2_WbidkYw,6080
61
- parsl/dataflow/memoization.py,sha256=AyO1khMwlbuGJQQk-l_wJRj0QeOHTOnmlvzXgQdNNQk,11977
61
+ parsl/dataflow/memoization.py,sha256=xWR09aZkQ695NIqyXQRCVl3OzioXQPzY3_3zqXd3ggA,16918
62
62
  parsl/dataflow/rundirs.py,sha256=JZdzybVGubY35jL2YiKcDo65ZmRl1WyOApc8ajYxztc,1087
63
- parsl/dataflow/states.py,sha256=hV6mfv-y4A6xrujeQglcomnfEs7y3Xm2g6JFwC6dvgQ,2612
63
+ parsl/dataflow/states.py,sha256=7i3s0QIOn0TA42YCjvEAVumNd8G0IlOvX6vOfG3Sy3U,2603
64
64
  parsl/dataflow/taskrecord.py,sha256=qIW7T6hn9dYTuNPdUura3HQwwUpUJACwPP5REm5COf4,3042
65
65
  parsl/executors/__init__.py,sha256=PEuXYrnVqwlaz_nt82s9D_YNaVsX7ET29DeIZRUR8hw,577
66
66
  parsl/executors/base.py,sha256=n-_tFtigMguc_alT8vSf1zKl2EuoGC1lmUewTv3dgsc,4990
@@ -110,7 +110,7 @@ parsl/jobs/error_handlers.py,sha256=BBXwUAMJpBm0HxV1P-I6jv7ZF9wcrhnCfzSTlsd2g4w,
110
110
  parsl/jobs/errors.py,sha256=cpSQXCrlKtuHsQf7usjF-lX8XsDkFnE5kWpmFjiN6OU,178
111
111
  parsl/jobs/job_status_poller.py,sha256=b37JOqDpSesqeSreEh1HzfVTFnD5Aoy6k8JDXkkPDmk,2192
112
112
  parsl/jobs/states.py,sha256=dUM8gC4YVpUjLMARJJ_tDERs6oHsoNheAtG6JWPIJt4,5058
113
- parsl/jobs/strategy.py,sha256=VxFicpEq6l4bkoFQItHCpQGv9-8jPuP_rMLV1yYZ26Q,13805
113
+ parsl/jobs/strategy.py,sha256=Ssw_24xtxb5w8CpBL6Cm11MvcX0qzXvMiHOrOX3-pWs,13671
114
114
  parsl/launchers/__init__.py,sha256=jJeDOWGKJjvpmWTLsj1zSqce_UAhWRc_IO-TzaOAlII,579
115
115
  parsl/launchers/base.py,sha256=CblcvPTJiu-MNLWaRtFe29SZQ0BpTOlaY8CGcHdlHIE,538
116
116
  parsl/launchers/errors.py,sha256=8YMV_CHpBNVa4eXkGE4x5DaFQlZkDCRCHmBktYcY6TA,467
@@ -318,7 +318,7 @@ parsl/tests/test_htex/test_manager_selector_by_block.py,sha256=VQqSE6MDhGpDSjShG
318
318
  parsl/tests/test_htex/test_managers_command.py,sha256=SCwkfyGB-Udgu5L2yDMpR5bsaT-aNjNkiXxtuRb25DI,1622
319
319
  parsl/tests/test_htex/test_missing_worker.py,sha256=gyp5i7_t-JHyJGtz_eXZKKBY5w8oqLOIxO6cJgGJMtQ,745
320
320
  parsl/tests/test_htex/test_multiple_disconnected_blocks.py,sha256=2vXZoIx4NuAWYuiNoL5Gxr85w72qZ7Kdb3JGh0FufTg,1867
321
- parsl/tests/test_htex/test_priority_queue.py,sha256=sAs9W4I0LsmvPpuN9Q66yRY4zoSOEo0eMFh6DXlih0I,2336
321
+ parsl/tests/test_htex/test_priority_queue.py,sha256=qnU5ueFsl7sLlJ4p_PVash5a9fYNLRbk7V4COnNuOmY,3007
322
322
  parsl/tests/test_htex/test_resource_spec_validation.py,sha256=ZXW02jDd1rNxjBLh1jHyiz31zNoB9JzDw94aWllXFd4,1102
323
323
  parsl/tests/test_htex/test_worker_failure.py,sha256=Uz-RHI-LK78FMjXUvrUFmo4iYfmpDVBUcBxxRb3UG9M,603
324
324
  parsl/tests/test_htex/test_zmq_binding.py,sha256=SmX_63vvXKnzWISBr8HnJCrRqubx7K0blvgjq4Px2gc,4391
@@ -358,7 +358,7 @@ parsl/tests/test_python_apps/test_context_manager.py,sha256=8kUgcxN-6cz2u-lUoDhM
358
358
  parsl/tests/test_python_apps/test_dep_standard_futures.py,sha256=kMOMZLaxJMmpABCUVniDIOIfkEqflZyhKjS_wkDti7A,1049
359
359
  parsl/tests/test_python_apps/test_dependencies.py,sha256=IRiTI_lPoWBSFSFnaBlE6Bv08PKEaf-qj5dfqO2RjT0,272
360
360
  parsl/tests/test_python_apps/test_dependencies_deep.py,sha256=Cuow2LLGY7zffPFj89AOIwKlXxHtsin3v_UIhfdwV_w,1542
361
- parsl/tests/test_python_apps/test_depfail_propagation.py,sha256=3q3HlVWrOixFtXWBvR_ypKtbdAHAJcKndXQ5drwrBQU,1488
361
+ parsl/tests/test_python_apps/test_depfail_propagation.py,sha256=TSXBgcFSxqkaEeVl_cCfQfdCmCgTTRi2q2mSr2RH6Tc,2024
362
362
  parsl/tests/test_python_apps/test_fail.py,sha256=gMuZwxZNaUCaonlUX-7SOBvXg8kidkBcEeqKLEvqpYM,1692
363
363
  parsl/tests/test_python_apps/test_fibonacci_iterative.py,sha256=ly2s5HuB9R53Z2FM_zy0WWdOk01iVhgcwSpQyK6ErIY,573
364
364
  parsl/tests/test_python_apps/test_fibonacci_recursive.py,sha256=q7LMFcu_pJSNPdz8iY0UiRoIweEWIBGwMjQffHWAuDc,592
@@ -400,7 +400,7 @@ parsl/tests/test_scaling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZ
400
400
  parsl/tests/test_scaling/test_block_error_handler.py,sha256=OS1IyiK8gjRFI1VzpmOvEnKsPev2vKmC6Z2Hp5LaHpA,6068
401
401
  parsl/tests/test_scaling/test_regression_1621.py,sha256=e3-bkHR3d8LxA-uY0BugyWgYzksh00I_UbaA-jHOzKY,1872
402
402
  parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py,sha256=bjE_NIBoWK6heEz5LN0tzE1977vUA9kVemAYCqcIbzY,2942
403
- parsl/tests/test_scaling/test_regression_3696_oscillation.py,sha256=xbRY1sNmPvpliwg0nLDCS2JcIviVPHHCOe3y1W9iIlY,3637
403
+ parsl/tests/test_scaling/test_regression_3696_oscillation.py,sha256=gjf5DDX_X-iZtekDQffsa3DBw8_vWarQh5ztkxcSkX0,3675
404
404
  parsl/tests/test_scaling/test_scale_down.py,sha256=q_H6YAaID-n6Yj_FVElhufApzsbD08ItRopjgRBlDvU,2769
405
405
  parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py,sha256=2w4BxKyWXrow9PMRZzIFdiB1EVZ8YRTmqsP-RNoOx7Q,4525
406
406
  parsl/tests/test_scaling/test_scale_down_htex_unregistered.py,sha256=OrdnYmd58n7UfkANPJ7mzha4WSCPdbgJRX1O1Zdu0tI,1954
@@ -450,13 +450,13 @@ parsl/usage_tracking/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
450
450
  parsl/usage_tracking/api.py,sha256=iaCY58Dc5J4UM7_dJzEEs871P1p1HdxBMtNGyVdzc9g,1821
451
451
  parsl/usage_tracking/levels.py,sha256=xbfzYEsd55KiZJ-mzNgPebvOH4rRHum04hROzEf41tU,291
452
452
  parsl/usage_tracking/usage.py,sha256=hbMo5BYgIWqMcFWqN-HYP1TbwNrTonpv-usfwnCFJKY,9212
453
- parsl-2025.10.13.data/scripts/exec_parsl_function.py,sha256=YXKVVIa4zXmOtz-0Ca4E_5nQfN_3S2bh2tB75uZZB4w,7774
454
- parsl-2025.10.13.data/scripts/interchange.py,sha256=Kn0yJnpcRsc37gfhD6mGkoX9wD7vP_QgWst7qwUjj5o,26145
455
- parsl-2025.10.13.data/scripts/parsl_coprocess.py,sha256=zrVjEqQvFOHxsLufPi00xzMONagjVwLZbavPM7bbjK4,5722
456
- parsl-2025.10.13.data/scripts/process_worker_pool.py,sha256=euc3xPPw1zFdXVjgbSvyyIcvjcEZGXZTi0aSj23Vp-g,41370
457
- parsl-2025.10.13.dist-info/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
458
- parsl-2025.10.13.dist-info/METADATA,sha256=VL1Yq8GWBXD6N7tstvWAcC5Tfs1rCOX8ldffVN6HtCo,4007
459
- parsl-2025.10.13.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
460
- parsl-2025.10.13.dist-info/entry_points.txt,sha256=XqnsWDYoEcLbsMcpnYGKLEnSBmaIe1YoM5YsBdJG2tI,176
461
- parsl-2025.10.13.dist-info/top_level.txt,sha256=PIheYoUFQtF2icLsgOykgU-Cjuwr2Oi6On2jo5RYgRM,6
462
- parsl-2025.10.13.dist-info/RECORD,,
453
+ parsl-2025.10.20.data/scripts/exec_parsl_function.py,sha256=YXKVVIa4zXmOtz-0Ca4E_5nQfN_3S2bh2tB75uZZB4w,7774
454
+ parsl-2025.10.20.data/scripts/interchange.py,sha256=Kn0yJnpcRsc37gfhD6mGkoX9wD7vP_QgWst7qwUjj5o,26145
455
+ parsl-2025.10.20.data/scripts/parsl_coprocess.py,sha256=zrVjEqQvFOHxsLufPi00xzMONagjVwLZbavPM7bbjK4,5722
456
+ parsl-2025.10.20.data/scripts/process_worker_pool.py,sha256=euc3xPPw1zFdXVjgbSvyyIcvjcEZGXZTi0aSj23Vp-g,41370
457
+ parsl-2025.10.20.dist-info/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
458
+ parsl-2025.10.20.dist-info/METADATA,sha256=00bQzNdWQ0pCl_MRkEY5s59WLk9r67BfY5t6LNALEqA,4007
459
+ parsl-2025.10.20.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
460
+ parsl-2025.10.20.dist-info/entry_points.txt,sha256=XqnsWDYoEcLbsMcpnYGKLEnSBmaIe1YoM5YsBdJG2tI,176
461
+ parsl-2025.10.20.dist-info/top_level.txt,sha256=PIheYoUFQtF2icLsgOykgU-Cjuwr2Oi6On2jo5RYgRM,6
462
+ parsl-2025.10.20.dist-info/RECORD,,