parsl 2025.9.8__py3-none-any.whl → 2025.11.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/app/bash.py +1 -1
- parsl/benchmark/perf.py +73 -17
- parsl/concurrent/__init__.py +95 -14
- parsl/curvezmq.py +0 -16
- parsl/data_provider/globus.py +3 -1
- parsl/dataflow/dflow.py +106 -204
- parsl/dataflow/memoization.py +146 -19
- parsl/dataflow/states.py +5 -5
- parsl/executors/base.py +2 -2
- parsl/executors/execute_task.py +2 -8
- parsl/executors/flux/executor.py +4 -6
- parsl/executors/globus_compute.py +0 -4
- parsl/executors/high_throughput/executor.py +86 -24
- parsl/executors/high_throughput/interchange.py +39 -20
- parsl/executors/high_throughput/mpi_executor.py +1 -2
- parsl/executors/high_throughput/mpi_resource_management.py +7 -14
- parsl/executors/high_throughput/process_worker_pool.py +32 -7
- parsl/executors/high_throughput/zmq_pipes.py +36 -67
- parsl/executors/radical/executor.py +2 -6
- parsl/executors/radical/rpex_worker.py +2 -2
- parsl/executors/taskvine/executor.py +5 -1
- parsl/executors/threads.py +5 -2
- parsl/jobs/states.py +2 -2
- parsl/jobs/strategy.py +7 -6
- parsl/monitoring/monitoring.py +2 -2
- parsl/monitoring/radios/filesystem.py +2 -1
- parsl/monitoring/radios/htex.py +2 -1
- parsl/monitoring/radios/multiprocessing.py +2 -1
- parsl/monitoring/radios/udp.py +2 -1
- parsl/multiprocessing.py +0 -49
- parsl/providers/base.py +24 -37
- parsl/providers/pbspro/pbspro.py +1 -1
- parsl/serialize/__init__.py +6 -9
- parsl/serialize/facade.py +0 -32
- parsl/tests/configs/local_threads_globus.py +18 -14
- parsl/tests/configs/taskvine_ex.py +1 -1
- parsl/tests/sites/test_concurrent.py +51 -3
- parsl/tests/test_checkpointing/test_periodic.py +15 -9
- parsl/tests/test_checkpointing/test_regression_233.py +0 -1
- parsl/tests/test_curvezmq.py +0 -42
- parsl/tests/test_execute_task.py +2 -11
- parsl/tests/test_htex/test_command_concurrency_regression_1321.py +54 -0
- parsl/tests/test_htex/test_htex.py +36 -1
- parsl/tests/test_htex/test_interchange_exit_bad_registration.py +2 -2
- parsl/tests/test_htex/test_priority_queue.py +26 -3
- parsl/tests/test_htex/test_zmq_binding.py +2 -1
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +18 -43
- parsl/tests/test_python_apps/test_basic.py +0 -14
- parsl/tests/test_python_apps/test_depfail_propagation.py +11 -1
- parsl/tests/test_python_apps/test_exception.py +19 -0
- parsl/tests/test_python_apps/test_garbage_collect.py +1 -6
- parsl/tests/test_python_apps/test_memoize_2.py +11 -1
- parsl/tests/test_regression/test_3874.py +47 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +1 -0
- parsl/tests/test_staging/test_staging_globus.py +2 -2
- parsl/tests/unit/test_globus_compute_executor.py +11 -2
- parsl/utils.py +8 -3
- parsl/version.py +1 -1
- {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/interchange.py +39 -20
- {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/process_worker_pool.py +32 -7
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/METADATA +64 -50
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/RECORD +68 -74
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/WHEEL +1 -1
- parsl/tests/configs/local_threads_checkpoint_periodic.py +0 -11
- parsl/tests/configs/local_threads_no_cache.py +0 -11
- parsl/tests/site_tests/test_provider.py +0 -88
- parsl/tests/site_tests/test_site.py +0 -70
- parsl/tests/test_aalst_patterns.py +0 -474
- parsl/tests/test_docs/test_workflow2.py +0 -42
- parsl/tests/test_error_handling/test_rand_fail.py +0 -171
- parsl/tests/test_regression/test_854.py +0 -62
- parsl/tests/test_serialization/test_pack_resource_spec.py +0 -23
- {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.9.8.data → parsl-2025.11.10.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/entry_points.txt +0 -0
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info/licenses}/LICENSE +0 -0
- {parsl-2025.9.8.dist-info → parsl-2025.11.10.dist-info}/top_level.txt +0 -0
parsl/dataflow/memoization.py
CHANGED
|
@@ -4,19 +4,18 @@ import hashlib
|
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
6
|
import pickle
|
|
7
|
+
import threading
|
|
8
|
+
import types
|
|
9
|
+
from concurrent.futures import Future
|
|
7
10
|
from functools import lru_cache, singledispatch
|
|
8
|
-
from typing import
|
|
11
|
+
from typing import Any, Dict, List, Literal, Optional, Sequence
|
|
9
12
|
|
|
10
13
|
import typeguard
|
|
11
14
|
|
|
12
15
|
from parsl.dataflow.errors import BadCheckpoint
|
|
13
16
|
from parsl.dataflow.taskrecord import TaskRecord
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
from parsl import DataFlowKernel # import loop at runtime - needed for typechecking - TODO turn into "if typing:"
|
|
17
|
-
|
|
18
|
-
import types
|
|
19
|
-
from concurrent.futures import Future
|
|
17
|
+
from parsl.errors import ConfigurationError, InternalConsistencyError
|
|
18
|
+
from parsl.utils import Timer, get_all_checkpoints
|
|
20
19
|
|
|
21
20
|
logger = logging.getLogger(__name__)
|
|
22
21
|
|
|
@@ -150,19 +149,41 @@ class Memoizer:
|
|
|
150
149
|
|
|
151
150
|
"""
|
|
152
151
|
|
|
153
|
-
|
|
154
|
-
"""Initialize the memoizer.
|
|
152
|
+
run_dir: str
|
|
155
153
|
|
|
156
|
-
|
|
157
|
-
|
|
154
|
+
def __init__(self, *,
|
|
155
|
+
memoize: bool = True,
|
|
156
|
+
checkpoint_files: Sequence[str] | None,
|
|
157
|
+
checkpoint_period: Optional[str],
|
|
158
|
+
checkpoint_mode: Literal['task_exit', 'periodic', 'dfk_exit', 'manual'] | None):
|
|
159
|
+
"""Initialize the memoizer.
|
|
158
160
|
|
|
159
161
|
KWargs:
|
|
160
162
|
- memoize (Bool): enable memoization or not.
|
|
161
163
|
- checkpoint (Dict): A checkpoint loaded as a dict.
|
|
162
164
|
"""
|
|
163
|
-
self.dfk = dfk
|
|
164
165
|
self.memoize = memoize
|
|
165
166
|
|
|
167
|
+
self.checkpointed_tasks = 0
|
|
168
|
+
|
|
169
|
+
self.checkpoint_lock = threading.Lock()
|
|
170
|
+
|
|
171
|
+
self.checkpoint_files = checkpoint_files
|
|
172
|
+
self.checkpoint_mode = checkpoint_mode
|
|
173
|
+
self.checkpoint_period = checkpoint_period
|
|
174
|
+
|
|
175
|
+
self.checkpointable_tasks: List[TaskRecord] = []
|
|
176
|
+
|
|
177
|
+
self._checkpoint_timer: Timer | None = None
|
|
178
|
+
|
|
179
|
+
def start(self) -> None:
|
|
180
|
+
if self.checkpoint_files is not None:
|
|
181
|
+
checkpoint_files = self.checkpoint_files
|
|
182
|
+
elif self.checkpoint_files is None and self.checkpoint_mode is not None:
|
|
183
|
+
checkpoint_files = get_all_checkpoints(self.run_dir)
|
|
184
|
+
else:
|
|
185
|
+
checkpoint_files = []
|
|
186
|
+
|
|
166
187
|
checkpoint = self.load_checkpoints(checkpoint_files)
|
|
167
188
|
|
|
168
189
|
if self.memoize:
|
|
@@ -172,6 +193,26 @@ class Memoizer:
|
|
|
172
193
|
logger.info("App caching disabled for all apps")
|
|
173
194
|
self.memo_lookup_table = {}
|
|
174
195
|
|
|
196
|
+
if self.checkpoint_mode == "periodic":
|
|
197
|
+
if self.checkpoint_period is None:
|
|
198
|
+
raise ConfigurationError("Checkpoint period must be specified with periodic checkpoint mode")
|
|
199
|
+
else:
|
|
200
|
+
try:
|
|
201
|
+
h, m, s = map(int, self.checkpoint_period.split(':'))
|
|
202
|
+
except Exception:
|
|
203
|
+
raise ConfigurationError("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(self.checkpoint_period))
|
|
204
|
+
checkpoint_period = (h * 3600) + (m * 60) + s
|
|
205
|
+
self._checkpoint_timer = Timer(self.checkpoint_queue, interval=checkpoint_period, name="Checkpoint")
|
|
206
|
+
|
|
207
|
+
def close(self) -> None:
|
|
208
|
+
if self.checkpoint_mode is not None:
|
|
209
|
+
logger.info("Making final checkpoint")
|
|
210
|
+
self.checkpoint_queue()
|
|
211
|
+
|
|
212
|
+
if self._checkpoint_timer:
|
|
213
|
+
logger.info("Stopping checkpoint timer")
|
|
214
|
+
self._checkpoint_timer.close()
|
|
215
|
+
|
|
175
216
|
def make_hash(self, task: TaskRecord) -> str:
|
|
176
217
|
"""Create a hash of the task inputs.
|
|
177
218
|
|
|
@@ -242,16 +283,20 @@ class Memoizer:
|
|
|
242
283
|
assert isinstance(result, Future) or result is None
|
|
243
284
|
return result
|
|
244
285
|
|
|
245
|
-
def
|
|
286
|
+
def update_memo_result(self, task: TaskRecord, r: Any) -> None:
|
|
287
|
+
self._update_memo(task)
|
|
288
|
+
|
|
289
|
+
def update_memo_exception(self, task: TaskRecord, e: BaseException) -> None:
|
|
290
|
+
self._update_memo(task)
|
|
291
|
+
|
|
292
|
+
def _update_memo(self, task: TaskRecord) -> None:
|
|
246
293
|
"""Updates the memoization lookup table with the result from a task.
|
|
294
|
+
This doesn't move any values around but associates the memoization
|
|
295
|
+
hashsum with the completed (by success or failure) AppFuture.
|
|
247
296
|
|
|
248
297
|
Args:
|
|
249
|
-
- task (
|
|
250
|
-
- r (Result future): Result future
|
|
298
|
+
- task (TaskRecord) : A task record from dfk.tasks
|
|
251
299
|
"""
|
|
252
|
-
# TODO: could use typeguard
|
|
253
|
-
assert isinstance(r, Future)
|
|
254
|
-
|
|
255
300
|
task_id = task['id']
|
|
256
301
|
|
|
257
302
|
if not self.memoize or not task['memoize'] or 'hashsum' not in task:
|
|
@@ -265,7 +310,7 @@ class Memoizer:
|
|
|
265
310
|
logger.info(f"Replacing app cache entry {task['hashsum']} with result from task {task_id}")
|
|
266
311
|
else:
|
|
267
312
|
logger.debug(f"Storing app cache entry {task['hashsum']} with result from task {task_id}")
|
|
268
|
-
self.memo_lookup_table[task['hashsum']] =
|
|
313
|
+
self.memo_lookup_table[task['hashsum']] = task['app_fu']
|
|
269
314
|
|
|
270
315
|
def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future[Any]]:
|
|
271
316
|
"""Load a checkpoint file into a lookup table.
|
|
@@ -334,3 +379,85 @@ class Memoizer:
|
|
|
334
379
|
return self._load_checkpoints(checkpointDirs)
|
|
335
380
|
else:
|
|
336
381
|
return {}
|
|
382
|
+
|
|
383
|
+
def update_checkpoint(self, task_record: TaskRecord) -> None:
|
|
384
|
+
if self.checkpoint_mode == 'task_exit':
|
|
385
|
+
self.checkpoint_one(task=task_record)
|
|
386
|
+
elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'):
|
|
387
|
+
with self.checkpoint_lock:
|
|
388
|
+
self.checkpointable_tasks.append(task_record)
|
|
389
|
+
elif self.checkpoint_mode is None:
|
|
390
|
+
pass
|
|
391
|
+
else:
|
|
392
|
+
raise InternalConsistencyError(f"Invalid checkpoint mode {self.checkpoint_mode}")
|
|
393
|
+
|
|
394
|
+
def checkpoint_one(self, *, task: TaskRecord) -> None:
|
|
395
|
+
"""Checkpoint a single task to a checkpoint file.
|
|
396
|
+
|
|
397
|
+
By default the checkpoints are written to the RUNDIR of the current
|
|
398
|
+
run under RUNDIR/checkpoints/tasks.pkl
|
|
399
|
+
|
|
400
|
+
Kwargs:
|
|
401
|
+
- task : A task to checkpoint.
|
|
402
|
+
|
|
403
|
+
.. note::
|
|
404
|
+
Checkpointing only works if memoization is enabled
|
|
405
|
+
|
|
406
|
+
"""
|
|
407
|
+
with self.checkpoint_lock:
|
|
408
|
+
self._checkpoint_these_tasks([task])
|
|
409
|
+
|
|
410
|
+
def checkpoint_queue(self) -> None:
|
|
411
|
+
"""Checkpoint all tasks registered in self.checkpointable_tasks.
|
|
412
|
+
|
|
413
|
+
By default the checkpoints are written to the RUNDIR of the current
|
|
414
|
+
run under RUNDIR/checkpoints/tasks.pkl
|
|
415
|
+
|
|
416
|
+
.. note::
|
|
417
|
+
Checkpointing only works if memoization is enabled
|
|
418
|
+
|
|
419
|
+
"""
|
|
420
|
+
with self.checkpoint_lock:
|
|
421
|
+
self._checkpoint_these_tasks(self.checkpointable_tasks)
|
|
422
|
+
self.checkpointable_tasks = []
|
|
423
|
+
|
|
424
|
+
def _checkpoint_these_tasks(self, checkpoint_queue: List[TaskRecord]) -> None:
|
|
425
|
+
"""Checkpoint a list of task records.
|
|
426
|
+
|
|
427
|
+
The checkpoint lock must be held when invoking this method.
|
|
428
|
+
"""
|
|
429
|
+
checkpoint_dir = '{0}/checkpoint'.format(self.run_dir)
|
|
430
|
+
checkpoint_tasks = checkpoint_dir + '/tasks.pkl'
|
|
431
|
+
|
|
432
|
+
if not os.path.exists(checkpoint_dir):
|
|
433
|
+
os.makedirs(checkpoint_dir, exist_ok=True)
|
|
434
|
+
|
|
435
|
+
count = 0
|
|
436
|
+
|
|
437
|
+
with open(checkpoint_tasks, 'ab') as f:
|
|
438
|
+
for task_record in checkpoint_queue:
|
|
439
|
+
task_id = task_record['id']
|
|
440
|
+
|
|
441
|
+
app_fu = task_record['app_fu']
|
|
442
|
+
|
|
443
|
+
if app_fu.done() and app_fu.exception() is None:
|
|
444
|
+
hashsum = task_record['hashsum']
|
|
445
|
+
if not hashsum:
|
|
446
|
+
continue
|
|
447
|
+
t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}
|
|
448
|
+
|
|
449
|
+
# We are using pickle here since pickle dumps to a file in 'ab'
|
|
450
|
+
# mode behave like a incremental log.
|
|
451
|
+
pickle.dump(t, f)
|
|
452
|
+
count += 1
|
|
453
|
+
logger.debug("Task {} checkpointed".format(task_id))
|
|
454
|
+
|
|
455
|
+
self.checkpointed_tasks += count
|
|
456
|
+
|
|
457
|
+
if count == 0:
|
|
458
|
+
if self.checkpointed_tasks == 0:
|
|
459
|
+
logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled")
|
|
460
|
+
else:
|
|
461
|
+
logger.debug("No tasks checkpointed in this pass.")
|
|
462
|
+
else:
|
|
463
|
+
logger.info("Done checkpointing {} tasks".format(count))
|
parsl/dataflow/states.py
CHANGED
|
@@ -67,10 +67,10 @@ class States(IntEnum):
|
|
|
67
67
|
return self.__class__.__name__ + "." + self.name
|
|
68
68
|
|
|
69
69
|
|
|
70
|
-
|
|
71
|
-
"""States from which we will never move to another state, because the job has
|
|
72
|
-
either definitively completed or failed."""
|
|
73
|
-
|
|
74
|
-
FINAL_FAILURE_STATES = [States.failed, States.dep_fail]
|
|
70
|
+
FINAL_FAILURE_STATES = {States.failed, States.dep_fail}
|
|
75
71
|
"""States which are final and which indicate a failure. This must
|
|
76
72
|
be a subset of FINAL_STATES"""
|
|
73
|
+
|
|
74
|
+
FINAL_STATES = {States.exec_done, States.memo_done, *FINAL_FAILURE_STATES}
|
|
75
|
+
"""States from which we will never move to another state, because the job has
|
|
76
|
+
either definitively completed or failed."""
|
parsl/executors/base.py
CHANGED
|
@@ -80,11 +80,11 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
|
80
80
|
self.shutdown()
|
|
81
81
|
return False
|
|
82
82
|
|
|
83
|
-
@abstractmethod
|
|
84
83
|
def start(self) -> None:
|
|
85
84
|
"""Start the executor.
|
|
86
85
|
|
|
87
|
-
|
|
86
|
+
By default, this does nothing, but this method should be overridden to
|
|
87
|
+
perform any spin-up operations (for example: starting thread pools).
|
|
88
88
|
"""
|
|
89
89
|
pass
|
|
90
90
|
|
parsl/executors/execute_task.py
CHANGED
|
@@ -1,17 +1,11 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
from parsl.serialize import unpack_res_spec_apply_message
|
|
1
|
+
from parsl.serialize import unpack_apply_message
|
|
4
2
|
|
|
5
3
|
|
|
6
4
|
def execute_task(bufs: bytes):
|
|
7
5
|
"""Deserialize the buffer and execute the task.
|
|
8
6
|
Returns the result or throws exception.
|
|
9
7
|
"""
|
|
10
|
-
f, args, kwargs
|
|
11
|
-
|
|
12
|
-
for varname in resource_spec:
|
|
13
|
-
envname = "PARSL_" + str(varname).upper()
|
|
14
|
-
os.environ[envname] = str(resource_spec[varname])
|
|
8
|
+
f, args, kwargs = unpack_apply_message(bufs)
|
|
15
9
|
|
|
16
10
|
# We might need to look into callability of the function from itself
|
|
17
11
|
# since we change it's name in the new namespace
|
parsl/executors/flux/executor.py
CHANGED
|
@@ -24,7 +24,7 @@ from parsl.executors.flux.execute_parsl_task import __file__ as _WORKER_PATH
|
|
|
24
24
|
from parsl.executors.flux.flux_instance_manager import __file__ as _MANAGER_PATH
|
|
25
25
|
from parsl.providers import LocalProvider
|
|
26
26
|
from parsl.providers.base import ExecutionProvider
|
|
27
|
-
from parsl.serialize import deserialize,
|
|
27
|
+
from parsl.serialize import deserialize, pack_apply_message
|
|
28
28
|
from parsl.serialize.errors import SerializationError
|
|
29
29
|
from parsl.utils import RepresentationMixin
|
|
30
30
|
|
|
@@ -224,7 +224,7 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
|
|
|
224
224
|
# add a ``weakref.finalize()`` function for joining the executor thread
|
|
225
225
|
weakref.finalize(
|
|
226
226
|
self,
|
|
227
|
-
lambda x, y: x.set() or y.join(),
|
|
227
|
+
lambda x, y: x.set() or y.join(), # type: ignore[func-returns-value]
|
|
228
228
|
self._stop_event,
|
|
229
229
|
self._submission_thread,
|
|
230
230
|
)
|
|
@@ -284,10 +284,8 @@ class FluxExecutor(ParslExecutor, RepresentationMixin):
|
|
|
284
284
|
infile = os.path.join(self.working_dir, f"{task_id}_in{os.extsep}pkl")
|
|
285
285
|
outfile = os.path.join(self.working_dir, f"{task_id}_out{os.extsep}pkl")
|
|
286
286
|
try:
|
|
287
|
-
fn_buf =
|
|
288
|
-
func, args, kwargs,
|
|
289
|
-
resource_specification={},
|
|
290
|
-
buffer_threshold=1024 * 1024
|
|
287
|
+
fn_buf = pack_apply_message(
|
|
288
|
+
func, args, kwargs, buffer_threshold=1 << 20,
|
|
291
289
|
)
|
|
292
290
|
except TypeError:
|
|
293
291
|
raise SerializationError(func.__name__)
|
|
@@ -76,10 +76,6 @@ class GlobusComputeExecutor(ParslExecutor, RepresentationMixin):
|
|
|
76
76
|
self.storage_access = storage_access
|
|
77
77
|
self.working_dir = working_dir
|
|
78
78
|
|
|
79
|
-
def start(self) -> None:
|
|
80
|
-
""" Start the Globus Compute Executor """
|
|
81
|
-
super().start()
|
|
82
|
-
|
|
83
79
|
def submit(self, func: Callable, resource_specification: Dict[str, Any], *args: Any, **kwargs: Any) -> Future:
|
|
84
80
|
""" Submit func to globus-compute
|
|
85
81
|
|
|
@@ -35,7 +35,7 @@ from parsl.monitoring.radios.zmq_router import ZMQRadioReceiver, start_zmq_recei
|
|
|
35
35
|
from parsl.process_loggers import wrap_with_logs
|
|
36
36
|
from parsl.providers import LocalProvider
|
|
37
37
|
from parsl.providers.base import ExecutionProvider
|
|
38
|
-
from parsl.serialize import deserialize,
|
|
38
|
+
from parsl.serialize import deserialize, pack_apply_message
|
|
39
39
|
from parsl.serialize.errors import DeserializationError, SerializationError
|
|
40
40
|
from parsl.usage_tracking.api import UsageInformation
|
|
41
41
|
from parsl.utils import RepresentationMixin
|
|
@@ -160,6 +160,12 @@ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionP
|
|
|
160
160
|
""" # Documentation for params used by both HTEx and MPIEx
|
|
161
161
|
|
|
162
162
|
|
|
163
|
+
class HTEXFuture(Future):
|
|
164
|
+
def __init__(self, task_id) -> None:
|
|
165
|
+
super().__init__()
|
|
166
|
+
self.parsl_executor_task_id = task_id
|
|
167
|
+
|
|
168
|
+
|
|
163
169
|
class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageInformation):
|
|
164
170
|
__doc__ = f"""Executor designed for cluster-scale
|
|
165
171
|
|
|
@@ -237,7 +243,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
|
237
243
|
@typeguard.typechecked
|
|
238
244
|
def __init__(self,
|
|
239
245
|
label: str = 'HighThroughputExecutor',
|
|
240
|
-
provider: ExecutionProvider =
|
|
246
|
+
provider: Optional[ExecutionProvider] = None,
|
|
241
247
|
launch_cmd: Optional[str] = None,
|
|
242
248
|
interchange_launch_cmd: Optional[Sequence[str]] = None,
|
|
243
249
|
address: Optional[str] = None,
|
|
@@ -267,7 +273,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
|
267
273
|
|
|
268
274
|
logger.debug("Initializing HighThroughputExecutor")
|
|
269
275
|
|
|
270
|
-
BlockProviderExecutor.__init__(self,
|
|
276
|
+
BlockProviderExecutor.__init__(self,
|
|
277
|
+
provider=provider if provider else LocalProvider(),
|
|
278
|
+
block_error_handler=block_error_handler)
|
|
271
279
|
self.label = label
|
|
272
280
|
self.worker_debug = worker_debug
|
|
273
281
|
self.storage_access = storage_access
|
|
@@ -332,6 +340,13 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
|
332
340
|
self.encrypted = encrypted
|
|
333
341
|
self.cert_dir = None
|
|
334
342
|
|
|
343
|
+
# This flag will enable/disable internal Python mismatch checks
|
|
344
|
+
# between the interchange and worker managers. This serves as a
|
|
345
|
+
# temporary workaround for Globus Compute to support different
|
|
346
|
+
# Python versions at the endpoint and worker layers. We can drop
|
|
347
|
+
# the flag once we implement modular internal message protocols.
|
|
348
|
+
self._check_python_mismatch: bool = True
|
|
349
|
+
|
|
335
350
|
if not launch_cmd:
|
|
336
351
|
launch_cmd = DEFAULT_LAUNCH_CMD
|
|
337
352
|
self.launch_cmd = launch_cmd
|
|
@@ -494,10 +509,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
|
494
509
|
else:
|
|
495
510
|
|
|
496
511
|
for serialized_msg in msgs:
|
|
497
|
-
|
|
498
|
-
msg = pickle.loads(serialized_msg)
|
|
499
|
-
except pickle.UnpicklingError:
|
|
500
|
-
raise BadMessage("Message received could not be unpickled")
|
|
512
|
+
msg = pickle.loads(serialized_msg)
|
|
501
513
|
|
|
502
514
|
if msg['type'] == 'result':
|
|
503
515
|
try:
|
|
@@ -568,6 +580,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
|
568
580
|
"cert_dir": self.cert_dir,
|
|
569
581
|
"manager_selector": self.manager_selector,
|
|
570
582
|
"run_id": self.run_id,
|
|
583
|
+
"_check_python_mismatch": self._check_python_mismatch,
|
|
571
584
|
}
|
|
572
585
|
|
|
573
586
|
config_pickle = pickle.dumps(interchange_config)
|
|
@@ -663,7 +676,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
|
663
676
|
logger.debug("Sending hold to manager: {}".format(manager['manager']))
|
|
664
677
|
self._hold_manager(manager['manager'])
|
|
665
678
|
|
|
666
|
-
def submit(self, func, resource_specification, *args, **kwargs):
|
|
679
|
+
def submit(self, func: Callable, resource_specification: dict, *args, **kwargs) -> HTEXFuture:
|
|
667
680
|
"""Submits work to the outgoing_q.
|
|
668
681
|
|
|
669
682
|
The outgoing_q is an external process listens on this
|
|
@@ -684,34 +697,83 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
|
684
697
|
|
|
685
698
|
self.validate_resource_spec(resource_specification)
|
|
686
699
|
|
|
687
|
-
if self.bad_state_is_set:
|
|
688
|
-
raise self.executor_exception
|
|
689
|
-
|
|
690
|
-
self._task_counter += 1
|
|
691
|
-
task_id = self._task_counter
|
|
692
|
-
|
|
693
700
|
# handle people sending blobs gracefully
|
|
694
701
|
if logger.getEffectiveLevel() <= logging.DEBUG:
|
|
695
702
|
args_to_print = tuple([ar if len(ar := repr(arg)) < 100 else (ar[:100] + '...') for arg in args])
|
|
696
703
|
logger.debug("Pushing function {} to queue with args {}".format(func, args_to_print))
|
|
697
704
|
|
|
698
|
-
fut = Future()
|
|
699
|
-
fut.parsl_executor_task_id = task_id
|
|
700
|
-
self.tasks[task_id] = fut
|
|
701
|
-
|
|
702
705
|
try:
|
|
703
|
-
fn_buf =
|
|
704
|
-
resource_specification=resource_specification,
|
|
705
|
-
buffer_threshold=1024 * 1024)
|
|
706
|
+
fn_buf = pack_apply_message(func, args, kwargs, buffer_threshold=1 << 20)
|
|
706
707
|
except TypeError:
|
|
707
708
|
raise SerializationError(func.__name__)
|
|
708
709
|
|
|
709
|
-
|
|
710
|
+
context = {}
|
|
711
|
+
if resource_specification:
|
|
712
|
+
context["resource_spec"] = resource_specification
|
|
713
|
+
|
|
714
|
+
return self.submit_payload(context, fn_buf)
|
|
715
|
+
|
|
716
|
+
def submit_payload(self, context: dict, buffer: bytes) -> HTEXFuture:
|
|
717
|
+
"""
|
|
718
|
+
Submit specially crafted payloads.
|
|
719
|
+
|
|
720
|
+
For use-cases where the ``HighThroughputExecutor`` consumer needs the payload
|
|
721
|
+
handled by the worker in a special way. For example, if the function is
|
|
722
|
+
serialized differently than Parsl's default approach, or if the task must
|
|
723
|
+
be setup more precisely than Parsl's default ``execute_task`` allows.
|
|
724
|
+
|
|
725
|
+
An example interaction:
|
|
726
|
+
|
|
727
|
+
.. code-block: python
|
|
728
|
+
|
|
729
|
+
>>> htex: HighThroughputExecutor # setup prior to this example
|
|
730
|
+
>>> ctxt = {
|
|
731
|
+
... "task_executor": {
|
|
732
|
+
... "f": "full.import.path.of.custom_execute_task",
|
|
733
|
+
... "a": ("additional", "arguments"),
|
|
734
|
+
... "k": {"some": "keyword", "args": "here"}
|
|
735
|
+
... }
|
|
736
|
+
... }
|
|
737
|
+
>>> fn_buf = custom_serialize(task_func, *task_args, **task_kwargs)
|
|
738
|
+
>>> fut = htex.submit_payload(ctxt, fn_buf)
|
|
739
|
+
|
|
740
|
+
The custom ``custom_execute_task`` would be dynamically imported, and
|
|
741
|
+
invoked as:
|
|
742
|
+
|
|
743
|
+
.. code-block: python
|
|
744
|
+
|
|
745
|
+
args = ("additional", "arguments")
|
|
746
|
+
kwargs = {"some": "keyword", "args": "here"}
|
|
747
|
+
result = custom_execute_task(fn_buf, *args, **kwargs)
|
|
748
|
+
|
|
749
|
+
Parameters
|
|
750
|
+
----------
|
|
751
|
+
context:
|
|
752
|
+
A task-specific context associated with the function buffer. Parsl
|
|
753
|
+
currently implements the keys ``task_executor`` and ``resource_spec``
|
|
754
|
+
|
|
755
|
+
buffer:
|
|
756
|
+
A serialized function, that will be deserialized and executed by
|
|
757
|
+
``execute_task`` (or custom function, if ``task_executor`` is specified)
|
|
758
|
+
|
|
759
|
+
Returns
|
|
760
|
+
-------
|
|
761
|
+
An HTEXFuture (a normal Future, with the attribute ``.parsl_executor_task_id``
|
|
762
|
+
set). The future will be set to done when the associated function buffer has
|
|
763
|
+
been invoked and completed.
|
|
764
|
+
"""
|
|
765
|
+
if self.bad_state_is_set:
|
|
766
|
+
raise self.executor_exception
|
|
767
|
+
|
|
768
|
+
self._task_counter += 1
|
|
769
|
+
task_id = self._task_counter
|
|
770
|
+
|
|
771
|
+
fut = HTEXFuture(task_id)
|
|
772
|
+
self.tasks[task_id] = fut
|
|
710
773
|
|
|
711
|
-
|
|
774
|
+
msg = {"task_id": task_id, "context": context, "buffer": buffer}
|
|
712
775
|
self.outgoing_q.put(msg)
|
|
713
776
|
|
|
714
|
-
# Return the future
|
|
715
777
|
return fut
|
|
716
778
|
|
|
717
779
|
@property
|
|
@@ -23,7 +23,6 @@ from parsl.monitoring.radios.base import MonitoringRadioSender
|
|
|
23
23
|
from parsl.monitoring.radios.zmq import ZMQRadioSender
|
|
24
24
|
from parsl.process_loggers import wrap_with_logs
|
|
25
25
|
from parsl.serialize import serialize as serialize_object
|
|
26
|
-
from parsl.utils import setproctitle
|
|
27
26
|
from parsl.version import VERSION as PARSL_VERSION
|
|
28
27
|
|
|
29
28
|
PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
|
|
@@ -56,6 +55,7 @@ class Interchange:
|
|
|
56
55
|
cert_dir: Optional[str],
|
|
57
56
|
manager_selector: ManagerSelector,
|
|
58
57
|
run_id: str,
|
|
58
|
+
_check_python_mismatch: bool,
|
|
59
59
|
) -> None:
|
|
60
60
|
"""
|
|
61
61
|
Parameters
|
|
@@ -99,6 +99,11 @@ class Interchange:
|
|
|
99
99
|
|
|
100
100
|
cert_dir : str | None
|
|
101
101
|
Path to the certificate directory.
|
|
102
|
+
|
|
103
|
+
_check_python_mismatch : bool
|
|
104
|
+
If True, the interchange and worker managers must run the same version of
|
|
105
|
+
Python. Running different versions can cause inter-process communication
|
|
106
|
+
errors, so proceed with caution.
|
|
102
107
|
"""
|
|
103
108
|
self.cert_dir = cert_dir
|
|
104
109
|
self.logdir = logdir
|
|
@@ -126,15 +131,13 @@ class Interchange:
|
|
|
126
131
|
logger.info("Connected to client")
|
|
127
132
|
|
|
128
133
|
self.run_id = run_id
|
|
134
|
+
self._check_python_mismatch = _check_python_mismatch
|
|
129
135
|
|
|
130
136
|
self.hub_address = hub_address
|
|
131
137
|
self.hub_zmq_port = hub_zmq_port
|
|
132
138
|
|
|
133
139
|
self.pending_task_queue: SortedList[Any] = SortedList(key=lambda tup: (tup[0], tup[1]))
|
|
134
140
|
|
|
135
|
-
# count of tasks that have been received from the submit side
|
|
136
|
-
self.task_counter = 0
|
|
137
|
-
|
|
138
141
|
# count of tasks that have been sent out to worker pools
|
|
139
142
|
self.count = 0
|
|
140
143
|
|
|
@@ -157,6 +160,7 @@ class Interchange:
|
|
|
157
160
|
logger.info(f"Bound to port {worker_port} for incoming worker connections")
|
|
158
161
|
|
|
159
162
|
self._ready_managers: Dict[bytes, ManagerRecord] = {}
|
|
163
|
+
self._logged_manager_count_token: object = None
|
|
160
164
|
self.connected_block_history: List[str] = []
|
|
161
165
|
|
|
162
166
|
self.heartbeat_threshold = heartbeat_threshold
|
|
@@ -213,7 +217,7 @@ class Interchange:
|
|
|
213
217
|
|
|
214
218
|
reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
|
|
215
219
|
|
|
216
|
-
if self.
|
|
220
|
+
if self.socks.get(self.command_channel) == zmq.POLLIN:
|
|
217
221
|
logger.debug("entering command_server section")
|
|
218
222
|
|
|
219
223
|
command_req = self.command_channel.recv_pyobj()
|
|
@@ -310,6 +314,7 @@ class Interchange:
|
|
|
310
314
|
self.process_manager_socket_message(interesting_managers, monitoring_radio, kill_event)
|
|
311
315
|
self.expire_bad_managers(interesting_managers, monitoring_radio)
|
|
312
316
|
self.expire_drained_managers(interesting_managers, monitoring_radio)
|
|
317
|
+
self.log_manager_counts(interesting_managers)
|
|
313
318
|
self.process_tasks_to_send(interesting_managers, monitoring_radio)
|
|
314
319
|
|
|
315
320
|
self.zmq_context.destroy()
|
|
@@ -321,20 +326,20 @@ class Interchange:
|
|
|
321
326
|
"""Process incoming task message(s).
|
|
322
327
|
"""
|
|
323
328
|
|
|
324
|
-
if self.
|
|
329
|
+
if self.socks.get(self.task_incoming) == zmq.POLLIN:
|
|
325
330
|
logger.debug("start task_incoming section")
|
|
326
331
|
msg = self.task_incoming.recv_pyobj()
|
|
327
332
|
|
|
328
333
|
# Process priority, higher number = lower priority
|
|
329
|
-
|
|
334
|
+
task_id = msg['task_id']
|
|
335
|
+
resource_spec = msg['context'].get('resource_spec', {})
|
|
330
336
|
priority = resource_spec.get('priority', float('inf'))
|
|
331
|
-
queue_entry = (-priority, -
|
|
337
|
+
queue_entry = (-priority, -task_id, msg)
|
|
332
338
|
|
|
333
|
-
logger.debug("
|
|
339
|
+
logger.debug("Putting task %s onto pending_task_queue", task_id)
|
|
334
340
|
|
|
335
341
|
self.pending_task_queue.add(queue_entry)
|
|
336
|
-
|
|
337
|
-
logger.debug(f"Fetched {self.task_counter} tasks so far")
|
|
342
|
+
logger.debug("Put task %s onto pending_task_queue", task_id)
|
|
338
343
|
|
|
339
344
|
def process_manager_socket_message(
|
|
340
345
|
self,
|
|
@@ -354,9 +359,10 @@ class Interchange:
|
|
|
354
359
|
mtype = meta['type']
|
|
355
360
|
except Exception as e:
|
|
356
361
|
logger.warning(
|
|
357
|
-
|
|
362
|
+
'Failed to read manager message; ignoring message'
|
|
363
|
+
f' (Exception: [{type(e).__name__}] {e})'
|
|
358
364
|
)
|
|
359
|
-
logger.debug('
|
|
365
|
+
logger.debug('Raw message bytes:\n %r\n', msg_parts, exc_info=e)
|
|
360
366
|
return
|
|
361
367
|
|
|
362
368
|
logger.debug(
|
|
@@ -396,7 +402,9 @@ class Interchange:
|
|
|
396
402
|
logger.info(f'Registration info for manager {manager_id!r}: {meta}')
|
|
397
403
|
self._send_monitoring_info(monitoring_radio, new_rec)
|
|
398
404
|
|
|
399
|
-
|
|
405
|
+
python_mismatch: bool = ix_minor_py != mgr_minor_py
|
|
406
|
+
parsl_mismatch: bool = ix_parsl_v != mgr_parsl_v
|
|
407
|
+
if parsl_mismatch or (self._check_python_mismatch and python_mismatch):
|
|
400
408
|
kill_event.set()
|
|
401
409
|
vm_exc = VersionMismatch(
|
|
402
410
|
f"py.v={ix_minor_py} parsl.v={ix_parsl_v}",
|
|
@@ -517,15 +525,24 @@ class Interchange:
|
|
|
517
525
|
m['active'] = False
|
|
518
526
|
self._send_monitoring_info(monitoring_radio, m)
|
|
519
527
|
|
|
528
|
+
def log_manager_counts(self, interesting_managers: Set[bytes]) -> None:
|
|
529
|
+
count_interesting = len(interesting_managers)
|
|
530
|
+
count_ready = len(self._ready_managers)
|
|
531
|
+
|
|
532
|
+
new_logged_manager_count_token = (count_interesting, count_ready)
|
|
533
|
+
|
|
534
|
+
if self._logged_manager_count_token != new_logged_manager_count_token:
|
|
535
|
+
|
|
536
|
+
logger.debug(
|
|
537
|
+
"Managers count (interesting/total): %d/%d",
|
|
538
|
+
count_interesting,
|
|
539
|
+
count_ready
|
|
540
|
+
)
|
|
541
|
+
self._logged_manager_count_token = new_logged_manager_count_token
|
|
542
|
+
|
|
520
543
|
def process_tasks_to_send(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
|
521
544
|
# Check if there are tasks that could be sent to managers
|
|
522
545
|
|
|
523
|
-
logger.debug(
|
|
524
|
-
"Managers count (interesting/total): %d/%d",
|
|
525
|
-
len(interesting_managers),
|
|
526
|
-
len(self._ready_managers)
|
|
527
|
-
)
|
|
528
|
-
|
|
529
546
|
if interesting_managers and self.pending_task_queue:
|
|
530
547
|
shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
|
|
531
548
|
|
|
@@ -618,6 +635,8 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
|
|
|
618
635
|
|
|
619
636
|
|
|
620
637
|
if __name__ == "__main__":
|
|
638
|
+
from parsl.utils import setproctitle
|
|
639
|
+
|
|
621
640
|
setproctitle("parsl: HTEX interchange")
|
|
622
641
|
|
|
623
642
|
config = pickle.load(sys.stdin.buffer)
|
|
@@ -16,7 +16,6 @@ from parsl.executors.status_handling import BlockProviderExecutor
|
|
|
16
16
|
from parsl.jobs.states import JobStatus
|
|
17
17
|
from parsl.launchers import SimpleLauncher
|
|
18
18
|
from parsl.monitoring.radios.base import RadioConfig
|
|
19
|
-
from parsl.providers import LocalProvider
|
|
20
19
|
from parsl.providers.base import ExecutionProvider
|
|
21
20
|
|
|
22
21
|
|
|
@@ -47,7 +46,7 @@ class MPIExecutor(HighThroughputExecutor):
|
|
|
47
46
|
@typeguard.typechecked
|
|
48
47
|
def __init__(self,
|
|
49
48
|
label: str = 'MPIExecutor',
|
|
50
|
-
provider: ExecutionProvider =
|
|
49
|
+
provider: Optional[ExecutionProvider] = None,
|
|
51
50
|
launch_cmd: Optional[str] = None,
|
|
52
51
|
interchange_launch_cmd: Optional[str] = None,
|
|
53
52
|
address: Optional[str] = None,
|