parsl 2025.1.13__py3-none-any.whl → 2025.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/configs/gc_multisite.py +27 -0
- parsl/configs/gc_tutorial.py +18 -0
- parsl/dataflow/dflow.py +23 -103
- parsl/dataflow/errors.py +60 -18
- parsl/dataflow/memoization.py +76 -2
- parsl/dataflow/taskrecord.py +1 -3
- parsl/executors/__init__.py +3 -1
- parsl/executors/globus_compute.py +125 -0
- parsl/executors/high_throughput/errors.py +1 -1
- parsl/executors/high_throughput/executor.py +16 -15
- parsl/executors/high_throughput/interchange.py +74 -96
- parsl/executors/high_throughput/zmq_pipes.py +0 -1
- parsl/tests/configs/globus_compute.py +20 -0
- parsl/tests/conftest.py +4 -0
- parsl/tests/test_checkpointing/test_python_checkpoint_1.py +0 -3
- parsl/tests/test_error_handling/test_resource_spec.py +3 -0
- parsl/tests/test_htex/test_interchange_exit_bad_registration.py +120 -0
- parsl/tests/test_htex/test_resource_spec_validation.py +0 -7
- parsl/tests/test_python_apps/test_dep_standard_futures.py +3 -0
- parsl/tests/test_python_apps/test_fail.py +23 -8
- parsl/tests/test_python_apps/test_join.py +6 -0
- parsl/tests/test_python_apps/test_memoize_1.py +0 -1
- parsl/tests/unit/test_globus_compute_executor.py +104 -0
- parsl/usage_tracking/usage.py +13 -8
- parsl/version.py +1 -1
- {parsl-2025.1.13.data → parsl-2025.1.27.data}/scripts/interchange.py +74 -96
- {parsl-2025.1.13.dist-info → parsl-2025.1.27.dist-info}/METADATA +5 -2
- {parsl-2025.1.13.dist-info → parsl-2025.1.27.dist-info}/RECORD +35 -30
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -42
- {parsl-2025.1.13.data → parsl-2025.1.27.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.1.13.data → parsl-2025.1.27.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.1.13.data → parsl-2025.1.27.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2025.1.13.dist-info → parsl-2025.1.27.dist-info}/LICENSE +0 -0
- {parsl-2025.1.13.dist-info → parsl-2025.1.27.dist-info}/WHEEL +0 -0
- {parsl-2025.1.13.dist-info → parsl-2025.1.27.dist-info}/entry_points.txt +0 -0
- {parsl-2025.1.13.dist-info → parsl-2025.1.27.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,27 @@
|
|
1
|
+
from globus_compute_sdk import Executor
|
2
|
+
|
3
|
+
from parsl.config import Config
|
4
|
+
from parsl.executors import GlobusComputeExecutor
|
5
|
+
from parsl.usage_tracking.levels import LEVEL_1
|
6
|
+
|
7
|
+
# Please start your own endpoint on perlmutter following instructions below to use this config:
|
8
|
+
# https://globus-compute.readthedocs.io/en/stable/endpoints/endpoint_examples.html#perlmutter-nersc
|
9
|
+
perlmutter_endpoint = 'YOUR_PERLMUTTER_ENDPOINT_UUID'
|
10
|
+
|
11
|
+
# Please start your own endpoint on expanse following instructions below to use this config:
|
12
|
+
# https://globus-compute.readthedocs.io/en/stable/endpoints/endpoint_examples.html#expanse-sdsc
|
13
|
+
expanse_endpoint = 'YOUR_EXPANSE_ENDPOINT_UUID'
|
14
|
+
|
15
|
+
config = Config(
|
16
|
+
executors=[
|
17
|
+
GlobusComputeExecutor(
|
18
|
+
executor=Executor(endpoint_id=perlmutter_endpoint),
|
19
|
+
label="Perlmutter",
|
20
|
+
),
|
21
|
+
GlobusComputeExecutor(
|
22
|
+
executor=Executor(endpoint_id=expanse_endpoint),
|
23
|
+
label="Expanse",
|
24
|
+
),
|
25
|
+
],
|
26
|
+
usage_tracking=LEVEL_1,
|
27
|
+
)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
from globus_compute_sdk import Executor
|
2
|
+
|
3
|
+
from parsl.config import Config
|
4
|
+
from parsl.executors import GlobusComputeExecutor
|
5
|
+
from parsl.usage_tracking.levels import LEVEL_1
|
6
|
+
|
7
|
+
# Public tutorial endpoint
|
8
|
+
tutorial_endpoint = '4b116d3c-1703-4f8f-9f6f-39921e5864df'
|
9
|
+
|
10
|
+
config = Config(
|
11
|
+
executors=[
|
12
|
+
GlobusComputeExecutor(
|
13
|
+
executor=Executor(endpoint_id=tutorial_endpoint),
|
14
|
+
label="Tutorial_Endpoint_py3.11",
|
15
|
+
)
|
16
|
+
],
|
17
|
+
usage_tracking=LEVEL_1,
|
18
|
+
)
|
parsl/dataflow/dflow.py
CHANGED
@@ -28,7 +28,7 @@ from parsl.config import Config
|
|
28
28
|
from parsl.data_provider.data_manager import DataManager
|
29
29
|
from parsl.data_provider.files import File
|
30
30
|
from parsl.dataflow.dependency_resolvers import SHALLOW_DEPENDENCY_RESOLVER
|
31
|
-
from parsl.dataflow.errors import
|
31
|
+
from parsl.dataflow.errors import DependencyError, JoinError
|
32
32
|
from parsl.dataflow.futures import AppFuture
|
33
33
|
from parsl.dataflow.memoization import Memoizer
|
34
34
|
from parsl.dataflow.rundirs import make_rundir
|
@@ -161,13 +161,13 @@ class DataFlowKernel:
|
|
161
161
|
workflow_info))
|
162
162
|
|
163
163
|
if config.checkpoint_files is not None:
|
164
|
-
|
164
|
+
checkpoint_files = config.checkpoint_files
|
165
165
|
elif config.checkpoint_files is None and config.checkpoint_mode is not None:
|
166
|
-
|
166
|
+
checkpoint_files = get_all_checkpoints(self.run_dir)
|
167
167
|
else:
|
168
|
-
|
168
|
+
checkpoint_files = []
|
169
169
|
|
170
|
-
self.memoizer = Memoizer(self, memoize=config.app_cache,
|
170
|
+
self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files)
|
171
171
|
self.checkpointed_tasks = 0
|
172
172
|
self._checkpoint_timer = None
|
173
173
|
self.checkpoint_mode = config.checkpoint_mode
|
@@ -484,24 +484,18 @@ class DataFlowKernel:
|
|
484
484
|
|
485
485
|
# now we know each joinable Future is done
|
486
486
|
# so now look for any exceptions
|
487
|
-
exceptions_tids: List[Tuple[BaseException,
|
487
|
+
exceptions_tids: List[Tuple[BaseException, str]]
|
488
488
|
exceptions_tids = []
|
489
489
|
if isinstance(joinable, Future):
|
490
490
|
je = joinable.exception()
|
491
491
|
if je is not None:
|
492
|
-
|
493
|
-
tid = joinable.task_record['id']
|
494
|
-
else:
|
495
|
-
tid = None
|
492
|
+
tid = self.render_future_description(joinable)
|
496
493
|
exceptions_tids = [(je, tid)]
|
497
494
|
elif isinstance(joinable, list):
|
498
495
|
for future in joinable:
|
499
496
|
je = future.exception()
|
500
497
|
if je is not None:
|
501
|
-
|
502
|
-
tid = joinable.task_record['id']
|
503
|
-
else:
|
504
|
-
tid = None
|
498
|
+
tid = self.render_future_description(future)
|
505
499
|
exceptions_tids.append((je, tid))
|
506
500
|
else:
|
507
501
|
raise TypeError(f"Unknown joinable type {type(joinable)}")
|
@@ -918,13 +912,7 @@ class DataFlowKernel:
|
|
918
912
|
dep_failures = []
|
919
913
|
|
920
914
|
def append_failure(e: Exception, dep: Future) -> None:
|
921
|
-
|
922
|
-
# then refer to the task ID.
|
923
|
-
# Otherwise make a repr of the Future object.
|
924
|
-
if hasattr(dep, 'task_record') and dep.task_record['dfk'] == self:
|
925
|
-
tid = "task " + repr(dep.task_record['id'])
|
926
|
-
else:
|
927
|
-
tid = repr(dep)
|
915
|
+
tid = self.render_future_description(dep)
|
928
916
|
dep_failures.extend([(e, tid)])
|
929
917
|
|
930
918
|
# Replace item in args
|
@@ -1076,10 +1064,7 @@ class DataFlowKernel:
|
|
1076
1064
|
|
1077
1065
|
depend_descs = []
|
1078
1066
|
for d in depends:
|
1079
|
-
|
1080
|
-
depend_descs.append("task {}".format(d.tid))
|
1081
|
-
else:
|
1082
|
-
depend_descs.append(repr(d))
|
1067
|
+
depend_descs.append(self.render_future_description(d))
|
1083
1068
|
|
1084
1069
|
if depend_descs != []:
|
1085
1070
|
waiting_message = "waiting on {}".format(", ".join(depend_descs))
|
@@ -1215,10 +1200,8 @@ class DataFlowKernel:
|
|
1215
1200
|
self._checkpoint_timer.close()
|
1216
1201
|
|
1217
1202
|
# Send final stats
|
1218
|
-
logger.info("Sending end message for usage tracking")
|
1219
1203
|
self.usage_tracker.send_end_message()
|
1220
1204
|
self.usage_tracker.close()
|
1221
|
-
logger.info("Closed usage tracking")
|
1222
1205
|
|
1223
1206
|
logger.info("Closing job status poller")
|
1224
1207
|
self.job_status_poller.close()
|
@@ -1280,7 +1263,7 @@ class DataFlowKernel:
|
|
1280
1263
|
Returns:
|
1281
1264
|
Checkpoint dir if checkpoints were written successfully.
|
1282
1265
|
By default the checkpoints are written to the RUNDIR of the current
|
1283
|
-
run under RUNDIR/checkpoints/
|
1266
|
+
run under RUNDIR/checkpoints/tasks.pkl
|
1284
1267
|
"""
|
1285
1268
|
with self.checkpoint_lock:
|
1286
1269
|
if tasks:
|
@@ -1290,18 +1273,11 @@ class DataFlowKernel:
|
|
1290
1273
|
self.checkpointable_tasks = []
|
1291
1274
|
|
1292
1275
|
checkpoint_dir = '{0}/checkpoint'.format(self.run_dir)
|
1293
|
-
checkpoint_dfk = checkpoint_dir + '/dfk.pkl'
|
1294
1276
|
checkpoint_tasks = checkpoint_dir + '/tasks.pkl'
|
1295
1277
|
|
1296
1278
|
if not os.path.exists(checkpoint_dir):
|
1297
1279
|
os.makedirs(checkpoint_dir, exist_ok=True)
|
1298
1280
|
|
1299
|
-
with open(checkpoint_dfk, 'wb') as f:
|
1300
|
-
state = {'rundir': self.run_dir,
|
1301
|
-
'task_count': self.task_count
|
1302
|
-
}
|
1303
|
-
pickle.dump(state, f)
|
1304
|
-
|
1305
1281
|
count = 0
|
1306
1282
|
|
1307
1283
|
with open(checkpoint_tasks, 'ab') as f:
|
@@ -1334,74 +1310,6 @@ class DataFlowKernel:
|
|
1334
1310
|
|
1335
1311
|
return checkpoint_dir
|
1336
1312
|
|
1337
|
-
def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future[Any]]:
|
1338
|
-
"""Load a checkpoint file into a lookup table.
|
1339
|
-
|
1340
|
-
The data being loaded from the pickle file mostly contains input
|
1341
|
-
attributes of the task: func, args, kwargs, env...
|
1342
|
-
To simplify the check of whether the exact task has been completed
|
1343
|
-
in the checkpoint, we hash these input params and use it as the key
|
1344
|
-
for the memoized lookup table.
|
1345
|
-
|
1346
|
-
Args:
|
1347
|
-
- checkpointDirs (list) : List of filepaths to checkpoints
|
1348
|
-
Eg. ['runinfo/001', 'runinfo/002']
|
1349
|
-
|
1350
|
-
Returns:
|
1351
|
-
- memoized_lookup_table (dict)
|
1352
|
-
"""
|
1353
|
-
memo_lookup_table = {}
|
1354
|
-
|
1355
|
-
for checkpoint_dir in checkpointDirs:
|
1356
|
-
logger.info("Loading checkpoints from {}".format(checkpoint_dir))
|
1357
|
-
checkpoint_file = os.path.join(checkpoint_dir, 'tasks.pkl')
|
1358
|
-
try:
|
1359
|
-
with open(checkpoint_file, 'rb') as f:
|
1360
|
-
while True:
|
1361
|
-
try:
|
1362
|
-
data = pickle.load(f)
|
1363
|
-
# Copy and hash only the input attributes
|
1364
|
-
memo_fu: Future = Future()
|
1365
|
-
assert data['exception'] is None
|
1366
|
-
memo_fu.set_result(data['result'])
|
1367
|
-
memo_lookup_table[data['hash']] = memo_fu
|
1368
|
-
|
1369
|
-
except EOFError:
|
1370
|
-
# Done with the checkpoint file
|
1371
|
-
break
|
1372
|
-
except FileNotFoundError:
|
1373
|
-
reason = "Checkpoint file was not found: {}".format(
|
1374
|
-
checkpoint_file)
|
1375
|
-
logger.error(reason)
|
1376
|
-
raise BadCheckpoint(reason)
|
1377
|
-
except Exception:
|
1378
|
-
reason = "Failed to load checkpoint: {}".format(
|
1379
|
-
checkpoint_file)
|
1380
|
-
logger.error(reason)
|
1381
|
-
raise BadCheckpoint(reason)
|
1382
|
-
|
1383
|
-
logger.info("Completed loading checkpoint: {0} with {1} tasks".format(checkpoint_file,
|
1384
|
-
len(memo_lookup_table.keys())))
|
1385
|
-
return memo_lookup_table
|
1386
|
-
|
1387
|
-
@typeguard.typechecked
|
1388
|
-
def load_checkpoints(self, checkpointDirs: Optional[Sequence[str]]) -> Dict[str, Future]:
|
1389
|
-
"""Load checkpoints from the checkpoint files into a dictionary.
|
1390
|
-
|
1391
|
-
The results are used to pre-populate the memoizer's lookup_table
|
1392
|
-
|
1393
|
-
Kwargs:
|
1394
|
-
- checkpointDirs (list) : List of run folder to use as checkpoints
|
1395
|
-
Eg. ['runinfo/001', 'runinfo/002']
|
1396
|
-
|
1397
|
-
Returns:
|
1398
|
-
- dict containing, hashed -> future mappings
|
1399
|
-
"""
|
1400
|
-
if checkpointDirs:
|
1401
|
-
return self._load_checkpoints(checkpointDirs)
|
1402
|
-
else:
|
1403
|
-
return {}
|
1404
|
-
|
1405
1313
|
@staticmethod
|
1406
1314
|
def _log_std_streams(task_record: TaskRecord) -> None:
|
1407
1315
|
tid = task_record['id']
|
@@ -1438,6 +1346,18 @@ class DataFlowKernel:
|
|
1438
1346
|
'' if label is None else '_{}'.format(label),
|
1439
1347
|
kw))
|
1440
1348
|
|
1349
|
+
def render_future_description(self, dep: Future) -> str:
|
1350
|
+
"""Renders a description of the future in the context of the
|
1351
|
+
current DFK.
|
1352
|
+
"""
|
1353
|
+
if isinstance(dep, AppFuture) and dep.task_record['dfk'] == self:
|
1354
|
+
tid = "task " + repr(dep.task_record['id'])
|
1355
|
+
elif isinstance(dep, DataFuture):
|
1356
|
+
tid = "DataFuture from task " + repr(dep.tid)
|
1357
|
+
else:
|
1358
|
+
tid = repr(dep)
|
1359
|
+
return tid
|
1360
|
+
|
1441
1361
|
|
1442
1362
|
class DataFlowKernelLoader:
|
1443
1363
|
"""Manage which DataFlowKernel is active.
|
parsl/dataflow/errors.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import
|
1
|
+
from typing import List, Sequence, Tuple
|
2
2
|
|
3
3
|
from parsl.errors import ParslError
|
4
4
|
|
@@ -29,35 +29,77 @@ class BadCheckpoint(DataFlowException):
|
|
29
29
|
return self.reason
|
30
30
|
|
31
31
|
|
32
|
-
class
|
33
|
-
"""Error raised if an app
|
34
|
-
in a
|
32
|
+
class PropagatedException(DataFlowException):
|
33
|
+
"""Error raised if an app fails because there was an error
|
34
|
+
in a related task. This is intended to be subclassed for
|
35
|
+
dependency and join_app errors.
|
35
36
|
|
36
37
|
Args:
|
37
|
-
- dependent_exceptions_tids: List of exceptions and
|
38
|
-
dependencies which failed. The
|
39
|
-
the repr of a non-
|
38
|
+
- dependent_exceptions_tids: List of exceptions and brief descriptions
|
39
|
+
for dependencies which failed. The description might be a task ID or
|
40
|
+
the repr of a non-AppFuture.
|
40
41
|
- task_id: Task ID of the task that failed because of the dependency error
|
41
42
|
"""
|
42
43
|
|
43
|
-
def __init__(self,
|
44
|
+
def __init__(self,
|
45
|
+
dependent_exceptions_tids: Sequence[Tuple[BaseException, str]],
|
46
|
+
task_id: int,
|
47
|
+
*,
|
48
|
+
failure_description: str) -> None:
|
44
49
|
self.dependent_exceptions_tids = dependent_exceptions_tids
|
45
50
|
self.task_id = task_id
|
51
|
+
self._failure_description = failure_description
|
52
|
+
|
53
|
+
(cause, cause_sequence) = self._find_any_root_cause()
|
54
|
+
self.__cause__ = cause
|
55
|
+
self._cause_sequence = cause_sequence
|
46
56
|
|
47
57
|
def __str__(self) -> str:
|
48
|
-
|
49
|
-
return f"
|
58
|
+
sequence_text = " <- ".join(self._cause_sequence)
|
59
|
+
return f"{self._failure_description} for task {self.task_id}. " \
|
60
|
+
f"The representative cause is via {sequence_text}"
|
61
|
+
|
62
|
+
def _find_any_root_cause(self) -> Tuple[BaseException, List[str]]:
|
63
|
+
"""Looks recursively through self.dependent_exceptions_tids to find
|
64
|
+
an exception that caused this propagated error, that is not itself
|
65
|
+
a propagated error.
|
66
|
+
"""
|
67
|
+
e: BaseException = self
|
68
|
+
dep_ids = []
|
69
|
+
while isinstance(e, PropagatedException) and len(e.dependent_exceptions_tids) >= 1:
|
70
|
+
id_txt = e.dependent_exceptions_tids[0][1]
|
71
|
+
assert isinstance(id_txt, str)
|
72
|
+
# if there are several causes for this exception, label that
|
73
|
+
# there are more so that we know that the representative fail
|
74
|
+
# sequence is not the full story.
|
75
|
+
if len(e.dependent_exceptions_tids) > 1:
|
76
|
+
id_txt += " (+ others)"
|
77
|
+
dep_ids.append(id_txt)
|
78
|
+
e = e.dependent_exceptions_tids[0][0]
|
79
|
+
return e, dep_ids
|
80
|
+
|
81
|
+
|
82
|
+
class DependencyError(PropagatedException):
|
83
|
+
"""Error raised if an app cannot run because there was an error
|
84
|
+
in a dependency. There can be several exceptions (one from each
|
85
|
+
dependency) and DependencyError collects them all together.
|
50
86
|
|
87
|
+
Args:
|
88
|
+
- dependent_exceptions_tids: List of exceptions and brief descriptions
|
89
|
+
for dependencies which failed. The description might be a task ID or
|
90
|
+
the repr of a non-AppFuture.
|
91
|
+
- task_id: Task ID of the task that failed because of the dependency error
|
92
|
+
"""
|
93
|
+
def __init__(self, dependent_exceptions_tids: Sequence[Tuple[BaseException, str]], task_id: int) -> None:
|
94
|
+
super().__init__(dependent_exceptions_tids, task_id,
|
95
|
+
failure_description="Dependency failure")
|
51
96
|
|
52
|
-
|
97
|
+
|
98
|
+
class JoinError(PropagatedException):
|
53
99
|
"""Error raised if apps joining into a join_app raise exceptions.
|
54
100
|
There can be several exceptions (one from each joining app),
|
55
101
|
and JoinError collects them all together.
|
56
102
|
"""
|
57
|
-
def __init__(self, dependent_exceptions_tids: Sequence[Tuple[BaseException,
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
def __str__(self) -> str:
|
62
|
-
dep_tids = [tid for (exception, tid) in self.dependent_exceptions_tids]
|
63
|
-
return "Join failure for task {} with failed join dependencies from tasks {}".format(self.task_id, dep_tids)
|
103
|
+
def __init__(self, dependent_exceptions_tids: Sequence[Tuple[BaseException, str]], task_id: int) -> None:
|
104
|
+
super().__init__(dependent_exceptions_tids, task_id,
|
105
|
+
failure_description="Join failure")
|
parsl/dataflow/memoization.py
CHANGED
@@ -2,10 +2,14 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import hashlib
|
4
4
|
import logging
|
5
|
+
import os
|
5
6
|
import pickle
|
6
7
|
from functools import lru_cache, singledispatch
|
7
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
8
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
|
8
9
|
|
10
|
+
import typeguard
|
11
|
+
|
12
|
+
from parsl.dataflow.errors import BadCheckpoint
|
9
13
|
from parsl.dataflow.taskrecord import TaskRecord
|
10
14
|
|
11
15
|
if TYPE_CHECKING:
|
@@ -146,7 +150,7 @@ class Memoizer:
|
|
146
150
|
|
147
151
|
"""
|
148
152
|
|
149
|
-
def __init__(self, dfk: DataFlowKernel, memoize: bool = True,
|
153
|
+
def __init__(self, dfk: DataFlowKernel, *, memoize: bool = True, checkpoint_files: Sequence[str]):
|
150
154
|
"""Initialize the memoizer.
|
151
155
|
|
152
156
|
Args:
|
@@ -159,6 +163,8 @@ class Memoizer:
|
|
159
163
|
self.dfk = dfk
|
160
164
|
self.memoize = memoize
|
161
165
|
|
166
|
+
checkpoint = self.load_checkpoints(checkpoint_files)
|
167
|
+
|
162
168
|
if self.memoize:
|
163
169
|
logger.info("App caching initialized")
|
164
170
|
self.memo_lookup_table = checkpoint
|
@@ -274,3 +280,71 @@ class Memoizer:
|
|
274
280
|
else:
|
275
281
|
logger.debug(f"Storing app cache entry {task['hashsum']} with result from task {task_id}")
|
276
282
|
self.memo_lookup_table[task['hashsum']] = r
|
283
|
+
|
284
|
+
def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future[Any]]:
|
285
|
+
"""Load a checkpoint file into a lookup table.
|
286
|
+
|
287
|
+
The data being loaded from the pickle file mostly contains input
|
288
|
+
attributes of the task: func, args, kwargs, env...
|
289
|
+
To simplify the check of whether the exact task has been completed
|
290
|
+
in the checkpoint, we hash these input params and use it as the key
|
291
|
+
for the memoized lookup table.
|
292
|
+
|
293
|
+
Args:
|
294
|
+
- checkpointDirs (list) : List of filepaths to checkpoints
|
295
|
+
Eg. ['runinfo/001', 'runinfo/002']
|
296
|
+
|
297
|
+
Returns:
|
298
|
+
- memoized_lookup_table (dict)
|
299
|
+
"""
|
300
|
+
memo_lookup_table = {}
|
301
|
+
|
302
|
+
for checkpoint_dir in checkpointDirs:
|
303
|
+
logger.info("Loading checkpoints from {}".format(checkpoint_dir))
|
304
|
+
checkpoint_file = os.path.join(checkpoint_dir, 'tasks.pkl')
|
305
|
+
try:
|
306
|
+
with open(checkpoint_file, 'rb') as f:
|
307
|
+
while True:
|
308
|
+
try:
|
309
|
+
data = pickle.load(f)
|
310
|
+
# Copy and hash only the input attributes
|
311
|
+
memo_fu: Future = Future()
|
312
|
+
assert data['exception'] is None
|
313
|
+
memo_fu.set_result(data['result'])
|
314
|
+
memo_lookup_table[data['hash']] = memo_fu
|
315
|
+
|
316
|
+
except EOFError:
|
317
|
+
# Done with the checkpoint file
|
318
|
+
break
|
319
|
+
except FileNotFoundError:
|
320
|
+
reason = "Checkpoint file was not found: {}".format(
|
321
|
+
checkpoint_file)
|
322
|
+
logger.error(reason)
|
323
|
+
raise BadCheckpoint(reason)
|
324
|
+
except Exception:
|
325
|
+
reason = "Failed to load checkpoint: {}".format(
|
326
|
+
checkpoint_file)
|
327
|
+
logger.error(reason)
|
328
|
+
raise BadCheckpoint(reason)
|
329
|
+
|
330
|
+
logger.info("Completed loading checkpoint: {0} with {1} tasks".format(checkpoint_file,
|
331
|
+
len(memo_lookup_table.keys())))
|
332
|
+
return memo_lookup_table
|
333
|
+
|
334
|
+
@typeguard.typechecked
|
335
|
+
def load_checkpoints(self, checkpointDirs: Optional[Sequence[str]]) -> Dict[str, Future]:
|
336
|
+
"""Load checkpoints from the checkpoint files into a dictionary.
|
337
|
+
|
338
|
+
The results are used to pre-populate the memoizer's lookup_table
|
339
|
+
|
340
|
+
Kwargs:
|
341
|
+
- checkpointDirs (list) : List of run folder to use as checkpoints
|
342
|
+
Eg. ['runinfo/001', 'runinfo/002']
|
343
|
+
|
344
|
+
Returns:
|
345
|
+
- dict containing, hashed -> future mappings
|
346
|
+
"""
|
347
|
+
if checkpointDirs:
|
348
|
+
return self._load_checkpoints(checkpointDirs)
|
349
|
+
else:
|
350
|
+
return {}
|
parsl/dataflow/taskrecord.py
CHANGED
@@ -43,12 +43,11 @@ class TaskRecord(TypedDict, total=False):
|
|
43
43
|
executed on.
|
44
44
|
"""
|
45
45
|
|
46
|
-
retries_left: int
|
47
46
|
fail_count: int
|
48
47
|
fail_cost: float
|
49
48
|
fail_history: List[str]
|
50
49
|
|
51
|
-
checkpoint: bool
|
50
|
+
checkpoint: bool
|
52
51
|
"""Should this task be checkpointed?
|
53
52
|
"""
|
54
53
|
|
@@ -68,7 +67,6 @@ class TaskRecord(TypedDict, total=False):
|
|
68
67
|
|
69
68
|
# these three could be more strongly typed perhaps but I'm not thinking about that now
|
70
69
|
func: Callable
|
71
|
-
fn_hash: str
|
72
70
|
args: Sequence[Any]
|
73
71
|
# in some places we uses a Tuple[Any, ...] and in some places a List[Any].
|
74
72
|
# This is an attempt to correctly type both of those.
|
parsl/executors/__init__.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
from parsl.executors.flux.executor import FluxExecutor
|
2
|
+
from parsl.executors.globus_compute import GlobusComputeExecutor
|
2
3
|
from parsl.executors.high_throughput.executor import HighThroughputExecutor
|
3
4
|
from parsl.executors.high_throughput.mpi_executor import MPIExecutor
|
4
5
|
from parsl.executors.threads import ThreadPoolExecutor
|
@@ -8,4 +9,5 @@ __all__ = ['ThreadPoolExecutor',
|
|
8
9
|
'HighThroughputExecutor',
|
9
10
|
'MPIExecutor',
|
10
11
|
'WorkQueueExecutor',
|
11
|
-
'FluxExecutor'
|
12
|
+
'FluxExecutor',
|
13
|
+
'GlobusComputeExecutor']
|
@@ -0,0 +1,125 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import copy
|
4
|
+
from concurrent.futures import Future
|
5
|
+
from typing import Any, Callable, Dict
|
6
|
+
|
7
|
+
import typeguard
|
8
|
+
|
9
|
+
from parsl.errors import OptionalModuleMissing
|
10
|
+
from parsl.executors.base import ParslExecutor
|
11
|
+
from parsl.utils import RepresentationMixin
|
12
|
+
|
13
|
+
try:
|
14
|
+
from globus_compute_sdk import Executor
|
15
|
+
_globus_compute_enabled = True
|
16
|
+
except ImportError:
|
17
|
+
_globus_compute_enabled = False
|
18
|
+
|
19
|
+
|
20
|
+
class GlobusComputeExecutor(ParslExecutor, RepresentationMixin):
|
21
|
+
""" GlobusComputeExecutor enables remote execution on Globus Compute endpoints
|
22
|
+
|
23
|
+
GlobusComputeExecutor is a thin wrapper over globus_compute_sdk.Executor
|
24
|
+
Refer to `globus-compute user documentation <https://globus-compute.readthedocs.io/en/latest/executor.html>`_
|
25
|
+
and `reference documentation <https://globus-compute.readthedocs.io/en/latest/reference/executor.html>`_
|
26
|
+
for more details.
|
27
|
+
|
28
|
+
.. note::
|
29
|
+
As a remote execution system, Globus Compute relies on serialization to ship
|
30
|
+
tasks and results between the Parsl client side and the remote Globus Compute
|
31
|
+
Endpoint side. Serialization is unreliable across python versions, and
|
32
|
+
wrappers used by Parsl assume identical Parsl versions across on both sides.
|
33
|
+
We recommend using matching Python, Parsl and Globus Compute version on both
|
34
|
+
the client side and the endpoint side for stable behavior.
|
35
|
+
|
36
|
+
"""
|
37
|
+
|
38
|
+
@typeguard.typechecked
|
39
|
+
def __init__(
|
40
|
+
self,
|
41
|
+
executor: Executor,
|
42
|
+
label: str = 'GlobusComputeExecutor',
|
43
|
+
):
|
44
|
+
"""
|
45
|
+
Parameters
|
46
|
+
----------
|
47
|
+
|
48
|
+
executor: globus_compute_sdk.Executor
|
49
|
+
Pass a globus_compute_sdk Executor that will be used to execute
|
50
|
+
tasks on a globus_compute endpoint. Refer to `globus-compute docs
|
51
|
+
<https://globus-compute.readthedocs.io/en/latest/reference/executor.html#globus-compute-executor>`_
|
52
|
+
|
53
|
+
label:
|
54
|
+
a label to name the executor
|
55
|
+
"""
|
56
|
+
if not _globus_compute_enabled:
|
57
|
+
raise OptionalModuleMissing(
|
58
|
+
['globus-compute-sdk'],
|
59
|
+
"GlobusComputeExecutor requires globus-compute-sdk installed"
|
60
|
+
)
|
61
|
+
|
62
|
+
super().__init__()
|
63
|
+
self.executor: Executor = executor
|
64
|
+
self.resource_specification = self.executor.resource_specification
|
65
|
+
self.user_endpoint_config = self.executor.user_endpoint_config
|
66
|
+
self.label = label
|
67
|
+
|
68
|
+
def start(self) -> None:
|
69
|
+
""" Start the Globus Compute Executor """
|
70
|
+
pass
|
71
|
+
|
72
|
+
def submit(self, func: Callable, resource_specification: Dict[str, Any], *args: Any, **kwargs: Any) -> Future:
|
73
|
+
""" Submit func to globus-compute
|
74
|
+
|
75
|
+
|
76
|
+
Parameters
|
77
|
+
----------
|
78
|
+
|
79
|
+
func: Callable
|
80
|
+
Python function to execute remotely
|
81
|
+
|
82
|
+
resource_specification: Dict[str, Any]
|
83
|
+
Resource specification can be used specify MPI resources required by MPI applications on
|
84
|
+
Endpoints configured to use globus compute's MPIEngine. GCE also accepts *user_endpoint_config*
|
85
|
+
to configure endpoints when the endpoint is a `Multi-User Endpoint
|
86
|
+
<https://globus-compute.readthedocs.io/en/latest/endpoints/endpoints.html#templating-endpoint-configuration>`_
|
87
|
+
|
88
|
+
args:
|
89
|
+
Args to pass to the function
|
90
|
+
|
91
|
+
kwargs:
|
92
|
+
kwargs to pass to the function
|
93
|
+
|
94
|
+
Returns
|
95
|
+
-------
|
96
|
+
|
97
|
+
Future
|
98
|
+
"""
|
99
|
+
res_spec = copy.deepcopy(resource_specification or self.resource_specification)
|
100
|
+
# Pop user_endpoint_config since it is illegal in resource_spec for globus_compute
|
101
|
+
if res_spec:
|
102
|
+
user_endpoint_config = res_spec.pop('user_endpoint_config', self.user_endpoint_config)
|
103
|
+
else:
|
104
|
+
user_endpoint_config = self.user_endpoint_config
|
105
|
+
|
106
|
+
try:
|
107
|
+
self.executor.resource_specification = res_spec
|
108
|
+
self.executor.user_endpoint_config = user_endpoint_config
|
109
|
+
return self.executor.submit(func, *args, **kwargs)
|
110
|
+
finally:
|
111
|
+
# Reset executor state to defaults set at configuration time
|
112
|
+
self.executor.resource_specification = self.resource_specification
|
113
|
+
self.executor.user_endpoint_config = self.user_endpoint_config
|
114
|
+
|
115
|
+
def shutdown(self):
|
116
|
+
"""Clean-up the resources associated with the Executor.
|
117
|
+
|
118
|
+
GCE.shutdown will cancel all futures that have not yet registered with
|
119
|
+
Globus Compute and will not wait for the launched futures to complete.
|
120
|
+
This method explicitly shutsdown the result_watcher thread to avoid
|
121
|
+
it waiting for outstanding futures at thread exit.
|
122
|
+
"""
|
123
|
+
self.executor.shutdown(wait=False, cancel_futures=True)
|
124
|
+
result_watcher = self.executor._get_result_watcher()
|
125
|
+
result_watcher.shutdown(wait=False, cancel_futures=True)
|
@@ -27,7 +27,7 @@ class VersionMismatch(Exception):
|
|
27
27
|
def __str__(self) -> str:
|
28
28
|
return (
|
29
29
|
f"Manager version info {self.manager_version} does not match interchange"
|
30
|
-
f" version info {self.interchange_version}
|
30
|
+
f" version info {self.interchange_version}"
|
31
31
|
)
|
32
32
|
|
33
33
|
|