parsl 2024.4.15__py3-none-any.whl → 2024.4.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/addresses.py +2 -2
- parsl/app/bash.py +10 -2
- parsl/app/errors.py +3 -5
- parsl/config.py +10 -1
- parsl/data_provider/zip.py +32 -0
- parsl/dataflow/dflow.py +102 -62
- parsl/dataflow/futures.py +26 -5
- parsl/executors/base.py +16 -0
- parsl/executors/high_throughput/executor.py +7 -1
- parsl/executors/taskvine/executor.py +6 -0
- parsl/executors/workqueue/executor.py +6 -0
- parsl/monitoring/monitoring.py +15 -0
- parsl/providers/kubernetes/kube.py +20 -1
- parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
- parsl/tests/conftest.py +12 -1
- parsl/tests/test_bash_apps/test_basic.py +2 -0
- parsl/tests/test_bash_apps/test_std_uri.py +128 -0
- parsl/tests/test_checkpointing/test_periodic.py +20 -33
- parsl/tests/test_checkpointing/test_task_exit.py +1 -1
- parsl/tests/test_htex/test_basic.py +2 -2
- parsl/tests/test_htex/test_missing_worker.py +0 -4
- parsl/tests/test_htex/test_zmq_binding.py +1 -0
- parsl/tests/test_monitoring/test_stdouterr.py +137 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +2 -8
- parsl/tests/test_python_apps/test_context_manager.py +3 -3
- parsl/tests/test_scaling/test_regression_1621.py +11 -11
- parsl/tests/test_staging/test_staging_stdout.py +61 -0
- parsl/tests/test_staging/test_zip_in.py +42 -0
- parsl/tests/test_staging/test_zip_to_zip.py +44 -0
- parsl/tests/unit/__init__.py +0 -0
- parsl/tests/unit/test_file.py +99 -0
- parsl/usage_tracking/api.py +66 -0
- parsl/usage_tracking/usage.py +39 -26
- parsl/utils.py +11 -2
- parsl/version.py +1 -1
- {parsl-2024.4.15.dist-info → parsl-2024.4.29.dist-info}/METADATA +4 -4
- {parsl-2024.4.15.dist-info → parsl-2024.4.29.dist-info}/RECORD +44 -36
- {parsl-2024.4.15.data → parsl-2024.4.29.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.4.15.data → parsl-2024.4.29.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.4.15.data → parsl-2024.4.29.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.4.15.dist-info → parsl-2024.4.29.dist-info}/LICENSE +0 -0
- {parsl-2024.4.15.dist-info → parsl-2024.4.29.dist-info}/WHEEL +0 -0
- {parsl-2024.4.15.dist-info → parsl-2024.4.29.dist-info}/entry_points.txt +0 -0
- {parsl-2024.4.15.dist-info → parsl-2024.4.29.dist-info}/top_level.txt +0 -0
parsl/addresses.py
CHANGED
@@ -113,7 +113,7 @@ def get_all_addresses() -> Set[str]:
|
|
113
113
|
try:
|
114
114
|
s_addresses.add(address_by_interface(interface))
|
115
115
|
except Exception:
|
116
|
-
logger.
|
116
|
+
logger.debug("Ignoring failure to fetch address from interface {}".format(interface))
|
117
117
|
|
118
118
|
resolution_functions: List[Callable[[], str]]
|
119
119
|
resolution_functions = [address_by_hostname, address_by_route, address_by_query]
|
@@ -121,7 +121,7 @@ def get_all_addresses() -> Set[str]:
|
|
121
121
|
try:
|
122
122
|
s_addresses.add(f())
|
123
123
|
except Exception:
|
124
|
-
logger.
|
124
|
+
logger.debug("Ignoring an address finder exception")
|
125
125
|
|
126
126
|
return s_addresses
|
127
127
|
|
parsl/app/bash.py
CHANGED
@@ -5,6 +5,7 @@ import logging
|
|
5
5
|
|
6
6
|
from parsl.app.errors import wrap_error
|
7
7
|
from parsl.app.app import AppBase
|
8
|
+
from parsl.data_provider.files import File
|
8
9
|
from parsl.dataflow.dflow import DataFlowKernelLoader
|
9
10
|
|
10
11
|
logger = logging.getLogger(__name__)
|
@@ -54,13 +55,20 @@ def remote_side_bash_executor(func, *args, **kwargs):
|
|
54
55
|
if stdfspec is None:
|
55
56
|
return None
|
56
57
|
|
57
|
-
|
58
|
+
if isinstance(stdfspec, File):
|
59
|
+
# a File is an os.PathLike and so we can use it directly for
|
60
|
+
# the subsequent file operations
|
61
|
+
fname = stdfspec
|
62
|
+
mode = "w"
|
63
|
+
else:
|
64
|
+
fname, mode = get_std_fname_mode(fdname, stdfspec)
|
65
|
+
|
58
66
|
try:
|
59
67
|
if os.path.dirname(fname):
|
60
68
|
os.makedirs(os.path.dirname(fname), exist_ok=True)
|
61
69
|
fd = open(fname, mode)
|
62
70
|
except Exception as e:
|
63
|
-
raise pe.BadStdStreamFile(fname
|
71
|
+
raise pe.BadStdStreamFile(str(fname)) from e
|
64
72
|
return fd
|
65
73
|
|
66
74
|
std_out = open_std_fd('stdout')
|
parsl/app/errors.py
CHANGED
@@ -78,16 +78,14 @@ class BadStdStreamFile(ParslError):
|
|
78
78
|
|
79
79
|
Contains:
|
80
80
|
reason(string)
|
81
|
-
exception object
|
82
81
|
"""
|
83
82
|
|
84
|
-
def __init__(self, reason: str
|
85
|
-
super().__init__(reason
|
83
|
+
def __init__(self, reason: str) -> None:
|
84
|
+
super().__init__(reason)
|
86
85
|
self._reason = reason
|
87
|
-
self._exception = exception
|
88
86
|
|
89
87
|
def __repr__(self) -> str:
|
90
|
-
return "Bad Stream File: {}
|
88
|
+
return "Bad Stream File: {}".format(self._reason)
|
91
89
|
|
92
90
|
def __str__(self) -> str:
|
93
91
|
return self.__repr__()
|
parsl/config.py
CHANGED
@@ -10,11 +10,12 @@ from parsl.executors.threads import ThreadPoolExecutor
|
|
10
10
|
from parsl.errors import ConfigurationError
|
11
11
|
from parsl.dataflow.taskrecord import TaskRecord
|
12
12
|
from parsl.monitoring import MonitoringHub
|
13
|
+
from parsl.usage_tracking.api import UsageInformation
|
13
14
|
|
14
15
|
logger = logging.getLogger(__name__)
|
15
16
|
|
16
17
|
|
17
|
-
class Config(RepresentationMixin):
|
18
|
+
class Config(RepresentationMixin, UsageInformation):
|
18
19
|
"""
|
19
20
|
Specification of Parsl configuration options.
|
20
21
|
|
@@ -50,6 +51,9 @@ class Config(RepresentationMixin):
|
|
50
51
|
of 1.
|
51
52
|
run_dir : str, optional
|
52
53
|
Path to run directory. Default is 'runinfo'.
|
54
|
+
std_autopath : function, optional
|
55
|
+
Sets the function used to generate stdout/stderr specifications when parsl.AUTO_LOGPATH is used. If no function
|
56
|
+
is specified, generates paths that look like: ``rundir/NNN/task_logs/X/task_{id}_{name}{label}.{out/err}``
|
53
57
|
strategy : str, optional
|
54
58
|
Strategy to use for scaling blocks according to workflow needs. Can be 'simple', 'htex_auto_scale', 'none'
|
55
59
|
or `None`.
|
@@ -89,6 +93,7 @@ class Config(RepresentationMixin):
|
|
89
93
|
retries: int = 0,
|
90
94
|
retry_handler: Optional[Callable[[Exception, TaskRecord], float]] = None,
|
91
95
|
run_dir: str = 'runinfo',
|
96
|
+
std_autopath: Optional[Callable] = None,
|
92
97
|
strategy: Optional[str] = 'simple',
|
93
98
|
strategy_period: Union[float, int] = 5,
|
94
99
|
max_idletime: float = 120.0,
|
@@ -129,6 +134,7 @@ class Config(RepresentationMixin):
|
|
129
134
|
self.usage_tracking = usage_tracking
|
130
135
|
self.initialize_logging = initialize_logging
|
131
136
|
self.monitoring = monitoring
|
137
|
+
self.std_autopath: Optional[Callable] = std_autopath
|
132
138
|
|
133
139
|
@property
|
134
140
|
def executors(self) -> Sequence[ParslExecutor]:
|
@@ -144,3 +150,6 @@ class Config(RepresentationMixin):
|
|
144
150
|
if len(duplicates) > 0:
|
145
151
|
raise ConfigurationError('Executors must have unique labels ({})'.format(
|
146
152
|
', '.join(['label={}'.format(repr(d)) for d in duplicates])))
|
153
|
+
|
154
|
+
def get_usage_information(self):
|
155
|
+
return {"executors_len": len(self.executors)}
|
parsl/data_provider/zip.py
CHANGED
@@ -42,6 +42,12 @@ class ZipFileStaging(Staging):
|
|
42
42
|
"""
|
43
43
|
|
44
44
|
def can_stage_out(self, file: File) -> bool:
|
45
|
+
return self.is_zip_url(file)
|
46
|
+
|
47
|
+
def can_stage_in(self, file: File) -> bool:
|
48
|
+
return self.is_zip_url(file)
|
49
|
+
|
50
|
+
def is_zip_url(self, file: File) -> bool:
|
45
51
|
logger.debug("archive provider checking File {}".format(repr(file)))
|
46
52
|
|
47
53
|
# First check if this is the scheme we care about
|
@@ -76,6 +82,20 @@ class ZipFileStaging(Staging):
|
|
76
82
|
app_fut = stage_out_app(zip_path, inside_path, working_dir, inputs=[file], _parsl_staging_inhibit=True, parent_fut=parent_fut)
|
77
83
|
return app_fut
|
78
84
|
|
85
|
+
def stage_in(self, dm, executor, file, parent_fut):
|
86
|
+
assert file.scheme == 'zip'
|
87
|
+
|
88
|
+
zip_path, inside_path = zip_path_split(file.path)
|
89
|
+
|
90
|
+
working_dir = dm.dfk.executors[executor].working_dir
|
91
|
+
|
92
|
+
if working_dir:
|
93
|
+
file.local_path = os.path.join(working_dir, inside_path)
|
94
|
+
|
95
|
+
stage_in_app = _zip_stage_in_app(dm)
|
96
|
+
app_fut = stage_in_app(zip_path, inside_path, working_dir, outputs=[file], _parsl_staging_inhibit=True, parent_fut=parent_fut)
|
97
|
+
return app_fut._outputs[0]
|
98
|
+
|
79
99
|
|
80
100
|
def _zip_stage_out(zip_file, inside_path, working_dir, parent_fut=None, inputs=[], _parsl_staging_inhibit=True):
|
81
101
|
file = inputs[0]
|
@@ -93,6 +113,18 @@ def _zip_stage_out_app(dm):
|
|
93
113
|
return parsl.python_app(executors=['_parsl_internal'], data_flow_kernel=dm.dfk)(_zip_stage_out)
|
94
114
|
|
95
115
|
|
116
|
+
def _zip_stage_in(zip_file, inside_path, working_dir, *, parent_fut, outputs, _parsl_staging_inhibit=True):
|
117
|
+
with filelock.FileLock(zip_file + ".lock"):
|
118
|
+
with zipfile.ZipFile(zip_file, mode='r') as z:
|
119
|
+
content = z.read(inside_path)
|
120
|
+
with open(outputs[0], "wb") as of:
|
121
|
+
of.write(content)
|
122
|
+
|
123
|
+
|
124
|
+
def _zip_stage_in_app(dm):
|
125
|
+
return parsl.python_app(executors=['_parsl_internal'], data_flow_kernel=dm.dfk)(_zip_stage_in)
|
126
|
+
|
127
|
+
|
96
128
|
def zip_path_split(path: str) -> Tuple[str, str]:
|
97
129
|
"""Split zip: path into a zipfile name and a contained-file name.
|
98
130
|
"""
|
parsl/dataflow/dflow.py
CHANGED
@@ -219,14 +219,18 @@ class DataFlowKernel:
|
|
219
219
|
task_log_info = self._create_task_log_info(task_record)
|
220
220
|
self.monitoring.send(MessageType.TASK_INFO, task_log_info)
|
221
221
|
|
222
|
-
def _create_task_log_info(self, task_record):
|
222
|
+
def _create_task_log_info(self, task_record: TaskRecord) -> Dict[str, Any]:
|
223
223
|
"""
|
224
224
|
Create the dictionary that will be included in the log.
|
225
225
|
"""
|
226
226
|
info_to_monitor = ['func_name', 'memoize', 'hashsum', 'fail_count', 'fail_cost', 'status',
|
227
227
|
'id', 'time_invoked', 'try_time_launched', 'time_returned', 'try_time_returned', 'executor']
|
228
228
|
|
229
|
-
|
229
|
+
# mypy cannot verify that these task_record[k] references are valid:
|
230
|
+
# They are valid if all entries in info_to_monitor are declared in the definition of TaskRecord
|
231
|
+
# This type: ignore[literal-required] asserts that fact.
|
232
|
+
task_log_info = {"task_" + k: task_record[k] for k in info_to_monitor} # type: ignore[literal-required]
|
233
|
+
|
230
234
|
task_log_info['run_id'] = self.run_id
|
231
235
|
task_log_info['try_id'] = task_record['try_id']
|
232
236
|
task_log_info['timestamp'] = datetime.datetime.now()
|
@@ -238,33 +242,28 @@ class DataFlowKernel:
|
|
238
242
|
task_log_info['task_inputs'] = str(task_record['kwargs'].get('inputs', None))
|
239
243
|
task_log_info['task_outputs'] = str(task_record['kwargs'].get('outputs', None))
|
240
244
|
task_log_info['task_stdin'] = task_record['kwargs'].get('stdin', None)
|
241
|
-
stdout_spec = task_record['kwargs'].get('stdout', None)
|
242
|
-
stderr_spec = task_record['kwargs'].get('stderr', None)
|
243
245
|
|
244
|
-
|
245
|
-
|
246
|
-
|
246
|
+
def std_spec_to_name(name, spec):
|
247
|
+
if spec is None:
|
248
|
+
name = ""
|
249
|
+
elif isinstance(spec, File):
|
250
|
+
name = spec.url
|
251
|
+
else:
|
252
|
+
# fallthrough case is various str, os.PathLike, tuple modes that
|
253
|
+
# can be interpreted by get_std_fname_mode.
|
254
|
+
try:
|
255
|
+
name, _ = get_std_fname_mode(name, spec)
|
256
|
+
except Exception:
|
257
|
+
logger.exception(f"Could not parse {name} specification {spec} for task {task_record['id']}")
|
258
|
+
name = ""
|
259
|
+
return name
|
247
260
|
|
248
|
-
|
249
|
-
|
250
|
-
stdout_name, _ = get_std_fname_mode('stdout', stdout_spec)
|
251
|
-
except Exception:
|
252
|
-
logger.exception("Could not parse stdout specification {} for task {}".format(stdout_spec, task_record['id']))
|
253
|
-
stdout_name = ""
|
254
|
-
else:
|
255
|
-
stdout_name = ""
|
261
|
+
stdout_spec = task_record['kwargs'].get('stdout')
|
262
|
+
task_log_info['task_stdout'] = std_spec_to_name('stdout', stdout_spec)
|
256
263
|
|
257
|
-
|
258
|
-
|
259
|
-
stderr_name, _ = get_std_fname_mode('stderr', stderr_spec)
|
260
|
-
except Exception:
|
261
|
-
logger.exception("Could not parse stderr specification {} for task {}".format(stderr_spec, task_record['id']))
|
262
|
-
stderr_name = ""
|
263
|
-
else:
|
264
|
-
stderr_name = ""
|
264
|
+
stderr_spec = task_record['kwargs'].get('stderr')
|
265
|
+
task_log_info['task_stderr'] = std_spec_to_name('stderr', stderr_spec)
|
265
266
|
|
266
|
-
task_log_info['task_stdout'] = stdout_name
|
267
|
-
task_log_info['task_stderr'] = stderr_name
|
268
267
|
task_log_info['task_fail_history'] = ",".join(task_record['fail_history'])
|
269
268
|
task_log_info['task_depends'] = None
|
270
269
|
if task_record['depends'] is not None:
|
@@ -774,6 +773,10 @@ class DataFlowKernel:
|
|
774
773
|
(inputs[idx], func) = self.data_manager.optionally_stage_in(f, func, executor)
|
775
774
|
|
776
775
|
for kwarg, f in kwargs.items():
|
776
|
+
# stdout and stderr files should not be staging in (they will be staged *out*
|
777
|
+
# in _add_output_deps)
|
778
|
+
if kwarg in ['stdout', 'stderr']:
|
779
|
+
continue
|
777
780
|
(kwargs[kwarg], func) = self.data_manager.optionally_stage_in(f, func, executor)
|
778
781
|
|
779
782
|
newargs = list(args)
|
@@ -786,33 +789,55 @@ class DataFlowKernel:
|
|
786
789
|
logger.debug("Adding output dependencies")
|
787
790
|
outputs = kwargs.get('outputs', [])
|
788
791
|
app_fut._outputs = []
|
789
|
-
|
790
|
-
|
792
|
+
|
793
|
+
# Pass over all possible outputs: the outputs kwarg, stdout and stderr
|
794
|
+
# and for each of those, perform possible stage-out. This can result in:
|
795
|
+
# a DataFuture to be exposed in app_fut to represent the completion of
|
796
|
+
# that stageout (sometimes backed by a new sub-workflow for separate-task
|
797
|
+
# stageout), a replacement for the function to be executed (intended to
|
798
|
+
# be the original function wrapped with an in-task stageout wrapper), a
|
799
|
+
# rewritten File object to be passed to task to be executed
|
800
|
+
|
801
|
+
def stageout_one_file(file: File, rewritable_func: Callable):
|
802
|
+
if not self.check_staging_inhibited(kwargs):
|
791
803
|
# replace a File with a DataFuture - either completing when the stageout
|
792
804
|
# future completes, or if no stage out future is returned, then when the
|
793
805
|
# app itself completes.
|
794
806
|
|
795
807
|
# The staging code will get a clean copy which it is allowed to mutate,
|
796
808
|
# while the DataFuture-contained original will not be modified by any staging.
|
797
|
-
f_copy =
|
798
|
-
outputs[idx] = f_copy
|
809
|
+
f_copy = file.cleancopy()
|
799
810
|
|
800
|
-
logger.debug("Submitting stage out for output file {}".format(repr(
|
811
|
+
logger.debug("Submitting stage out for output file {}".format(repr(file)))
|
801
812
|
stageout_fut = self.data_manager.stage_out(f_copy, executor, app_fut)
|
802
813
|
if stageout_fut:
|
803
|
-
logger.debug("Adding a dependency on stageout future for {}".format(repr(
|
804
|
-
|
814
|
+
logger.debug("Adding a dependency on stageout future for {}".format(repr(file)))
|
815
|
+
df = DataFuture(stageout_fut, file, tid=app_fut.tid)
|
805
816
|
else:
|
806
|
-
logger.debug("No stageout dependency for {}".format(repr(
|
807
|
-
|
817
|
+
logger.debug("No stageout dependency for {}".format(repr(file)))
|
818
|
+
df = DataFuture(app_fut, file, tid=app_fut.tid)
|
808
819
|
|
809
820
|
# this is a hook for post-task stageout
|
810
821
|
# note that nothing depends on the output - which is maybe a bug
|
811
822
|
# in the not-very-tested stageout system?
|
812
|
-
|
823
|
+
rewritable_func = self.data_manager.replace_task_stage_out(f_copy, rewritable_func, executor)
|
824
|
+
return rewritable_func, f_copy, df
|
813
825
|
else:
|
814
|
-
logger.debug("Not performing output staging for: {}".format(repr(
|
815
|
-
|
826
|
+
logger.debug("Not performing output staging for: {}".format(repr(file)))
|
827
|
+
return rewritable_func, file, DataFuture(app_fut, file, tid=app_fut.tid)
|
828
|
+
|
829
|
+
for idx, file in enumerate(outputs):
|
830
|
+
func, outputs[idx], o = stageout_one_file(file, func)
|
831
|
+
app_fut._outputs.append(o)
|
832
|
+
|
833
|
+
file = kwargs.get('stdout')
|
834
|
+
if isinstance(file, File):
|
835
|
+
func, kwargs['stdout'], app_fut._stdout_future = stageout_one_file(file, func)
|
836
|
+
|
837
|
+
file = kwargs.get('stderr')
|
838
|
+
if isinstance(file, File):
|
839
|
+
func, kwargs['stderr'], app_fut._stderr_future = stageout_one_file(file, func)
|
840
|
+
|
816
841
|
return func
|
817
842
|
|
818
843
|
def _gather_all_deps(self, args: Sequence[Any], kwargs: Dict[str, Any]) -> List[Future]:
|
@@ -970,32 +995,16 @@ class DataFlowKernel:
|
|
970
995
|
executor = random.choice(choices)
|
971
996
|
logger.debug("Task {} will be sent to executor {}".format(task_id, executor))
|
972
997
|
|
973
|
-
# The below uses func.__name__ before it has been wrapped by any staging code.
|
974
|
-
|
975
|
-
label = app_kwargs.get('label')
|
976
|
-
for kw in ['stdout', 'stderr']:
|
977
|
-
if kw in app_kwargs:
|
978
|
-
if app_kwargs[kw] == parsl.AUTO_LOGNAME:
|
979
|
-
if kw not in ignore_for_cache:
|
980
|
-
ignore_for_cache += [kw]
|
981
|
-
app_kwargs[kw] = os.path.join(
|
982
|
-
self.run_dir,
|
983
|
-
'task_logs',
|
984
|
-
str(int(task_id / 10000)).zfill(4), # limit logs to 10k entries per directory
|
985
|
-
'task_{}_{}{}.{}'.format(
|
986
|
-
str(task_id).zfill(4),
|
987
|
-
func.__name__,
|
988
|
-
'' if label is None else '_{}'.format(label),
|
989
|
-
kw)
|
990
|
-
)
|
991
|
-
|
992
998
|
resource_specification = app_kwargs.get('parsl_resource_specification', {})
|
993
999
|
|
994
1000
|
task_record: TaskRecord
|
995
|
-
task_record = {'
|
1001
|
+
task_record = {'args': app_args,
|
1002
|
+
'depends': [],
|
996
1003
|
'dfk': self,
|
997
1004
|
'executor': executor,
|
1005
|
+
'func': func,
|
998
1006
|
'func_name': func.__name__,
|
1007
|
+
'kwargs': app_kwargs,
|
999
1008
|
'memoize': cache,
|
1000
1009
|
'hashsum': None,
|
1001
1010
|
'exec_fu': None,
|
@@ -1017,18 +1026,30 @@ class DataFlowKernel:
|
|
1017
1026
|
|
1018
1027
|
self.update_task_state(task_record, States.unsched)
|
1019
1028
|
|
1029
|
+
for kw in ['stdout', 'stderr']:
|
1030
|
+
if kw in app_kwargs:
|
1031
|
+
if app_kwargs[kw] == parsl.AUTO_LOGNAME:
|
1032
|
+
if kw not in ignore_for_cache:
|
1033
|
+
ignore_for_cache += [kw]
|
1034
|
+
if self.config.std_autopath is None:
|
1035
|
+
app_kwargs[kw] = self.default_std_autopath(task_record, kw)
|
1036
|
+
else:
|
1037
|
+
app_kwargs[kw] = self.config.std_autopath(task_record, kw)
|
1038
|
+
|
1020
1039
|
app_fu = AppFuture(task_record)
|
1040
|
+
task_record['app_fu'] = app_fu
|
1021
1041
|
|
1022
1042
|
# Transform remote input files to data futures
|
1023
1043
|
app_args, app_kwargs, func = self._add_input_deps(executor, app_args, app_kwargs, func)
|
1024
1044
|
|
1025
1045
|
func = self._add_output_deps(executor, app_args, app_kwargs, app_fu, func)
|
1026
1046
|
|
1047
|
+
# Replace the function invocation in the TaskRecord with whatever file-staging
|
1048
|
+
# substitutions have been made.
|
1027
1049
|
task_record.update({
|
1028
1050
|
'args': app_args,
|
1029
1051
|
'func': func,
|
1030
|
-
'kwargs': app_kwargs
|
1031
|
-
'app_fu': app_fu})
|
1052
|
+
'kwargs': app_kwargs})
|
1032
1053
|
|
1033
1054
|
assert task_id not in self.tasks
|
1034
1055
|
|
@@ -1140,8 +1161,6 @@ class DataFlowKernel:
|
|
1140
1161
|
executor.hub_port = self.hub_zmq_port
|
1141
1162
|
if self.monitoring:
|
1142
1163
|
executor.monitoring_radio = self.monitoring.radio
|
1143
|
-
else:
|
1144
|
-
executor.monitoring_radio = None
|
1145
1164
|
if hasattr(executor, 'provider'):
|
1146
1165
|
if hasattr(executor.provider, 'script_dir'):
|
1147
1166
|
executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
|
@@ -1221,8 +1240,10 @@ class DataFlowKernel:
|
|
1221
1240
|
self._checkpoint_timer.close()
|
1222
1241
|
|
1223
1242
|
# Send final stats
|
1243
|
+
logger.info("Sending end message for usage tracking")
|
1224
1244
|
self.usage_tracker.send_end_message()
|
1225
1245
|
self.usage_tracker.close()
|
1246
|
+
logger.info("Closed usage tracking")
|
1226
1247
|
|
1227
1248
|
logger.info("Closing job status poller")
|
1228
1249
|
self.job_status_poller.close()
|
@@ -1402,14 +1423,33 @@ class DataFlowKernel:
|
|
1402
1423
|
logger.info(f"{name} for task {tid} will not be redirected.")
|
1403
1424
|
elif isinstance(target, str):
|
1404
1425
|
logger.info(f"{name} for task {tid} will be redirected to {target}")
|
1405
|
-
elif isinstance(target,
|
1426
|
+
elif isinstance(target, os.PathLike):
|
1427
|
+
logger.info(f"{name} for task {tid} will be redirected to {os.fspath(target)}")
|
1428
|
+
elif isinstance(target, tuple) and len(target) == 2 and isinstance(target[0], str):
|
1406
1429
|
logger.info(f"{name} for task {tid} will be redirected to {target[0]} with mode {target[1]}")
|
1430
|
+
elif isinstance(target, tuple) and len(target) == 2 and isinstance(target[0], os.PathLike):
|
1431
|
+
logger.info(f"{name} for task {tid} will be redirected to {os.fspath(target[0])} with mode {target[1]}")
|
1432
|
+
elif isinstance(target, DataFuture):
|
1433
|
+
logger.info(f"{name} for task {tid} will staged to {target.file_obj.url}")
|
1407
1434
|
else:
|
1408
1435
|
logger.error(f"{name} for task {tid} has unknown specification: {target!r}")
|
1409
1436
|
|
1410
1437
|
log_std_stream("Standard out", task_record['app_fu'].stdout)
|
1411
1438
|
log_std_stream("Standard error", task_record['app_fu'].stderr)
|
1412
1439
|
|
1440
|
+
def default_std_autopath(self, taskrecord, kw):
|
1441
|
+
label = taskrecord['kwargs'].get('label')
|
1442
|
+
task_id = taskrecord['id']
|
1443
|
+
return os.path.join(
|
1444
|
+
self.run_dir,
|
1445
|
+
'task_logs',
|
1446
|
+
str(int(task_id / 10000)).zfill(4), # limit logs to 10k entries per directory
|
1447
|
+
'task_{}_{}{}.{}'.format(
|
1448
|
+
str(task_id).zfill(4),
|
1449
|
+
taskrecord['func_name'],
|
1450
|
+
'' if label is None else '_{}'.format(label),
|
1451
|
+
kw))
|
1452
|
+
|
1413
1453
|
|
1414
1454
|
class DataFlowKernelLoader:
|
1415
1455
|
"""Manage which DataFlowKernel is active.
|
parsl/dataflow/futures.py
CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
3
3
|
from concurrent.futures import Future
|
4
4
|
import logging
|
5
5
|
import threading
|
6
|
-
from typing import Any, Optional, Sequence
|
6
|
+
from typing import Any, Optional, Sequence, Union
|
7
7
|
|
8
8
|
import parsl.app.app as app
|
9
9
|
|
@@ -70,13 +70,34 @@ class AppFuture(Future):
|
|
70
70
|
self._outputs = []
|
71
71
|
self.task_record = task_record
|
72
72
|
|
73
|
+
self._stdout_future: Optional[DataFuture] = None
|
74
|
+
self._stderr_future: Optional[DataFuture] = None
|
75
|
+
|
73
76
|
@property
|
74
|
-
def stdout(self) ->
|
75
|
-
|
77
|
+
def stdout(self) -> Union[None, str, DataFuture]:
|
78
|
+
"""Return app stdout. If stdout was specified as a string, then this
|
79
|
+
property will return that string. If stdout was specified as a File,
|
80
|
+
then this property will return a DataFuture representing that file
|
81
|
+
stageout.
|
82
|
+
TODO: this can be a tuple too I think?"""
|
83
|
+
if self._stdout_future:
|
84
|
+
return self._stdout_future
|
85
|
+
else:
|
86
|
+
# this covers the str and None cases
|
87
|
+
return self.task_record['kwargs'].get('stdout')
|
76
88
|
|
77
89
|
@property
|
78
|
-
def stderr(self) ->
|
79
|
-
|
90
|
+
def stderr(self) -> Union[None, str, DataFuture]:
|
91
|
+
"""Return app stderr. If stdout was specified as a string, then this
|
92
|
+
property will return that string. If stdout was specified as a File,
|
93
|
+
then this property will return a DataFuture representing that file
|
94
|
+
stageout.
|
95
|
+
TODO: this can be a tuple too I think?"""
|
96
|
+
if self._stderr_future:
|
97
|
+
return self._stderr_future
|
98
|
+
else:
|
99
|
+
# this covers the str and None cases
|
100
|
+
return self.task_record['kwargs'].get('stderr')
|
80
101
|
|
81
102
|
@property
|
82
103
|
def tid(self) -> int:
|
parsl/executors/base.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import os
|
1
2
|
from abc import ABCMeta, abstractmethod
|
2
3
|
from concurrent.futures import Future
|
3
4
|
from typing import Any, Callable, Dict, Optional
|
@@ -45,6 +46,21 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
45
46
|
label: str = "undefined"
|
46
47
|
radio_mode: str = "udp"
|
47
48
|
|
49
|
+
def __init__(
|
50
|
+
self,
|
51
|
+
*,
|
52
|
+
hub_address: Optional[str] = None,
|
53
|
+
hub_port: Optional[int] = None,
|
54
|
+
monitoring_radio: Optional[MonitoringRadio] = None,
|
55
|
+
run_dir: str = ".",
|
56
|
+
run_id: Optional[str] = None,
|
57
|
+
):
|
58
|
+
self.hub_address = hub_address
|
59
|
+
self.hub_port = hub_port
|
60
|
+
self.monitoring_radio = monitoring_radio
|
61
|
+
self.run_dir = os.path.abspath(run_dir)
|
62
|
+
self.run_id = run_id
|
63
|
+
|
48
64
|
def __enter__(self) -> Self:
|
49
65
|
return self
|
50
66
|
|
@@ -14,6 +14,7 @@ import math
|
|
14
14
|
import warnings
|
15
15
|
|
16
16
|
import parsl.launchers
|
17
|
+
from parsl.usage_tracking.api import UsageInformation
|
17
18
|
from parsl.serialize import pack_res_spec_apply_message, deserialize
|
18
19
|
from parsl.serialize.errors import SerializationError, DeserializationError
|
19
20
|
from parsl.app.errors import RemoteExceptionWrapper
|
@@ -62,7 +63,7 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
|
|
62
63
|
"--available-accelerators {accelerators}")
|
63
64
|
|
64
65
|
|
65
|
-
class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
66
|
+
class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageInformation):
|
66
67
|
"""Executor designed for cluster-scale
|
67
68
|
|
68
69
|
The HighThroughputExecutor system has the following components:
|
@@ -818,4 +819,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
818
819
|
logger.info("Unable to terminate Interchange process; sending SIGKILL")
|
819
820
|
self.interchange_proc.kill()
|
820
821
|
|
822
|
+
self.interchange_proc.close()
|
823
|
+
|
821
824
|
logger.info("Finished HighThroughputExecutor shutdown attempt")
|
825
|
+
|
826
|
+
def get_usage_information(self):
|
827
|
+
return {"mpi": self.enable_mpi_mode}
|
@@ -644,6 +644,12 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
644
644
|
logger.debug("Joining on factory process")
|
645
645
|
self._factory_process.join()
|
646
646
|
|
647
|
+
# Shutdown multiprocessing queues
|
648
|
+
self._ready_task_queue.close()
|
649
|
+
self._ready_task_queue.join_thread()
|
650
|
+
self._finished_task_queue.close()
|
651
|
+
self._finished_task_queue.join_thread()
|
652
|
+
|
647
653
|
self._is_shutdown = True
|
648
654
|
logger.debug("TaskVine shutdown completed")
|
649
655
|
|
@@ -735,6 +735,12 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
735
735
|
logger.debug("Joining on collector thread")
|
736
736
|
self.collector_thread.join()
|
737
737
|
|
738
|
+
logger.debug("Closing multiprocessing queues")
|
739
|
+
self.task_queue.close()
|
740
|
+
self.task_queue.join_thread()
|
741
|
+
self.collector_queue.close()
|
742
|
+
self.collector_queue.join_thread()
|
743
|
+
|
738
744
|
self.is_shutdown = True
|
739
745
|
logger.debug("Work Queue shutdown completed")
|
740
746
|
|
parsl/monitoring/monitoring.py
CHANGED
@@ -195,6 +195,8 @@ class MonitoringHub(RepresentationMixin):
|
|
195
195
|
|
196
196
|
try:
|
197
197
|
comm_q_result = comm_q.get(block=True, timeout=120)
|
198
|
+
comm_q.close()
|
199
|
+
comm_q.join_thread()
|
198
200
|
except queue.Empty:
|
199
201
|
logger.error("Hub has not completed initialization in 120s. Aborting")
|
200
202
|
raise Exception("Hub failed to start")
|
@@ -258,6 +260,19 @@ class MonitoringHub(RepresentationMixin):
|
|
258
260
|
self.filesystem_proc.terminate()
|
259
261
|
self.filesystem_proc.join()
|
260
262
|
|
263
|
+
logger.info("Closing monitoring multiprocessing queues")
|
264
|
+
self.exception_q.close()
|
265
|
+
self.exception_q.join_thread()
|
266
|
+
self.priority_msgs.close()
|
267
|
+
self.priority_msgs.join_thread()
|
268
|
+
self.resource_msgs.close()
|
269
|
+
self.resource_msgs.join_thread()
|
270
|
+
self.node_msgs.close()
|
271
|
+
self.node_msgs.join_thread()
|
272
|
+
self.block_msgs.close()
|
273
|
+
self.block_msgs.join_thread()
|
274
|
+
logger.info("Closed monitoring multiprocessing queues")
|
275
|
+
|
261
276
|
|
262
277
|
@wrap_with_logs
|
263
278
|
def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:
|
@@ -105,7 +105,26 @@ class KubernetesProvider(ExecutionProvider, RepresentationMixin):
|
|
105
105
|
if not _kubernetes_enabled:
|
106
106
|
raise OptionalModuleMissing(['kubernetes'],
|
107
107
|
"Kubernetes provider requires kubernetes module and config.")
|
108
|
-
|
108
|
+
try:
|
109
|
+
config.load_kube_config()
|
110
|
+
except config.config_exception.ConfigException:
|
111
|
+
# `load_kube_config` assumes a local kube-config file, and fails if not
|
112
|
+
# present, raising:
|
113
|
+
#
|
114
|
+
# kubernetes.config.config_exception.ConfigException: Invalid
|
115
|
+
# kube-config file. No configuration found.
|
116
|
+
#
|
117
|
+
# Since running a parsl driver script on a kubernetes cluster is a common
|
118
|
+
# pattern to enable worker-interchange communication, this enables an
|
119
|
+
# in-cluster config to be loaded if a kube-config file isn't found.
|
120
|
+
#
|
121
|
+
# Based on: https://github.com/kubernetes-client/python/issues/1005
|
122
|
+
try:
|
123
|
+
config.load_incluster_config()
|
124
|
+
except config.config_exception.ConfigException:
|
125
|
+
raise config.config_exception.ConfigException(
|
126
|
+
"Failed to load both kube-config file and in-cluster configuration."
|
127
|
+
)
|
109
128
|
|
110
129
|
self.namespace = namespace
|
111
130
|
self.image = image
|
@@ -1,13 +1,11 @@
|
|
1
1
|
from parsl.config import Config
|
2
2
|
from parsl.executors.threads import ThreadPoolExecutor
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
checkpoint_period='00:00:05'
|
13
|
-
)
|
4
|
+
|
5
|
+
def fresh_config():
|
6
|
+
tpe = ThreadPoolExecutor(label='local_threads_checkpoint_periodic', max_threads=1)
|
7
|
+
return Config(
|
8
|
+
executors=[tpe],
|
9
|
+
checkpoint_mode='periodic',
|
10
|
+
checkpoint_period='00:00:02'
|
11
|
+
)
|