parsl 2023.10.23__py3-none-any.whl → 2023.11.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/__init__.py +1 -0
- parsl/app/app.py +29 -21
- parsl/channels/base.py +12 -24
- parsl/config.py +19 -12
- parsl/configs/ad_hoc.py +2 -2
- parsl/dataflow/dflow.py +10 -4
- parsl/executors/base.py +1 -3
- parsl/executors/high_throughput/executor.py +3 -3
- parsl/executors/high_throughput/interchange.py +59 -53
- parsl/executors/high_throughput/process_worker_pool.py +2 -2
- parsl/executors/high_throughput/zmq_pipes.py +1 -1
- parsl/executors/radical/__init__.py +4 -0
- parsl/executors/radical/executor.py +550 -0
- parsl/executors/radical/rpex_master.py +42 -0
- parsl/executors/radical/rpex_resources.py +165 -0
- parsl/executors/radical/rpex_worker.py +61 -0
- parsl/executors/status_handling.py +1 -2
- parsl/executors/taskvine/exec_parsl_function.py +3 -4
- parsl/executors/taskvine/executor.py +18 -4
- parsl/executors/taskvine/factory.py +1 -1
- parsl/executors/taskvine/manager.py +12 -16
- parsl/executors/taskvine/utils.py +5 -5
- parsl/executors/threads.py +1 -2
- parsl/executors/workqueue/exec_parsl_function.py +2 -1
- parsl/executors/workqueue/executor.py +34 -24
- parsl/jobs/job_status_poller.py +2 -3
- parsl/monitoring/monitoring.py +6 -6
- parsl/monitoring/remote.py +1 -1
- parsl/monitoring/visualization/plots/default/workflow_plots.py +4 -4
- parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +2 -2
- parsl/providers/slurm/slurm.py +1 -1
- parsl/tests/configs/ad_hoc_cluster_htex.py +3 -3
- parsl/tests/configs/htex_ad_hoc_cluster.py +1 -1
- parsl/tests/configs/local_radical.py +20 -0
- parsl/tests/configs/local_radical_mpi.py +20 -0
- parsl/tests/configs/local_threads_monitoring.py +1 -1
- parsl/tests/conftest.py +6 -2
- parsl/tests/scaling_tests/vineex_condor.py +1 -1
- parsl/tests/scaling_tests/vineex_local.py +1 -1
- parsl/tests/scaling_tests/wqex_condor.py +1 -1
- parsl/tests/scaling_tests/wqex_local.py +1 -1
- parsl/tests/test_docs/test_kwargs.py +37 -0
- parsl/tests/test_python_apps/test_garbage_collect.py +1 -1
- parsl/tests/test_python_apps/test_lifted.py +3 -2
- parsl/tests/test_radical/__init__.py +0 -0
- parsl/tests/test_radical/test_mpi_funcs.py +27 -0
- parsl/tests/test_regression/test_1606_wait_for_current_tasks.py +1 -1
- parsl/utils.py +4 -4
- parsl/version.py +1 -1
- {parsl-2023.10.23.data → parsl-2023.11.20.data}/scripts/exec_parsl_function.py +2 -1
- {parsl-2023.10.23.data → parsl-2023.11.20.data}/scripts/process_worker_pool.py +2 -2
- {parsl-2023.10.23.dist-info → parsl-2023.11.20.dist-info}/METADATA +5 -2
- {parsl-2023.10.23.dist-info → parsl-2023.11.20.dist-info}/RECORD +58 -48
- {parsl-2023.10.23.dist-info → parsl-2023.11.20.dist-info}/WHEEL +1 -1
- {parsl-2023.10.23.data → parsl-2023.11.20.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2023.10.23.dist-info → parsl-2023.11.20.dist-info}/LICENSE +0 -0
- {parsl-2023.10.23.dist-info → parsl-2023.11.20.dist-info}/entry_points.txt +0 -0
- {parsl-2023.10.23.dist-info → parsl-2023.11.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,165 @@
|
|
1
|
+
import sys
|
2
|
+
import json
|
3
|
+
|
4
|
+
from typing import List
|
5
|
+
|
6
|
+
_setup_paths: List[str]
|
7
|
+
try:
|
8
|
+
import radical.pilot as rp
|
9
|
+
import radical.utils as ru
|
10
|
+
except ImportError:
|
11
|
+
_setup_paths = []
|
12
|
+
else:
|
13
|
+
_setup_paths = [rp.sdist_path,
|
14
|
+
ru.sdist_path]
|
15
|
+
|
16
|
+
|
17
|
+
MPI = "mpi"
|
18
|
+
RP_ENV = "rp"
|
19
|
+
CLIENT = "client"
|
20
|
+
RPEX_ENV = "ve_rpex"
|
21
|
+
MPI_WORKER = "MPIWorker"
|
22
|
+
DEFAULT_WORKER = "DefaultWorker"
|
23
|
+
|
24
|
+
|
25
|
+
class ResourceConfig:
|
26
|
+
"""
|
27
|
+
This ResourceConfig class is an abstraction of the resource
|
28
|
+
configuration of the RAPTOR layer in the RADICAL-Pilot runtime system.
|
29
|
+
|
30
|
+
This class sets up the default configuration values for the executor and
|
31
|
+
allows the user to specify different resource requirements flexibly.
|
32
|
+
|
33
|
+
For more information:
|
34
|
+
https://radicalpilot.readthedocs.io/en/stable/tutorials/raptor.html
|
35
|
+
|
36
|
+
Parameters
|
37
|
+
----------
|
38
|
+
masters : int
|
39
|
+
The number of masters to be deployed by RAPTOR.
|
40
|
+
Default is 1.
|
41
|
+
|
42
|
+
workers : int
|
43
|
+
The number of workers to be deployed by RAPTOR.
|
44
|
+
Default is 1.
|
45
|
+
|
46
|
+
worker_gpus_per_node : int
|
47
|
+
The number of GPUs a worker will operate on per node.
|
48
|
+
Default is 0.
|
49
|
+
|
50
|
+
worker_cores_per_node : int
|
51
|
+
The number of CPU cores a worker will operate on per node.
|
52
|
+
Default is 4.
|
53
|
+
|
54
|
+
cores_per_master : int
|
55
|
+
The number of cores a master will operate on per node.
|
56
|
+
Default is 1.
|
57
|
+
|
58
|
+
nodes_per_worker : int
|
59
|
+
The number of nodes to be occupied by every worker.
|
60
|
+
Default is 1.
|
61
|
+
|
62
|
+
pilot_env_path : str
|
63
|
+
The path to an exisitng pilot environment.
|
64
|
+
Default is an empty string (RADICAL-Pilot will create one).
|
65
|
+
|
66
|
+
pilot_env_name : str
|
67
|
+
The name of the pilot environment.
|
68
|
+
Default is "ve_rpex".
|
69
|
+
|
70
|
+
pilot_env_pre_exec : list
|
71
|
+
List of commands to be executed before starting the pilot environment.
|
72
|
+
Default is an empty list.
|
73
|
+
|
74
|
+
pilot_env_type : str
|
75
|
+
The type of the pilot environment (e.g., 'venv', 'conda').
|
76
|
+
Default is "venv".
|
77
|
+
|
78
|
+
pilot_env_setup : list
|
79
|
+
List of setup commands/packages for the pilot environment.
|
80
|
+
Default setup includes "parsl", rp.sdist_path, and ru.sdist_path.
|
81
|
+
|
82
|
+
python_v : str
|
83
|
+
The Python version to be used in the pilot environment.
|
84
|
+
Default is determined by the system's Python version.
|
85
|
+
|
86
|
+
worker_type : str
|
87
|
+
The type of worker(s) to be deployed by RAPTOR on the compute
|
88
|
+
resources.
|
89
|
+
Default is "DefaultWorker".
|
90
|
+
"""
|
91
|
+
|
92
|
+
masters: int = 1
|
93
|
+
workers: int = 1
|
94
|
+
|
95
|
+
worker_gpus_per_node: int = 0
|
96
|
+
worker_cores_per_node: int = 4
|
97
|
+
|
98
|
+
cores_per_master: int = 1
|
99
|
+
nodes_per_worker: int = 1
|
100
|
+
|
101
|
+
pilot_env_mode: str = CLIENT
|
102
|
+
pilot_env_path: str = ""
|
103
|
+
pilot_env_type: str = "venv"
|
104
|
+
pilot_env_name: str = RP_ENV
|
105
|
+
pilot_env_pre_exec: List[str] = []
|
106
|
+
pilot_env_setup: List[str] = _setup_paths
|
107
|
+
|
108
|
+
python_v: str = f'{sys.version_info[0]}.{sys.version_info[1]}'
|
109
|
+
worker_type: str = DEFAULT_WORKER
|
110
|
+
|
111
|
+
def _get_cfg_file(cls, path=None):
|
112
|
+
|
113
|
+
# Default ENV mode for RP is to reuse
|
114
|
+
# the client side. If this is not the case,
|
115
|
+
# then RP will create a new env named ve_rpex
|
116
|
+
# The user need to make sure that under:
|
117
|
+
# $HOME/.radical/pilot/configs/*_resource.json
|
118
|
+
# that virtenv_mode = local
|
119
|
+
if cls.pilot_env_mode != CLIENT:
|
120
|
+
cls.pilot_env_name = RPEX_ENV
|
121
|
+
|
122
|
+
if MPI in cls.worker_type.lower() and \
|
123
|
+
"mpi4py" not in cls.pilot_env_setup:
|
124
|
+
cls.pilot_env_setup.append("mpi4py")
|
125
|
+
|
126
|
+
cfg = {
|
127
|
+
'n_masters': cls.masters,
|
128
|
+
'n_workers': cls.workers,
|
129
|
+
'gpus_per_node': cls.worker_gpus_per_node,
|
130
|
+
'cores_per_node': cls.worker_cores_per_node,
|
131
|
+
'cores_per_master': cls.cores_per_master,
|
132
|
+
'nodes_per_worker': cls.nodes_per_worker,
|
133
|
+
|
134
|
+
'pilot_env': {
|
135
|
+
"version": cls.python_v,
|
136
|
+
"name": cls.pilot_env_name,
|
137
|
+
"path": cls.pilot_env_path,
|
138
|
+
"type": cls.pilot_env_type,
|
139
|
+
"setup": cls.pilot_env_setup,
|
140
|
+
"pre_exec": cls.pilot_env_pre_exec
|
141
|
+
},
|
142
|
+
|
143
|
+
'pilot_env_mode': cls.pilot_env_mode,
|
144
|
+
|
145
|
+
'master_descr': {
|
146
|
+
"mode": rp.RAPTOR_MASTER,
|
147
|
+
"named_env": cls.pilot_env_name,
|
148
|
+
"executable": "python3 rpex_master.py",
|
149
|
+
},
|
150
|
+
|
151
|
+
'worker_descr': {
|
152
|
+
"mode": rp.RAPTOR_WORKER,
|
153
|
+
"named_env": cls.pilot_env_name,
|
154
|
+
"raptor_file": "./rpex_worker.py",
|
155
|
+
"raptor_class": cls.worker_type if
|
156
|
+
cls.worker_type.lower() != MPI else MPI_WORKER,
|
157
|
+
}}
|
158
|
+
|
159
|
+
# Convert the class instance to a cfg file.
|
160
|
+
config_path = 'rpex.cfg'
|
161
|
+
if path:
|
162
|
+
config_path = path + '/' + config_path
|
163
|
+
with open(config_path, 'w') as f:
|
164
|
+
json.dump(cfg, f, indent=4)
|
165
|
+
return config_path
|
@@ -0,0 +1,61 @@
|
|
1
|
+
import sys
|
2
|
+
import radical.pilot as rp
|
3
|
+
|
4
|
+
import parsl.app.errors as pe
|
5
|
+
from parsl.app.bash import remote_side_bash_executor
|
6
|
+
from parsl.serialize import unpack_apply_message, serialize
|
7
|
+
from parsl.executors.high_throughput.process_worker_pool import execute_task
|
8
|
+
|
9
|
+
|
10
|
+
class ParslWorker:
|
11
|
+
|
12
|
+
def _dispatch_func(self, task):
|
13
|
+
|
14
|
+
try:
|
15
|
+
buffer = rp.utils.deserialize_bson(task['description']['function'])
|
16
|
+
result = execute_task(buffer)
|
17
|
+
val = str(serialize(result, buffer_threshold=1000000))
|
18
|
+
exc = (None, None)
|
19
|
+
ret = 0
|
20
|
+
out = None
|
21
|
+
err = None
|
22
|
+
except Exception:
|
23
|
+
val = None
|
24
|
+
exc = (rp.utils.serialize_bson(pe.RemoteExceptionWrapper(*sys.exc_info())), None)
|
25
|
+
ret = 1
|
26
|
+
out = None
|
27
|
+
err = None
|
28
|
+
|
29
|
+
return out, err, ret, val, exc
|
30
|
+
|
31
|
+
def _dispatch_proc(self, task):
|
32
|
+
|
33
|
+
try:
|
34
|
+
buffer = rp.utils.deserialize_bson(task['description']['executable'])
|
35
|
+
func, args, kwargs = unpack_apply_message(buffer, {}, copy=False)
|
36
|
+
ret = remote_side_bash_executor(func, *args, **kwargs)
|
37
|
+
exc = (None, None)
|
38
|
+
val = None
|
39
|
+
out = None
|
40
|
+
err = None
|
41
|
+
except Exception:
|
42
|
+
val = None
|
43
|
+
exc = (rp.utils.serialize_bson(pe.RemoteExceptionWrapper(*sys.exc_info())), None)
|
44
|
+
ret = 1
|
45
|
+
out = None
|
46
|
+
err = None
|
47
|
+
|
48
|
+
return out, err, ret, val, exc
|
49
|
+
|
50
|
+
|
51
|
+
class MPIWorker(rp.raptor.MPIWorker):
|
52
|
+
def _dispatch_func(self, task):
|
53
|
+
return super()._dispatch_func(task)
|
54
|
+
|
55
|
+
|
56
|
+
class DefaultWorker(rp.raptor.DefaultWorker):
|
57
|
+
def _dispatch_func(self, task):
|
58
|
+
return ParslWorker()._dispatch_func(task)
|
59
|
+
|
60
|
+
def _dispatch_proc(self, task):
|
61
|
+
return ParslWorker()._dispatch_proc(task)
|
@@ -6,7 +6,6 @@ from abc import abstractmethod, abstractproperty
|
|
6
6
|
from concurrent.futures import Future
|
7
7
|
from typing import List, Any, Dict, Optional, Tuple, Union, Callable
|
8
8
|
|
9
|
-
import parsl # noqa F401
|
10
9
|
from parsl.executors.base import ParslExecutor
|
11
10
|
from parsl.executors.errors import BadStateException, ScalingFailed
|
12
11
|
from parsl.jobs.states import JobStatus, JobState
|
@@ -193,7 +192,7 @@ class BlockProviderExecutor(ParslExecutor):
|
|
193
192
|
raise ScalingFailed(self, "No execution provider available")
|
194
193
|
block_ids = []
|
195
194
|
logger.info(f"Scaling out by {blocks} blocks")
|
196
|
-
for
|
195
|
+
for _ in range(blocks):
|
197
196
|
block_id = str(self._block_id_counter.get_id())
|
198
197
|
logger.info(f"Allocated block ID {block_id}")
|
199
198
|
try:
|
@@ -1,11 +1,10 @@
|
|
1
1
|
import traceback
|
2
2
|
import sys
|
3
3
|
|
4
|
-
import pickle
|
5
4
|
from parsl.app.errors import RemoteExceptionWrapper
|
6
5
|
from parsl.data_provider.files import File
|
7
6
|
from parsl.utils import get_std_fname_mode
|
8
|
-
from parsl.serialize import deserialize
|
7
|
+
from parsl.serialize import deserialize, serialize
|
9
8
|
|
10
9
|
# This scripts executes a parsl function which is pickled in 4 files:
|
11
10
|
#
|
@@ -30,10 +29,10 @@ from parsl.serialize import deserialize
|
|
30
29
|
#
|
31
30
|
|
32
31
|
|
33
|
-
def dump_result_to_file(result_file: str,
|
32
|
+
def dump_result_to_file(result_file: str, result):
|
34
33
|
""" Dump a result to the given result file."""
|
35
34
|
with open(result_file, "wb") as f_out:
|
36
|
-
|
35
|
+
f_out.write(serialize(result))
|
37
36
|
|
38
37
|
|
39
38
|
def remap_location(mapping, parsl_file):
|
@@ -22,7 +22,7 @@ from typing import List, Optional, Union, Literal
|
|
22
22
|
# Import Parsl constructs
|
23
23
|
import parsl.utils as putils
|
24
24
|
from parsl.data_provider.staging import Staging
|
25
|
-
from parsl.serialize import serialize
|
25
|
+
from parsl.serialize import serialize, deserialize
|
26
26
|
from parsl.data_provider.files import File
|
27
27
|
from parsl.errors import OptionalModuleMissing
|
28
28
|
from parsl.providers.base import ExecutionProvider
|
@@ -614,7 +614,6 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
614
614
|
self._factory_process.join()
|
615
615
|
|
616
616
|
logger.debug("TaskVine shutdown completed")
|
617
|
-
return True
|
618
617
|
|
619
618
|
@wrap_with_logs
|
620
619
|
def _collect_taskvine_results(self):
|
@@ -639,11 +638,26 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
639
638
|
logger.debug(f'Updating Future for Parsl Task: {task_report.executor_id}. \
|
640
639
|
Task {task_report.executor_id} has result_received set to {task_report.result_received}')
|
641
640
|
if task_report.result_received:
|
642
|
-
|
641
|
+
try:
|
642
|
+
with open(task_report.result_file, 'rb') as f_in:
|
643
|
+
result = deserialize(f_in.read())
|
644
|
+
except Exception as e:
|
645
|
+
logger.error(f'Cannot load result from result file {task_report.result_file}. Exception: {e}')
|
646
|
+
ex = TaskVineTaskFailure('Cannot load result from result file', None)
|
647
|
+
ex.__cause__ = e
|
648
|
+
future.set_exception(ex)
|
649
|
+
else:
|
650
|
+
if isinstance(result, Exception):
|
651
|
+
ex = TaskVineTaskFailure('Task execution raises an exception', result)
|
652
|
+
ex.__cause__ = result
|
653
|
+
future.set_exception(ex)
|
654
|
+
else:
|
655
|
+
future.set_result(result)
|
643
656
|
else:
|
644
657
|
# If there are no results, then the task failed according to one of
|
645
658
|
# taskvine modes, such as resource exhaustion.
|
646
|
-
|
659
|
+
ex = TaskVineTaskFailure(task_report.reason, None)
|
660
|
+
future.set_exception(ex)
|
647
661
|
|
648
662
|
# decrement outstanding task counter
|
649
663
|
with self._outstanding_tasks_lock:
|
@@ -30,7 +30,7 @@ def _taskvine_factory(should_stop, factory_config):
|
|
30
30
|
else:
|
31
31
|
factory = Factory(batch_type=factory_config.batch_type,
|
32
32
|
manager_host_port=f"{factory_config._project_address}:{factory_config._project_port}",
|
33
|
-
|
33
|
+
)
|
34
34
|
except Exception as e:
|
35
35
|
raise TaskVineFactoryFailure(f'Cannot create factory with exception {e}')
|
36
36
|
|
@@ -2,7 +2,6 @@ import logging
|
|
2
2
|
import hashlib
|
3
3
|
import subprocess
|
4
4
|
import os
|
5
|
-
import pickle
|
6
5
|
import queue
|
7
6
|
import shutil
|
8
7
|
import uuid
|
@@ -229,7 +228,7 @@ def _taskvine_submit_wait(ready_task_queue=None,
|
|
229
228
|
logger.error("Unable to create executor task (mode:regular): {}".format(e))
|
230
229
|
finished_task_queue.put_nowait(VineTaskToParsl(executor_id=task.executor_id,
|
231
230
|
result_received=False,
|
232
|
-
|
231
|
+
result_file=None,
|
233
232
|
reason="task could not be created by taskvine",
|
234
233
|
status=-1))
|
235
234
|
continue
|
@@ -268,7 +267,7 @@ def _taskvine_submit_wait(ready_task_queue=None,
|
|
268
267
|
logger.error("Unable to create executor task (mode:serverless): {}".format(e))
|
269
268
|
finished_task_queue.put_nowait(VineTaskToParsl(executor_id=task.executor_id,
|
270
269
|
result_received=False,
|
271
|
-
|
270
|
+
result_file=None,
|
272
271
|
reason="task could not be created by taskvine",
|
273
272
|
status=-1))
|
274
273
|
else:
|
@@ -369,7 +368,7 @@ def _taskvine_submit_wait(ready_task_queue=None,
|
|
369
368
|
logger.error("Unable to submit task to taskvine: {}".format(e))
|
370
369
|
finished_task_queue.put_nowait(VineTaskToParsl(executor_id=task.executor_id,
|
371
370
|
result_received=False,
|
372
|
-
|
371
|
+
result_file=None,
|
373
372
|
reason="task could not be submited to taskvine",
|
374
373
|
status=-1))
|
375
374
|
continue
|
@@ -394,24 +393,21 @@ def _taskvine_submit_wait(ready_task_queue=None,
|
|
394
393
|
|
395
394
|
logger.debug(f"completed executor task info: {executor_task_id}, {t.category}, {t.command}, {t.std_output}")
|
396
395
|
|
397
|
-
# A tasks completes 'succesfully' if it has result file
|
398
|
-
#
|
399
|
-
#
|
396
|
+
# A tasks completes 'succesfully' if it has result file.
|
397
|
+
# A check whether the Python object represented using this file can be
|
398
|
+
# deserialized happens later in the collector thread of the executor
|
399
|
+
# process.
|
400
400
|
logger.debug("Looking for result in {}".format(result_file))
|
401
|
-
|
402
|
-
with open(result_file, "rb") as f_in:
|
403
|
-
result = pickle.load(f_in)
|
401
|
+
if os.path.exists(result_file):
|
404
402
|
logger.debug("Found result in {}".format(result_file))
|
405
403
|
finished_task_queue.put_nowait(VineTaskToParsl(executor_id=executor_task_id,
|
406
404
|
result_received=True,
|
407
|
-
|
405
|
+
result_file=result_file,
|
408
406
|
reason=None,
|
409
407
|
status=t.exit_code))
|
410
408
|
# If a result file could not be generated, explain the
|
411
|
-
# failure according to taskvine error codes.
|
412
|
-
|
413
|
-
# match the positive case.
|
414
|
-
except Exception as e:
|
409
|
+
# failure according to taskvine error codes.
|
410
|
+
else:
|
415
411
|
reason = _explain_taskvine_result(t)
|
416
412
|
logger.debug("Did not find result in {}".format(result_file))
|
417
413
|
logger.debug("Wrapper Script status: {}\nTaskVine Status: {}"
|
@@ -420,7 +416,7 @@ def _taskvine_submit_wait(ready_task_queue=None,
|
|
420
416
|
.format(executor_task_id, t.id, reason))
|
421
417
|
finished_task_queue.put_nowait(VineTaskToParsl(executor_id=executor_task_id,
|
422
418
|
result_received=False,
|
423
|
-
|
419
|
+
result_file=None,
|
424
420
|
reason=reason,
|
425
421
|
status=t.exit_code))
|
426
422
|
|
@@ -42,20 +42,20 @@ class ParslTaskToVine:
|
|
42
42
|
|
43
43
|
class VineTaskToParsl:
|
44
44
|
"""
|
45
|
-
Support structure to communicate final status of TaskVine tasks to Parsl
|
46
|
-
|
47
|
-
|
45
|
+
Support structure to communicate final status of TaskVine tasks to Parsl.
|
46
|
+
result_file is only valid if result_received is True.
|
47
|
+
Reason and status are only valid if result_received is False.
|
48
48
|
"""
|
49
49
|
def __init__(self,
|
50
50
|
executor_id: int, # executor id of task
|
51
51
|
result_received: bool, # whether result is received or not
|
52
|
-
|
52
|
+
result_file: Optional[str], # path to file that contains the serialized result object
|
53
53
|
reason: Optional[str], # string describing why execution fails
|
54
54
|
status: Optional[int] # exit code of execution of task
|
55
55
|
):
|
56
56
|
self.executor_id = executor_id
|
57
57
|
self.result_received = result_received
|
58
|
-
self.
|
58
|
+
self.result_file = result_file
|
59
59
|
self.reason = reason
|
60
60
|
self.status = status
|
61
61
|
|
parsl/executors/threads.py
CHANGED
@@ -72,9 +72,8 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
|
|
72
72
|
|
73
73
|
"""
|
74
74
|
logger.debug("Shutting down executor, which involves waiting for running tasks to complete")
|
75
|
-
|
75
|
+
self.executor.shutdown(wait=block)
|
76
76
|
logger.debug("Done with executor shutdown")
|
77
|
-
return x
|
78
77
|
|
79
78
|
def monitor_resources(self):
|
80
79
|
"""Resource monitoring sometimes deadlocks when using threads, so this function
|
@@ -4,6 +4,7 @@ from parsl.utils import get_std_fname_mode
|
|
4
4
|
import traceback
|
5
5
|
import sys
|
6
6
|
import pickle
|
7
|
+
from parsl.serialize import serialize
|
7
8
|
|
8
9
|
# This scripts executes a parsl function which is pickled in a file:
|
9
10
|
#
|
@@ -32,7 +33,7 @@ def load_pickled_file(filename):
|
|
32
33
|
|
33
34
|
def dump_result_to_file(result_file, result_package):
|
34
35
|
with open(result_file, "wb") as f_out:
|
35
|
-
|
36
|
+
f_out.write(serialize(result_package))
|
36
37
|
|
37
38
|
|
38
39
|
def remap_location(mapping, parsl_file):
|
@@ -21,7 +21,7 @@ import inspect
|
|
21
21
|
import shutil
|
22
22
|
import itertools
|
23
23
|
|
24
|
-
from parsl.serialize import pack_apply_message
|
24
|
+
from parsl.serialize import pack_apply_message, deserialize
|
25
25
|
import parsl.utils as putils
|
26
26
|
from parsl.executors.errors import ExecutorError
|
27
27
|
from parsl.data_provider.files import File
|
@@ -66,11 +66,11 @@ ParslTaskToWq = namedtuple('ParslTaskToWq',
|
|
66
66
|
|
67
67
|
# Support structure to communicate final status of work queue tasks to parsl
|
68
68
|
# if result_received is True:
|
69
|
-
#
|
69
|
+
# result_file is the path to the file containing the result.
|
70
70
|
# if result_received is False:
|
71
71
|
# reason and status are only valid if result_received is False
|
72
|
-
#
|
73
|
-
WqTaskToParsl = namedtuple('WqTaskToParsl', 'id result_received
|
72
|
+
# result_file is None
|
73
|
+
WqTaskToParsl = namedtuple('WqTaskToParsl', 'id result_received result_file reason status')
|
74
74
|
|
75
75
|
# Support structure to report parsl filenames to work queue.
|
76
76
|
# parsl_name is the local_name or filepath attribute of a parsl file object.
|
@@ -449,7 +449,7 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
449
449
|
input_files = []
|
450
450
|
output_files = []
|
451
451
|
|
452
|
-
# Determine the input and output files that will exist at the
|
452
|
+
# Determine the input and output files that will exist at the workers:
|
453
453
|
input_files += [self._register_file(f) for f in kwargs.get("inputs", []) if isinstance(f, File)]
|
454
454
|
output_files += [self._register_file(f) for f in kwargs.get("outputs", []) if isinstance(f, File)]
|
455
455
|
|
@@ -707,7 +707,6 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
707
707
|
self.collector_thread.join()
|
708
708
|
|
709
709
|
logger.debug("Work Queue shutdown completed")
|
710
|
-
return True
|
711
710
|
|
712
711
|
@wrap_with_logs
|
713
712
|
def _collect_work_queue_results(self):
|
@@ -729,14 +728,29 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
729
728
|
with self.tasks_lock:
|
730
729
|
future = self.tasks.pop(task_report.id)
|
731
730
|
logger.debug("Updating Future for executor task {}".format(task_report.id))
|
731
|
+
# If result_received, then there's a result file. The object inside the file
|
732
|
+
# may be a valid result or an exception caused within the function invocation.
|
733
|
+
# Otherwise there's no result file, implying errors from WorkQueue.
|
732
734
|
if task_report.result_received:
|
733
|
-
|
735
|
+
try:
|
736
|
+
with open(task_report.result_file, 'rb') as f_in:
|
737
|
+
result = deserialize(f_in.read())
|
738
|
+
except Exception as e:
|
739
|
+
logger.error(f'Cannot load result from result file {task_report.result_file}. Exception: {e}')
|
740
|
+
ex = WorkQueueTaskFailure('Cannot load result from result file', None)
|
741
|
+
ex.__cause__ = e
|
742
|
+
future.set_exception(ex)
|
743
|
+
else:
|
744
|
+
if isinstance(result, Exception):
|
745
|
+
ex = WorkQueueTaskFailure('Task execution raises an exception', result)
|
746
|
+
ex.__cause__ = result
|
747
|
+
future.set_exception(ex)
|
748
|
+
else:
|
749
|
+
future.set_result(result)
|
734
750
|
else:
|
735
751
|
# If there are no results, then the task failed according to one of
|
736
752
|
# work queue modes, such as resource exhaustion.
|
737
|
-
ex = WorkQueueTaskFailure(task_report.reason,
|
738
|
-
if task_report.result is not None:
|
739
|
-
ex.__cause__ = task_report.result
|
753
|
+
ex = WorkQueueTaskFailure(task_report.reason, None)
|
740
754
|
future.set_exception(ex)
|
741
755
|
finally:
|
742
756
|
logger.debug("Marking all outstanding tasks as failed")
|
@@ -876,7 +890,7 @@ def _work_queue_submit_wait(*,
|
|
876
890
|
logger.error("Unable to create task: {}".format(e))
|
877
891
|
collector_queue.put_nowait(WqTaskToParsl(id=task.id,
|
878
892
|
result_received=False,
|
879
|
-
|
893
|
+
result_file=None,
|
880
894
|
reason="task could not be created by work queue",
|
881
895
|
status=-1))
|
882
896
|
continue
|
@@ -937,7 +951,7 @@ def _work_queue_submit_wait(*,
|
|
937
951
|
logger.error("Unable to submit task to work queue: {}".format(e))
|
938
952
|
collector_queue.put_nowait(WqTaskToParsl(id=task.id,
|
939
953
|
result_received=False,
|
940
|
-
|
954
|
+
result_file=None,
|
941
955
|
reason="task could not be submited to work queue",
|
942
956
|
status=-1))
|
943
957
|
continue
|
@@ -957,24 +971,20 @@ def _work_queue_submit_wait(*,
|
|
957
971
|
logger.debug("Completed Work Queue task {}, executor task {}".format(t.id, t.tag))
|
958
972
|
result_file = result_file_of_task_id.pop(t.tag)
|
959
973
|
|
960
|
-
# A tasks completes 'succesfully' if it has result file
|
961
|
-
#
|
962
|
-
#
|
974
|
+
# A tasks completes 'succesfully' if it has result file.
|
975
|
+
# The check whether this file can load a serialized Python object
|
976
|
+
# happens later in the collector thread of the executor process.
|
963
977
|
logger.debug("Looking for result in {}".format(result_file))
|
964
|
-
|
965
|
-
with open(result_file, "rb") as f_in:
|
966
|
-
result = pickle.load(f_in)
|
978
|
+
if os.path.exists(result_file):
|
967
979
|
logger.debug("Found result in {}".format(result_file))
|
968
980
|
collector_queue.put_nowait(WqTaskToParsl(id=executor_task_id,
|
969
981
|
result_received=True,
|
970
|
-
|
982
|
+
result_file=result_file,
|
971
983
|
reason=None,
|
972
984
|
status=t.return_status))
|
973
985
|
# If a result file could not be generated, explain the
|
974
|
-
# failure according to work queue error codes.
|
975
|
-
|
976
|
-
# match the positive case.
|
977
|
-
except Exception as e:
|
986
|
+
# failure according to work queue error codes.
|
987
|
+
else:
|
978
988
|
reason = _explain_work_queue_result(t)
|
979
989
|
logger.debug("Did not find result in {}".format(result_file))
|
980
990
|
logger.debug("Wrapper Script status: {}\nWorkQueue Status: {}"
|
@@ -983,7 +993,7 @@ def _work_queue_submit_wait(*,
|
|
983
993
|
.format(executor_task_id, t.id, reason))
|
984
994
|
collector_queue.put_nowait(WqTaskToParsl(id=executor_task_id,
|
985
995
|
result_received=False,
|
986
|
-
|
996
|
+
result_file=None,
|
987
997
|
reason=reason,
|
988
998
|
status=t.return_status))
|
989
999
|
logger.debug("Exiting WorkQueue Monitoring Process")
|
parsl/jobs/job_status_poller.py
CHANGED
@@ -1,9 +1,8 @@
|
|
1
1
|
import logging
|
2
|
-
import parsl
|
2
|
+
import parsl
|
3
3
|
import time
|
4
4
|
import zmq
|
5
|
-
from typing import Dict, Sequence
|
6
|
-
from typing import List # noqa F401 (used in type annotation)
|
5
|
+
from typing import Dict, List, Sequence
|
7
6
|
|
8
7
|
from parsl.jobs.states import JobStatus, JobState
|
9
8
|
from parsl.jobs.strategy import Strategy
|
parsl/monitoring/monitoring.py
CHANGED
@@ -194,10 +194,10 @@ class MonitoringHub(RepresentationMixin):
|
|
194
194
|
"logdir": self.logdir,
|
195
195
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
196
196
|
"run_id": run_id
|
197
|
-
|
197
|
+
},
|
198
198
|
name="Monitoring-Router-Process",
|
199
199
|
daemon=True,
|
200
|
-
|
200
|
+
)
|
201
201
|
self.router_proc.start()
|
202
202
|
|
203
203
|
self.dbm_proc = ForkProcess(target=dbm_starter,
|
@@ -205,10 +205,10 @@ class MonitoringHub(RepresentationMixin):
|
|
205
205
|
kwargs={"logdir": self.logdir,
|
206
206
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
207
207
|
"db_url": self.logging_endpoint,
|
208
|
-
|
208
|
+
},
|
209
209
|
name="Monitoring-DBM-Process",
|
210
210
|
daemon=True,
|
211
|
-
|
211
|
+
)
|
212
212
|
self.dbm_proc.start()
|
213
213
|
self.logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
|
214
214
|
|
@@ -216,7 +216,7 @@ class MonitoringHub(RepresentationMixin):
|
|
216
216
|
args=(self.logdir, self.resource_msgs, run_dir),
|
217
217
|
name="Monitoring-Filesystem-Process",
|
218
218
|
daemon=True
|
219
|
-
|
219
|
+
)
|
220
220
|
self.filesystem_proc.start()
|
221
221
|
self.logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
|
222
222
|
|
@@ -359,7 +359,7 @@ class MonitoringRouter:
|
|
359
359
|
run_id: str,
|
360
360
|
logging_level: int = logging.INFO,
|
361
361
|
atexit_timeout: int = 3 # in seconds
|
362
|
-
|
362
|
+
):
|
363
363
|
""" Initializes a monitoring configuration class.
|
364
364
|
|
365
365
|
Parameters
|