parsl 2023.10.23__py3-none-any.whl → 2023.11.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. parsl/__init__.py +1 -0
  2. parsl/app/app.py +29 -21
  3. parsl/channels/base.py +12 -24
  4. parsl/config.py +19 -12
  5. parsl/configs/ad_hoc.py +2 -2
  6. parsl/dataflow/dflow.py +10 -4
  7. parsl/executors/base.py +1 -3
  8. parsl/executors/high_throughput/executor.py +3 -3
  9. parsl/executors/high_throughput/interchange.py +59 -53
  10. parsl/executors/high_throughput/process_worker_pool.py +2 -2
  11. parsl/executors/high_throughput/zmq_pipes.py +1 -1
  12. parsl/executors/radical/__init__.py +4 -0
  13. parsl/executors/radical/executor.py +550 -0
  14. parsl/executors/radical/rpex_master.py +42 -0
  15. parsl/executors/radical/rpex_resources.py +165 -0
  16. parsl/executors/radical/rpex_worker.py +61 -0
  17. parsl/executors/status_handling.py +1 -2
  18. parsl/executors/taskvine/exec_parsl_function.py +3 -4
  19. parsl/executors/taskvine/executor.py +18 -4
  20. parsl/executors/taskvine/factory.py +1 -1
  21. parsl/executors/taskvine/manager.py +12 -16
  22. parsl/executors/taskvine/utils.py +5 -5
  23. parsl/executors/threads.py +1 -2
  24. parsl/executors/workqueue/exec_parsl_function.py +2 -1
  25. parsl/executors/workqueue/executor.py +34 -24
  26. parsl/jobs/job_status_poller.py +2 -3
  27. parsl/monitoring/monitoring.py +6 -6
  28. parsl/monitoring/remote.py +1 -1
  29. parsl/monitoring/visualization/plots/default/workflow_plots.py +4 -4
  30. parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +2 -2
  31. parsl/providers/slurm/slurm.py +1 -1
  32. parsl/tests/configs/ad_hoc_cluster_htex.py +3 -3
  33. parsl/tests/configs/htex_ad_hoc_cluster.py +1 -1
  34. parsl/tests/configs/local_radical.py +20 -0
  35. parsl/tests/configs/local_radical_mpi.py +20 -0
  36. parsl/tests/configs/local_threads_monitoring.py +1 -1
  37. parsl/tests/conftest.py +6 -2
  38. parsl/tests/scaling_tests/vineex_condor.py +1 -1
  39. parsl/tests/scaling_tests/vineex_local.py +1 -1
  40. parsl/tests/scaling_tests/wqex_condor.py +1 -1
  41. parsl/tests/scaling_tests/wqex_local.py +1 -1
  42. parsl/tests/test_docs/test_kwargs.py +37 -0
  43. parsl/tests/test_python_apps/test_garbage_collect.py +1 -1
  44. parsl/tests/test_python_apps/test_lifted.py +3 -2
  45. parsl/tests/test_radical/__init__.py +0 -0
  46. parsl/tests/test_radical/test_mpi_funcs.py +27 -0
  47. parsl/tests/test_regression/test_1606_wait_for_current_tasks.py +1 -1
  48. parsl/utils.py +4 -4
  49. parsl/version.py +1 -1
  50. {parsl-2023.10.23.data → parsl-2023.11.20.data}/scripts/exec_parsl_function.py +2 -1
  51. {parsl-2023.10.23.data → parsl-2023.11.20.data}/scripts/process_worker_pool.py +2 -2
  52. {parsl-2023.10.23.dist-info → parsl-2023.11.20.dist-info}/METADATA +5 -2
  53. {parsl-2023.10.23.dist-info → parsl-2023.11.20.dist-info}/RECORD +58 -48
  54. {parsl-2023.10.23.dist-info → parsl-2023.11.20.dist-info}/WHEEL +1 -1
  55. {parsl-2023.10.23.data → parsl-2023.11.20.data}/scripts/parsl_coprocess.py +0 -0
  56. {parsl-2023.10.23.dist-info → parsl-2023.11.20.dist-info}/LICENSE +0 -0
  57. {parsl-2023.10.23.dist-info → parsl-2023.11.20.dist-info}/entry_points.txt +0 -0
  58. {parsl-2023.10.23.dist-info → parsl-2023.11.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,165 @@
1
+ import sys
2
+ import json
3
+
4
+ from typing import List
5
+
6
+ _setup_paths: List[str]
7
+ try:
8
+ import radical.pilot as rp
9
+ import radical.utils as ru
10
+ except ImportError:
11
+ _setup_paths = []
12
+ else:
13
+ _setup_paths = [rp.sdist_path,
14
+ ru.sdist_path]
15
+
16
+
17
+ MPI = "mpi"
18
+ RP_ENV = "rp"
19
+ CLIENT = "client"
20
+ RPEX_ENV = "ve_rpex"
21
+ MPI_WORKER = "MPIWorker"
22
+ DEFAULT_WORKER = "DefaultWorker"
23
+
24
+
25
+ class ResourceConfig:
26
+ """
27
+ This ResourceConfig class is an abstraction of the resource
28
+ configuration of the RAPTOR layer in the RADICAL-Pilot runtime system.
29
+
30
+ This class sets up the default configuration values for the executor and
31
+ allows the user to specify different resource requirements flexibly.
32
+
33
+ For more information:
34
+ https://radicalpilot.readthedocs.io/en/stable/tutorials/raptor.html
35
+
36
+ Parameters
37
+ ----------
38
+ masters : int
39
+ The number of masters to be deployed by RAPTOR.
40
+ Default is 1.
41
+
42
+ workers : int
43
+ The number of workers to be deployed by RAPTOR.
44
+ Default is 1.
45
+
46
+ worker_gpus_per_node : int
47
+ The number of GPUs a worker will operate on per node.
48
+ Default is 0.
49
+
50
+ worker_cores_per_node : int
51
+ The number of CPU cores a worker will operate on per node.
52
+ Default is 4.
53
+
54
+ cores_per_master : int
55
+ The number of cores a master will operate on per node.
56
+ Default is 1.
57
+
58
+ nodes_per_worker : int
59
+ The number of nodes to be occupied by every worker.
60
+ Default is 1.
61
+
62
+ pilot_env_path : str
63
+ The path to an exisitng pilot environment.
64
+ Default is an empty string (RADICAL-Pilot will create one).
65
+
66
+ pilot_env_name : str
67
+ The name of the pilot environment.
68
+ Default is "ve_rpex".
69
+
70
+ pilot_env_pre_exec : list
71
+ List of commands to be executed before starting the pilot environment.
72
+ Default is an empty list.
73
+
74
+ pilot_env_type : str
75
+ The type of the pilot environment (e.g., 'venv', 'conda').
76
+ Default is "venv".
77
+
78
+ pilot_env_setup : list
79
+ List of setup commands/packages for the pilot environment.
80
+ Default setup includes "parsl", rp.sdist_path, and ru.sdist_path.
81
+
82
+ python_v : str
83
+ The Python version to be used in the pilot environment.
84
+ Default is determined by the system's Python version.
85
+
86
+ worker_type : str
87
+ The type of worker(s) to be deployed by RAPTOR on the compute
88
+ resources.
89
+ Default is "DefaultWorker".
90
+ """
91
+
92
+ masters: int = 1
93
+ workers: int = 1
94
+
95
+ worker_gpus_per_node: int = 0
96
+ worker_cores_per_node: int = 4
97
+
98
+ cores_per_master: int = 1
99
+ nodes_per_worker: int = 1
100
+
101
+ pilot_env_mode: str = CLIENT
102
+ pilot_env_path: str = ""
103
+ pilot_env_type: str = "venv"
104
+ pilot_env_name: str = RP_ENV
105
+ pilot_env_pre_exec: List[str] = []
106
+ pilot_env_setup: List[str] = _setup_paths
107
+
108
+ python_v: str = f'{sys.version_info[0]}.{sys.version_info[1]}'
109
+ worker_type: str = DEFAULT_WORKER
110
+
111
+ def _get_cfg_file(cls, path=None):
112
+
113
+ # Default ENV mode for RP is to reuse
114
+ # the client side. If this is not the case,
115
+ # then RP will create a new env named ve_rpex
116
+ # The user need to make sure that under:
117
+ # $HOME/.radical/pilot/configs/*_resource.json
118
+ # that virtenv_mode = local
119
+ if cls.pilot_env_mode != CLIENT:
120
+ cls.pilot_env_name = RPEX_ENV
121
+
122
+ if MPI in cls.worker_type.lower() and \
123
+ "mpi4py" not in cls.pilot_env_setup:
124
+ cls.pilot_env_setup.append("mpi4py")
125
+
126
+ cfg = {
127
+ 'n_masters': cls.masters,
128
+ 'n_workers': cls.workers,
129
+ 'gpus_per_node': cls.worker_gpus_per_node,
130
+ 'cores_per_node': cls.worker_cores_per_node,
131
+ 'cores_per_master': cls.cores_per_master,
132
+ 'nodes_per_worker': cls.nodes_per_worker,
133
+
134
+ 'pilot_env': {
135
+ "version": cls.python_v,
136
+ "name": cls.pilot_env_name,
137
+ "path": cls.pilot_env_path,
138
+ "type": cls.pilot_env_type,
139
+ "setup": cls.pilot_env_setup,
140
+ "pre_exec": cls.pilot_env_pre_exec
141
+ },
142
+
143
+ 'pilot_env_mode': cls.pilot_env_mode,
144
+
145
+ 'master_descr': {
146
+ "mode": rp.RAPTOR_MASTER,
147
+ "named_env": cls.pilot_env_name,
148
+ "executable": "python3 rpex_master.py",
149
+ },
150
+
151
+ 'worker_descr': {
152
+ "mode": rp.RAPTOR_WORKER,
153
+ "named_env": cls.pilot_env_name,
154
+ "raptor_file": "./rpex_worker.py",
155
+ "raptor_class": cls.worker_type if
156
+ cls.worker_type.lower() != MPI else MPI_WORKER,
157
+ }}
158
+
159
+ # Convert the class instance to a cfg file.
160
+ config_path = 'rpex.cfg'
161
+ if path:
162
+ config_path = path + '/' + config_path
163
+ with open(config_path, 'w') as f:
164
+ json.dump(cfg, f, indent=4)
165
+ return config_path
@@ -0,0 +1,61 @@
1
+ import sys
2
+ import radical.pilot as rp
3
+
4
+ import parsl.app.errors as pe
5
+ from parsl.app.bash import remote_side_bash_executor
6
+ from parsl.serialize import unpack_apply_message, serialize
7
+ from parsl.executors.high_throughput.process_worker_pool import execute_task
8
+
9
+
10
+ class ParslWorker:
11
+
12
+ def _dispatch_func(self, task):
13
+
14
+ try:
15
+ buffer = rp.utils.deserialize_bson(task['description']['function'])
16
+ result = execute_task(buffer)
17
+ val = str(serialize(result, buffer_threshold=1000000))
18
+ exc = (None, None)
19
+ ret = 0
20
+ out = None
21
+ err = None
22
+ except Exception:
23
+ val = None
24
+ exc = (rp.utils.serialize_bson(pe.RemoteExceptionWrapper(*sys.exc_info())), None)
25
+ ret = 1
26
+ out = None
27
+ err = None
28
+
29
+ return out, err, ret, val, exc
30
+
31
+ def _dispatch_proc(self, task):
32
+
33
+ try:
34
+ buffer = rp.utils.deserialize_bson(task['description']['executable'])
35
+ func, args, kwargs = unpack_apply_message(buffer, {}, copy=False)
36
+ ret = remote_side_bash_executor(func, *args, **kwargs)
37
+ exc = (None, None)
38
+ val = None
39
+ out = None
40
+ err = None
41
+ except Exception:
42
+ val = None
43
+ exc = (rp.utils.serialize_bson(pe.RemoteExceptionWrapper(*sys.exc_info())), None)
44
+ ret = 1
45
+ out = None
46
+ err = None
47
+
48
+ return out, err, ret, val, exc
49
+
50
+
51
+ class MPIWorker(rp.raptor.MPIWorker):
52
+ def _dispatch_func(self, task):
53
+ return super()._dispatch_func(task)
54
+
55
+
56
+ class DefaultWorker(rp.raptor.DefaultWorker):
57
+ def _dispatch_func(self, task):
58
+ return ParslWorker()._dispatch_func(task)
59
+
60
+ def _dispatch_proc(self, task):
61
+ return ParslWorker()._dispatch_proc(task)
@@ -6,7 +6,6 @@ from abc import abstractmethod, abstractproperty
6
6
  from concurrent.futures import Future
7
7
  from typing import List, Any, Dict, Optional, Tuple, Union, Callable
8
8
 
9
- import parsl # noqa F401
10
9
  from parsl.executors.base import ParslExecutor
11
10
  from parsl.executors.errors import BadStateException, ScalingFailed
12
11
  from parsl.jobs.states import JobStatus, JobState
@@ -193,7 +192,7 @@ class BlockProviderExecutor(ParslExecutor):
193
192
  raise ScalingFailed(self, "No execution provider available")
194
193
  block_ids = []
195
194
  logger.info(f"Scaling out by {blocks} blocks")
196
- for i in range(blocks):
195
+ for _ in range(blocks):
197
196
  block_id = str(self._block_id_counter.get_id())
198
197
  logger.info(f"Allocated block ID {block_id}")
199
198
  try:
@@ -1,11 +1,10 @@
1
1
  import traceback
2
2
  import sys
3
3
 
4
- import pickle
5
4
  from parsl.app.errors import RemoteExceptionWrapper
6
5
  from parsl.data_provider.files import File
7
6
  from parsl.utils import get_std_fname_mode
8
- from parsl.serialize import deserialize
7
+ from parsl.serialize import deserialize, serialize
9
8
 
10
9
  # This scripts executes a parsl function which is pickled in 4 files:
11
10
  #
@@ -30,10 +29,10 @@ from parsl.serialize import deserialize
30
29
  #
31
30
 
32
31
 
33
- def dump_result_to_file(result_file: str, result_package):
32
+ def dump_result_to_file(result_file: str, result):
34
33
  """ Dump a result to the given result file."""
35
34
  with open(result_file, "wb") as f_out:
36
- pickle.dump(result_package, f_out)
35
+ f_out.write(serialize(result))
37
36
 
38
37
 
39
38
  def remap_location(mapping, parsl_file):
@@ -22,7 +22,7 @@ from typing import List, Optional, Union, Literal
22
22
  # Import Parsl constructs
23
23
  import parsl.utils as putils
24
24
  from parsl.data_provider.staging import Staging
25
- from parsl.serialize import serialize
25
+ from parsl.serialize import serialize, deserialize
26
26
  from parsl.data_provider.files import File
27
27
  from parsl.errors import OptionalModuleMissing
28
28
  from parsl.providers.base import ExecutionProvider
@@ -614,7 +614,6 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
614
614
  self._factory_process.join()
615
615
 
616
616
  logger.debug("TaskVine shutdown completed")
617
- return True
618
617
 
619
618
  @wrap_with_logs
620
619
  def _collect_taskvine_results(self):
@@ -639,11 +638,26 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
639
638
  logger.debug(f'Updating Future for Parsl Task: {task_report.executor_id}. \
640
639
  Task {task_report.executor_id} has result_received set to {task_report.result_received}')
641
640
  if task_report.result_received:
642
- future.set_result(task_report.result)
641
+ try:
642
+ with open(task_report.result_file, 'rb') as f_in:
643
+ result = deserialize(f_in.read())
644
+ except Exception as e:
645
+ logger.error(f'Cannot load result from result file {task_report.result_file}. Exception: {e}')
646
+ ex = TaskVineTaskFailure('Cannot load result from result file', None)
647
+ ex.__cause__ = e
648
+ future.set_exception(ex)
649
+ else:
650
+ if isinstance(result, Exception):
651
+ ex = TaskVineTaskFailure('Task execution raises an exception', result)
652
+ ex.__cause__ = result
653
+ future.set_exception(ex)
654
+ else:
655
+ future.set_result(result)
643
656
  else:
644
657
  # If there are no results, then the task failed according to one of
645
658
  # taskvine modes, such as resource exhaustion.
646
- future.set_exception(TaskVineTaskFailure(task_report.reason, task_report.result))
659
+ ex = TaskVineTaskFailure(task_report.reason, None)
660
+ future.set_exception(ex)
647
661
 
648
662
  # decrement outstanding task counter
649
663
  with self._outstanding_tasks_lock:
@@ -30,7 +30,7 @@ def _taskvine_factory(should_stop, factory_config):
30
30
  else:
31
31
  factory = Factory(batch_type=factory_config.batch_type,
32
32
  manager_host_port=f"{factory_config._project_address}:{factory_config._project_port}",
33
- )
33
+ )
34
34
  except Exception as e:
35
35
  raise TaskVineFactoryFailure(f'Cannot create factory with exception {e}')
36
36
 
@@ -2,7 +2,6 @@ import logging
2
2
  import hashlib
3
3
  import subprocess
4
4
  import os
5
- import pickle
6
5
  import queue
7
6
  import shutil
8
7
  import uuid
@@ -229,7 +228,7 @@ def _taskvine_submit_wait(ready_task_queue=None,
229
228
  logger.error("Unable to create executor task (mode:regular): {}".format(e))
230
229
  finished_task_queue.put_nowait(VineTaskToParsl(executor_id=task.executor_id,
231
230
  result_received=False,
232
- result=None,
231
+ result_file=None,
233
232
  reason="task could not be created by taskvine",
234
233
  status=-1))
235
234
  continue
@@ -268,7 +267,7 @@ def _taskvine_submit_wait(ready_task_queue=None,
268
267
  logger.error("Unable to create executor task (mode:serverless): {}".format(e))
269
268
  finished_task_queue.put_nowait(VineTaskToParsl(executor_id=task.executor_id,
270
269
  result_received=False,
271
- result=None,
270
+ result_file=None,
272
271
  reason="task could not be created by taskvine",
273
272
  status=-1))
274
273
  else:
@@ -369,7 +368,7 @@ def _taskvine_submit_wait(ready_task_queue=None,
369
368
  logger.error("Unable to submit task to taskvine: {}".format(e))
370
369
  finished_task_queue.put_nowait(VineTaskToParsl(executor_id=task.executor_id,
371
370
  result_received=False,
372
- result=None,
371
+ result_file=None,
373
372
  reason="task could not be submited to taskvine",
374
373
  status=-1))
375
374
  continue
@@ -394,24 +393,21 @@ def _taskvine_submit_wait(ready_task_queue=None,
394
393
 
395
394
  logger.debug(f"completed executor task info: {executor_task_id}, {t.category}, {t.command}, {t.std_output}")
396
395
 
397
- # A tasks completes 'succesfully' if it has result file,
398
- # and it can be loaded. This may mean that the 'success' is
399
- # an exception.
396
+ # A tasks completes 'succesfully' if it has result file.
397
+ # A check whether the Python object represented using this file can be
398
+ # deserialized happens later in the collector thread of the executor
399
+ # process.
400
400
  logger.debug("Looking for result in {}".format(result_file))
401
- try:
402
- with open(result_file, "rb") as f_in:
403
- result = pickle.load(f_in)
401
+ if os.path.exists(result_file):
404
402
  logger.debug("Found result in {}".format(result_file))
405
403
  finished_task_queue.put_nowait(VineTaskToParsl(executor_id=executor_task_id,
406
404
  result_received=True,
407
- result=result,
405
+ result_file=result_file,
408
406
  reason=None,
409
407
  status=t.exit_code))
410
408
  # If a result file could not be generated, explain the
411
- # failure according to taskvine error codes. We generate
412
- # an exception and wrap it with RemoteExceptionWrapper, to
413
- # match the positive case.
414
- except Exception as e:
409
+ # failure according to taskvine error codes.
410
+ else:
415
411
  reason = _explain_taskvine_result(t)
416
412
  logger.debug("Did not find result in {}".format(result_file))
417
413
  logger.debug("Wrapper Script status: {}\nTaskVine Status: {}"
@@ -420,7 +416,7 @@ def _taskvine_submit_wait(ready_task_queue=None,
420
416
  .format(executor_task_id, t.id, reason))
421
417
  finished_task_queue.put_nowait(VineTaskToParsl(executor_id=executor_task_id,
422
418
  result_received=False,
423
- result=e,
419
+ result_file=None,
424
420
  reason=reason,
425
421
  status=t.exit_code))
426
422
 
@@ -42,20 +42,20 @@ class ParslTaskToVine:
42
42
 
43
43
  class VineTaskToParsl:
44
44
  """
45
- Support structure to communicate final status of TaskVine tasks to Parsl
46
- result is only valid if result_received is True
47
- reason and status are only valid if result_received is False
45
+ Support structure to communicate final status of TaskVine tasks to Parsl.
46
+ result_file is only valid if result_received is True.
47
+ Reason and status are only valid if result_received is False.
48
48
  """
49
49
  def __init__(self,
50
50
  executor_id: int, # executor id of task
51
51
  result_received: bool, # whether result is received or not
52
- result, # result object if available
52
+ result_file: Optional[str], # path to file that contains the serialized result object
53
53
  reason: Optional[str], # string describing why execution fails
54
54
  status: Optional[int] # exit code of execution of task
55
55
  ):
56
56
  self.executor_id = executor_id
57
57
  self.result_received = result_received
58
- self.result = result
58
+ self.result_file = result_file
59
59
  self.reason = reason
60
60
  self.status = status
61
61
 
@@ -72,9 +72,8 @@ class ThreadPoolExecutor(ParslExecutor, RepresentationMixin):
72
72
 
73
73
  """
74
74
  logger.debug("Shutting down executor, which involves waiting for running tasks to complete")
75
- x = self.executor.shutdown(wait=block)
75
+ self.executor.shutdown(wait=block)
76
76
  logger.debug("Done with executor shutdown")
77
- return x
78
77
 
79
78
  def monitor_resources(self):
80
79
  """Resource monitoring sometimes deadlocks when using threads, so this function
@@ -4,6 +4,7 @@ from parsl.utils import get_std_fname_mode
4
4
  import traceback
5
5
  import sys
6
6
  import pickle
7
+ from parsl.serialize import serialize
7
8
 
8
9
  # This scripts executes a parsl function which is pickled in a file:
9
10
  #
@@ -32,7 +33,7 @@ def load_pickled_file(filename):
32
33
 
33
34
  def dump_result_to_file(result_file, result_package):
34
35
  with open(result_file, "wb") as f_out:
35
- pickle.dump(result_package, f_out)
36
+ f_out.write(serialize(result_package))
36
37
 
37
38
 
38
39
  def remap_location(mapping, parsl_file):
@@ -21,7 +21,7 @@ import inspect
21
21
  import shutil
22
22
  import itertools
23
23
 
24
- from parsl.serialize import pack_apply_message
24
+ from parsl.serialize import pack_apply_message, deserialize
25
25
  import parsl.utils as putils
26
26
  from parsl.executors.errors import ExecutorError
27
27
  from parsl.data_provider.files import File
@@ -66,11 +66,11 @@ ParslTaskToWq = namedtuple('ParslTaskToWq',
66
66
 
67
67
  # Support structure to communicate final status of work queue tasks to parsl
68
68
  # if result_received is True:
69
- # result is the result
69
+ # result_file is the path to the file containing the result.
70
70
  # if result_received is False:
71
71
  # reason and status are only valid if result_received is False
72
- # result is either None or an exception raised while looking for a result
73
- WqTaskToParsl = namedtuple('WqTaskToParsl', 'id result_received result reason status')
72
+ # result_file is None
73
+ WqTaskToParsl = namedtuple('WqTaskToParsl', 'id result_received result_file reason status')
74
74
 
75
75
  # Support structure to report parsl filenames to work queue.
76
76
  # parsl_name is the local_name or filepath attribute of a parsl file object.
@@ -449,7 +449,7 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
449
449
  input_files = []
450
450
  output_files = []
451
451
 
452
- # Determine the input and output files that will exist at the workes:
452
+ # Determine the input and output files that will exist at the workers:
453
453
  input_files += [self._register_file(f) for f in kwargs.get("inputs", []) if isinstance(f, File)]
454
454
  output_files += [self._register_file(f) for f in kwargs.get("outputs", []) if isinstance(f, File)]
455
455
 
@@ -707,7 +707,6 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
707
707
  self.collector_thread.join()
708
708
 
709
709
  logger.debug("Work Queue shutdown completed")
710
- return True
711
710
 
712
711
  @wrap_with_logs
713
712
  def _collect_work_queue_results(self):
@@ -729,14 +728,29 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
729
728
  with self.tasks_lock:
730
729
  future = self.tasks.pop(task_report.id)
731
730
  logger.debug("Updating Future for executor task {}".format(task_report.id))
731
+ # If result_received, then there's a result file. The object inside the file
732
+ # may be a valid result or an exception caused within the function invocation.
733
+ # Otherwise there's no result file, implying errors from WorkQueue.
732
734
  if task_report.result_received:
733
- future.set_result(task_report.result)
735
+ try:
736
+ with open(task_report.result_file, 'rb') as f_in:
737
+ result = deserialize(f_in.read())
738
+ except Exception as e:
739
+ logger.error(f'Cannot load result from result file {task_report.result_file}. Exception: {e}')
740
+ ex = WorkQueueTaskFailure('Cannot load result from result file', None)
741
+ ex.__cause__ = e
742
+ future.set_exception(ex)
743
+ else:
744
+ if isinstance(result, Exception):
745
+ ex = WorkQueueTaskFailure('Task execution raises an exception', result)
746
+ ex.__cause__ = result
747
+ future.set_exception(ex)
748
+ else:
749
+ future.set_result(result)
734
750
  else:
735
751
  # If there are no results, then the task failed according to one of
736
752
  # work queue modes, such as resource exhaustion.
737
- ex = WorkQueueTaskFailure(task_report.reason, task_report.result)
738
- if task_report.result is not None:
739
- ex.__cause__ = task_report.result
753
+ ex = WorkQueueTaskFailure(task_report.reason, None)
740
754
  future.set_exception(ex)
741
755
  finally:
742
756
  logger.debug("Marking all outstanding tasks as failed")
@@ -876,7 +890,7 @@ def _work_queue_submit_wait(*,
876
890
  logger.error("Unable to create task: {}".format(e))
877
891
  collector_queue.put_nowait(WqTaskToParsl(id=task.id,
878
892
  result_received=False,
879
- result=None,
893
+ result_file=None,
880
894
  reason="task could not be created by work queue",
881
895
  status=-1))
882
896
  continue
@@ -937,7 +951,7 @@ def _work_queue_submit_wait(*,
937
951
  logger.error("Unable to submit task to work queue: {}".format(e))
938
952
  collector_queue.put_nowait(WqTaskToParsl(id=task.id,
939
953
  result_received=False,
940
- result=None,
954
+ result_file=None,
941
955
  reason="task could not be submited to work queue",
942
956
  status=-1))
943
957
  continue
@@ -957,24 +971,20 @@ def _work_queue_submit_wait(*,
957
971
  logger.debug("Completed Work Queue task {}, executor task {}".format(t.id, t.tag))
958
972
  result_file = result_file_of_task_id.pop(t.tag)
959
973
 
960
- # A tasks completes 'succesfully' if it has result file,
961
- # and it can be loaded. This may mean that the 'success' is
962
- # an exception.
974
+ # A tasks completes 'succesfully' if it has result file.
975
+ # The check whether this file can load a serialized Python object
976
+ # happens later in the collector thread of the executor process.
963
977
  logger.debug("Looking for result in {}".format(result_file))
964
- try:
965
- with open(result_file, "rb") as f_in:
966
- result = pickle.load(f_in)
978
+ if os.path.exists(result_file):
967
979
  logger.debug("Found result in {}".format(result_file))
968
980
  collector_queue.put_nowait(WqTaskToParsl(id=executor_task_id,
969
981
  result_received=True,
970
- result=result,
982
+ result_file=result_file,
971
983
  reason=None,
972
984
  status=t.return_status))
973
985
  # If a result file could not be generated, explain the
974
- # failure according to work queue error codes. We generate
975
- # an exception and wrap it with RemoteExceptionWrapper, to
976
- # match the positive case.
977
- except Exception as e:
986
+ # failure according to work queue error codes.
987
+ else:
978
988
  reason = _explain_work_queue_result(t)
979
989
  logger.debug("Did not find result in {}".format(result_file))
980
990
  logger.debug("Wrapper Script status: {}\nWorkQueue Status: {}"
@@ -983,7 +993,7 @@ def _work_queue_submit_wait(*,
983
993
  .format(executor_task_id, t.id, reason))
984
994
  collector_queue.put_nowait(WqTaskToParsl(id=executor_task_id,
985
995
  result_received=False,
986
- result=e,
996
+ result_file=None,
987
997
  reason=reason,
988
998
  status=t.return_status))
989
999
  logger.debug("Exiting WorkQueue Monitoring Process")
@@ -1,9 +1,8 @@
1
1
  import logging
2
- import parsl # noqa F401 (used in string type annotation)
2
+ import parsl
3
3
  import time
4
4
  import zmq
5
- from typing import Dict, Sequence
6
- from typing import List # noqa F401 (used in type annotation)
5
+ from typing import Dict, List, Sequence
7
6
 
8
7
  from parsl.jobs.states import JobStatus, JobState
9
8
  from parsl.jobs.strategy import Strategy
@@ -194,10 +194,10 @@ class MonitoringHub(RepresentationMixin):
194
194
  "logdir": self.logdir,
195
195
  "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
196
196
  "run_id": run_id
197
- },
197
+ },
198
198
  name="Monitoring-Router-Process",
199
199
  daemon=True,
200
- )
200
+ )
201
201
  self.router_proc.start()
202
202
 
203
203
  self.dbm_proc = ForkProcess(target=dbm_starter,
@@ -205,10 +205,10 @@ class MonitoringHub(RepresentationMixin):
205
205
  kwargs={"logdir": self.logdir,
206
206
  "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
207
207
  "db_url": self.logging_endpoint,
208
- },
208
+ },
209
209
  name="Monitoring-DBM-Process",
210
210
  daemon=True,
211
- )
211
+ )
212
212
  self.dbm_proc.start()
213
213
  self.logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
214
214
 
@@ -216,7 +216,7 @@ class MonitoringHub(RepresentationMixin):
216
216
  args=(self.logdir, self.resource_msgs, run_dir),
217
217
  name="Monitoring-Filesystem-Process",
218
218
  daemon=True
219
- )
219
+ )
220
220
  self.filesystem_proc.start()
221
221
  self.logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
222
222
 
@@ -359,7 +359,7 @@ class MonitoringRouter:
359
359
  run_id: str,
360
360
  logging_level: int = logging.INFO,
361
361
  atexit_timeout: int = 3 # in seconds
362
- ):
362
+ ):
363
363
  """ Initializes a monitoring configuration class.
364
364
 
365
365
  Parameters
@@ -143,7 +143,7 @@ def send_first_last_message(try_id: int,
143
143
  'first_msg': not is_last,
144
144
  'last_msg': is_last,
145
145
  'timestamp': datetime.datetime.now()
146
- })
146
+ })
147
147
  radio.send(msg)
148
148
  return
149
149