parsl 2024.4.1__py3-none-any.whl → 2024.4.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/data_provider/data_manager.py +2 -1
- parsl/data_provider/zip.py +104 -0
- parsl/dataflow/dflow.py +57 -48
- parsl/dataflow/futures.py +0 -7
- parsl/executors/base.py +12 -9
- parsl/executors/high_throughput/executor.py +14 -19
- parsl/executors/high_throughput/process_worker_pool.py +3 -1
- parsl/executors/status_handling.py +82 -9
- parsl/executors/taskvine/executor.py +7 -2
- parsl/executors/workqueue/executor.py +8 -3
- parsl/jobs/job_status_poller.py +27 -107
- parsl/jobs/strategy.py +31 -32
- parsl/monitoring/monitoring.py +14 -23
- parsl/monitoring/radios.py +15 -0
- parsl/monitoring/remote.py +2 -1
- parsl/monitoring/router.py +7 -6
- parsl/providers/local/local.py +1 -1
- parsl/tests/configs/htex_local_alternate.py +2 -1
- parsl/tests/configs/taskvine_ex.py +1 -2
- parsl/tests/configs/workqueue_ex.py +1 -2
- parsl/tests/conftest.py +6 -7
- parsl/tests/test_bash_apps/test_basic.py +5 -4
- parsl/tests/test_bash_apps/test_error_codes.py +0 -3
- parsl/tests/test_bash_apps/test_kwarg_storage.py +0 -1
- parsl/tests/test_bash_apps/test_memoize.py +0 -2
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +0 -1
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +0 -1
- parsl/tests/test_bash_apps/test_multiline.py +0 -1
- parsl/tests/test_bash_apps/test_stdout.py +11 -6
- parsl/tests/test_monitoring/test_basic.py +46 -21
- parsl/tests/test_monitoring/test_fuzz_zmq.py +10 -1
- parsl/tests/test_python_apps/test_outputs.py +0 -1
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +74 -0
- parsl/tests/test_staging/test_zip_out.py +113 -0
- parsl/version.py +1 -1
- {parsl-2024.4.1.data → parsl-2024.4.15.data}/scripts/process_worker_pool.py +3 -1
- {parsl-2024.4.1.dist-info → parsl-2024.4.15.dist-info}/METADATA +3 -2
- {parsl-2024.4.1.dist-info → parsl-2024.4.15.dist-info}/RECORD +44 -41
- {parsl-2024.4.1.data → parsl-2024.4.15.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.4.1.data → parsl-2024.4.15.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.4.1.dist-info → parsl-2024.4.15.dist-info}/LICENSE +0 -0
- {parsl-2024.4.1.dist-info → parsl-2024.4.15.dist-info}/WHEEL +0 -0
- {parsl-2024.4.1.dist-info → parsl-2024.4.15.dist-info}/entry_points.txt +0 -0
- {parsl-2024.4.1.dist-info → parsl-2024.4.15.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ from parsl.data_provider.files import File
|
|
7
7
|
from parsl.data_provider.file_noop import NoOpFileStaging
|
8
8
|
from parsl.data_provider.ftp import FTPSeparateTaskStaging
|
9
9
|
from parsl.data_provider.http import HTTPSeparateTaskStaging
|
10
|
+
from parsl.data_provider.zip import ZipFileStaging
|
10
11
|
from parsl.data_provider.staging import Staging
|
11
12
|
|
12
13
|
if TYPE_CHECKING:
|
@@ -17,7 +18,7 @@ logger = logging.getLogger(__name__)
|
|
17
18
|
# these will be shared between all executors that do not explicitly
|
18
19
|
# override, so should not contain executor-specific state
|
19
20
|
default_staging: List[Staging]
|
20
|
-
default_staging = [NoOpFileStaging(), FTPSeparateTaskStaging(), HTTPSeparateTaskStaging()]
|
21
|
+
default_staging = [NoOpFileStaging(), FTPSeparateTaskStaging(), HTTPSeparateTaskStaging(), ZipFileStaging()]
|
21
22
|
|
22
23
|
|
23
24
|
class DataManager:
|
@@ -0,0 +1,104 @@
|
|
1
|
+
import filelock
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
import parsl
|
5
|
+
import zipfile
|
6
|
+
|
7
|
+
from typing import Tuple
|
8
|
+
|
9
|
+
from parsl.data_provider.staging import Staging
|
10
|
+
from parsl.data_provider.files import File
|
11
|
+
from parsl.errors import ParslError
|
12
|
+
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class ZipAuthorityError(ParslError):
|
18
|
+
def __init__(self, file):
|
19
|
+
self.file = file
|
20
|
+
|
21
|
+
def __str__(self):
|
22
|
+
return f"ZipFileStaging cannot stage Files with an authority (netloc) section ({self.file.netloc}), for {self.file.url}"
|
23
|
+
|
24
|
+
|
25
|
+
class ZipFileStaging(Staging):
|
26
|
+
"""A stage-out provider for zip files.
|
27
|
+
|
28
|
+
This provider will stage out files by writing them into the specified zip
|
29
|
+
file.
|
30
|
+
|
31
|
+
The filename of both the zip file and the file contained in that zip are
|
32
|
+
specified using a zip: URL, like this:
|
33
|
+
|
34
|
+
zip:/tmp/foo/this.zip/inside/here.txt
|
35
|
+
|
36
|
+
This URL names a zip file ``/tmp/foo/this.zip`` containing a file
|
37
|
+
``inside/here.txt``.
|
38
|
+
|
39
|
+
The provider will use the Python filelock package to lock the zip file so
|
40
|
+
that it does not conflict with other instances of itself. This lock will
|
41
|
+
not protect against other modifications to the zip file.
|
42
|
+
"""
|
43
|
+
|
44
|
+
def can_stage_out(self, file: File) -> bool:
|
45
|
+
logger.debug("archive provider checking File {}".format(repr(file)))
|
46
|
+
|
47
|
+
# First check if this is the scheme we care about
|
48
|
+
if file.scheme != "zip":
|
49
|
+
return False
|
50
|
+
|
51
|
+
# This is some basic validation to check that the user isn't specifying
|
52
|
+
# an authority section and expecting it to mean something.
|
53
|
+
if file.netloc != "":
|
54
|
+
raise ZipAuthorityError(file)
|
55
|
+
|
56
|
+
# If we got this far, we can stage this file
|
57
|
+
return True
|
58
|
+
|
59
|
+
def stage_out(self, dm, executor, file, parent_fut):
|
60
|
+
assert file.scheme == 'zip'
|
61
|
+
|
62
|
+
zip_path, inside_path = zip_path_split(file.path)
|
63
|
+
|
64
|
+
working_dir = dm.dfk.executors[executor].working_dir
|
65
|
+
|
66
|
+
if working_dir:
|
67
|
+
file.local_path = os.path.join(working_dir, inside_path)
|
68
|
+
|
69
|
+
# TODO: I think its the right behaviour that a staging out provider should create the directory structure
|
70
|
+
# for the file to be placed in?
|
71
|
+
os.makedirs(os.path.dirname(file.local_path), exist_ok=True)
|
72
|
+
else:
|
73
|
+
raise RuntimeError("zip file staging requires a working_dir to be specified")
|
74
|
+
|
75
|
+
stage_out_app = _zip_stage_out_app(dm)
|
76
|
+
app_fut = stage_out_app(zip_path, inside_path, working_dir, inputs=[file], _parsl_staging_inhibit=True, parent_fut=parent_fut)
|
77
|
+
return app_fut
|
78
|
+
|
79
|
+
|
80
|
+
def _zip_stage_out(zip_file, inside_path, working_dir, parent_fut=None, inputs=[], _parsl_staging_inhibit=True):
|
81
|
+
file = inputs[0]
|
82
|
+
|
83
|
+
os.makedirs(os.path.dirname(zip_file), exist_ok=True)
|
84
|
+
|
85
|
+
with filelock.FileLock(zip_file + ".lock"):
|
86
|
+
with zipfile.ZipFile(zip_file, mode='a', compression=zipfile.ZIP_DEFLATED) as z:
|
87
|
+
z.write(file, arcname=inside_path)
|
88
|
+
|
89
|
+
os.remove(file)
|
90
|
+
|
91
|
+
|
92
|
+
def _zip_stage_out_app(dm):
|
93
|
+
return parsl.python_app(executors=['_parsl_internal'], data_flow_kernel=dm.dfk)(_zip_stage_out)
|
94
|
+
|
95
|
+
|
96
|
+
def zip_path_split(path: str) -> Tuple[str, str]:
|
97
|
+
"""Split zip: path into a zipfile name and a contained-file name.
|
98
|
+
"""
|
99
|
+
index = path.find(".zip/")
|
100
|
+
|
101
|
+
zip_path = path[:index + 4]
|
102
|
+
inside_path = path[index + 5:]
|
103
|
+
|
104
|
+
return (zip_path, inside_path)
|
parsl/dataflow/dflow.py
CHANGED
@@ -177,10 +177,11 @@ class DataFlowKernel:
|
|
177
177
|
|
178
178
|
# this must be set before executors are added since add_executors calls
|
179
179
|
# job_status_poller.add_executors.
|
180
|
+
radio = self.monitoring.radio if self.monitoring else None
|
180
181
|
self.job_status_poller = JobStatusPoller(strategy=self.config.strategy,
|
181
182
|
strategy_period=self.config.strategy_period,
|
182
183
|
max_idletime=self.config.max_idletime,
|
183
|
-
|
184
|
+
monitoring=radio)
|
184
185
|
|
185
186
|
self.executors: Dict[str, ParslExecutor] = {}
|
186
187
|
|
@@ -239,16 +240,29 @@ class DataFlowKernel:
|
|
239
240
|
task_log_info['task_stdin'] = task_record['kwargs'].get('stdin', None)
|
240
241
|
stdout_spec = task_record['kwargs'].get('stdout', None)
|
241
242
|
stderr_spec = task_record['kwargs'].get('stderr', None)
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
243
|
+
|
244
|
+
# stdout and stderr strings are set to the filename if we can
|
245
|
+
# interpret the specification; otherwise, set to the empty string
|
246
|
+
# (on exception, or when not specified)
|
247
|
+
|
248
|
+
if stdout_spec is not None:
|
249
|
+
try:
|
250
|
+
stdout_name, _ = get_std_fname_mode('stdout', stdout_spec)
|
251
|
+
except Exception:
|
252
|
+
logger.exception("Could not parse stdout specification {} for task {}".format(stdout_spec, task_record['id']))
|
253
|
+
stdout_name = ""
|
254
|
+
else:
|
255
|
+
stdout_name = ""
|
256
|
+
|
257
|
+
if stderr_spec is not None:
|
258
|
+
try:
|
259
|
+
stderr_name, _ = get_std_fname_mode('stderr', stderr_spec)
|
260
|
+
except Exception:
|
261
|
+
logger.exception("Could not parse stderr specification {} for task {}".format(stderr_spec, task_record['id']))
|
262
|
+
stderr_name = ""
|
263
|
+
else:
|
264
|
+
stderr_name = ""
|
265
|
+
|
252
266
|
task_log_info['task_stdout'] = stdout_name
|
253
267
|
task_log_info['task_stderr'] = stderr_name
|
254
268
|
task_log_info['task_fail_history'] = ",".join(task_record['fail_history'])
|
@@ -674,14 +688,6 @@ class DataFlowKernel:
|
|
674
688
|
def launch_task(self, task_record: TaskRecord) -> Future:
|
675
689
|
"""Handle the actual submission of the task to the executor layer.
|
676
690
|
|
677
|
-
If the app task has the executors attributes not set (default=='all')
|
678
|
-
the task is launched on a randomly selected executor from the
|
679
|
-
list of executors. This behavior could later be updated to support
|
680
|
-
binding to executors based on user specified criteria.
|
681
|
-
|
682
|
-
If the app task specifies a particular set of executors, it will be
|
683
|
-
targeted at those specific executors.
|
684
|
-
|
685
691
|
Args:
|
686
692
|
task_record : The task record
|
687
693
|
|
@@ -714,14 +720,18 @@ class DataFlowKernel:
|
|
714
720
|
|
715
721
|
if self.monitoring is not None and self.monitoring.resource_monitoring_enabled:
|
716
722
|
wrapper_logging_level = logging.DEBUG if self.monitoring.monitoring_debug else logging.INFO
|
717
|
-
(function, args, kwargs) = monitor_wrapper(function,
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
723
|
+
(function, args, kwargs) = monitor_wrapper(f=function,
|
724
|
+
args=args,
|
725
|
+
kwargs=kwargs,
|
726
|
+
x_try_id=try_id,
|
727
|
+
x_task_id=task_id,
|
728
|
+
monitoring_hub_url=self.monitoring.monitoring_hub_url,
|
729
|
+
run_id=self.run_id,
|
730
|
+
logging_level=wrapper_logging_level,
|
731
|
+
sleep_dur=self.monitoring.resource_monitoring_interval,
|
732
|
+
radio_mode=executor.radio_mode,
|
733
|
+
monitor_resources=executor.monitor_resources(),
|
734
|
+
run_dir=self.run_dir)
|
725
735
|
|
726
736
|
with self.submitter_lock:
|
727
737
|
exec_fu = executor.submit(function, task_record['resource_specification'], *args, **kwargs)
|
@@ -1128,6 +1138,10 @@ class DataFlowKernel:
|
|
1128
1138
|
executor.run_dir = self.run_dir
|
1129
1139
|
executor.hub_address = self.hub_address
|
1130
1140
|
executor.hub_port = self.hub_zmq_port
|
1141
|
+
if self.monitoring:
|
1142
|
+
executor.monitoring_radio = self.monitoring.radio
|
1143
|
+
else:
|
1144
|
+
executor.monitoring_radio = None
|
1131
1145
|
if hasattr(executor, 'provider'):
|
1132
1146
|
if hasattr(executor.provider, 'script_dir'):
|
1133
1147
|
executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
|
@@ -1214,21 +1228,7 @@ class DataFlowKernel:
|
|
1214
1228
|
self.job_status_poller.close()
|
1215
1229
|
logger.info("Terminated job status poller")
|
1216
1230
|
|
1217
|
-
logger.info("
|
1218
|
-
|
1219
|
-
for ef in self.job_status_poller._executor_facades:
|
1220
|
-
if not ef.executor.bad_state_is_set:
|
1221
|
-
logger.info(f"Scaling in executor {ef.executor.label}")
|
1222
|
-
|
1223
|
-
# this code needs to be at least as many blocks as need
|
1224
|
-
# cancelling, but it is safe to be more, as the scaling
|
1225
|
-
# code will cope with being asked to cancel more blocks
|
1226
|
-
# than exist.
|
1227
|
-
block_count = len(ef.status)
|
1228
|
-
ef.scale_in(block_count)
|
1229
|
-
|
1230
|
-
else: # and bad_state_is_set
|
1231
|
-
logger.warning(f"Not scaling in executor {ef.executor.label} because it is in bad state")
|
1231
|
+
logger.info("Shutting down executors")
|
1232
1232
|
|
1233
1233
|
for executor in self.executors.values():
|
1234
1234
|
logger.info(f"Shutting down executor {executor.label}")
|
@@ -1245,8 +1245,7 @@ class DataFlowKernel:
|
|
1245
1245
|
'tasks_completed_count': self.task_state_counts[States.exec_done],
|
1246
1246
|
"time_began": self.time_began,
|
1247
1247
|
'time_completed': self.time_completed,
|
1248
|
-
'run_id': self.run_id, 'rundir': self.run_dir
|
1249
|
-
'exit_now': True})
|
1248
|
+
'run_id': self.run_id, 'rundir': self.run_dir})
|
1250
1249
|
|
1251
1250
|
logger.info("Terminating monitoring")
|
1252
1251
|
self.monitoring.close()
|
@@ -1396,10 +1395,20 @@ class DataFlowKernel:
|
|
1396
1395
|
|
1397
1396
|
@staticmethod
|
1398
1397
|
def _log_std_streams(task_record: TaskRecord) -> None:
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1402
|
-
|
1398
|
+
tid = task_record['id']
|
1399
|
+
|
1400
|
+
def log_std_stream(name: str, target) -> None:
|
1401
|
+
if target is None:
|
1402
|
+
logger.info(f"{name} for task {tid} will not be redirected.")
|
1403
|
+
elif isinstance(target, str):
|
1404
|
+
logger.info(f"{name} for task {tid} will be redirected to {target}")
|
1405
|
+
elif isinstance(target, tuple) and len(target) == 2:
|
1406
|
+
logger.info(f"{name} for task {tid} will be redirected to {target[0]} with mode {target[1]}")
|
1407
|
+
else:
|
1408
|
+
logger.error(f"{name} for task {tid} has unknown specification: {target!r}")
|
1409
|
+
|
1410
|
+
log_std_stream("Standard out", task_record['app_fu'].stdout)
|
1411
|
+
log_std_stream("Standard error", task_record['app_fu'].stderr)
|
1403
1412
|
|
1404
1413
|
|
1405
1414
|
class DataFlowKernelLoader:
|
parsl/dataflow/futures.py
CHANGED
@@ -1,10 +1,3 @@
|
|
1
|
-
"""This module implements the AppFutures.
|
2
|
-
|
3
|
-
We have two basic types of futures:
|
4
|
-
1. DataFutures which represent data objects
|
5
|
-
2. AppFutures which represent the futures on App/Leaf tasks.
|
6
|
-
|
7
|
-
"""
|
8
1
|
from __future__ import annotations
|
9
2
|
|
10
3
|
from concurrent.futures import Future
|
parsl/executors/base.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
from abc import ABCMeta, abstractmethod
|
2
2
|
from concurrent.futures import Future
|
3
|
-
from typing import Any, Callable, Dict, Optional
|
3
|
+
from typing import Any, Callable, Dict, Optional
|
4
4
|
from typing_extensions import Literal, Self
|
5
5
|
|
6
|
-
from parsl.
|
6
|
+
from parsl.monitoring.radios import MonitoringRadio
|
7
7
|
|
8
8
|
|
9
9
|
class ParslExecutor(metaclass=ABCMeta):
|
@@ -79,13 +79,6 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
79
79
|
"""
|
80
80
|
pass
|
81
81
|
|
82
|
-
def create_monitoring_info(self, status: Dict[str, JobStatus]) -> List[object]:
|
83
|
-
"""Create a monitoring message for each block based on the poll status.
|
84
|
-
|
85
|
-
:return: a list of dictionaries mapping to the info of each block
|
86
|
-
"""
|
87
|
-
return []
|
88
|
-
|
89
82
|
def monitor_resources(self) -> bool:
|
90
83
|
"""Should resource monitoring happen for tasks on running on this executor?
|
91
84
|
|
@@ -135,3 +128,13 @@ class ParslExecutor(metaclass=ABCMeta):
|
|
135
128
|
@hub_port.setter
|
136
129
|
def hub_port(self, value: Optional[int]) -> None:
|
137
130
|
self._hub_port = value
|
131
|
+
|
132
|
+
@property
|
133
|
+
def monitoring_radio(self) -> Optional[MonitoringRadio]:
|
134
|
+
"""Local radio for sending monitoring messages
|
135
|
+
"""
|
136
|
+
return self._monitoring_radio
|
137
|
+
|
138
|
+
@monitoring_radio.setter
|
139
|
+
def monitoring_radio(self, value: Optional[MonitoringRadio]) -> None:
|
140
|
+
self._monitoring_radio = value
|
@@ -5,7 +5,6 @@ import typeguard
|
|
5
5
|
import logging
|
6
6
|
import threading
|
7
7
|
import queue
|
8
|
-
import datetime
|
9
8
|
import pickle
|
10
9
|
from dataclasses import dataclass
|
11
10
|
from multiprocessing import Process, Queue
|
@@ -18,7 +17,7 @@ import parsl.launchers
|
|
18
17
|
from parsl.serialize import pack_res_spec_apply_message, deserialize
|
19
18
|
from parsl.serialize.errors import SerializationError, DeserializationError
|
20
19
|
from parsl.app.errors import RemoteExceptionWrapper
|
21
|
-
from parsl.jobs.states import JobStatus, JobState
|
20
|
+
from parsl.jobs.states import JobStatus, JobState, TERMINAL_STATES
|
22
21
|
from parsl.executors.high_throughput import zmq_pipes
|
23
22
|
from parsl.executors.high_throughput import interchange
|
24
23
|
from parsl.executors.errors import (
|
@@ -677,22 +676,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
677
676
|
# Return the future
|
678
677
|
return fut
|
679
678
|
|
680
|
-
def create_monitoring_info(self, status):
|
681
|
-
""" Create a msg for monitoring based on the poll status
|
682
|
-
|
683
|
-
"""
|
684
|
-
msg = []
|
685
|
-
for bid, s in status.items():
|
686
|
-
d = {}
|
687
|
-
d['run_id'] = self.run_id
|
688
|
-
d['status'] = s.status_name
|
689
|
-
d['timestamp'] = datetime.datetime.now()
|
690
|
-
d['executor_label'] = self.label
|
691
|
-
d['job_id'] = self.blocks_to_job_id.get(bid, None)
|
692
|
-
d['block_id'] = bid
|
693
|
-
msg.append(d)
|
694
|
-
return msg
|
695
|
-
|
696
679
|
@property
|
697
680
|
def workers_per_node(self) -> Union[int, float]:
|
698
681
|
return self._workers_per_node
|
@@ -730,8 +713,20 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
730
713
|
tasks: int # sum of tasks in this block
|
731
714
|
idle: float # shortest idle time of any manager in this block
|
732
715
|
|
716
|
+
# block_info will be populated from two sources:
|
717
|
+
# the Job Status Poller mutable block list, and the list of blocks
|
718
|
+
# which have connected to the interchange.
|
719
|
+
|
720
|
+
def new_block_info():
|
721
|
+
return BlockInfo(tasks=0, idle=float('inf'))
|
722
|
+
|
723
|
+
block_info: Dict[str, BlockInfo] = defaultdict(new_block_info)
|
724
|
+
|
725
|
+
for block_id, job_status in self._status.items():
|
726
|
+
if job_status.state not in TERMINAL_STATES:
|
727
|
+
block_info[block_id] = new_block_info()
|
728
|
+
|
733
729
|
managers = self.connected_managers()
|
734
|
-
block_info: Dict[str, BlockInfo] = defaultdict(lambda: BlockInfo(tasks=0, idle=float('inf')))
|
735
730
|
for manager in managers:
|
736
731
|
if not manager['active']:
|
737
732
|
continue
|
@@ -361,7 +361,9 @@ class Manager:
|
|
361
361
|
kill_event.set()
|
362
362
|
else:
|
363
363
|
task_recv_counter += len(tasks)
|
364
|
-
logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format(
|
364
|
+
logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format(
|
365
|
+
[t['task_id'] for t in tasks], task_recv_counter
|
366
|
+
))
|
365
367
|
|
366
368
|
for task in tasks:
|
367
369
|
self.task_scheduler.put_task(task)
|
@@ -1,15 +1,18 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
+
import datetime
|
2
3
|
import logging
|
3
4
|
import threading
|
5
|
+
import time
|
4
6
|
from itertools import compress
|
5
7
|
from abc import abstractmethod, abstractproperty
|
6
8
|
from concurrent.futures import Future
|
7
|
-
from typing import List, Any, Dict, Optional, Tuple, Union, Callable
|
9
|
+
from typing import List, Any, Dict, Optional, Sequence, Tuple, Union, Callable
|
8
10
|
|
9
11
|
from parsl.executors.base import ParslExecutor
|
10
12
|
from parsl.executors.errors import BadStateException, ScalingFailed
|
11
13
|
from parsl.jobs.states import JobStatus, JobState
|
12
14
|
from parsl.jobs.error_handlers import simple_error_handler, noop_error_handler
|
15
|
+
from parsl.monitoring.message_type import MessageType
|
13
16
|
from parsl.providers.base import ExecutionProvider
|
14
17
|
from parsl.utils import AtomicIDCounter
|
15
18
|
|
@@ -71,6 +74,9 @@ class BlockProviderExecutor(ParslExecutor):
|
|
71
74
|
self.blocks_to_job_id = {} # type: Dict[str, str]
|
72
75
|
self.job_ids_to_block = {} # type: Dict[str, str]
|
73
76
|
|
77
|
+
self._last_poll_time = 0.0
|
78
|
+
self._status = {} # type: Dict[str, JobStatus]
|
79
|
+
|
74
80
|
def _make_status_dict(self, block_ids: List[str], status_list: List[JobStatus]) -> Dict[str, JobStatus]:
|
75
81
|
"""Given a list of block ids and a list of corresponding status strings,
|
76
82
|
returns a dictionary mapping each block id to the corresponding status
|
@@ -102,12 +108,6 @@ class BlockProviderExecutor(ParslExecutor):
|
|
102
108
|
else:
|
103
109
|
return self._provider.status_polling_interval
|
104
110
|
|
105
|
-
def _fail_job_async(self, block_id: str, message: str):
|
106
|
-
"""Marks a job that has failed to start but would not otherwise be included in status()
|
107
|
-
as failed and report it in status()
|
108
|
-
"""
|
109
|
-
self._simulated_status[block_id] = JobStatus(JobState.FAILED, message)
|
110
|
-
|
111
111
|
@abstractproperty
|
112
112
|
def outstanding(self) -> int:
|
113
113
|
"""This should return the number of tasks that the executor has been given to run (waiting to run, and running now)"""
|
@@ -198,8 +198,7 @@ class BlockProviderExecutor(ParslExecutor):
|
|
198
198
|
self.job_ids_to_block[job_id] = block_id
|
199
199
|
block_ids.append(block_id)
|
200
200
|
except Exception as ex:
|
201
|
-
self.
|
202
|
-
"Failed to start block {}: {}".format(block_id, ex))
|
201
|
+
self._simulated_status[block_id] = JobStatus(JobState.FAILED, "Failed to start block {}: {}".format(block_id, ex))
|
203
202
|
return block_ids
|
204
203
|
|
205
204
|
@abstractmethod
|
@@ -241,3 +240,77 @@ class BlockProviderExecutor(ParslExecutor):
|
|
241
240
|
@abstractproperty
|
242
241
|
def workers_per_node(self) -> Union[int, float]:
|
243
242
|
pass
|
243
|
+
|
244
|
+
def send_monitoring_info(self, status: Dict) -> None:
|
245
|
+
# Send monitoring info for HTEX when monitoring enabled
|
246
|
+
if self.monitoring_radio:
|
247
|
+
msg = self.create_monitoring_info(status)
|
248
|
+
logger.debug("Sending message {} to hub from job status poller".format(msg))
|
249
|
+
self.monitoring_radio.send((MessageType.BLOCK_INFO, msg))
|
250
|
+
|
251
|
+
def create_monitoring_info(self, status: Dict[str, JobStatus]) -> Sequence[object]:
|
252
|
+
"""Create a monitoring message for each block based on the poll status.
|
253
|
+
"""
|
254
|
+
msg = []
|
255
|
+
for bid, s in status.items():
|
256
|
+
d: Dict[str, Any] = {}
|
257
|
+
d['run_id'] = self.run_id
|
258
|
+
d['status'] = s.status_name
|
259
|
+
d['timestamp'] = datetime.datetime.now()
|
260
|
+
d['executor_label'] = self.label
|
261
|
+
d['job_id'] = self.blocks_to_job_id.get(bid, None)
|
262
|
+
d['block_id'] = bid
|
263
|
+
msg.append(d)
|
264
|
+
return msg
|
265
|
+
|
266
|
+
def poll_facade(self) -> None:
|
267
|
+
now = time.time()
|
268
|
+
if now >= self._last_poll_time + self.status_polling_interval:
|
269
|
+
previous_status = self._status
|
270
|
+
self._status = self.status()
|
271
|
+
self._last_poll_time = now
|
272
|
+
delta_status = {}
|
273
|
+
for block_id in self._status:
|
274
|
+
if block_id not in previous_status \
|
275
|
+
or previous_status[block_id].state != self._status[block_id].state:
|
276
|
+
delta_status[block_id] = self._status[block_id]
|
277
|
+
|
278
|
+
if delta_status:
|
279
|
+
self.send_monitoring_info(delta_status)
|
280
|
+
|
281
|
+
@property
|
282
|
+
def status_facade(self) -> Dict[str, JobStatus]:
|
283
|
+
"""Return the status of all jobs/blocks of the executor of this poller.
|
284
|
+
|
285
|
+
:return: a dictionary mapping block ids (in string) to job status
|
286
|
+
"""
|
287
|
+
return self._status
|
288
|
+
|
289
|
+
def scale_in_facade(self, n: int, max_idletime: Optional[float] = None) -> List[str]:
|
290
|
+
|
291
|
+
if max_idletime is None:
|
292
|
+
block_ids = self.scale_in(n)
|
293
|
+
else:
|
294
|
+
# This is a HighThroughputExecutor-specific interface violation.
|
295
|
+
# This code hopes, through pan-codebase reasoning, that this
|
296
|
+
# scale_in method really does come from HighThroughputExecutor,
|
297
|
+
# and so does have an extra max_idletime parameter not present
|
298
|
+
# in the executor interface.
|
299
|
+
block_ids = self.scale_in(n, max_idletime=max_idletime) # type: ignore[call-arg]
|
300
|
+
if block_ids is not None:
|
301
|
+
new_status = {}
|
302
|
+
for block_id in block_ids:
|
303
|
+
new_status[block_id] = JobStatus(JobState.CANCELLED)
|
304
|
+
del self._status[block_id]
|
305
|
+
self.send_monitoring_info(new_status)
|
306
|
+
return block_ids
|
307
|
+
|
308
|
+
def scale_out_facade(self, n: int) -> List[str]:
|
309
|
+
block_ids = self.scale_out(n)
|
310
|
+
if block_ids is not None:
|
311
|
+
new_status = {}
|
312
|
+
for block_id in block_ids:
|
313
|
+
new_status[block_id] = JobStatus(JobState.PENDING)
|
314
|
+
self.send_monitoring_info(new_status)
|
315
|
+
self._status.update(new_status)
|
316
|
+
return block_ids
|
@@ -596,7 +596,7 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
596
596
|
def workers_per_node(self) -> Union[int, float]:
|
597
597
|
return 1
|
598
598
|
|
599
|
-
def scale_in(self, count):
|
599
|
+
def scale_in(self, count: int) -> List[str]:
|
600
600
|
"""Scale in method. Cancel a given number of blocks
|
601
601
|
"""
|
602
602
|
# Obtain list of blocks to kill
|
@@ -605,9 +605,14 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
605
605
|
|
606
606
|
# Cancel the blocks provisioned
|
607
607
|
if self.provider:
|
608
|
-
|
608
|
+
logger.info(f"Scaling in jobs: {kill_ids}")
|
609
|
+
r = self.provider.cancel(kill_ids)
|
610
|
+
job_ids = self._filter_scale_in_ids(kill_ids, r)
|
611
|
+
block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
|
612
|
+
return block_ids_killed
|
609
613
|
else:
|
610
614
|
logger.error("No execution provider available to scale")
|
615
|
+
return []
|
611
616
|
|
612
617
|
def shutdown(self, *args, **kwargs):
|
613
618
|
"""Shutdown the executor. Sets flag to cancel the submit process and
|
@@ -691,7 +691,7 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
691
691
|
def workers_per_node(self) -> Union[int, float]:
|
692
692
|
return 1
|
693
693
|
|
694
|
-
def scale_in(self, count):
|
694
|
+
def scale_in(self, count: int) -> List[str]:
|
695
695
|
"""Scale in method.
|
696
696
|
"""
|
697
697
|
# Obtain list of blocks to kill
|
@@ -700,9 +700,14 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
700
700
|
|
701
701
|
# Cancel the blocks provisioned
|
702
702
|
if self.provider:
|
703
|
-
|
703
|
+
logger.info(f"Scaling in jobs: {kill_ids}")
|
704
|
+
r = self.provider.cancel(kill_ids)
|
705
|
+
job_ids = self._filter_scale_in_ids(kill_ids, r)
|
706
|
+
block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
|
707
|
+
return block_ids_killed
|
704
708
|
else:
|
705
|
-
logger.error("No execution provider available to scale")
|
709
|
+
logger.error("No execution provider available to scale in")
|
710
|
+
return []
|
706
711
|
|
707
712
|
def shutdown(self, *args, **kwargs):
|
708
713
|
"""Shutdown the executor. Sets flag to cancel the submit process and
|