parsl 2024.3.4__py3-none-any.whl → 2024.3.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/addresses.py +3 -1
- parsl/config.py +4 -0
- parsl/dataflow/dflow.py +14 -7
- parsl/dataflow/taskrecord.py +3 -1
- parsl/executors/high_throughput/executor.py +34 -10
- parsl/executors/high_throughput/interchange.py +43 -10
- parsl/executors/high_throughput/manager_record.py +1 -0
- parsl/executors/high_throughput/process_worker_pool.py +48 -7
- parsl/executors/taskvine/executor.py +6 -3
- parsl/executors/taskvine/manager.py +1 -0
- parsl/executors/taskvine/manager_config.py +3 -4
- parsl/jobs/job_status_poller.py +4 -3
- parsl/jobs/strategy.py +2 -1
- parsl/launchers/launchers.py +6 -6
- parsl/log_utils.py +8 -4
- parsl/monitoring/db_manager.py +29 -7
- parsl/monitoring/monitoring.py +15 -54
- parsl/monitoring/remote.py +29 -0
- parsl/monitoring/visualization/models.py +7 -0
- parsl/monitoring/visualization/plots/default/workflow_plots.py +3 -0
- parsl/monitoring/visualization/views.py +2 -1
- parsl/providers/cluster_provider.py +1 -3
- parsl/providers/slurm/slurm.py +13 -2
- parsl/tests/configs/user_opts.py +5 -2
- parsl/tests/test_htex/test_drain.py +78 -0
- parsl/tests/test_monitoring/test_app_names.py +86 -0
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +3 -11
- parsl/usage_tracking/usage.py +5 -9
- parsl/utils.py +2 -2
- parsl/version.py +1 -1
- {parsl-2024.3.4.data → parsl-2024.3.18.data}/scripts/process_worker_pool.py +48 -7
- {parsl-2024.3.4.dist-info → parsl-2024.3.18.dist-info}/METADATA +2 -2
- {parsl-2024.3.4.dist-info → parsl-2024.3.18.dist-info}/RECORD +39 -38
- {parsl-2024.3.4.dist-info → parsl-2024.3.18.dist-info}/WHEEL +1 -1
- parsl/configs/bluewaters.py +0 -28
- {parsl-2024.3.4.data → parsl-2024.3.18.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.3.4.data → parsl-2024.3.18.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.3.4.dist-info → parsl-2024.3.18.dist-info}/LICENSE +0 -0
- {parsl-2024.3.4.dist-info → parsl-2024.3.18.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.4.dist-info → parsl-2024.3.18.dist-info}/top_level.txt +0 -0
parsl/launchers/launchers.py
CHANGED
@@ -8,16 +8,16 @@ logger = logging.getLogger(__name__)
|
|
8
8
|
class SimpleLauncher(Launcher):
|
9
9
|
""" Does no wrapping. Just returns the command as-is
|
10
10
|
"""
|
11
|
-
def
|
11
|
+
def __init__(self, debug: bool = True) -> None:
|
12
12
|
super().__init__(debug=debug)
|
13
13
|
|
14
14
|
def __call__(self, command: str, tasks_per_node: int, nodes_per_block: int) -> str:
|
15
|
-
"""
|
16
|
-
Args:
|
17
|
-
- command (string): The command string to be launched
|
18
|
-
- task_block (string) : bash evaluated string.
|
19
15
|
|
20
|
-
|
16
|
+
if nodes_per_block > 1:
|
17
|
+
logger.warning('Simple Launcher only supports single node per block. '
|
18
|
+
f'Requested nodes: {nodes_per_block}. '
|
19
|
+
'You may be getting fewer workers than expected')
|
20
|
+
|
21
21
|
return command
|
22
22
|
|
23
23
|
|
parsl/log_utils.py
CHANGED
@@ -28,7 +28,7 @@ DEFAULT_FORMAT = (
|
|
28
28
|
def set_stream_logger(name: str = 'parsl',
|
29
29
|
level: int = logging.DEBUG,
|
30
30
|
format_string: Optional[str] = None,
|
31
|
-
stream: Optional[io.TextIOWrapper] = None) ->
|
31
|
+
stream: Optional[io.TextIOWrapper] = None) -> logging.Logger:
|
32
32
|
"""Add a stream log handler.
|
33
33
|
|
34
34
|
Args:
|
@@ -39,7 +39,7 @@ def set_stream_logger(name: str = 'parsl',
|
|
39
39
|
If not specified, the default stream for logging.StreamHandler is used.
|
40
40
|
|
41
41
|
Returns:
|
42
|
-
-
|
42
|
+
- logger for specified name
|
43
43
|
"""
|
44
44
|
if format_string is None:
|
45
45
|
# format_string = "%(asctime)s %(name)s [%(levelname)s] Thread:%(thread)d %(message)s"
|
@@ -59,12 +59,14 @@ def set_stream_logger(name: str = 'parsl',
|
|
59
59
|
futures_logger = logging.getLogger("concurrent.futures")
|
60
60
|
futures_logger.addHandler(handler)
|
61
61
|
|
62
|
+
return logger
|
63
|
+
|
62
64
|
|
63
65
|
@typeguard.typechecked
|
64
66
|
def set_file_logger(filename: str,
|
65
67
|
name: str = 'parsl',
|
66
68
|
level: int = logging.DEBUG,
|
67
|
-
format_string: Optional[str] = None) ->
|
69
|
+
format_string: Optional[str] = None) -> logging.Logger:
|
68
70
|
"""Add a file log handler.
|
69
71
|
|
70
72
|
Args:
|
@@ -74,7 +76,7 @@ def set_file_logger(filename: str,
|
|
74
76
|
- format_string (string): Set the format string
|
75
77
|
|
76
78
|
Returns:
|
77
|
-
-
|
79
|
+
- logger for specified name
|
78
80
|
"""
|
79
81
|
if format_string is None:
|
80
82
|
format_string = DEFAULT_FORMAT
|
@@ -91,3 +93,5 @@ def set_file_logger(filename: str,
|
|
91
93
|
# concurrent.futures
|
92
94
|
futures_logger = logging.getLogger("concurrent.futures")
|
93
95
|
futures_logger.addHandler(handler)
|
96
|
+
|
97
|
+
return logger
|
parsl/monitoring/db_manager.py
CHANGED
@@ -103,7 +103,13 @@ class Database:
|
|
103
103
|
def rollback(self) -> None:
|
104
104
|
self.session.rollback()
|
105
105
|
|
106
|
-
def _generate_mappings(
|
106
|
+
def _generate_mappings(
|
107
|
+
self,
|
108
|
+
table: Table,
|
109
|
+
columns: Optional[List[str]] = None,
|
110
|
+
messages: List[MonitoringMessage] = [],
|
111
|
+
) -> List[Dict[str, Any]]:
|
112
|
+
|
107
113
|
mappings = []
|
108
114
|
for msg in messages:
|
109
115
|
m = {}
|
@@ -250,6 +256,12 @@ class Database:
|
|
250
256
|
'psutil_process_disk_write', Float, nullable=True)
|
251
257
|
psutil_process_status = Column(
|
252
258
|
'psutil_process_status', Text, nullable=True)
|
259
|
+
psutil_cpu_num = Column(
|
260
|
+
'psutil_cpu_num', Text, nullable=True)
|
261
|
+
psutil_process_num_ctx_switches_voluntary = Column(
|
262
|
+
'psutil_process_num_ctx_switches_voluntary', Float, nullable=True)
|
263
|
+
psutil_process_num_ctx_switches_involuntary = Column(
|
264
|
+
'psutil_process_num_ctx_switches_involuntary', Float, nullable=True)
|
253
265
|
__table_args__ = (
|
254
266
|
PrimaryKeyConstraint('try_id', 'task_id', 'run_id', 'timestamp'),
|
255
267
|
)
|
@@ -518,7 +530,10 @@ class DatabaseManager:
|
|
518
530
|
reprocessable_first_resource_messages.append(msg)
|
519
531
|
else:
|
520
532
|
if task_try_id in deferred_resource_messages:
|
521
|
-
logger.error(
|
533
|
+
logger.error(
|
534
|
+
"Task {} already has a deferred resource message. "
|
535
|
+
"Discarding previous message.".format(msg['task_id'])
|
536
|
+
)
|
522
537
|
deferred_resource_messages[task_try_id] = msg
|
523
538
|
elif msg['last_msg']:
|
524
539
|
# This assumes that the primary key has been added
|
@@ -544,7 +559,10 @@ class DatabaseManager:
|
|
544
559
|
if reprocessable_last_resource_messages:
|
545
560
|
self._insert(table=STATUS, messages=reprocessable_last_resource_messages)
|
546
561
|
except Exception:
|
547
|
-
logger.exception(
|
562
|
+
logger.exception(
|
563
|
+
"Exception in db loop: this might have been a malformed message, "
|
564
|
+
"or some other error. monitoring data may have been lost"
|
565
|
+
)
|
548
566
|
exception_happened = True
|
549
567
|
if exception_happened:
|
550
568
|
raise RuntimeError("An exception happened sometime during database processing and should have been logged in database_manager.log")
|
@@ -571,8 +589,10 @@ class DatabaseManager:
|
|
571
589
|
self._dispatch_to_internal(x)
|
572
590
|
elif queue_tag == 'resource':
|
573
591
|
assert isinstance(x, tuple), "_migrate_logs_to_internal was expecting a tuple, got {}".format(x)
|
574
|
-
assert x[0] == MessageType.RESOURCE_INFO,
|
575
|
-
"_migrate_logs_to_internal can only migrate RESOURCE_INFO message from resource queue,
|
592
|
+
assert x[0] == MessageType.RESOURCE_INFO, (
|
593
|
+
"_migrate_logs_to_internal can only migrate RESOURCE_INFO message from resource queue, "
|
594
|
+
"got tag {}, message {}".format(x[0], x)
|
595
|
+
)
|
576
596
|
self._dispatch_to_internal(x)
|
577
597
|
elif queue_tag == 'node':
|
578
598
|
assert len(x) == 2, "expected message tuple to have exactly two elements"
|
@@ -613,7 +633,8 @@ class DatabaseManager:
|
|
613
633
|
# if retried - for example, the database being locked because someone else is readying
|
614
634
|
# the tables we are trying to write to. If that assumption is wrong, then this loop
|
615
635
|
# may go on forever.
|
616
|
-
logger.warning("Got a database OperationalError.
|
636
|
+
logger.warning("Got a database OperationalError. "
|
637
|
+
"Ignoring and retrying on the assumption that it is recoverable: {}".format(e))
|
617
638
|
self.db.rollback()
|
618
639
|
time.sleep(1) # hard coded 1s wait - this should be configurable or exponential backoff or something
|
619
640
|
|
@@ -640,7 +661,8 @@ class DatabaseManager:
|
|
640
661
|
done = True
|
641
662
|
except sa.exc.OperationalError as e:
|
642
663
|
# hoping that this is a database locked error during _update, not some other problem
|
643
|
-
logger.warning("Got a database OperationalError.
|
664
|
+
logger.warning("Got a database OperationalError. "
|
665
|
+
"Ignoring and retrying on the assumption that it is recoverable: {}".format(e))
|
644
666
|
self.db.rollback()
|
645
667
|
time.sleep(1) # hard coded 1s wait - this should be configurable or exponential backoff or something
|
646
668
|
except KeyboardInterrupt:
|
parsl/monitoring/monitoring.py
CHANGED
@@ -15,6 +15,7 @@ import parsl.monitoring.remote
|
|
15
15
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
16
16
|
from multiprocessing import Process
|
17
17
|
from multiprocessing.queues import Queue
|
18
|
+
from parsl.log_utils import set_file_logger
|
18
19
|
from parsl.utils import RepresentationMixin
|
19
20
|
from parsl.process_loggers import wrap_with_logs
|
20
21
|
from parsl.utils import setproctitle
|
@@ -38,40 +39,6 @@ else:
|
|
38
39
|
logger = logging.getLogger(__name__)
|
39
40
|
|
40
41
|
|
41
|
-
def start_file_logger(filename: str, name: str = 'monitoring', level: int = logging.DEBUG, format_string: Optional[str] = None) -> logging.Logger:
|
42
|
-
"""Add a stream log handler.
|
43
|
-
|
44
|
-
Parameters
|
45
|
-
---------
|
46
|
-
|
47
|
-
filename: string
|
48
|
-
Name of the file to write logs to. Required.
|
49
|
-
name: string
|
50
|
-
Logger name.
|
51
|
-
level: logging.LEVEL
|
52
|
-
Set the logging level. Default=logging.DEBUG
|
53
|
-
- format_string (string): Set the format string
|
54
|
-
format_string: string
|
55
|
-
Format string to use.
|
56
|
-
|
57
|
-
Returns
|
58
|
-
-------
|
59
|
-
None.
|
60
|
-
"""
|
61
|
-
if format_string is None:
|
62
|
-
format_string = "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d [%(levelname)s] %(message)s"
|
63
|
-
|
64
|
-
logger = logging.getLogger(name)
|
65
|
-
logger.setLevel(level)
|
66
|
-
logger.propagate = False
|
67
|
-
handler = logging.FileHandler(filename)
|
68
|
-
handler.setLevel(level)
|
69
|
-
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%d %H:%M:%S')
|
70
|
-
handler.setFormatter(formatter)
|
71
|
-
logger.addHandler(handler)
|
72
|
-
return logger
|
73
|
-
|
74
|
-
|
75
42
|
@typeguard.typechecked
|
76
43
|
class MonitoringHub(RepresentationMixin):
|
77
44
|
def __init__(self,
|
@@ -79,9 +46,6 @@ class MonitoringHub(RepresentationMixin):
|
|
79
46
|
hub_port: Optional[int] = None,
|
80
47
|
hub_port_range: Tuple[int, int] = (55050, 56000),
|
81
48
|
|
82
|
-
client_address: str = "127.0.0.1",
|
83
|
-
client_port_range: Tuple[int, int] = (55000, 56000),
|
84
|
-
|
85
49
|
workflow_name: Optional[str] = None,
|
86
50
|
workflow_version: Optional[str] = None,
|
87
51
|
logging_endpoint: Optional[str] = None,
|
@@ -106,11 +70,6 @@ class MonitoringHub(RepresentationMixin):
|
|
106
70
|
to deliver monitoring messages to the monitoring router.
|
107
71
|
Note that despite the similar name, this is not related to hub_port.
|
108
72
|
Default: (55050, 56000)
|
109
|
-
client_address : str
|
110
|
-
The ip address at which the dfk will be able to reach Hub. Default: "127.0.0.1"
|
111
|
-
client_port_range : tuple(int, int)
|
112
|
-
The MonitoringHub picks ports at random from the range which will be used by Hub.
|
113
|
-
Default: (55000, 56000)
|
114
73
|
workflow_name : str
|
115
74
|
The name for the workflow. Default to the name of the parsl script
|
116
75
|
workflow_version : str
|
@@ -145,9 +104,6 @@ class MonitoringHub(RepresentationMixin):
|
|
145
104
|
if _db_manager_excepts:
|
146
105
|
raise _db_manager_excepts
|
147
106
|
|
148
|
-
self.client_address = client_address
|
149
|
-
self.client_port_range = client_port_range
|
150
|
-
|
151
107
|
self.hub_address = hub_address
|
152
108
|
self.hub_port = hub_port
|
153
109
|
self.hub_port_range = hub_port_range
|
@@ -290,8 +246,12 @@ class MonitoringHub(RepresentationMixin):
|
|
290
246
|
self._dfk_channel.close()
|
291
247
|
if exception_msgs:
|
292
248
|
for exception_msg in exception_msgs:
|
293
|
-
self.logger.error(
|
294
|
-
|
249
|
+
self.logger.error(
|
250
|
+
"{} process delivered an exception: {}. Terminating all monitoring processes immediately.".format(
|
251
|
+
exception_msg[0],
|
252
|
+
exception_msg[1]
|
253
|
+
)
|
254
|
+
)
|
295
255
|
self.router_proc.terminate()
|
296
256
|
self.dbm_proc.terminate()
|
297
257
|
self.filesystem_proc.terminate()
|
@@ -333,9 +293,9 @@ class MonitoringHub(RepresentationMixin):
|
|
333
293
|
|
334
294
|
@wrap_with_logs
|
335
295
|
def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:
|
336
|
-
logger =
|
337
|
-
|
338
|
-
|
296
|
+
logger = set_file_logger("{}/monitoring_filesystem_radio.log".format(logdir),
|
297
|
+
name="monitoring_filesystem_radio",
|
298
|
+
level=logging.INFO)
|
339
299
|
|
340
300
|
logger.info("Starting filesystem radio receiver")
|
341
301
|
setproctitle("parsl: monitoring filesystem receiver")
|
@@ -401,9 +361,9 @@ class MonitoringRouter:
|
|
401
361
|
|
402
362
|
"""
|
403
363
|
os.makedirs(logdir, exist_ok=True)
|
404
|
-
self.logger =
|
405
|
-
|
406
|
-
|
364
|
+
self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
|
365
|
+
name="monitoring_router",
|
366
|
+
level=logging_level)
|
407
367
|
self.logger.debug("Monitoring router starting")
|
408
368
|
|
409
369
|
self.hub_address = hub_address
|
@@ -489,7 +449,8 @@ class MonitoringRouter:
|
|
489
449
|
# but there is no verification that the message
|
490
450
|
# received from ic_channel.recv_pyobj() is actually
|
491
451
|
# of that type.
|
492
|
-
self.logger.error(
|
452
|
+
self.logger.error("Discarding message " # type: ignore[unreachable]
|
453
|
+
f"from interchange with unknown type {msg[0].value}")
|
493
454
|
except zmq.Again:
|
494
455
|
pass
|
495
456
|
except Exception:
|
parsl/monitoring/remote.py
CHANGED
@@ -201,6 +201,8 @@ def monitor(pid: int,
|
|
201
201
|
|
202
202
|
children_user_time = {} # type: Dict[int, float]
|
203
203
|
children_system_time = {} # type: Dict[int, float]
|
204
|
+
children_num_ctx_switches_voluntary = {} # type: Dict[int, float]
|
205
|
+
children_num_ctx_switches_involuntary = {} # type: Dict[int, float]
|
204
206
|
|
205
207
|
def accumulate_and_prepare() -> Dict[str, Any]:
|
206
208
|
d = {"psutil_process_" + str(k): v for k, v in pm.as_dict().items() if k in simple}
|
@@ -218,6 +220,15 @@ def monitor(pid: int,
|
|
218
220
|
logging.debug("got children")
|
219
221
|
|
220
222
|
d["psutil_cpu_count"] = psutil.cpu_count()
|
223
|
+
|
224
|
+
# note that this will be the CPU number of the base process, not anything launched by it
|
225
|
+
d["psutil_cpu_num"] = pm.cpu_num()
|
226
|
+
|
227
|
+
pctxsw = pm.num_ctx_switches()
|
228
|
+
|
229
|
+
d["psutil_process_num_ctx_switches_voluntary"] = pctxsw.voluntary
|
230
|
+
d["psutil_process_num_ctx_switches_involuntary"] = pctxsw.involuntary
|
231
|
+
|
221
232
|
d['psutil_process_memory_virtual'] = pm.memory_info().vms
|
222
233
|
d['psutil_process_memory_resident'] = pm.memory_info().rss
|
223
234
|
d['psutil_process_time_user'] = pm.cpu_times().user
|
@@ -238,6 +249,11 @@ def monitor(pid: int,
|
|
238
249
|
child_system_time = child.cpu_times().system
|
239
250
|
children_user_time[child.pid] = child_user_time
|
240
251
|
children_system_time[child.pid] = child_system_time
|
252
|
+
|
253
|
+
pctxsw = child.num_ctx_switches()
|
254
|
+
children_num_ctx_switches_voluntary[child.pid] = pctxsw.voluntary
|
255
|
+
children_num_ctx_switches_involuntary[child.pid] = pctxsw.involuntary
|
256
|
+
|
241
257
|
d['psutil_process_memory_virtual'] += child.memory_info().vms
|
242
258
|
d['psutil_process_memory_resident'] += child.memory_info().rss
|
243
259
|
try:
|
@@ -248,14 +264,27 @@ def monitor(pid: int,
|
|
248
264
|
logging.exception("Exception reading IO counters for child {k}. Recorded IO usage may be incomplete".format(k=k), exc_info=True)
|
249
265
|
d['psutil_process_disk_write'] += 0
|
250
266
|
d['psutil_process_disk_read'] += 0
|
267
|
+
|
251
268
|
total_children_user_time = 0.0
|
252
269
|
for child_pid in children_user_time:
|
253
270
|
total_children_user_time += children_user_time[child_pid]
|
271
|
+
|
254
272
|
total_children_system_time = 0.0
|
255
273
|
for child_pid in children_system_time:
|
256
274
|
total_children_system_time += children_system_time[child_pid]
|
275
|
+
|
276
|
+
total_children_num_ctx_switches_voluntary = 0.0
|
277
|
+
for child_pid in children_num_ctx_switches_voluntary:
|
278
|
+
total_children_num_ctx_switches_voluntary += children_num_ctx_switches_voluntary[child_pid]
|
279
|
+
|
280
|
+
total_children_num_ctx_switches_involuntary = 0.0
|
281
|
+
for child_pid in children_num_ctx_switches_involuntary:
|
282
|
+
total_children_num_ctx_switches_involuntary += children_num_ctx_switches_involuntary[child_pid]
|
283
|
+
|
257
284
|
d['psutil_process_time_user'] += total_children_user_time
|
258
285
|
d['psutil_process_time_system'] += total_children_system_time
|
286
|
+
d['psutil_process_num_ctx_switches_voluntary'] += total_children_num_ctx_switches_voluntary
|
287
|
+
d['psutil_process_num_ctx_switches_involuntary'] += total_children_num_ctx_switches_involuntary
|
259
288
|
logging.debug("sending message")
|
260
289
|
return d
|
261
290
|
|
@@ -102,5 +102,12 @@ class Resource(db.Model):
|
|
102
102
|
'psutil_process_disk_write', db.Float, nullable=True)
|
103
103
|
psutil_process_status = db.Column(
|
104
104
|
'psutil_process_status', db.Text, nullable=True)
|
105
|
+
psutil_cpu_num = db.Column(
|
106
|
+
'psutil_cpu_num', db.Text, nullable=True)
|
107
|
+
psutil_process_num_ctx_switches_voluntary = db.Column(
|
108
|
+
'psutil_process_num_ctx_switches_voluntary', db.Float, nullable=True)
|
109
|
+
psutil_process_num_ctx_switches_involuntary = db.Column(
|
110
|
+
'psutil_process_num_ctx_switches_involuntary', db.Float, nullable=True)
|
111
|
+
|
105
112
|
__table_args__ = (
|
106
113
|
db.PrimaryKeyConstraint('task_id', 'run_id', 'timestamp'),)
|
@@ -27,6 +27,9 @@ gantt_colors = {'unsched': 'rgb(240, 240, 240)',
|
|
27
27
|
|
28
28
|
def task_gantt_plot(df_task, df_status, time_completed=None):
|
29
29
|
|
30
|
+
if df_task.empty:
|
31
|
+
return None
|
32
|
+
|
30
33
|
# if the workflow is not recorded as completed, then assume
|
31
34
|
# that tasks should continue in their last state until now,
|
32
35
|
# rather than the workflow end time.
|
@@ -8,7 +8,8 @@ from parsl.monitoring.visualization.models import Workflow, Task, Status, db
|
|
8
8
|
|
9
9
|
from parsl.monitoring.visualization.plots.default.workflow_plots import task_gantt_plot, task_per_app_plot, workflow_dag_plot
|
10
10
|
from parsl.monitoring.visualization.plots.default.task_plots import time_series_memory_per_task_plot
|
11
|
-
from parsl.monitoring.visualization.plots.default.workflow_resource_plots import resource_distribution_plot,
|
11
|
+
from parsl.monitoring.visualization.plots.default.workflow_resource_plots import (resource_distribution_plot,
|
12
|
+
resource_efficiency, worker_efficiency)
|
12
13
|
|
13
14
|
dummy = True
|
14
15
|
|
@@ -91,7 +91,7 @@ class ClusterProvider(ExecutionProvider):
|
|
91
91
|
- configs (dict) : configs that get pushed into the template
|
92
92
|
|
93
93
|
Returns:
|
94
|
-
-
|
94
|
+
- None
|
95
95
|
|
96
96
|
Raises:
|
97
97
|
SchedulerMissingArgs : If template is missing args
|
@@ -117,8 +117,6 @@ class ClusterProvider(ExecutionProvider):
|
|
117
117
|
logger.error("Uncategorized error: %s", e)
|
118
118
|
raise e
|
119
119
|
|
120
|
-
return True
|
121
|
-
|
122
120
|
@abstractmethod
|
123
121
|
def _status(self):
|
124
122
|
pass
|
parsl/providers/slurm/slurm.py
CHANGED
@@ -280,11 +280,22 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
|
|
280
280
|
else:
|
281
281
|
logger.error("Could not read job ID from submit command standard output.")
|
282
282
|
logger.error("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip())
|
283
|
-
raise SubmitException(
|
283
|
+
raise SubmitException(
|
284
|
+
job_name,
|
285
|
+
"Could not read job ID from submit command standard output",
|
286
|
+
stdout=stdout,
|
287
|
+
stderr=stderr,
|
288
|
+
retcode=retcode
|
289
|
+
)
|
284
290
|
else:
|
285
291
|
logger.error("Submit command failed")
|
286
292
|
logger.error("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip())
|
287
|
-
raise SubmitException(
|
293
|
+
raise SubmitException(
|
294
|
+
job_name, "Could not read job ID from submit command standard output",
|
295
|
+
stdout=stdout,
|
296
|
+
stderr=stderr,
|
297
|
+
retcode=retcode
|
298
|
+
)
|
288
299
|
|
289
300
|
def cancel(self, job_ids):
|
290
301
|
''' Cancels the jobs specified by a list of job ids
|
parsl/tests/configs/user_opts.py
CHANGED
@@ -52,13 +52,16 @@ user_opts = {
|
|
52
52
|
# 'username': MIDWAY_USERNAME,
|
53
53
|
# 'script_dir': '/scratch/midway2/{}/parsl_scripts'.format(MIDWAY_USERNAME),
|
54
54
|
# 'scheduler_options': "",
|
55
|
-
# 'worker_init': 'cd /scratch/midway2/{}/parsl_scripts;
|
55
|
+
# 'worker_init': 'cd /scratch/midway2/{}/parsl_scripts; '
|
56
|
+
# 'module load Anaconda3/5.1.0; source activate parsl_testing;'
|
57
|
+
# .format(MIDWAY_USERNAME),
|
56
58
|
# },
|
57
59
|
# 'osg': {
|
58
60
|
# 'username': OSG_USERNAME,
|
59
61
|
# 'script_dir': '/home/{}/parsl_scripts'.format(OSG_USERNAME),
|
60
62
|
# 'scheduler_options': "",
|
61
|
-
# 'worker_init' : 'module load python/3.5.2; python3 -m venv parsl_env;
|
63
|
+
# 'worker_init' : 'module load python/3.5.2; python3 -m venv parsl_env;
|
64
|
+
# source parsl_env/bin/activate; python3 -m pip install parsl==0.5.2'
|
62
65
|
# },
|
63
66
|
# 'swan': {
|
64
67
|
# 'username': SWAN_USERNAME,
|
@@ -0,0 +1,78 @@
|
|
1
|
+
import parsl
|
2
|
+
import pytest
|
3
|
+
import time
|
4
|
+
|
5
|
+
from parsl.providers import LocalProvider
|
6
|
+
from parsl.channels import LocalChannel
|
7
|
+
from parsl.launchers import SimpleLauncher
|
8
|
+
|
9
|
+
from parsl.config import Config
|
10
|
+
from parsl.executors import HighThroughputExecutor
|
11
|
+
|
12
|
+
# this constant is used to scale some durations that happen
|
13
|
+
# based around the expected drain period: the drain period
|
14
|
+
# is TIME_CONST seconds, and the single executed task will
|
15
|
+
# last twice that many number of seconds.
|
16
|
+
TIME_CONST = 1
|
17
|
+
|
18
|
+
|
19
|
+
def local_config():
|
20
|
+
return Config(
|
21
|
+
executors=[
|
22
|
+
HighThroughputExecutor(
|
23
|
+
label="htex_local",
|
24
|
+
drain_period=TIME_CONST,
|
25
|
+
worker_debug=True,
|
26
|
+
cores_per_worker=1,
|
27
|
+
encrypted=True,
|
28
|
+
provider=LocalProvider(
|
29
|
+
channel=LocalChannel(),
|
30
|
+
init_blocks=1,
|
31
|
+
min_blocks=0,
|
32
|
+
max_blocks=0,
|
33
|
+
launcher=SimpleLauncher(),
|
34
|
+
),
|
35
|
+
)
|
36
|
+
],
|
37
|
+
strategy='none',
|
38
|
+
)
|
39
|
+
|
40
|
+
|
41
|
+
@parsl.python_app
|
42
|
+
def f(n):
|
43
|
+
import time
|
44
|
+
time.sleep(n)
|
45
|
+
|
46
|
+
|
47
|
+
@pytest.mark.local
|
48
|
+
def test_drain(try_assert):
|
49
|
+
|
50
|
+
htex = parsl.dfk().executors['htex_local']
|
51
|
+
|
52
|
+
# wait till we have a block running...
|
53
|
+
|
54
|
+
try_assert(lambda: len(htex.connected_managers()) == 1)
|
55
|
+
|
56
|
+
managers = htex.connected_managers()
|
57
|
+
assert managers[0]['active'], "The manager should be active"
|
58
|
+
assert not managers[0]['draining'], "The manager should not be draining"
|
59
|
+
|
60
|
+
fut = f(TIME_CONST * 2)
|
61
|
+
|
62
|
+
time.sleep(TIME_CONST)
|
63
|
+
|
64
|
+
# this assert should happen *very fast* after the above delay...
|
65
|
+
try_assert(lambda: htex.connected_managers()[0]['draining'], timeout_ms=500)
|
66
|
+
|
67
|
+
# and the test task should still be running...
|
68
|
+
assert not fut.done(), "The test task should still be running"
|
69
|
+
|
70
|
+
fut.result()
|
71
|
+
|
72
|
+
# and now we should see the manager disappear...
|
73
|
+
# ... with strategy='none', this should be coming from draining but
|
74
|
+
# that information isn't immediately obvious from the absence in
|
75
|
+
# connected managers.
|
76
|
+
# As with the above draining assert, this should happen very fast after
|
77
|
+
# the task ends.
|
78
|
+
try_assert(lambda: len(htex.connected_managers()) == 0, timeout_ms=500)
|
@@ -0,0 +1,86 @@
|
|
1
|
+
"""Tests monitoring records app name under various decoration patterns.
|
2
|
+
"""
|
3
|
+
|
4
|
+
import os
|
5
|
+
import parsl
|
6
|
+
import pytest
|
7
|
+
import time
|
8
|
+
|
9
|
+
from parsl.tests.configs.htex_local_alternate import fresh_config
|
10
|
+
|
11
|
+
|
12
|
+
@parsl.python_app
|
13
|
+
def regular_decorated_app():
|
14
|
+
return 5
|
15
|
+
|
16
|
+
|
17
|
+
@pytest.mark.local
|
18
|
+
def get_regular_decorated_app():
|
19
|
+
return regular_decorated_app
|
20
|
+
|
21
|
+
|
22
|
+
def for_decoration_later():
|
23
|
+
return 77
|
24
|
+
|
25
|
+
|
26
|
+
def get_for_decoration_later():
|
27
|
+
return parsl.python_app(for_decoration_later)
|
28
|
+
|
29
|
+
|
30
|
+
def get_decorated_closure():
|
31
|
+
|
32
|
+
r = 53
|
33
|
+
|
34
|
+
@parsl.python_app
|
35
|
+
def decorated_closure():
|
36
|
+
return r
|
37
|
+
|
38
|
+
return decorated_closure
|
39
|
+
|
40
|
+
|
41
|
+
@pytest.mark.local
|
42
|
+
@pytest.mark.parametrize("get_app,expected_name,expected_result",
|
43
|
+
[(get_regular_decorated_app, "regular_decorated_app", 5),
|
44
|
+
(get_for_decoration_later, "for_decoration_later", 77),
|
45
|
+
(get_decorated_closure, "decorated_closure", 53)
|
46
|
+
])
|
47
|
+
def test_app_name(get_app, expected_name, expected_result, tmpd_cwd):
|
48
|
+
|
49
|
+
# this is imported here rather than at module level because
|
50
|
+
# it isn't available in a plain parsl install, so this module
|
51
|
+
# would otherwise fail to import and break even a basic test
|
52
|
+
# run.
|
53
|
+
import sqlalchemy
|
54
|
+
|
55
|
+
c = fresh_config()
|
56
|
+
c.run_dir = tmpd_cwd
|
57
|
+
c.monitoring.logging_endpoint = f"sqlite:///{tmpd_cwd}/monitoring.db"
|
58
|
+
parsl.load(c)
|
59
|
+
|
60
|
+
app = get_app()
|
61
|
+
assert app().result() == expected_result
|
62
|
+
|
63
|
+
parsl.dfk().cleanup()
|
64
|
+
parsl.clear()
|
65
|
+
|
66
|
+
engine = sqlalchemy.create_engine(c.monitoring.logging_endpoint)
|
67
|
+
with engine.begin() as connection:
|
68
|
+
|
69
|
+
def count_rows(table: str):
|
70
|
+
result = connection.execute(f"SELECT COUNT(*) FROM {table}")
|
71
|
+
(c, ) = result.first()
|
72
|
+
return c
|
73
|
+
|
74
|
+
# one workflow...
|
75
|
+
assert count_rows("workflow") == 1
|
76
|
+
|
77
|
+
# ... with one task ...
|
78
|
+
assert count_rows("task") == 1
|
79
|
+
|
80
|
+
# ... that was tried once ...
|
81
|
+
assert count_rows("try") == 1
|
82
|
+
|
83
|
+
# ... and has the expected name.
|
84
|
+
result = connection.execute("SELECT task_func_name FROM task")
|
85
|
+
(c, ) = result.first()
|
86
|
+
assert c == expected_name
|
@@ -37,6 +37,7 @@ def local_config():
|
|
37
37
|
],
|
38
38
|
max_idletime=0.5,
|
39
39
|
strategy='htex_auto_scale',
|
40
|
+
strategy_period=0.1
|
40
41
|
)
|
41
42
|
|
42
43
|
|
@@ -62,16 +63,6 @@ def waiting_app(ident: int, outputs=(), inputs=()):
|
|
62
63
|
def test_scale_out(tmpd_cwd, try_assert):
|
63
64
|
dfk = parsl.dfk()
|
64
65
|
|
65
|
-
# reconfigure scaling strategy to run faster than usual. This allows
|
66
|
-
# this test to complete faster - at time of writing 27s with default
|
67
|
-
# 5s strategy, vs XXXX with 0.5s strategy.
|
68
|
-
|
69
|
-
# check this attribute still exists, in the presence of ongoing
|
70
|
-
# development, so we have some belief that setting it will not be
|
71
|
-
# setting a now-ignored parameter.
|
72
|
-
assert hasattr(dfk.job_status_poller, 'interval')
|
73
|
-
dfk.job_status_poller.interval = 0.1
|
74
|
-
|
75
66
|
num_managers = len(dfk.executors['htex_local'].connected_managers())
|
76
67
|
|
77
68
|
assert num_managers == 0, "Expected 0 managers at start"
|
@@ -98,7 +89,8 @@ def test_scale_out(tmpd_cwd, try_assert):
|
|
98
89
|
|
99
90
|
assert dfk.executors['htex_local'].outstanding == 0
|
100
91
|
|
101
|
-
# now we can launch one "long" task -
|
92
|
+
# now we can launch one "long" task -
|
93
|
+
# and what should happen is that the connected_managers count "eventually" (?) converges to 1 and stays there.
|
102
94
|
|
103
95
|
finish_path = tmpd_cwd / "stage2_workers_may_continue"
|
104
96
|
|