parsl 2024.2.26__py3-none-any.whl → 2024.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/addresses.py +1 -1
- parsl/configs/ASPIRE1.py +1 -1
- parsl/configs/ad_hoc.py +1 -1
- parsl/configs/bridges.py +1 -1
- parsl/configs/cc_in2p3.py +1 -1
- parsl/configs/expanse.py +1 -1
- parsl/configs/frontera.py +1 -1
- parsl/configs/kubernetes.py +1 -1
- parsl/configs/midway.py +1 -1
- parsl/configs/osg.py +1 -1
- parsl/configs/stampede2.py +1 -1
- parsl/dataflow/dflow.py +11 -6
- parsl/dataflow/taskrecord.py +3 -1
- parsl/executors/high_throughput/executor.py +69 -37
- parsl/executors/high_throughput/interchange.py +78 -59
- parsl/executors/high_throughput/process_worker_pool.py +40 -28
- parsl/executors/taskvine/executor.py +3 -1
- parsl/executors/workqueue/executor.py +5 -2
- parsl/executors/workqueue/parsl_coprocess.py +107 -95
- parsl/jobs/job_status_poller.py +9 -3
- parsl/jobs/strategy.py +4 -3
- parsl/monitoring/db_manager.py +25 -5
- parsl/monitoring/monitoring.py +6 -2
- parsl/monitoring/remote.py +29 -0
- parsl/monitoring/visualization/models.py +7 -0
- parsl/providers/slurm/slurm.py +13 -2
- parsl/tests/configs/ad_hoc_cluster_htex.py +1 -1
- parsl/tests/configs/bluewaters.py +1 -1
- parsl/tests/configs/bridges.py +1 -1
- parsl/tests/configs/cc_in2p3.py +1 -1
- parsl/tests/configs/comet.py +1 -1
- parsl/tests/configs/frontera.py +1 -1
- parsl/tests/configs/midway.py +1 -1
- parsl/tests/configs/nscc_singapore.py +1 -1
- parsl/tests/configs/osg_htex.py +1 -1
- parsl/tests/configs/petrelkube.py +1 -1
- parsl/tests/configs/summit.py +1 -1
- parsl/tests/configs/theta.py +1 -1
- parsl/tests/configs/user_opts.py +3 -1
- parsl/tests/manual_tests/test_ad_hoc_htex.py +1 -1
- parsl/tests/scaling_tests/htex_local.py +1 -1
- parsl/tests/sites/test_affinity.py +1 -1
- parsl/tests/sites/test_concurrent.py +1 -1
- parsl/tests/sites/test_dynamic_executor.py +1 -1
- parsl/tests/sites/test_worker_info.py +1 -1
- parsl/tests/test_htex/test_basic.py +1 -1
- parsl/tests/test_htex/test_connected_blocks.py +1 -1
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +1 -1
- parsl/tests/test_htex/test_disconnected_blocks.py +1 -1
- parsl/tests/test_htex/test_htex.py +13 -0
- parsl/tests/test_htex/test_manager_failure.py +1 -1
- parsl/tests/test_htex/test_missing_worker.py +1 -1
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +1 -1
- parsl/tests/test_htex/test_worker_failure.py +1 -1
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +1 -1
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +1 -1
- parsl/tests/test_mpi_apps/test_resource_spec.py +1 -1
- parsl/tests/test_scaling/test_scale_down.py +2 -2
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +159 -0
- parsl/usage_tracking/usage.py +5 -9
- parsl/version.py +1 -1
- parsl-2024.3.11.data/scripts/parsl_coprocess.py +166 -0
- {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/process_worker_pool.py +40 -28
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/METADATA +2 -2
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/RECORD +70 -70
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/WHEEL +1 -1
- parsl/configs/bluewaters.py +0 -28
- parsl-2024.2.26.data/scripts/parsl_coprocess.py +0 -154
- {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/LICENSE +0 -0
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/entry_points.txt +0 -0
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/top_level.txt +0 -0
@@ -62,7 +62,7 @@ class Manager:
|
|
62
62
|
result_port,
|
63
63
|
cores_per_worker,
|
64
64
|
mem_per_worker,
|
65
|
-
|
65
|
+
max_workers_per_node,
|
66
66
|
prefetch_capacity,
|
67
67
|
uid,
|
68
68
|
block_id,
|
@@ -100,8 +100,8 @@ class Manager:
|
|
100
100
|
the there's sufficient memory for each worker. If set to None, memory on node is not
|
101
101
|
considered in the determination of workers to be launched on node by the manager.
|
102
102
|
|
103
|
-
|
104
|
-
|
103
|
+
max_workers_per_node : int | float
|
104
|
+
Caps the maximum number of workers that can be launched.
|
105
105
|
|
106
106
|
prefetch_capacity : int
|
107
107
|
Number of tasks that could be prefetched over available worker capacity.
|
@@ -140,7 +140,9 @@ class Manager:
|
|
140
140
|
Path to the certificate directory.
|
141
141
|
"""
|
142
142
|
|
143
|
-
logger.info("Manager
|
143
|
+
logger.info("Manager initializing")
|
144
|
+
|
145
|
+
self._start_time = time.time()
|
144
146
|
|
145
147
|
try:
|
146
148
|
ix_address = probe_addresses(addresses.split(','), task_port, timeout=address_probe_timeout)
|
@@ -188,15 +190,15 @@ class Manager:
|
|
188
190
|
else:
|
189
191
|
available_mem_on_node = round(psutil.virtual_memory().available / (2**30), 1)
|
190
192
|
|
191
|
-
self.
|
193
|
+
self.max_workers_per_node = max_workers_per_node
|
192
194
|
self.prefetch_capacity = prefetch_capacity
|
193
195
|
|
194
|
-
mem_slots =
|
196
|
+
mem_slots = max_workers_per_node
|
195
197
|
# Avoid a divide by 0 error.
|
196
198
|
if mem_per_worker and mem_per_worker > 0:
|
197
199
|
mem_slots = math.floor(available_mem_on_node / mem_per_worker)
|
198
200
|
|
199
|
-
self.worker_count: int = min(
|
201
|
+
self.worker_count: int = min(max_workers_per_node,
|
200
202
|
mem_slots,
|
201
203
|
math.floor(cores_on_node / cores_per_worker))
|
202
204
|
|
@@ -237,7 +239,8 @@ class Manager:
|
|
237
239
|
def create_reg_message(self):
|
238
240
|
""" Creates a registration message to identify the worker to the interchange
|
239
241
|
"""
|
240
|
-
msg = {'
|
242
|
+
msg = {'type': 'registration',
|
243
|
+
'parsl_v': PARSL_VERSION,
|
241
244
|
'python_v': "{}.{}.{}".format(sys.version_info.major,
|
242
245
|
sys.version_info.minor,
|
243
246
|
sys.version_info.micro),
|
@@ -258,8 +261,9 @@ class Manager:
|
|
258
261
|
def heartbeat_to_incoming(self):
|
259
262
|
""" Send heartbeat to the incoming task queue
|
260
263
|
"""
|
261
|
-
|
262
|
-
|
264
|
+
msg = {'type': 'heartbeat'}
|
265
|
+
b_msg = json.dumps(msg).encode('utf-8')
|
266
|
+
self.task_incoming.send(b_msg)
|
263
267
|
logger.debug("Sent heartbeat")
|
264
268
|
|
265
269
|
@wrap_with_logs
|
@@ -284,9 +288,17 @@ class Manager:
|
|
284
288
|
last_interchange_contact = time.time()
|
285
289
|
task_recv_counter = 0
|
286
290
|
|
287
|
-
poll_timer = self.poll_period
|
288
|
-
|
289
291
|
while not kill_event.is_set():
|
292
|
+
|
293
|
+
# This loop will sit inside poller.poll until either a message
|
294
|
+
# arrives or one of these event times is reached. This code
|
295
|
+
# assumes that the event times won't change except on iteration
|
296
|
+
# of this loop - so will break if a different thread does
|
297
|
+
# anything to bring one of the event times earlier - and that the
|
298
|
+
# time here are correctly copy-pasted from the relevant if
|
299
|
+
# statements.
|
300
|
+
next_interesting_event_time = min(last_beat + self.heartbeat_period,
|
301
|
+
last_interchange_contact + self.heartbeat_threshold)
|
290
302
|
try:
|
291
303
|
pending_task_count = self.pending_task_queue.qsize()
|
292
304
|
except NotImplementedError:
|
@@ -296,14 +308,14 @@ class Manager:
|
|
296
308
|
logger.debug("ready workers: {}, pending tasks: {}".format(self.ready_worker_count.value,
|
297
309
|
pending_task_count))
|
298
310
|
|
299
|
-
if time.time()
|
311
|
+
if time.time() >= last_beat + self.heartbeat_period:
|
300
312
|
self.heartbeat_to_incoming()
|
301
313
|
last_beat = time.time()
|
302
314
|
|
303
|
-
|
315
|
+
poll_duration_s = max(0, next_interesting_event_time - time.time())
|
316
|
+
socks = dict(poller.poll(timeout=poll_duration_s * 1000))
|
304
317
|
|
305
318
|
if self.task_incoming in socks and socks[self.task_incoming] == zmq.POLLIN:
|
306
|
-
poll_timer = 0
|
307
319
|
_, pkl_msg = self.task_incoming.recv_multipart()
|
308
320
|
tasks = pickle.loads(pkl_msg)
|
309
321
|
last_interchange_contact = time.time()
|
@@ -320,14 +332,9 @@ class Manager:
|
|
320
332
|
|
321
333
|
else:
|
322
334
|
logger.debug("No incoming tasks")
|
323
|
-
# Limit poll duration to heartbeat_period
|
324
|
-
# heartbeat_period is in s vs poll_timer in ms
|
325
|
-
if not poll_timer:
|
326
|
-
poll_timer = self.poll_period
|
327
|
-
poll_timer = min(self.heartbeat_period * 1000, poll_timer * 2)
|
328
335
|
|
329
336
|
# Only check if no messages were received.
|
330
|
-
if time.time()
|
337
|
+
if time.time() >= last_interchange_contact + self.heartbeat_threshold:
|
331
338
|
logger.critical("Missing contact with interchange beyond heartbeat_threshold")
|
332
339
|
kill_event.set()
|
333
340
|
logger.critical("Exiting")
|
@@ -364,7 +371,8 @@ class Manager:
|
|
364
371
|
logger.exception("Got an exception: {}".format(e))
|
365
372
|
|
366
373
|
if time.time() > last_result_beat + self.heartbeat_period:
|
367
|
-
|
374
|
+
heartbeat_message = f"last_result_beat={last_result_beat} heartbeat_period={self.heartbeat_period} seconds"
|
375
|
+
logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
|
368
376
|
last_result_beat = time.time()
|
369
377
|
items.append(pickle.dumps({'type': 'heartbeat'}))
|
370
378
|
|
@@ -405,7 +413,9 @@ class Manager:
|
|
405
413
|
raise WorkerLost(worker_id, platform.node())
|
406
414
|
except Exception:
|
407
415
|
logger.info("Putting exception for executor task {} in the pending result queue".format(task['task_id']))
|
408
|
-
result_package = {'type': 'result',
|
416
|
+
result_package = {'type': 'result',
|
417
|
+
'task_id': task['task_id'],
|
418
|
+
'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
|
409
419
|
pkl_package = pickle.dumps(result_package)
|
410
420
|
self.pending_result_queue.put(pkl_package)
|
411
421
|
except KeyError:
|
@@ -452,7 +462,6 @@ class Manager:
|
|
452
462
|
|
453
463
|
TODO: Move task receiving to a thread
|
454
464
|
"""
|
455
|
-
start = time.time()
|
456
465
|
self._kill_event = threading.Event()
|
457
466
|
self._tasks_in_progress = self._mp_manager.dict()
|
458
467
|
|
@@ -502,7 +511,7 @@ class Manager:
|
|
502
511
|
self.task_incoming.close()
|
503
512
|
self.result_outgoing.close()
|
504
513
|
self.zmq_context.term()
|
505
|
-
delta = time.time() -
|
514
|
+
delta = time.time() - self._start_time
|
506
515
|
logger.info("process_worker_pool ran for {} seconds".format(delta))
|
507
516
|
return
|
508
517
|
|
@@ -787,7 +796,7 @@ if __name__ == "__main__":
|
|
787
796
|
help="GB of memory assigned to each worker process. Default=0, no assignment")
|
788
797
|
parser.add_argument("-t", "--task_port", required=True,
|
789
798
|
help="REQUIRED: Task port for receiving tasks from the interchange")
|
790
|
-
parser.add_argument("--
|
799
|
+
parser.add_argument("--max_workers_per_node", default=float('inf'),
|
791
800
|
help="Caps the maximum workers that can be launched, default:infinity")
|
792
801
|
parser.add_argument("-p", "--prefetch_capacity", default=0,
|
793
802
|
help="Number of tasks that can be prefetched to the manager. Default is 0.")
|
@@ -841,7 +850,7 @@ if __name__ == "__main__":
|
|
841
850
|
logger.info("task_port: {}".format(args.task_port))
|
842
851
|
logger.info("result_port: {}".format(args.result_port))
|
843
852
|
logger.info("addresses: {}".format(args.addresses))
|
844
|
-
logger.info("
|
853
|
+
logger.info("max_workers_per_node: {}".format(args.max_workers_per_node))
|
845
854
|
logger.info("poll_period: {}".format(args.poll))
|
846
855
|
logger.info("address_probe_timeout: {}".format(args.address_probe_timeout))
|
847
856
|
logger.info("Prefetch capacity: {}".format(args.prefetch_capacity))
|
@@ -860,7 +869,10 @@ if __name__ == "__main__":
|
|
860
869
|
block_id=args.block_id,
|
861
870
|
cores_per_worker=float(args.cores_per_worker),
|
862
871
|
mem_per_worker=None if args.mem_per_worker == 'None' else float(args.mem_per_worker),
|
863
|
-
|
872
|
+
max_workers_per_node=(
|
873
|
+
args.max_workers_per_node if args.max_workers_per_node == float('inf')
|
874
|
+
else int(args.max_workers_per_node)
|
875
|
+
),
|
864
876
|
prefetch_capacity=int(args.prefetch_capacity),
|
865
877
|
heartbeat_threshold=int(args.hb_threshold),
|
866
878
|
heartbeat_period=int(args.hb_period),
|
@@ -228,7 +228,9 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
228
228
|
# factory logs go with manager logs regardless
|
229
229
|
self.factory_config.scratch_dir = self.manager_config.vine_log_dir
|
230
230
|
logger.debug(f"Function data directory: {self._function_data_dir}, log directory: {log_dir}")
|
231
|
-
logger.debug(
|
231
|
+
logger.debug(
|
232
|
+
f"TaskVine manager log directory: {self.manager_config.vine_log_dir}, "
|
233
|
+
f"factory log directory: {self.factory_config.scratch_dir}")
|
232
234
|
|
233
235
|
def start(self):
|
234
236
|
"""Create submit process and collector thread to create, send, and
|
@@ -61,8 +61,11 @@ logger = logging.getLogger(__name__)
|
|
61
61
|
|
62
62
|
|
63
63
|
# Support structure to communicate parsl tasks to the work queue submit thread.
|
64
|
-
ParslTaskToWq = namedtuple(
|
65
|
-
|
64
|
+
ParslTaskToWq = namedtuple(
|
65
|
+
'ParslTaskToWq',
|
66
|
+
'id '
|
67
|
+
'category '
|
68
|
+
'cores memory disk gpus priority running_time_min env_pkg map_file function_file result_file input_files output_files')
|
66
69
|
|
67
70
|
# Support structure to communicate final status of work queue tasks to parsl
|
68
71
|
# if result_received is True:
|
@@ -1,18 +1,29 @@
|
|
1
1
|
#! /usr/bin/env python3
|
2
2
|
|
3
|
-
import sys
|
4
|
-
from parsl.app.errors import RemoteExceptionWrapper
|
5
|
-
|
6
3
|
import socket
|
7
4
|
import json
|
8
5
|
import os
|
9
6
|
import sys
|
10
|
-
|
11
|
-
|
7
|
+
|
8
|
+
# If enabled, coprocess will print to stdout
|
9
|
+
debug_mode = False
|
10
|
+
|
11
|
+
# Send a message on a binary I/O stream by sending the message length and then the (string) message.
|
12
|
+
def send_message(stream, data):
|
13
|
+
size = len(data)
|
14
|
+
size_msg = "{}\n".format(size)
|
15
|
+
stream.write(size_msg)
|
16
|
+
stream.write(data)
|
17
|
+
|
18
|
+
# Receive a standard message from a binary I/O stream by reading length and then returning the (string) message
|
19
|
+
def recv_message(stream):
|
20
|
+
line = stream.readline()
|
21
|
+
length = int(line)
|
22
|
+
return stream.read(length)
|
23
|
+
|
24
|
+
# Decorator for remotely execution functions to package things as json.
|
12
25
|
def remote_execute(func):
|
13
|
-
def remote_wrapper(event
|
14
|
-
if q:
|
15
|
-
event = json.loads(event)
|
26
|
+
def remote_wrapper(event):
|
16
27
|
kwargs = event["fn_kwargs"]
|
17
28
|
args = event["fn_args"]
|
18
29
|
try:
|
@@ -21,114 +32,115 @@ def remote_execute(func):
|
|
21
32
|
"StatusCode": 200
|
22
33
|
}
|
23
34
|
except Exception as e:
|
24
|
-
response = {
|
35
|
+
response = {
|
25
36
|
"Result": str(e),
|
26
|
-
"StatusCode": 500
|
37
|
+
"StatusCode": 500
|
27
38
|
}
|
28
|
-
|
29
|
-
return response
|
30
|
-
q.put(response)
|
39
|
+
return response
|
31
40
|
return remote_wrapper
|
32
|
-
|
33
|
-
|
34
|
-
def send_configuration(config):
|
35
|
-
config_string = json.dumps(config)
|
36
|
-
config_cmd = f"{len(config_string) + 1}\n{config_string}\n"
|
37
|
-
sys.stdout.write(config_cmd)
|
38
|
-
sys.stdout.flush()
|
41
|
+
|
42
|
+
# Main loop of coprocess for executing network functions.
|
39
43
|
def main():
|
44
|
+
# Listen on an arbitrary port to be reported to the worker.
|
40
45
|
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
41
46
|
try:
|
42
|
-
# modify the port argument to be 0 to listen on an arbitrary port
|
43
47
|
s.bind(('localhost', 0))
|
44
48
|
except Exception as e:
|
45
49
|
s.close()
|
46
|
-
print(e)
|
47
|
-
exit(1)
|
48
|
-
|
50
|
+
print(e, file=sys.stderr)
|
51
|
+
sys.exit(1)
|
52
|
+
|
53
|
+
# Inform the worker of name and port for later connection.
|
49
54
|
config = {
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
55
|
+
"name": name(), # noqa: F821
|
56
|
+
"port": s.getsockname()[1],
|
57
|
+
}
|
58
|
+
send_message(sys.stdout, json.dumps(config))
|
59
|
+
sys.stdout.flush()
|
60
|
+
|
61
|
+
# Remember original working directory b/c we change for each invocation.
|
54
62
|
abs_working_dir = os.getcwd()
|
63
|
+
|
64
|
+
# Create pipe for communication with child process
|
65
|
+
rpipe, wpipe = os.pipe()
|
66
|
+
rpipestream = os.fdopen(rpipe, "r")
|
67
|
+
|
55
68
|
while True:
|
56
69
|
s.listen()
|
57
70
|
conn, addr = s.accept()
|
58
|
-
|
71
|
+
connstream = conn.makefile("rw", encoding="utf-8")
|
72
|
+
|
73
|
+
if debug_mode:
|
74
|
+
print('Network function: connection from {}'.format(addr), file=sys.stderr)
|
75
|
+
|
59
76
|
while True:
|
60
|
-
#
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
if
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
77
|
+
# Read the invocation header from the worker
|
78
|
+
line = connstream.readline()
|
79
|
+
|
80
|
+
# If end of file, then break out and accept again
|
81
|
+
if not line:
|
82
|
+
break
|
83
|
+
|
84
|
+
# Parse the invocation header.
|
85
|
+
input_spec = line.split()
|
86
|
+
function_name = input_spec[0]
|
87
|
+
task_id = int(input_spec[1])
|
88
|
+
event_size = int(input_spec[2])
|
89
|
+
|
90
|
+
# then read the contents of the event itself
|
91
|
+
event_str = connstream.read(event_size)
|
92
|
+
event = json.loads(event_str)
|
93
|
+
exec_method = event.get("remote_task_exec_method", None)
|
94
|
+
|
71
95
|
try:
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
elif
|
89
|
-
|
96
|
+
# First move to target directory (is undone in finally block)
|
97
|
+
os.chdir(os.path.join(abs_working_dir, f't.{task_id}'))
|
98
|
+
|
99
|
+
# Then invoke function by desired method, resulting in
|
100
|
+
# response containing the text representation of the result.
|
101
|
+
|
102
|
+
if exec_method == "direct":
|
103
|
+
response = json.dumps(globals()[function_name](event))
|
104
|
+
else:
|
105
|
+
p = os.fork()
|
106
|
+
if p == 0:
|
107
|
+
response = globals()[function_name](event)
|
108
|
+
wpipestream = os.fdopen(wpipe, "w")
|
109
|
+
send_message(wpipestream, json.dumps(response))
|
110
|
+
wpipestream.flush()
|
111
|
+
os._exit(0)
|
112
|
+
elif p < 0:
|
113
|
+
if debug_mode:
|
114
|
+
print(f'Network function: unable to fork to execute {function_name}', file=sys.stderr)
|
115
|
+
response = {
|
116
|
+
"Result": "unable to fork",
|
117
|
+
"StatusCode": 500
|
118
|
+
}
|
119
|
+
response = json.dumps(response)
|
90
120
|
else:
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
response = {
|
99
|
-
"Result": "unable to fork",
|
100
|
-
"StatusCode": 500
|
101
|
-
}
|
102
|
-
else:
|
103
|
-
chunk = os.read(read, 65536).decode("utf-8")
|
104
|
-
all_chunks = [chunk]
|
105
|
-
while (len(chunk) >= 65536):
|
106
|
-
chunk = os.read(read, 65536).decode("utf-8")
|
107
|
-
all_chunks.append(chunk)
|
108
|
-
response = "".join(all_chunks).encode("utf-8")
|
109
|
-
os.waitid(os.P_PID, p, os.WEXITED)
|
110
|
-
response_size = len(response)
|
111
|
-
size_msg = "{}\n".format(response_size)
|
112
|
-
# send the size of response
|
113
|
-
conn.sendall(size_msg.encode('utf-8'))
|
114
|
-
# send response
|
115
|
-
conn.sendall(response)
|
116
|
-
break
|
121
|
+
# Get response string from child process.
|
122
|
+
response = recv_message(rpipestream)
|
123
|
+
# Wait for child process to complete
|
124
|
+
os.waitpid(p, 0)
|
125
|
+
|
126
|
+
# At this point, response is set to a value one way or the other
|
127
|
+
|
117
128
|
except Exception as e:
|
118
|
-
|
129
|
+
if debug_mode:
|
130
|
+
print("Network function encountered exception ", str(e), file=sys.stderr)
|
119
131
|
response = {
|
120
132
|
'Result': f'network function encountered exception {e}',
|
121
133
|
'Status Code': 500
|
122
134
|
}
|
123
|
-
response = json.dumps(response)
|
124
|
-
response_size = len(response)
|
125
|
-
size_msg = "{}\n".format(response_size)
|
126
|
-
# send the size of response
|
127
|
-
conn.sendall(size_msg.encode('utf-8'))
|
128
|
-
# send response
|
129
|
-
conn.sendall(response)
|
135
|
+
response = json.dumps(response)
|
130
136
|
finally:
|
137
|
+
# Restore the working directory, no matter how the function ended.
|
131
138
|
os.chdir(abs_working_dir)
|
139
|
+
|
140
|
+
# Send response string back to parent worker process.
|
141
|
+
send_message(connstream, response)
|
142
|
+
connstream.flush()
|
143
|
+
|
132
144
|
return 0
|
133
145
|
def name():
|
134
146
|
return 'parsl_coprocess'
|
@@ -136,9 +148,9 @@ def name():
|
|
136
148
|
def run_parsl_task(a, b, c):
|
137
149
|
import parsl.executors.workqueue.exec_parsl_function as epf
|
138
150
|
try:
|
139
|
-
map_file, function_file, result_file = (a, b, c)
|
151
|
+
(map_file, function_file, result_file) = (a, b, c)
|
140
152
|
try:
|
141
|
-
namespace, function_code, result_name = epf.load_function(map_file, function_file)
|
153
|
+
(namespace, function_code, result_name) = epf.load_function(map_file, function_file)
|
142
154
|
except Exception:
|
143
155
|
raise
|
144
156
|
try:
|
@@ -150,5 +162,5 @@ def run_parsl_task(a, b, c):
|
|
150
162
|
epf.dump_result_to_file(result_file, result)
|
151
163
|
return None
|
152
164
|
if __name__ == "__main__":
|
153
|
-
|
165
|
+
main()
|
154
166
|
|
parsl/jobs/job_status_poller.py
CHANGED
@@ -72,11 +72,17 @@ class PollItem:
|
|
72
72
|
def executor(self) -> BlockProviderExecutor:
|
73
73
|
return self._executor
|
74
74
|
|
75
|
-
def scale_in(self, n,
|
76
|
-
|
75
|
+
def scale_in(self, n, max_idletime=None):
|
76
|
+
|
77
|
+
if max_idletime is None:
|
77
78
|
block_ids = self._executor.scale_in(n)
|
78
79
|
else:
|
79
|
-
|
80
|
+
# This is a HighThroughputExecutor-specific interface violation.
|
81
|
+
# This code hopes, through pan-codebase reasoning, that this
|
82
|
+
# scale_in method really does come from HighThroughputExecutor,
|
83
|
+
# and so does have an extra max_idletime parameter not present
|
84
|
+
# in the executor interface.
|
85
|
+
block_ids = self._executor.scale_in(n, max_idletime=max_idletime)
|
80
86
|
if block_ids is not None:
|
81
87
|
new_status = {}
|
82
88
|
for block_id in block_ids:
|
parsl/jobs/strategy.py
CHANGED
@@ -245,7 +245,8 @@ class Strategy:
|
|
245
245
|
exec_status.scale_in(active_blocks - min_blocks)
|
246
246
|
|
247
247
|
else:
|
248
|
-
logger.debug(
|
248
|
+
logger.debug(
|
249
|
+
f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s for executor {label}; not scaling in")
|
249
250
|
|
250
251
|
# Case 2
|
251
252
|
# More tasks than the available slots.
|
@@ -288,8 +289,8 @@ class Strategy:
|
|
288
289
|
excess_slots = math.ceil(active_slots - (active_tasks * parallelism))
|
289
290
|
excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
|
290
291
|
excess_blocks = min(excess_blocks, active_blocks - min_blocks)
|
291
|
-
logger.debug(f"Requesting scaling in by {excess_blocks} blocks")
|
292
|
-
exec_status.scale_in(excess_blocks,
|
292
|
+
logger.debug(f"Requesting scaling in by {excess_blocks} blocks with idle time {self.max_idletime}s")
|
293
|
+
exec_status.scale_in(excess_blocks, max_idletime=self.max_idletime)
|
293
294
|
else:
|
294
295
|
logger.error("This strategy does not support scaling in except for HighThroughputExecutor - taking no action")
|
295
296
|
else:
|
parsl/monitoring/db_manager.py
CHANGED
@@ -103,7 +103,13 @@ class Database:
|
|
103
103
|
def rollback(self) -> None:
|
104
104
|
self.session.rollback()
|
105
105
|
|
106
|
-
def _generate_mappings(
|
106
|
+
def _generate_mappings(
|
107
|
+
self,
|
108
|
+
table: Table,
|
109
|
+
columns: Optional[List[str]] = None,
|
110
|
+
messages: List[MonitoringMessage] = [],
|
111
|
+
) -> List[Dict[str, Any]]:
|
112
|
+
|
107
113
|
mappings = []
|
108
114
|
for msg in messages:
|
109
115
|
m = {}
|
@@ -250,6 +256,12 @@ class Database:
|
|
250
256
|
'psutil_process_disk_write', Float, nullable=True)
|
251
257
|
psutil_process_status = Column(
|
252
258
|
'psutil_process_status', Text, nullable=True)
|
259
|
+
psutil_cpu_num = Column(
|
260
|
+
'psutil_cpu_num', Text, nullable=True)
|
261
|
+
psutil_process_num_ctx_switches_voluntary = Column(
|
262
|
+
'psutil_process_num_ctx_switches_voluntary', Float, nullable=True)
|
263
|
+
psutil_process_num_ctx_switches_involuntary = Column(
|
264
|
+
'psutil_process_num_ctx_switches_involuntary', Float, nullable=True)
|
253
265
|
__table_args__ = (
|
254
266
|
PrimaryKeyConstraint('try_id', 'task_id', 'run_id', 'timestamp'),
|
255
267
|
)
|
@@ -518,7 +530,10 @@ class DatabaseManager:
|
|
518
530
|
reprocessable_first_resource_messages.append(msg)
|
519
531
|
else:
|
520
532
|
if task_try_id in deferred_resource_messages:
|
521
|
-
logger.error(
|
533
|
+
logger.error(
|
534
|
+
"Task {} already has a deferred resource message. "
|
535
|
+
"Discarding previous message.".format(msg['task_id'])
|
536
|
+
)
|
522
537
|
deferred_resource_messages[task_try_id] = msg
|
523
538
|
elif msg['last_msg']:
|
524
539
|
# This assumes that the primary key has been added
|
@@ -544,7 +559,10 @@ class DatabaseManager:
|
|
544
559
|
if reprocessable_last_resource_messages:
|
545
560
|
self._insert(table=STATUS, messages=reprocessable_last_resource_messages)
|
546
561
|
except Exception:
|
547
|
-
logger.exception(
|
562
|
+
logger.exception(
|
563
|
+
"Exception in db loop: this might have been a malformed message, "
|
564
|
+
"or some other error. monitoring data may have been lost"
|
565
|
+
)
|
548
566
|
exception_happened = True
|
549
567
|
if exception_happened:
|
550
568
|
raise RuntimeError("An exception happened sometime during database processing and should have been logged in database_manager.log")
|
@@ -571,8 +589,10 @@ class DatabaseManager:
|
|
571
589
|
self._dispatch_to_internal(x)
|
572
590
|
elif queue_tag == 'resource':
|
573
591
|
assert isinstance(x, tuple), "_migrate_logs_to_internal was expecting a tuple, got {}".format(x)
|
574
|
-
assert x[0] == MessageType.RESOURCE_INFO,
|
575
|
-
"_migrate_logs_to_internal can only migrate RESOURCE_INFO message from resource queue,
|
592
|
+
assert x[0] == MessageType.RESOURCE_INFO, (
|
593
|
+
"_migrate_logs_to_internal can only migrate RESOURCE_INFO message from resource queue, "
|
594
|
+
"got tag {}, message {}".format(x[0], x)
|
595
|
+
)
|
576
596
|
self._dispatch_to_internal(x)
|
577
597
|
elif queue_tag == 'node':
|
578
598
|
assert len(x) == 2, "expected message tuple to have exactly two elements"
|
parsl/monitoring/monitoring.py
CHANGED
@@ -290,8 +290,12 @@ class MonitoringHub(RepresentationMixin):
|
|
290
290
|
self._dfk_channel.close()
|
291
291
|
if exception_msgs:
|
292
292
|
for exception_msg in exception_msgs:
|
293
|
-
self.logger.error(
|
294
|
-
|
293
|
+
self.logger.error(
|
294
|
+
"{} process delivered an exception: {}. Terminating all monitoring processes immediately.".format(
|
295
|
+
exception_msg[0],
|
296
|
+
exception_msg[1]
|
297
|
+
)
|
298
|
+
)
|
295
299
|
self.router_proc.terminate()
|
296
300
|
self.dbm_proc.terminate()
|
297
301
|
self.filesystem_proc.terminate()
|