parsl 2024.2.26__py3-none-any.whl → 2024.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/addresses.py +1 -1
- parsl/configs/ASPIRE1.py +1 -1
- parsl/configs/ad_hoc.py +1 -1
- parsl/configs/bridges.py +1 -1
- parsl/configs/cc_in2p3.py +1 -1
- parsl/configs/expanse.py +1 -1
- parsl/configs/frontera.py +1 -1
- parsl/configs/kubernetes.py +1 -1
- parsl/configs/midway.py +1 -1
- parsl/configs/osg.py +1 -1
- parsl/configs/stampede2.py +1 -1
- parsl/dataflow/dflow.py +11 -6
- parsl/dataflow/taskrecord.py +3 -1
- parsl/executors/high_throughput/executor.py +69 -37
- parsl/executors/high_throughput/interchange.py +78 -59
- parsl/executors/high_throughput/process_worker_pool.py +40 -28
- parsl/executors/taskvine/executor.py +3 -1
- parsl/executors/workqueue/executor.py +5 -2
- parsl/executors/workqueue/parsl_coprocess.py +107 -95
- parsl/jobs/job_status_poller.py +9 -3
- parsl/jobs/strategy.py +4 -3
- parsl/monitoring/db_manager.py +25 -5
- parsl/monitoring/monitoring.py +6 -2
- parsl/monitoring/remote.py +29 -0
- parsl/monitoring/visualization/models.py +7 -0
- parsl/providers/slurm/slurm.py +13 -2
- parsl/tests/configs/ad_hoc_cluster_htex.py +1 -1
- parsl/tests/configs/bluewaters.py +1 -1
- parsl/tests/configs/bridges.py +1 -1
- parsl/tests/configs/cc_in2p3.py +1 -1
- parsl/tests/configs/comet.py +1 -1
- parsl/tests/configs/frontera.py +1 -1
- parsl/tests/configs/midway.py +1 -1
- parsl/tests/configs/nscc_singapore.py +1 -1
- parsl/tests/configs/osg_htex.py +1 -1
- parsl/tests/configs/petrelkube.py +1 -1
- parsl/tests/configs/summit.py +1 -1
- parsl/tests/configs/theta.py +1 -1
- parsl/tests/configs/user_opts.py +3 -1
- parsl/tests/manual_tests/test_ad_hoc_htex.py +1 -1
- parsl/tests/scaling_tests/htex_local.py +1 -1
- parsl/tests/sites/test_affinity.py +1 -1
- parsl/tests/sites/test_concurrent.py +1 -1
- parsl/tests/sites/test_dynamic_executor.py +1 -1
- parsl/tests/sites/test_worker_info.py +1 -1
- parsl/tests/test_htex/test_basic.py +1 -1
- parsl/tests/test_htex/test_connected_blocks.py +1 -1
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +1 -1
- parsl/tests/test_htex/test_disconnected_blocks.py +1 -1
- parsl/tests/test_htex/test_htex.py +13 -0
- parsl/tests/test_htex/test_manager_failure.py +1 -1
- parsl/tests/test_htex/test_missing_worker.py +1 -1
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +1 -1
- parsl/tests/test_htex/test_worker_failure.py +1 -1
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +1 -1
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +1 -1
- parsl/tests/test_mpi_apps/test_resource_spec.py +1 -1
- parsl/tests/test_scaling/test_scale_down.py +2 -2
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +159 -0
- parsl/usage_tracking/usage.py +5 -9
- parsl/version.py +1 -1
- parsl-2024.3.11.data/scripts/parsl_coprocess.py +166 -0
- {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/process_worker_pool.py +40 -28
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/METADATA +2 -2
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/RECORD +70 -70
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/WHEEL +1 -1
- parsl/configs/bluewaters.py +0 -28
- parsl-2024.2.26.data/scripts/parsl_coprocess.py +0 -154
- {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/LICENSE +0 -0
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/entry_points.txt +0 -0
- {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,166 @@
|
|
1
|
+
#!python
|
2
|
+
|
3
|
+
import socket
|
4
|
+
import json
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
|
8
|
+
# If enabled, coprocess will print to stdout
|
9
|
+
debug_mode = False
|
10
|
+
|
11
|
+
# Send a message on a binary I/O stream by sending the message length and then the (string) message.
|
12
|
+
def send_message(stream, data):
|
13
|
+
size = len(data)
|
14
|
+
size_msg = "{}\n".format(size)
|
15
|
+
stream.write(size_msg)
|
16
|
+
stream.write(data)
|
17
|
+
|
18
|
+
# Receive a standard message from a binary I/O stream by reading length and then returning the (string) message
|
19
|
+
def recv_message(stream):
|
20
|
+
line = stream.readline()
|
21
|
+
length = int(line)
|
22
|
+
return stream.read(length)
|
23
|
+
|
24
|
+
# Decorator for remotely execution functions to package things as json.
|
25
|
+
def remote_execute(func):
|
26
|
+
def remote_wrapper(event):
|
27
|
+
kwargs = event["fn_kwargs"]
|
28
|
+
args = event["fn_args"]
|
29
|
+
try:
|
30
|
+
response = {
|
31
|
+
"Result": func(*args, **kwargs),
|
32
|
+
"StatusCode": 200
|
33
|
+
}
|
34
|
+
except Exception as e:
|
35
|
+
response = {
|
36
|
+
"Result": str(e),
|
37
|
+
"StatusCode": 500
|
38
|
+
}
|
39
|
+
return response
|
40
|
+
return remote_wrapper
|
41
|
+
|
42
|
+
# Main loop of coprocess for executing network functions.
|
43
|
+
def main():
|
44
|
+
# Listen on an arbitrary port to be reported to the worker.
|
45
|
+
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
46
|
+
try:
|
47
|
+
s.bind(('localhost', 0))
|
48
|
+
except Exception as e:
|
49
|
+
s.close()
|
50
|
+
print(e, file=sys.stderr)
|
51
|
+
sys.exit(1)
|
52
|
+
|
53
|
+
# Inform the worker of name and port for later connection.
|
54
|
+
config = {
|
55
|
+
"name": name(), # noqa: F821
|
56
|
+
"port": s.getsockname()[1],
|
57
|
+
}
|
58
|
+
send_message(sys.stdout, json.dumps(config))
|
59
|
+
sys.stdout.flush()
|
60
|
+
|
61
|
+
# Remember original working directory b/c we change for each invocation.
|
62
|
+
abs_working_dir = os.getcwd()
|
63
|
+
|
64
|
+
# Create pipe for communication with child process
|
65
|
+
rpipe, wpipe = os.pipe()
|
66
|
+
rpipestream = os.fdopen(rpipe, "r")
|
67
|
+
|
68
|
+
while True:
|
69
|
+
s.listen()
|
70
|
+
conn, addr = s.accept()
|
71
|
+
connstream = conn.makefile("rw", encoding="utf-8")
|
72
|
+
|
73
|
+
if debug_mode:
|
74
|
+
print('Network function: connection from {}'.format(addr), file=sys.stderr)
|
75
|
+
|
76
|
+
while True:
|
77
|
+
# Read the invocation header from the worker
|
78
|
+
line = connstream.readline()
|
79
|
+
|
80
|
+
# If end of file, then break out and accept again
|
81
|
+
if not line:
|
82
|
+
break
|
83
|
+
|
84
|
+
# Parse the invocation header.
|
85
|
+
input_spec = line.split()
|
86
|
+
function_name = input_spec[0]
|
87
|
+
task_id = int(input_spec[1])
|
88
|
+
event_size = int(input_spec[2])
|
89
|
+
|
90
|
+
# then read the contents of the event itself
|
91
|
+
event_str = connstream.read(event_size)
|
92
|
+
event = json.loads(event_str)
|
93
|
+
exec_method = event.get("remote_task_exec_method", None)
|
94
|
+
|
95
|
+
try:
|
96
|
+
# First move to target directory (is undone in finally block)
|
97
|
+
os.chdir(os.path.join(abs_working_dir, f't.{task_id}'))
|
98
|
+
|
99
|
+
# Then invoke function by desired method, resulting in
|
100
|
+
# response containing the text representation of the result.
|
101
|
+
|
102
|
+
if exec_method == "direct":
|
103
|
+
response = json.dumps(globals()[function_name](event))
|
104
|
+
else:
|
105
|
+
p = os.fork()
|
106
|
+
if p == 0:
|
107
|
+
response = globals()[function_name](event)
|
108
|
+
wpipestream = os.fdopen(wpipe, "w")
|
109
|
+
send_message(wpipestream, json.dumps(response))
|
110
|
+
wpipestream.flush()
|
111
|
+
os._exit(0)
|
112
|
+
elif p < 0:
|
113
|
+
if debug_mode:
|
114
|
+
print(f'Network function: unable to fork to execute {function_name}', file=sys.stderr)
|
115
|
+
response = {
|
116
|
+
"Result": "unable to fork",
|
117
|
+
"StatusCode": 500
|
118
|
+
}
|
119
|
+
response = json.dumps(response)
|
120
|
+
else:
|
121
|
+
# Get response string from child process.
|
122
|
+
response = recv_message(rpipestream)
|
123
|
+
# Wait for child process to complete
|
124
|
+
os.waitpid(p, 0)
|
125
|
+
|
126
|
+
# At this point, response is set to a value one way or the other
|
127
|
+
|
128
|
+
except Exception as e:
|
129
|
+
if debug_mode:
|
130
|
+
print("Network function encountered exception ", str(e), file=sys.stderr)
|
131
|
+
response = {
|
132
|
+
'Result': f'network function encountered exception {e}',
|
133
|
+
'Status Code': 500
|
134
|
+
}
|
135
|
+
response = json.dumps(response)
|
136
|
+
finally:
|
137
|
+
# Restore the working directory, no matter how the function ended.
|
138
|
+
os.chdir(abs_working_dir)
|
139
|
+
|
140
|
+
# Send response string back to parent worker process.
|
141
|
+
send_message(connstream, response)
|
142
|
+
connstream.flush()
|
143
|
+
|
144
|
+
return 0
|
145
|
+
def name():
|
146
|
+
return 'parsl_coprocess'
|
147
|
+
@remote_execute
|
148
|
+
def run_parsl_task(a, b, c):
|
149
|
+
import parsl.executors.workqueue.exec_parsl_function as epf
|
150
|
+
try:
|
151
|
+
(map_file, function_file, result_file) = (a, b, c)
|
152
|
+
try:
|
153
|
+
(namespace, function_code, result_name) = epf.load_function(map_file, function_file)
|
154
|
+
except Exception:
|
155
|
+
raise
|
156
|
+
try:
|
157
|
+
result = epf.execute_function(namespace, function_code, result_name)
|
158
|
+
except Exception:
|
159
|
+
raise
|
160
|
+
except Exception:
|
161
|
+
result = RemoteExceptionWrapper(*sys.exc_info())
|
162
|
+
epf.dump_result_to_file(result_file, result)
|
163
|
+
return None
|
164
|
+
if __name__ == "__main__":
|
165
|
+
main()
|
166
|
+
|
@@ -62,7 +62,7 @@ class Manager:
|
|
62
62
|
result_port,
|
63
63
|
cores_per_worker,
|
64
64
|
mem_per_worker,
|
65
|
-
|
65
|
+
max_workers_per_node,
|
66
66
|
prefetch_capacity,
|
67
67
|
uid,
|
68
68
|
block_id,
|
@@ -100,8 +100,8 @@ class Manager:
|
|
100
100
|
the there's sufficient memory for each worker. If set to None, memory on node is not
|
101
101
|
considered in the determination of workers to be launched on node by the manager.
|
102
102
|
|
103
|
-
|
104
|
-
|
103
|
+
max_workers_per_node : int | float
|
104
|
+
Caps the maximum number of workers that can be launched.
|
105
105
|
|
106
106
|
prefetch_capacity : int
|
107
107
|
Number of tasks that could be prefetched over available worker capacity.
|
@@ -140,7 +140,9 @@ class Manager:
|
|
140
140
|
Path to the certificate directory.
|
141
141
|
"""
|
142
142
|
|
143
|
-
logger.info("Manager
|
143
|
+
logger.info("Manager initializing")
|
144
|
+
|
145
|
+
self._start_time = time.time()
|
144
146
|
|
145
147
|
try:
|
146
148
|
ix_address = probe_addresses(addresses.split(','), task_port, timeout=address_probe_timeout)
|
@@ -188,15 +190,15 @@ class Manager:
|
|
188
190
|
else:
|
189
191
|
available_mem_on_node = round(psutil.virtual_memory().available / (2**30), 1)
|
190
192
|
|
191
|
-
self.
|
193
|
+
self.max_workers_per_node = max_workers_per_node
|
192
194
|
self.prefetch_capacity = prefetch_capacity
|
193
195
|
|
194
|
-
mem_slots =
|
196
|
+
mem_slots = max_workers_per_node
|
195
197
|
# Avoid a divide by 0 error.
|
196
198
|
if mem_per_worker and mem_per_worker > 0:
|
197
199
|
mem_slots = math.floor(available_mem_on_node / mem_per_worker)
|
198
200
|
|
199
|
-
self.worker_count: int = min(
|
201
|
+
self.worker_count: int = min(max_workers_per_node,
|
200
202
|
mem_slots,
|
201
203
|
math.floor(cores_on_node / cores_per_worker))
|
202
204
|
|
@@ -237,7 +239,8 @@ class Manager:
|
|
237
239
|
def create_reg_message(self):
|
238
240
|
""" Creates a registration message to identify the worker to the interchange
|
239
241
|
"""
|
240
|
-
msg = {'
|
242
|
+
msg = {'type': 'registration',
|
243
|
+
'parsl_v': PARSL_VERSION,
|
241
244
|
'python_v': "{}.{}.{}".format(sys.version_info.major,
|
242
245
|
sys.version_info.minor,
|
243
246
|
sys.version_info.micro),
|
@@ -258,8 +261,9 @@ class Manager:
|
|
258
261
|
def heartbeat_to_incoming(self):
|
259
262
|
""" Send heartbeat to the incoming task queue
|
260
263
|
"""
|
261
|
-
|
262
|
-
|
264
|
+
msg = {'type': 'heartbeat'}
|
265
|
+
b_msg = json.dumps(msg).encode('utf-8')
|
266
|
+
self.task_incoming.send(b_msg)
|
263
267
|
logger.debug("Sent heartbeat")
|
264
268
|
|
265
269
|
@wrap_with_logs
|
@@ -284,9 +288,17 @@ class Manager:
|
|
284
288
|
last_interchange_contact = time.time()
|
285
289
|
task_recv_counter = 0
|
286
290
|
|
287
|
-
poll_timer = self.poll_period
|
288
|
-
|
289
291
|
while not kill_event.is_set():
|
292
|
+
|
293
|
+
# This loop will sit inside poller.poll until either a message
|
294
|
+
# arrives or one of these event times is reached. This code
|
295
|
+
# assumes that the event times won't change except on iteration
|
296
|
+
# of this loop - so will break if a different thread does
|
297
|
+
# anything to bring one of the event times earlier - and that the
|
298
|
+
# time here are correctly copy-pasted from the relevant if
|
299
|
+
# statements.
|
300
|
+
next_interesting_event_time = min(last_beat + self.heartbeat_period,
|
301
|
+
last_interchange_contact + self.heartbeat_threshold)
|
290
302
|
try:
|
291
303
|
pending_task_count = self.pending_task_queue.qsize()
|
292
304
|
except NotImplementedError:
|
@@ -296,14 +308,14 @@ class Manager:
|
|
296
308
|
logger.debug("ready workers: {}, pending tasks: {}".format(self.ready_worker_count.value,
|
297
309
|
pending_task_count))
|
298
310
|
|
299
|
-
if time.time()
|
311
|
+
if time.time() >= last_beat + self.heartbeat_period:
|
300
312
|
self.heartbeat_to_incoming()
|
301
313
|
last_beat = time.time()
|
302
314
|
|
303
|
-
|
315
|
+
poll_duration_s = max(0, next_interesting_event_time - time.time())
|
316
|
+
socks = dict(poller.poll(timeout=poll_duration_s * 1000))
|
304
317
|
|
305
318
|
if self.task_incoming in socks and socks[self.task_incoming] == zmq.POLLIN:
|
306
|
-
poll_timer = 0
|
307
319
|
_, pkl_msg = self.task_incoming.recv_multipart()
|
308
320
|
tasks = pickle.loads(pkl_msg)
|
309
321
|
last_interchange_contact = time.time()
|
@@ -320,14 +332,9 @@ class Manager:
|
|
320
332
|
|
321
333
|
else:
|
322
334
|
logger.debug("No incoming tasks")
|
323
|
-
# Limit poll duration to heartbeat_period
|
324
|
-
# heartbeat_period is in s vs poll_timer in ms
|
325
|
-
if not poll_timer:
|
326
|
-
poll_timer = self.poll_period
|
327
|
-
poll_timer = min(self.heartbeat_period * 1000, poll_timer * 2)
|
328
335
|
|
329
336
|
# Only check if no messages were received.
|
330
|
-
if time.time()
|
337
|
+
if time.time() >= last_interchange_contact + self.heartbeat_threshold:
|
331
338
|
logger.critical("Missing contact with interchange beyond heartbeat_threshold")
|
332
339
|
kill_event.set()
|
333
340
|
logger.critical("Exiting")
|
@@ -364,7 +371,8 @@ class Manager:
|
|
364
371
|
logger.exception("Got an exception: {}".format(e))
|
365
372
|
|
366
373
|
if time.time() > last_result_beat + self.heartbeat_period:
|
367
|
-
|
374
|
+
heartbeat_message = f"last_result_beat={last_result_beat} heartbeat_period={self.heartbeat_period} seconds"
|
375
|
+
logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
|
368
376
|
last_result_beat = time.time()
|
369
377
|
items.append(pickle.dumps({'type': 'heartbeat'}))
|
370
378
|
|
@@ -405,7 +413,9 @@ class Manager:
|
|
405
413
|
raise WorkerLost(worker_id, platform.node())
|
406
414
|
except Exception:
|
407
415
|
logger.info("Putting exception for executor task {} in the pending result queue".format(task['task_id']))
|
408
|
-
result_package = {'type': 'result',
|
416
|
+
result_package = {'type': 'result',
|
417
|
+
'task_id': task['task_id'],
|
418
|
+
'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
|
409
419
|
pkl_package = pickle.dumps(result_package)
|
410
420
|
self.pending_result_queue.put(pkl_package)
|
411
421
|
except KeyError:
|
@@ -452,7 +462,6 @@ class Manager:
|
|
452
462
|
|
453
463
|
TODO: Move task receiving to a thread
|
454
464
|
"""
|
455
|
-
start = time.time()
|
456
465
|
self._kill_event = threading.Event()
|
457
466
|
self._tasks_in_progress = self._mp_manager.dict()
|
458
467
|
|
@@ -502,7 +511,7 @@ class Manager:
|
|
502
511
|
self.task_incoming.close()
|
503
512
|
self.result_outgoing.close()
|
504
513
|
self.zmq_context.term()
|
505
|
-
delta = time.time() -
|
514
|
+
delta = time.time() - self._start_time
|
506
515
|
logger.info("process_worker_pool ran for {} seconds".format(delta))
|
507
516
|
return
|
508
517
|
|
@@ -787,7 +796,7 @@ if __name__ == "__main__":
|
|
787
796
|
help="GB of memory assigned to each worker process. Default=0, no assignment")
|
788
797
|
parser.add_argument("-t", "--task_port", required=True,
|
789
798
|
help="REQUIRED: Task port for receiving tasks from the interchange")
|
790
|
-
parser.add_argument("--
|
799
|
+
parser.add_argument("--max_workers_per_node", default=float('inf'),
|
791
800
|
help="Caps the maximum workers that can be launched, default:infinity")
|
792
801
|
parser.add_argument("-p", "--prefetch_capacity", default=0,
|
793
802
|
help="Number of tasks that can be prefetched to the manager. Default is 0.")
|
@@ -841,7 +850,7 @@ if __name__ == "__main__":
|
|
841
850
|
logger.info("task_port: {}".format(args.task_port))
|
842
851
|
logger.info("result_port: {}".format(args.result_port))
|
843
852
|
logger.info("addresses: {}".format(args.addresses))
|
844
|
-
logger.info("
|
853
|
+
logger.info("max_workers_per_node: {}".format(args.max_workers_per_node))
|
845
854
|
logger.info("poll_period: {}".format(args.poll))
|
846
855
|
logger.info("address_probe_timeout: {}".format(args.address_probe_timeout))
|
847
856
|
logger.info("Prefetch capacity: {}".format(args.prefetch_capacity))
|
@@ -860,7 +869,10 @@ if __name__ == "__main__":
|
|
860
869
|
block_id=args.block_id,
|
861
870
|
cores_per_worker=float(args.cores_per_worker),
|
862
871
|
mem_per_worker=None if args.mem_per_worker == 'None' else float(args.mem_per_worker),
|
863
|
-
|
872
|
+
max_workers_per_node=(
|
873
|
+
args.max_workers_per_node if args.max_workers_per_node == float('inf')
|
874
|
+
else int(args.max_workers_per_node)
|
875
|
+
),
|
864
876
|
prefetch_capacity=int(args.prefetch_capacity),
|
865
877
|
heartbeat_threshold=int(args.hb_threshold),
|
866
878
|
heartbeat_period=int(args.hb_period),
|
@@ -1,9 +1,9 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: parsl
|
3
|
-
Version: 2024.
|
3
|
+
Version: 2024.3.11
|
4
4
|
Summary: Simple data dependent workflows in Python
|
5
5
|
Home-page: https://github.com/Parsl/parsl
|
6
|
-
Download-URL: https://github.com/Parsl/parsl/archive/2024.
|
6
|
+
Download-URL: https://github.com/Parsl/parsl/archive/2024.03.11.tar.gz
|
7
7
|
Author: The Parsl Team
|
8
8
|
Author-email: parsl@googlegroups.com
|
9
9
|
License: Apache 2.0
|