parsl 2024.2.26__py3-none-any.whl → 2024.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. parsl/addresses.py +1 -1
  2. parsl/configs/ASPIRE1.py +1 -1
  3. parsl/configs/ad_hoc.py +1 -1
  4. parsl/configs/bridges.py +1 -1
  5. parsl/configs/cc_in2p3.py +1 -1
  6. parsl/configs/expanse.py +1 -1
  7. parsl/configs/frontera.py +1 -1
  8. parsl/configs/kubernetes.py +1 -1
  9. parsl/configs/midway.py +1 -1
  10. parsl/configs/osg.py +1 -1
  11. parsl/configs/stampede2.py +1 -1
  12. parsl/dataflow/dflow.py +11 -6
  13. parsl/dataflow/taskrecord.py +3 -1
  14. parsl/executors/high_throughput/executor.py +69 -37
  15. parsl/executors/high_throughput/interchange.py +78 -59
  16. parsl/executors/high_throughput/process_worker_pool.py +40 -28
  17. parsl/executors/taskvine/executor.py +3 -1
  18. parsl/executors/workqueue/executor.py +5 -2
  19. parsl/executors/workqueue/parsl_coprocess.py +107 -95
  20. parsl/jobs/job_status_poller.py +9 -3
  21. parsl/jobs/strategy.py +4 -3
  22. parsl/monitoring/db_manager.py +25 -5
  23. parsl/monitoring/monitoring.py +6 -2
  24. parsl/monitoring/remote.py +29 -0
  25. parsl/monitoring/visualization/models.py +7 -0
  26. parsl/providers/slurm/slurm.py +13 -2
  27. parsl/tests/configs/ad_hoc_cluster_htex.py +1 -1
  28. parsl/tests/configs/bluewaters.py +1 -1
  29. parsl/tests/configs/bridges.py +1 -1
  30. parsl/tests/configs/cc_in2p3.py +1 -1
  31. parsl/tests/configs/comet.py +1 -1
  32. parsl/tests/configs/frontera.py +1 -1
  33. parsl/tests/configs/midway.py +1 -1
  34. parsl/tests/configs/nscc_singapore.py +1 -1
  35. parsl/tests/configs/osg_htex.py +1 -1
  36. parsl/tests/configs/petrelkube.py +1 -1
  37. parsl/tests/configs/summit.py +1 -1
  38. parsl/tests/configs/theta.py +1 -1
  39. parsl/tests/configs/user_opts.py +3 -1
  40. parsl/tests/manual_tests/test_ad_hoc_htex.py +1 -1
  41. parsl/tests/scaling_tests/htex_local.py +1 -1
  42. parsl/tests/sites/test_affinity.py +1 -1
  43. parsl/tests/sites/test_concurrent.py +1 -1
  44. parsl/tests/sites/test_dynamic_executor.py +1 -1
  45. parsl/tests/sites/test_worker_info.py +1 -1
  46. parsl/tests/test_htex/test_basic.py +1 -1
  47. parsl/tests/test_htex/test_connected_blocks.py +1 -1
  48. parsl/tests/test_htex/test_cpu_affinity_explicit.py +1 -1
  49. parsl/tests/test_htex/test_disconnected_blocks.py +1 -1
  50. parsl/tests/test_htex/test_htex.py +13 -0
  51. parsl/tests/test_htex/test_manager_failure.py +1 -1
  52. parsl/tests/test_htex/test_missing_worker.py +1 -1
  53. parsl/tests/test_htex/test_multiple_disconnected_blocks.py +1 -1
  54. parsl/tests/test_htex/test_worker_failure.py +1 -1
  55. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +1 -1
  56. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +1 -1
  57. parsl/tests/test_mpi_apps/test_resource_spec.py +1 -1
  58. parsl/tests/test_scaling/test_scale_down.py +2 -2
  59. parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +159 -0
  60. parsl/usage_tracking/usage.py +5 -9
  61. parsl/version.py +1 -1
  62. parsl-2024.3.11.data/scripts/parsl_coprocess.py +166 -0
  63. {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/process_worker_pool.py +40 -28
  64. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/METADATA +2 -2
  65. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/RECORD +70 -70
  66. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/WHEEL +1 -1
  67. parsl/configs/bluewaters.py +0 -28
  68. parsl-2024.2.26.data/scripts/parsl_coprocess.py +0 -154
  69. {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/exec_parsl_function.py +0 -0
  70. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/LICENSE +0 -0
  71. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/entry_points.txt +0 -0
  72. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,166 @@
1
+ #!python
2
+
3
+ import socket
4
+ import json
5
+ import os
6
+ import sys
7
+
8
+ # If enabled, coprocess will print to stdout
9
+ debug_mode = False
10
+
11
+ # Send a message on a binary I/O stream by sending the message length and then the (string) message.
12
+ def send_message(stream, data):
13
+ size = len(data)
14
+ size_msg = "{}\n".format(size)
15
+ stream.write(size_msg)
16
+ stream.write(data)
17
+
18
+ # Receive a standard message from a binary I/O stream by reading length and then returning the (string) message
19
+ def recv_message(stream):
20
+ line = stream.readline()
21
+ length = int(line)
22
+ return stream.read(length)
23
+
24
+ # Decorator for remotely execution functions to package things as json.
25
+ def remote_execute(func):
26
+ def remote_wrapper(event):
27
+ kwargs = event["fn_kwargs"]
28
+ args = event["fn_args"]
29
+ try:
30
+ response = {
31
+ "Result": func(*args, **kwargs),
32
+ "StatusCode": 200
33
+ }
34
+ except Exception as e:
35
+ response = {
36
+ "Result": str(e),
37
+ "StatusCode": 500
38
+ }
39
+ return response
40
+ return remote_wrapper
41
+
42
+ # Main loop of coprocess for executing network functions.
43
+ def main():
44
+ # Listen on an arbitrary port to be reported to the worker.
45
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
46
+ try:
47
+ s.bind(('localhost', 0))
48
+ except Exception as e:
49
+ s.close()
50
+ print(e, file=sys.stderr)
51
+ sys.exit(1)
52
+
53
+ # Inform the worker of name and port for later connection.
54
+ config = {
55
+ "name": name(), # noqa: F821
56
+ "port": s.getsockname()[1],
57
+ }
58
+ send_message(sys.stdout, json.dumps(config))
59
+ sys.stdout.flush()
60
+
61
+ # Remember original working directory b/c we change for each invocation.
62
+ abs_working_dir = os.getcwd()
63
+
64
+ # Create pipe for communication with child process
65
+ rpipe, wpipe = os.pipe()
66
+ rpipestream = os.fdopen(rpipe, "r")
67
+
68
+ while True:
69
+ s.listen()
70
+ conn, addr = s.accept()
71
+ connstream = conn.makefile("rw", encoding="utf-8")
72
+
73
+ if debug_mode:
74
+ print('Network function: connection from {}'.format(addr), file=sys.stderr)
75
+
76
+ while True:
77
+ # Read the invocation header from the worker
78
+ line = connstream.readline()
79
+
80
+ # If end of file, then break out and accept again
81
+ if not line:
82
+ break
83
+
84
+ # Parse the invocation header.
85
+ input_spec = line.split()
86
+ function_name = input_spec[0]
87
+ task_id = int(input_spec[1])
88
+ event_size = int(input_spec[2])
89
+
90
+ # then read the contents of the event itself
91
+ event_str = connstream.read(event_size)
92
+ event = json.loads(event_str)
93
+ exec_method = event.get("remote_task_exec_method", None)
94
+
95
+ try:
96
+ # First move to target directory (is undone in finally block)
97
+ os.chdir(os.path.join(abs_working_dir, f't.{task_id}'))
98
+
99
+ # Then invoke function by desired method, resulting in
100
+ # response containing the text representation of the result.
101
+
102
+ if exec_method == "direct":
103
+ response = json.dumps(globals()[function_name](event))
104
+ else:
105
+ p = os.fork()
106
+ if p == 0:
107
+ response = globals()[function_name](event)
108
+ wpipestream = os.fdopen(wpipe, "w")
109
+ send_message(wpipestream, json.dumps(response))
110
+ wpipestream.flush()
111
+ os._exit(0)
112
+ elif p < 0:
113
+ if debug_mode:
114
+ print(f'Network function: unable to fork to execute {function_name}', file=sys.stderr)
115
+ response = {
116
+ "Result": "unable to fork",
117
+ "StatusCode": 500
118
+ }
119
+ response = json.dumps(response)
120
+ else:
121
+ # Get response string from child process.
122
+ response = recv_message(rpipestream)
123
+ # Wait for child process to complete
124
+ os.waitpid(p, 0)
125
+
126
+ # At this point, response is set to a value one way or the other
127
+
128
+ except Exception as e:
129
+ if debug_mode:
130
+ print("Network function encountered exception ", str(e), file=sys.stderr)
131
+ response = {
132
+ 'Result': f'network function encountered exception {e}',
133
+ 'Status Code': 500
134
+ }
135
+ response = json.dumps(response)
136
+ finally:
137
+ # Restore the working directory, no matter how the function ended.
138
+ os.chdir(abs_working_dir)
139
+
140
+ # Send response string back to parent worker process.
141
+ send_message(connstream, response)
142
+ connstream.flush()
143
+
144
+ return 0
145
+ def name():
146
+ return 'parsl_coprocess'
147
+ @remote_execute
148
+ def run_parsl_task(a, b, c):
149
+ import parsl.executors.workqueue.exec_parsl_function as epf
150
+ try:
151
+ (map_file, function_file, result_file) = (a, b, c)
152
+ try:
153
+ (namespace, function_code, result_name) = epf.load_function(map_file, function_file)
154
+ except Exception:
155
+ raise
156
+ try:
157
+ result = epf.execute_function(namespace, function_code, result_name)
158
+ except Exception:
159
+ raise
160
+ except Exception:
161
+ result = RemoteExceptionWrapper(*sys.exc_info())
162
+ epf.dump_result_to_file(result_file, result)
163
+ return None
164
+ if __name__ == "__main__":
165
+ main()
166
+
@@ -62,7 +62,7 @@ class Manager:
62
62
  result_port,
63
63
  cores_per_worker,
64
64
  mem_per_worker,
65
- max_workers,
65
+ max_workers_per_node,
66
66
  prefetch_capacity,
67
67
  uid,
68
68
  block_id,
@@ -100,8 +100,8 @@ class Manager:
100
100
  the there's sufficient memory for each worker. If set to None, memory on node is not
101
101
  considered in the determination of workers to be launched on node by the manager.
102
102
 
103
- max_workers : int
104
- caps the maximum number of workers that can be launched.
103
+ max_workers_per_node : int | float
104
+ Caps the maximum number of workers that can be launched.
105
105
 
106
106
  prefetch_capacity : int
107
107
  Number of tasks that could be prefetched over available worker capacity.
@@ -140,7 +140,9 @@ class Manager:
140
140
  Path to the certificate directory.
141
141
  """
142
142
 
143
- logger.info("Manager started")
143
+ logger.info("Manager initializing")
144
+
145
+ self._start_time = time.time()
144
146
 
145
147
  try:
146
148
  ix_address = probe_addresses(addresses.split(','), task_port, timeout=address_probe_timeout)
@@ -188,15 +190,15 @@ class Manager:
188
190
  else:
189
191
  available_mem_on_node = round(psutil.virtual_memory().available / (2**30), 1)
190
192
 
191
- self.max_workers = max_workers
193
+ self.max_workers_per_node = max_workers_per_node
192
194
  self.prefetch_capacity = prefetch_capacity
193
195
 
194
- mem_slots = max_workers
196
+ mem_slots = max_workers_per_node
195
197
  # Avoid a divide by 0 error.
196
198
  if mem_per_worker and mem_per_worker > 0:
197
199
  mem_slots = math.floor(available_mem_on_node / mem_per_worker)
198
200
 
199
- self.worker_count: int = min(max_workers,
201
+ self.worker_count: int = min(max_workers_per_node,
200
202
  mem_slots,
201
203
  math.floor(cores_on_node / cores_per_worker))
202
204
 
@@ -237,7 +239,8 @@ class Manager:
237
239
  def create_reg_message(self):
238
240
  """ Creates a registration message to identify the worker to the interchange
239
241
  """
240
- msg = {'parsl_v': PARSL_VERSION,
242
+ msg = {'type': 'registration',
243
+ 'parsl_v': PARSL_VERSION,
241
244
  'python_v': "{}.{}.{}".format(sys.version_info.major,
242
245
  sys.version_info.minor,
243
246
  sys.version_info.micro),
@@ -258,8 +261,9 @@ class Manager:
258
261
  def heartbeat_to_incoming(self):
259
262
  """ Send heartbeat to the incoming task queue
260
263
  """
261
- heartbeat = (HEARTBEAT_CODE).to_bytes(4, "little")
262
- self.task_incoming.send(heartbeat)
264
+ msg = {'type': 'heartbeat'}
265
+ b_msg = json.dumps(msg).encode('utf-8')
266
+ self.task_incoming.send(b_msg)
263
267
  logger.debug("Sent heartbeat")
264
268
 
265
269
  @wrap_with_logs
@@ -284,9 +288,17 @@ class Manager:
284
288
  last_interchange_contact = time.time()
285
289
  task_recv_counter = 0
286
290
 
287
- poll_timer = self.poll_period
288
-
289
291
  while not kill_event.is_set():
292
+
293
+ # This loop will sit inside poller.poll until either a message
294
+ # arrives or one of these event times is reached. This code
295
+ # assumes that the event times won't change except on iteration
296
+ # of this loop - so will break if a different thread does
297
+ # anything to bring one of the event times earlier - and that the
298
+ # time here are correctly copy-pasted from the relevant if
299
+ # statements.
300
+ next_interesting_event_time = min(last_beat + self.heartbeat_period,
301
+ last_interchange_contact + self.heartbeat_threshold)
290
302
  try:
291
303
  pending_task_count = self.pending_task_queue.qsize()
292
304
  except NotImplementedError:
@@ -296,14 +308,14 @@ class Manager:
296
308
  logger.debug("ready workers: {}, pending tasks: {}".format(self.ready_worker_count.value,
297
309
  pending_task_count))
298
310
 
299
- if time.time() > last_beat + self.heartbeat_period:
311
+ if time.time() >= last_beat + self.heartbeat_period:
300
312
  self.heartbeat_to_incoming()
301
313
  last_beat = time.time()
302
314
 
303
- socks = dict(poller.poll(timeout=poll_timer))
315
+ poll_duration_s = max(0, next_interesting_event_time - time.time())
316
+ socks = dict(poller.poll(timeout=poll_duration_s * 1000))
304
317
 
305
318
  if self.task_incoming in socks and socks[self.task_incoming] == zmq.POLLIN:
306
- poll_timer = 0
307
319
  _, pkl_msg = self.task_incoming.recv_multipart()
308
320
  tasks = pickle.loads(pkl_msg)
309
321
  last_interchange_contact = time.time()
@@ -320,14 +332,9 @@ class Manager:
320
332
 
321
333
  else:
322
334
  logger.debug("No incoming tasks")
323
- # Limit poll duration to heartbeat_period
324
- # heartbeat_period is in s vs poll_timer in ms
325
- if not poll_timer:
326
- poll_timer = self.poll_period
327
- poll_timer = min(self.heartbeat_period * 1000, poll_timer * 2)
328
335
 
329
336
  # Only check if no messages were received.
330
- if time.time() > last_interchange_contact + self.heartbeat_threshold:
337
+ if time.time() >= last_interchange_contact + self.heartbeat_threshold:
331
338
  logger.critical("Missing contact with interchange beyond heartbeat_threshold")
332
339
  kill_event.set()
333
340
  logger.critical("Exiting")
@@ -364,7 +371,8 @@ class Manager:
364
371
  logger.exception("Got an exception: {}".format(e))
365
372
 
366
373
  if time.time() > last_result_beat + self.heartbeat_period:
367
- logger.info(f"Sending heartbeat via results connection: last_result_beat={last_result_beat} heartbeat_period={self.heartbeat_period} seconds")
374
+ heartbeat_message = f"last_result_beat={last_result_beat} heartbeat_period={self.heartbeat_period} seconds"
375
+ logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
368
376
  last_result_beat = time.time()
369
377
  items.append(pickle.dumps({'type': 'heartbeat'}))
370
378
 
@@ -405,7 +413,9 @@ class Manager:
405
413
  raise WorkerLost(worker_id, platform.node())
406
414
  except Exception:
407
415
  logger.info("Putting exception for executor task {} in the pending result queue".format(task['task_id']))
408
- result_package = {'type': 'result', 'task_id': task['task_id'], 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
416
+ result_package = {'type': 'result',
417
+ 'task_id': task['task_id'],
418
+ 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
409
419
  pkl_package = pickle.dumps(result_package)
410
420
  self.pending_result_queue.put(pkl_package)
411
421
  except KeyError:
@@ -452,7 +462,6 @@ class Manager:
452
462
 
453
463
  TODO: Move task receiving to a thread
454
464
  """
455
- start = time.time()
456
465
  self._kill_event = threading.Event()
457
466
  self._tasks_in_progress = self._mp_manager.dict()
458
467
 
@@ -502,7 +511,7 @@ class Manager:
502
511
  self.task_incoming.close()
503
512
  self.result_outgoing.close()
504
513
  self.zmq_context.term()
505
- delta = time.time() - start
514
+ delta = time.time() - self._start_time
506
515
  logger.info("process_worker_pool ran for {} seconds".format(delta))
507
516
  return
508
517
 
@@ -787,7 +796,7 @@ if __name__ == "__main__":
787
796
  help="GB of memory assigned to each worker process. Default=0, no assignment")
788
797
  parser.add_argument("-t", "--task_port", required=True,
789
798
  help="REQUIRED: Task port for receiving tasks from the interchange")
790
- parser.add_argument("--max_workers", default=float('inf'),
799
+ parser.add_argument("--max_workers_per_node", default=float('inf'),
791
800
  help="Caps the maximum workers that can be launched, default:infinity")
792
801
  parser.add_argument("-p", "--prefetch_capacity", default=0,
793
802
  help="Number of tasks that can be prefetched to the manager. Default is 0.")
@@ -841,7 +850,7 @@ if __name__ == "__main__":
841
850
  logger.info("task_port: {}".format(args.task_port))
842
851
  logger.info("result_port: {}".format(args.result_port))
843
852
  logger.info("addresses: {}".format(args.addresses))
844
- logger.info("max_workers: {}".format(args.max_workers))
853
+ logger.info("max_workers_per_node: {}".format(args.max_workers_per_node))
845
854
  logger.info("poll_period: {}".format(args.poll))
846
855
  logger.info("address_probe_timeout: {}".format(args.address_probe_timeout))
847
856
  logger.info("Prefetch capacity: {}".format(args.prefetch_capacity))
@@ -860,7 +869,10 @@ if __name__ == "__main__":
860
869
  block_id=args.block_id,
861
870
  cores_per_worker=float(args.cores_per_worker),
862
871
  mem_per_worker=None if args.mem_per_worker == 'None' else float(args.mem_per_worker),
863
- max_workers=args.max_workers if args.max_workers == float('inf') else int(args.max_workers),
872
+ max_workers_per_node=(
873
+ args.max_workers_per_node if args.max_workers_per_node == float('inf')
874
+ else int(args.max_workers_per_node)
875
+ ),
864
876
  prefetch_capacity=int(args.prefetch_capacity),
865
877
  heartbeat_threshold=int(args.hb_threshold),
866
878
  heartbeat_period=int(args.hb_period),
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parsl
3
- Version: 2024.2.26
3
+ Version: 2024.3.11
4
4
  Summary: Simple data dependent workflows in Python
5
5
  Home-page: https://github.com/Parsl/parsl
6
- Download-URL: https://github.com/Parsl/parsl/archive/2024.02.26.tar.gz
6
+ Download-URL: https://github.com/Parsl/parsl/archive/2024.03.11.tar.gz
7
7
  Author: The Parsl Team
8
8
  Author-email: parsl@googlegroups.com
9
9
  License: Apache 2.0