parsl 2024.2.26__py3-none-any.whl → 2024.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. parsl/addresses.py +1 -1
  2. parsl/configs/ASPIRE1.py +1 -1
  3. parsl/configs/ad_hoc.py +1 -1
  4. parsl/configs/bridges.py +1 -1
  5. parsl/configs/cc_in2p3.py +1 -1
  6. parsl/configs/expanse.py +1 -1
  7. parsl/configs/frontera.py +1 -1
  8. parsl/configs/kubernetes.py +1 -1
  9. parsl/configs/midway.py +1 -1
  10. parsl/configs/osg.py +1 -1
  11. parsl/configs/stampede2.py +1 -1
  12. parsl/dataflow/dflow.py +11 -6
  13. parsl/dataflow/taskrecord.py +3 -1
  14. parsl/executors/high_throughput/executor.py +69 -37
  15. parsl/executors/high_throughput/interchange.py +78 -59
  16. parsl/executors/high_throughput/process_worker_pool.py +40 -28
  17. parsl/executors/taskvine/executor.py +3 -1
  18. parsl/executors/workqueue/executor.py +5 -2
  19. parsl/executors/workqueue/parsl_coprocess.py +107 -95
  20. parsl/jobs/job_status_poller.py +9 -3
  21. parsl/jobs/strategy.py +4 -3
  22. parsl/monitoring/db_manager.py +25 -5
  23. parsl/monitoring/monitoring.py +6 -2
  24. parsl/monitoring/remote.py +29 -0
  25. parsl/monitoring/visualization/models.py +7 -0
  26. parsl/providers/slurm/slurm.py +13 -2
  27. parsl/tests/configs/ad_hoc_cluster_htex.py +1 -1
  28. parsl/tests/configs/bluewaters.py +1 -1
  29. parsl/tests/configs/bridges.py +1 -1
  30. parsl/tests/configs/cc_in2p3.py +1 -1
  31. parsl/tests/configs/comet.py +1 -1
  32. parsl/tests/configs/frontera.py +1 -1
  33. parsl/tests/configs/midway.py +1 -1
  34. parsl/tests/configs/nscc_singapore.py +1 -1
  35. parsl/tests/configs/osg_htex.py +1 -1
  36. parsl/tests/configs/petrelkube.py +1 -1
  37. parsl/tests/configs/summit.py +1 -1
  38. parsl/tests/configs/theta.py +1 -1
  39. parsl/tests/configs/user_opts.py +3 -1
  40. parsl/tests/manual_tests/test_ad_hoc_htex.py +1 -1
  41. parsl/tests/scaling_tests/htex_local.py +1 -1
  42. parsl/tests/sites/test_affinity.py +1 -1
  43. parsl/tests/sites/test_concurrent.py +1 -1
  44. parsl/tests/sites/test_dynamic_executor.py +1 -1
  45. parsl/tests/sites/test_worker_info.py +1 -1
  46. parsl/tests/test_htex/test_basic.py +1 -1
  47. parsl/tests/test_htex/test_connected_blocks.py +1 -1
  48. parsl/tests/test_htex/test_cpu_affinity_explicit.py +1 -1
  49. parsl/tests/test_htex/test_disconnected_blocks.py +1 -1
  50. parsl/tests/test_htex/test_htex.py +13 -0
  51. parsl/tests/test_htex/test_manager_failure.py +1 -1
  52. parsl/tests/test_htex/test_missing_worker.py +1 -1
  53. parsl/tests/test_htex/test_multiple_disconnected_blocks.py +1 -1
  54. parsl/tests/test_htex/test_worker_failure.py +1 -1
  55. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +1 -1
  56. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +1 -1
  57. parsl/tests/test_mpi_apps/test_resource_spec.py +1 -1
  58. parsl/tests/test_scaling/test_scale_down.py +2 -2
  59. parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +159 -0
  60. parsl/usage_tracking/usage.py +5 -9
  61. parsl/version.py +1 -1
  62. parsl-2024.3.11.data/scripts/parsl_coprocess.py +166 -0
  63. {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/process_worker_pool.py +40 -28
  64. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/METADATA +2 -2
  65. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/RECORD +70 -70
  66. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/WHEEL +1 -1
  67. parsl/configs/bluewaters.py +0 -28
  68. parsl-2024.2.26.data/scripts/parsl_coprocess.py +0 -154
  69. {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/exec_parsl_function.py +0 -0
  70. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/LICENSE +0 -0
  71. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/entry_points.txt +0 -0
  72. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/top_level.txt +0 -0
@@ -62,7 +62,7 @@ class Manager:
62
62
  result_port,
63
63
  cores_per_worker,
64
64
  mem_per_worker,
65
- max_workers,
65
+ max_workers_per_node,
66
66
  prefetch_capacity,
67
67
  uid,
68
68
  block_id,
@@ -100,8 +100,8 @@ class Manager:
100
100
  the there's sufficient memory for each worker. If set to None, memory on node is not
101
101
  considered in the determination of workers to be launched on node by the manager.
102
102
 
103
- max_workers : int
104
- caps the maximum number of workers that can be launched.
103
+ max_workers_per_node : int | float
104
+ Caps the maximum number of workers that can be launched.
105
105
 
106
106
  prefetch_capacity : int
107
107
  Number of tasks that could be prefetched over available worker capacity.
@@ -140,7 +140,9 @@ class Manager:
140
140
  Path to the certificate directory.
141
141
  """
142
142
 
143
- logger.info("Manager started")
143
+ logger.info("Manager initializing")
144
+
145
+ self._start_time = time.time()
144
146
 
145
147
  try:
146
148
  ix_address = probe_addresses(addresses.split(','), task_port, timeout=address_probe_timeout)
@@ -188,15 +190,15 @@ class Manager:
188
190
  else:
189
191
  available_mem_on_node = round(psutil.virtual_memory().available / (2**30), 1)
190
192
 
191
- self.max_workers = max_workers
193
+ self.max_workers_per_node = max_workers_per_node
192
194
  self.prefetch_capacity = prefetch_capacity
193
195
 
194
- mem_slots = max_workers
196
+ mem_slots = max_workers_per_node
195
197
  # Avoid a divide by 0 error.
196
198
  if mem_per_worker and mem_per_worker > 0:
197
199
  mem_slots = math.floor(available_mem_on_node / mem_per_worker)
198
200
 
199
- self.worker_count: int = min(max_workers,
201
+ self.worker_count: int = min(max_workers_per_node,
200
202
  mem_slots,
201
203
  math.floor(cores_on_node / cores_per_worker))
202
204
 
@@ -237,7 +239,8 @@ class Manager:
237
239
  def create_reg_message(self):
238
240
  """ Creates a registration message to identify the worker to the interchange
239
241
  """
240
- msg = {'parsl_v': PARSL_VERSION,
242
+ msg = {'type': 'registration',
243
+ 'parsl_v': PARSL_VERSION,
241
244
  'python_v': "{}.{}.{}".format(sys.version_info.major,
242
245
  sys.version_info.minor,
243
246
  sys.version_info.micro),
@@ -258,8 +261,9 @@ class Manager:
258
261
  def heartbeat_to_incoming(self):
259
262
  """ Send heartbeat to the incoming task queue
260
263
  """
261
- heartbeat = (HEARTBEAT_CODE).to_bytes(4, "little")
262
- self.task_incoming.send(heartbeat)
264
+ msg = {'type': 'heartbeat'}
265
+ b_msg = json.dumps(msg).encode('utf-8')
266
+ self.task_incoming.send(b_msg)
263
267
  logger.debug("Sent heartbeat")
264
268
 
265
269
  @wrap_with_logs
@@ -284,9 +288,17 @@ class Manager:
284
288
  last_interchange_contact = time.time()
285
289
  task_recv_counter = 0
286
290
 
287
- poll_timer = self.poll_period
288
-
289
291
  while not kill_event.is_set():
292
+
293
+ # This loop will sit inside poller.poll until either a message
294
+ # arrives or one of these event times is reached. This code
295
+ # assumes that the event times won't change except on iteration
296
+ # of this loop - so will break if a different thread does
297
+ # anything to bring one of the event times earlier - and that the
298
+ # time here are correctly copy-pasted from the relevant if
299
+ # statements.
300
+ next_interesting_event_time = min(last_beat + self.heartbeat_period,
301
+ last_interchange_contact + self.heartbeat_threshold)
290
302
  try:
291
303
  pending_task_count = self.pending_task_queue.qsize()
292
304
  except NotImplementedError:
@@ -296,14 +308,14 @@ class Manager:
296
308
  logger.debug("ready workers: {}, pending tasks: {}".format(self.ready_worker_count.value,
297
309
  pending_task_count))
298
310
 
299
- if time.time() > last_beat + self.heartbeat_period:
311
+ if time.time() >= last_beat + self.heartbeat_period:
300
312
  self.heartbeat_to_incoming()
301
313
  last_beat = time.time()
302
314
 
303
- socks = dict(poller.poll(timeout=poll_timer))
315
+ poll_duration_s = max(0, next_interesting_event_time - time.time())
316
+ socks = dict(poller.poll(timeout=poll_duration_s * 1000))
304
317
 
305
318
  if self.task_incoming in socks and socks[self.task_incoming] == zmq.POLLIN:
306
- poll_timer = 0
307
319
  _, pkl_msg = self.task_incoming.recv_multipart()
308
320
  tasks = pickle.loads(pkl_msg)
309
321
  last_interchange_contact = time.time()
@@ -320,14 +332,9 @@ class Manager:
320
332
 
321
333
  else:
322
334
  logger.debug("No incoming tasks")
323
- # Limit poll duration to heartbeat_period
324
- # heartbeat_period is in s vs poll_timer in ms
325
- if not poll_timer:
326
- poll_timer = self.poll_period
327
- poll_timer = min(self.heartbeat_period * 1000, poll_timer * 2)
328
335
 
329
336
  # Only check if no messages were received.
330
- if time.time() > last_interchange_contact + self.heartbeat_threshold:
337
+ if time.time() >= last_interchange_contact + self.heartbeat_threshold:
331
338
  logger.critical("Missing contact with interchange beyond heartbeat_threshold")
332
339
  kill_event.set()
333
340
  logger.critical("Exiting")
@@ -364,7 +371,8 @@ class Manager:
364
371
  logger.exception("Got an exception: {}".format(e))
365
372
 
366
373
  if time.time() > last_result_beat + self.heartbeat_period:
367
- logger.info(f"Sending heartbeat via results connection: last_result_beat={last_result_beat} heartbeat_period={self.heartbeat_period} seconds")
374
+ heartbeat_message = f"last_result_beat={last_result_beat} heartbeat_period={self.heartbeat_period} seconds"
375
+ logger.info(f"Sending heartbeat via results connection: {heartbeat_message}")
368
376
  last_result_beat = time.time()
369
377
  items.append(pickle.dumps({'type': 'heartbeat'}))
370
378
 
@@ -405,7 +413,9 @@ class Manager:
405
413
  raise WorkerLost(worker_id, platform.node())
406
414
  except Exception:
407
415
  logger.info("Putting exception for executor task {} in the pending result queue".format(task['task_id']))
408
- result_package = {'type': 'result', 'task_id': task['task_id'], 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
416
+ result_package = {'type': 'result',
417
+ 'task_id': task['task_id'],
418
+ 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
409
419
  pkl_package = pickle.dumps(result_package)
410
420
  self.pending_result_queue.put(pkl_package)
411
421
  except KeyError:
@@ -452,7 +462,6 @@ class Manager:
452
462
 
453
463
  TODO: Move task receiving to a thread
454
464
  """
455
- start = time.time()
456
465
  self._kill_event = threading.Event()
457
466
  self._tasks_in_progress = self._mp_manager.dict()
458
467
 
@@ -502,7 +511,7 @@ class Manager:
502
511
  self.task_incoming.close()
503
512
  self.result_outgoing.close()
504
513
  self.zmq_context.term()
505
- delta = time.time() - start
514
+ delta = time.time() - self._start_time
506
515
  logger.info("process_worker_pool ran for {} seconds".format(delta))
507
516
  return
508
517
 
@@ -787,7 +796,7 @@ if __name__ == "__main__":
787
796
  help="GB of memory assigned to each worker process. Default=0, no assignment")
788
797
  parser.add_argument("-t", "--task_port", required=True,
789
798
  help="REQUIRED: Task port for receiving tasks from the interchange")
790
- parser.add_argument("--max_workers", default=float('inf'),
799
+ parser.add_argument("--max_workers_per_node", default=float('inf'),
791
800
  help="Caps the maximum workers that can be launched, default:infinity")
792
801
  parser.add_argument("-p", "--prefetch_capacity", default=0,
793
802
  help="Number of tasks that can be prefetched to the manager. Default is 0.")
@@ -841,7 +850,7 @@ if __name__ == "__main__":
841
850
  logger.info("task_port: {}".format(args.task_port))
842
851
  logger.info("result_port: {}".format(args.result_port))
843
852
  logger.info("addresses: {}".format(args.addresses))
844
- logger.info("max_workers: {}".format(args.max_workers))
853
+ logger.info("max_workers_per_node: {}".format(args.max_workers_per_node))
845
854
  logger.info("poll_period: {}".format(args.poll))
846
855
  logger.info("address_probe_timeout: {}".format(args.address_probe_timeout))
847
856
  logger.info("Prefetch capacity: {}".format(args.prefetch_capacity))
@@ -860,7 +869,10 @@ if __name__ == "__main__":
860
869
  block_id=args.block_id,
861
870
  cores_per_worker=float(args.cores_per_worker),
862
871
  mem_per_worker=None if args.mem_per_worker == 'None' else float(args.mem_per_worker),
863
- max_workers=args.max_workers if args.max_workers == float('inf') else int(args.max_workers),
872
+ max_workers_per_node=(
873
+ args.max_workers_per_node if args.max_workers_per_node == float('inf')
874
+ else int(args.max_workers_per_node)
875
+ ),
864
876
  prefetch_capacity=int(args.prefetch_capacity),
865
877
  heartbeat_threshold=int(args.hb_threshold),
866
878
  heartbeat_period=int(args.hb_period),
@@ -228,7 +228,9 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
228
228
  # factory logs go with manager logs regardless
229
229
  self.factory_config.scratch_dir = self.manager_config.vine_log_dir
230
230
  logger.debug(f"Function data directory: {self._function_data_dir}, log directory: {log_dir}")
231
- logger.debug(f"TaskVine manager log directory: {self.manager_config.vine_log_dir}, factory log directory: {self.factory_config.scratch_dir}")
231
+ logger.debug(
232
+ f"TaskVine manager log directory: {self.manager_config.vine_log_dir}, "
233
+ f"factory log directory: {self.factory_config.scratch_dir}")
232
234
 
233
235
  def start(self):
234
236
  """Create submit process and collector thread to create, send, and
@@ -61,8 +61,11 @@ logger = logging.getLogger(__name__)
61
61
 
62
62
 
63
63
  # Support structure to communicate parsl tasks to the work queue submit thread.
64
- ParslTaskToWq = namedtuple('ParslTaskToWq',
65
- 'id category cores memory disk gpus priority running_time_min env_pkg map_file function_file result_file input_files output_files')
64
+ ParslTaskToWq = namedtuple(
65
+ 'ParslTaskToWq',
66
+ 'id '
67
+ 'category '
68
+ 'cores memory disk gpus priority running_time_min env_pkg map_file function_file result_file input_files output_files')
66
69
 
67
70
  # Support structure to communicate final status of work queue tasks to parsl
68
71
  # if result_received is True:
@@ -1,18 +1,29 @@
1
1
  #! /usr/bin/env python3
2
2
 
3
- import sys
4
- from parsl.app.errors import RemoteExceptionWrapper
5
-
6
3
  import socket
7
4
  import json
8
5
  import os
9
6
  import sys
10
- import threading
11
- import queue
7
+
8
+ # If enabled, coprocess will print to stdout
9
+ debug_mode = False
10
+
11
+ # Send a message on a binary I/O stream by sending the message length and then the (string) message.
12
+ def send_message(stream, data):
13
+ size = len(data)
14
+ size_msg = "{}\n".format(size)
15
+ stream.write(size_msg)
16
+ stream.write(data)
17
+
18
+ # Receive a standard message from a binary I/O stream by reading length and then returning the (string) message
19
+ def recv_message(stream):
20
+ line = stream.readline()
21
+ length = int(line)
22
+ return stream.read(length)
23
+
24
+ # Decorator for remotely execution functions to package things as json.
12
25
  def remote_execute(func):
13
- def remote_wrapper(event, q=None):
14
- if q:
15
- event = json.loads(event)
26
+ def remote_wrapper(event):
16
27
  kwargs = event["fn_kwargs"]
17
28
  args = event["fn_args"]
18
29
  try:
@@ -21,114 +32,115 @@ def remote_execute(func):
21
32
  "StatusCode": 200
22
33
  }
23
34
  except Exception as e:
24
- response = {
35
+ response = {
25
36
  "Result": str(e),
26
- "StatusCode": 500
37
+ "StatusCode": 500
27
38
  }
28
- if not q:
29
- return response
30
- q.put(response)
39
+ return response
31
40
  return remote_wrapper
32
-
33
- read, write = os.pipe()
34
- def send_configuration(config):
35
- config_string = json.dumps(config)
36
- config_cmd = f"{len(config_string) + 1}\n{config_string}\n"
37
- sys.stdout.write(config_cmd)
38
- sys.stdout.flush()
41
+
42
+ # Main loop of coprocess for executing network functions.
39
43
  def main():
44
+ # Listen on an arbitrary port to be reported to the worker.
40
45
  s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
41
46
  try:
42
- # modify the port argument to be 0 to listen on an arbitrary port
43
47
  s.bind(('localhost', 0))
44
48
  except Exception as e:
45
49
  s.close()
46
- print(e)
47
- exit(1)
48
- # information to print to stdout for worker
50
+ print(e, file=sys.stderr)
51
+ sys.exit(1)
52
+
53
+ # Inform the worker of name and port for later connection.
49
54
  config = {
50
- "name": name(),
51
- "port": s.getsockname()[1],
52
- }
53
- send_configuration(config)
55
+ "name": name(), # noqa: F821
56
+ "port": s.getsockname()[1],
57
+ }
58
+ send_message(sys.stdout, json.dumps(config))
59
+ sys.stdout.flush()
60
+
61
+ # Remember original working directory b/c we change for each invocation.
54
62
  abs_working_dir = os.getcwd()
63
+
64
+ # Create pipe for communication with child process
65
+ rpipe, wpipe = os.pipe()
66
+ rpipestream = os.fdopen(rpipe, "r")
67
+
55
68
  while True:
56
69
  s.listen()
57
70
  conn, addr = s.accept()
58
- print('Network function: connection from {}'.format(addr), file=sys.stderr)
71
+ connstream = conn.makefile("rw", encoding="utf-8")
72
+
73
+ if debug_mode:
74
+ print('Network function: connection from {}'.format(addr), file=sys.stderr)
75
+
59
76
  while True:
60
- # peek at message to find newline to get the size
61
- event_size = None
62
- line = conn.recv(100, socket.MSG_PEEK)
63
- eol = line.find(b'\n')
64
- if eol >= 0:
65
- size = eol+1
66
- # actually read the size of the event
67
- input_spec = conn.recv(size).decode('utf-8').split()
68
- function_name = input_spec[0]
69
- task_id = int(input_spec[1])
70
- event_size = int(input_spec[2])
77
+ # Read the invocation header from the worker
78
+ line = connstream.readline()
79
+
80
+ # If end of file, then break out and accept again
81
+ if not line:
82
+ break
83
+
84
+ # Parse the invocation header.
85
+ input_spec = line.split()
86
+ function_name = input_spec[0]
87
+ task_id = int(input_spec[1])
88
+ event_size = int(input_spec[2])
89
+
90
+ # then read the contents of the event itself
91
+ event_str = connstream.read(event_size)
92
+ event = json.loads(event_str)
93
+ exec_method = event.get("remote_task_exec_method", None)
94
+
71
95
  try:
72
- if event_size:
73
- # receive the bytes containing the event and turn it into a string
74
- event_str = conn.recv(event_size).decode("utf-8")
75
- # turn the event into a python dictionary
76
- event = json.loads(event_str)
77
- # see if the user specified an execution method
78
- exec_method = event.get("remote_task_exec_method", None)
79
- print('Network function: recieved event: {}'.format(event), file=sys.stderr)
80
- os.chdir(os.path.join(abs_working_dir, f't.{task_id}'))
81
- if exec_method == "thread":
82
- # create a forked process for function handler
83
- q = queue.Queue()
84
- p = threading.Thread(target=globals()[function_name], args=(event_str, q))
85
- p.start()
86
- p.join()
87
- response = json.dumps(q.get()).encode("utf-8")
88
- elif exec_method == "direct":
89
- response = json.dumps(globals()[function_name](event)).encode("utf-8")
96
+ # First move to target directory (is undone in finally block)
97
+ os.chdir(os.path.join(abs_working_dir, f't.{task_id}'))
98
+
99
+ # Then invoke function by desired method, resulting in
100
+ # response containing the text representation of the result.
101
+
102
+ if exec_method == "direct":
103
+ response = json.dumps(globals()[function_name](event))
104
+ else:
105
+ p = os.fork()
106
+ if p == 0:
107
+ response = globals()[function_name](event)
108
+ wpipestream = os.fdopen(wpipe, "w")
109
+ send_message(wpipestream, json.dumps(response))
110
+ wpipestream.flush()
111
+ os._exit(0)
112
+ elif p < 0:
113
+ if debug_mode:
114
+ print(f'Network function: unable to fork to execute {function_name}', file=sys.stderr)
115
+ response = {
116
+ "Result": "unable to fork",
117
+ "StatusCode": 500
118
+ }
119
+ response = json.dumps(response)
90
120
  else:
91
- p = os.fork()
92
- if p == 0:
93
- response =globals()[function_name](event)
94
- os.write(write, json.dumps(response).encode("utf-8"))
95
- os._exit(0)
96
- elif p < 0:
97
- print('Network function: unable to fork', file=sys.stderr)
98
- response = {
99
- "Result": "unable to fork",
100
- "StatusCode": 500
101
- }
102
- else:
103
- chunk = os.read(read, 65536).decode("utf-8")
104
- all_chunks = [chunk]
105
- while (len(chunk) >= 65536):
106
- chunk = os.read(read, 65536).decode("utf-8")
107
- all_chunks.append(chunk)
108
- response = "".join(all_chunks).encode("utf-8")
109
- os.waitid(os.P_PID, p, os.WEXITED)
110
- response_size = len(response)
111
- size_msg = "{}\n".format(response_size)
112
- # send the size of response
113
- conn.sendall(size_msg.encode('utf-8'))
114
- # send response
115
- conn.sendall(response)
116
- break
121
+ # Get response string from child process.
122
+ response = recv_message(rpipestream)
123
+ # Wait for child process to complete
124
+ os.waitpid(p, 0)
125
+
126
+ # At this point, response is set to a value one way or the other
127
+
117
128
  except Exception as e:
118
- print("Network function encountered exception ", str(e), file=sys.stderr)
129
+ if debug_mode:
130
+ print("Network function encountered exception ", str(e), file=sys.stderr)
119
131
  response = {
120
132
  'Result': f'network function encountered exception {e}',
121
133
  'Status Code': 500
122
134
  }
123
- response = json.dumps(response).encode('utf-8')
124
- response_size = len(response)
125
- size_msg = "{}\n".format(response_size)
126
- # send the size of response
127
- conn.sendall(size_msg.encode('utf-8'))
128
- # send response
129
- conn.sendall(response)
135
+ response = json.dumps(response)
130
136
  finally:
137
+ # Restore the working directory, no matter how the function ended.
131
138
  os.chdir(abs_working_dir)
139
+
140
+ # Send response string back to parent worker process.
141
+ send_message(connstream, response)
142
+ connstream.flush()
143
+
132
144
  return 0
133
145
  def name():
134
146
  return 'parsl_coprocess'
@@ -136,9 +148,9 @@ def name():
136
148
  def run_parsl_task(a, b, c):
137
149
  import parsl.executors.workqueue.exec_parsl_function as epf
138
150
  try:
139
- map_file, function_file, result_file = (a, b, c)
151
+ (map_file, function_file, result_file) = (a, b, c)
140
152
  try:
141
- namespace, function_code, result_name = epf.load_function(map_file, function_file)
153
+ (namespace, function_code, result_name) = epf.load_function(map_file, function_file)
142
154
  except Exception:
143
155
  raise
144
156
  try:
@@ -150,5 +162,5 @@ def run_parsl_task(a, b, c):
150
162
  epf.dump_result_to_file(result_file, result)
151
163
  return None
152
164
  if __name__ == "__main__":
153
- main()
165
+ main()
154
166
 
@@ -72,11 +72,17 @@ class PollItem:
72
72
  def executor(self) -> BlockProviderExecutor:
73
73
  return self._executor
74
74
 
75
- def scale_in(self, n, force=True, max_idletime=None):
76
- if force and not max_idletime:
75
+ def scale_in(self, n, max_idletime=None):
76
+
77
+ if max_idletime is None:
77
78
  block_ids = self._executor.scale_in(n)
78
79
  else:
79
- block_ids = self._executor.scale_in(n, force=force, max_idletime=max_idletime)
80
+ # This is a HighThroughputExecutor-specific interface violation.
81
+ # This code hopes, through pan-codebase reasoning, that this
82
+ # scale_in method really does come from HighThroughputExecutor,
83
+ # and so does have an extra max_idletime parameter not present
84
+ # in the executor interface.
85
+ block_ids = self._executor.scale_in(n, max_idletime=max_idletime)
80
86
  if block_ids is not None:
81
87
  new_status = {}
82
88
  for block_id in block_ids:
parsl/jobs/strategy.py CHANGED
@@ -245,7 +245,8 @@ class Strategy:
245
245
  exec_status.scale_in(active_blocks - min_blocks)
246
246
 
247
247
  else:
248
- logger.debug(f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s for executor {label}; not scaling in")
248
+ logger.debug(
249
+ f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s for executor {label}; not scaling in")
249
250
 
250
251
  # Case 2
251
252
  # More tasks than the available slots.
@@ -288,8 +289,8 @@ class Strategy:
288
289
  excess_slots = math.ceil(active_slots - (active_tasks * parallelism))
289
290
  excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
290
291
  excess_blocks = min(excess_blocks, active_blocks - min_blocks)
291
- logger.debug(f"Requesting scaling in by {excess_blocks} blocks")
292
- exec_status.scale_in(excess_blocks, force=False, max_idletime=self.max_idletime)
292
+ logger.debug(f"Requesting scaling in by {excess_blocks} blocks with idle time {self.max_idletime}s")
293
+ exec_status.scale_in(excess_blocks, max_idletime=self.max_idletime)
293
294
  else:
294
295
  logger.error("This strategy does not support scaling in except for HighThroughputExecutor - taking no action")
295
296
  else:
@@ -103,7 +103,13 @@ class Database:
103
103
  def rollback(self) -> None:
104
104
  self.session.rollback()
105
105
 
106
- def _generate_mappings(self, table: Table, columns: Optional[List[str]] = None, messages: List[MonitoringMessage] = []) -> List[Dict[str, Any]]:
106
+ def _generate_mappings(
107
+ self,
108
+ table: Table,
109
+ columns: Optional[List[str]] = None,
110
+ messages: List[MonitoringMessage] = [],
111
+ ) -> List[Dict[str, Any]]:
112
+
107
113
  mappings = []
108
114
  for msg in messages:
109
115
  m = {}
@@ -250,6 +256,12 @@ class Database:
250
256
  'psutil_process_disk_write', Float, nullable=True)
251
257
  psutil_process_status = Column(
252
258
  'psutil_process_status', Text, nullable=True)
259
+ psutil_cpu_num = Column(
260
+ 'psutil_cpu_num', Text, nullable=True)
261
+ psutil_process_num_ctx_switches_voluntary = Column(
262
+ 'psutil_process_num_ctx_switches_voluntary', Float, nullable=True)
263
+ psutil_process_num_ctx_switches_involuntary = Column(
264
+ 'psutil_process_num_ctx_switches_involuntary', Float, nullable=True)
253
265
  __table_args__ = (
254
266
  PrimaryKeyConstraint('try_id', 'task_id', 'run_id', 'timestamp'),
255
267
  )
@@ -518,7 +530,10 @@ class DatabaseManager:
518
530
  reprocessable_first_resource_messages.append(msg)
519
531
  else:
520
532
  if task_try_id in deferred_resource_messages:
521
- logger.error("Task {} already has a deferred resource message. Discarding previous message.".format(msg['task_id']))
533
+ logger.error(
534
+ "Task {} already has a deferred resource message. "
535
+ "Discarding previous message.".format(msg['task_id'])
536
+ )
522
537
  deferred_resource_messages[task_try_id] = msg
523
538
  elif msg['last_msg']:
524
539
  # This assumes that the primary key has been added
@@ -544,7 +559,10 @@ class DatabaseManager:
544
559
  if reprocessable_last_resource_messages:
545
560
  self._insert(table=STATUS, messages=reprocessable_last_resource_messages)
546
561
  except Exception:
547
- logger.exception("Exception in db loop: this might have been a malformed message, or some other error. monitoring data may have been lost")
562
+ logger.exception(
563
+ "Exception in db loop: this might have been a malformed message, "
564
+ "or some other error. monitoring data may have been lost"
565
+ )
548
566
  exception_happened = True
549
567
  if exception_happened:
550
568
  raise RuntimeError("An exception happened sometime during database processing and should have been logged in database_manager.log")
@@ -571,8 +589,10 @@ class DatabaseManager:
571
589
  self._dispatch_to_internal(x)
572
590
  elif queue_tag == 'resource':
573
591
  assert isinstance(x, tuple), "_migrate_logs_to_internal was expecting a tuple, got {}".format(x)
574
- assert x[0] == MessageType.RESOURCE_INFO, \
575
- "_migrate_logs_to_internal can only migrate RESOURCE_INFO message from resource queue, got tag {}, message {}".format(x[0], x)
592
+ assert x[0] == MessageType.RESOURCE_INFO, (
593
+ "_migrate_logs_to_internal can only migrate RESOURCE_INFO message from resource queue, "
594
+ "got tag {}, message {}".format(x[0], x)
595
+ )
576
596
  self._dispatch_to_internal(x)
577
597
  elif queue_tag == 'node':
578
598
  assert len(x) == 2, "expected message tuple to have exactly two elements"
@@ -290,8 +290,12 @@ class MonitoringHub(RepresentationMixin):
290
290
  self._dfk_channel.close()
291
291
  if exception_msgs:
292
292
  for exception_msg in exception_msgs:
293
- self.logger.error("{} process delivered an exception: {}. Terminating all monitoring processes immediately.".format(exception_msg[0],
294
- exception_msg[1]))
293
+ self.logger.error(
294
+ "{} process delivered an exception: {}. Terminating all monitoring processes immediately.".format(
295
+ exception_msg[0],
296
+ exception_msg[1]
297
+ )
298
+ )
295
299
  self.router_proc.terminate()
296
300
  self.dbm_proc.terminate()
297
301
  self.filesystem_proc.terminate()