parsl 2024.3.4__py3-none-any.whl → 2024.3.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. parsl/addresses.py +3 -1
  2. parsl/config.py +4 -0
  3. parsl/dataflow/dflow.py +14 -7
  4. parsl/dataflow/taskrecord.py +3 -1
  5. parsl/executors/high_throughput/executor.py +34 -10
  6. parsl/executors/high_throughput/interchange.py +43 -10
  7. parsl/executors/high_throughput/manager_record.py +1 -0
  8. parsl/executors/high_throughput/process_worker_pool.py +48 -7
  9. parsl/executors/taskvine/executor.py +6 -3
  10. parsl/executors/taskvine/manager.py +1 -0
  11. parsl/executors/taskvine/manager_config.py +3 -4
  12. parsl/jobs/job_status_poller.py +4 -3
  13. parsl/jobs/strategy.py +2 -1
  14. parsl/launchers/launchers.py +6 -6
  15. parsl/log_utils.py +8 -4
  16. parsl/monitoring/db_manager.py +29 -7
  17. parsl/monitoring/monitoring.py +15 -54
  18. parsl/monitoring/remote.py +29 -0
  19. parsl/monitoring/visualization/models.py +7 -0
  20. parsl/monitoring/visualization/plots/default/workflow_plots.py +3 -0
  21. parsl/monitoring/visualization/views.py +2 -1
  22. parsl/providers/cluster_provider.py +1 -3
  23. parsl/providers/slurm/slurm.py +13 -2
  24. parsl/tests/configs/user_opts.py +5 -2
  25. parsl/tests/test_htex/test_drain.py +78 -0
  26. parsl/tests/test_monitoring/test_app_names.py +86 -0
  27. parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +3 -11
  28. parsl/usage_tracking/usage.py +5 -9
  29. parsl/utils.py +2 -2
  30. parsl/version.py +1 -1
  31. {parsl-2024.3.4.data → parsl-2024.3.18.data}/scripts/process_worker_pool.py +48 -7
  32. {parsl-2024.3.4.dist-info → parsl-2024.3.18.dist-info}/METADATA +2 -2
  33. {parsl-2024.3.4.dist-info → parsl-2024.3.18.dist-info}/RECORD +39 -38
  34. {parsl-2024.3.4.dist-info → parsl-2024.3.18.dist-info}/WHEEL +1 -1
  35. parsl/configs/bluewaters.py +0 -28
  36. {parsl-2024.3.4.data → parsl-2024.3.18.data}/scripts/exec_parsl_function.py +0 -0
  37. {parsl-2024.3.4.data → parsl-2024.3.18.data}/scripts/parsl_coprocess.py +0 -0
  38. {parsl-2024.3.4.dist-info → parsl-2024.3.18.dist-info}/LICENSE +0 -0
  39. {parsl-2024.3.4.dist-info → parsl-2024.3.18.dist-info}/entry_points.txt +0 -0
  40. {parsl-2024.3.4.dist-info → parsl-2024.3.18.dist-info}/top_level.txt +0 -0
@@ -8,16 +8,16 @@ logger = logging.getLogger(__name__)
8
8
  class SimpleLauncher(Launcher):
9
9
  """ Does no wrapping. Just returns the command as-is
10
10
  """
11
- def __init_(self, debug: bool = True) -> None:
11
+ def __init__(self, debug: bool = True) -> None:
12
12
  super().__init__(debug=debug)
13
13
 
14
14
  def __call__(self, command: str, tasks_per_node: int, nodes_per_block: int) -> str:
15
- """
16
- Args:
17
- - command (string): The command string to be launched
18
- - task_block (string) : bash evaluated string.
19
15
 
20
- """
16
+ if nodes_per_block > 1:
17
+ logger.warning('Simple Launcher only supports single node per block. '
18
+ f'Requested nodes: {nodes_per_block}. '
19
+ 'You may be getting fewer workers than expected')
20
+
21
21
  return command
22
22
 
23
23
 
parsl/log_utils.py CHANGED
@@ -28,7 +28,7 @@ DEFAULT_FORMAT = (
28
28
  def set_stream_logger(name: str = 'parsl',
29
29
  level: int = logging.DEBUG,
30
30
  format_string: Optional[str] = None,
31
- stream: Optional[io.TextIOWrapper] = None) -> None:
31
+ stream: Optional[io.TextIOWrapper] = None) -> logging.Logger:
32
32
  """Add a stream log handler.
33
33
 
34
34
  Args:
@@ -39,7 +39,7 @@ def set_stream_logger(name: str = 'parsl',
39
39
  If not specified, the default stream for logging.StreamHandler is used.
40
40
 
41
41
  Returns:
42
- - None
42
+ - logger for specified name
43
43
  """
44
44
  if format_string is None:
45
45
  # format_string = "%(asctime)s %(name)s [%(levelname)s] Thread:%(thread)d %(message)s"
@@ -59,12 +59,14 @@ def set_stream_logger(name: str = 'parsl',
59
59
  futures_logger = logging.getLogger("concurrent.futures")
60
60
  futures_logger.addHandler(handler)
61
61
 
62
+ return logger
63
+
62
64
 
63
65
  @typeguard.typechecked
64
66
  def set_file_logger(filename: str,
65
67
  name: str = 'parsl',
66
68
  level: int = logging.DEBUG,
67
- format_string: Optional[str] = None) -> None:
69
+ format_string: Optional[str] = None) -> logging.Logger:
68
70
  """Add a file log handler.
69
71
 
70
72
  Args:
@@ -74,7 +76,7 @@ def set_file_logger(filename: str,
74
76
  - format_string (string): Set the format string
75
77
 
76
78
  Returns:
77
- - None
79
+ - logger for specified name
78
80
  """
79
81
  if format_string is None:
80
82
  format_string = DEFAULT_FORMAT
@@ -91,3 +93,5 @@ def set_file_logger(filename: str,
91
93
  # concurrent.futures
92
94
  futures_logger = logging.getLogger("concurrent.futures")
93
95
  futures_logger.addHandler(handler)
96
+
97
+ return logger
@@ -103,7 +103,13 @@ class Database:
103
103
  def rollback(self) -> None:
104
104
  self.session.rollback()
105
105
 
106
- def _generate_mappings(self, table: Table, columns: Optional[List[str]] = None, messages: List[MonitoringMessage] = []) -> List[Dict[str, Any]]:
106
+ def _generate_mappings(
107
+ self,
108
+ table: Table,
109
+ columns: Optional[List[str]] = None,
110
+ messages: List[MonitoringMessage] = [],
111
+ ) -> List[Dict[str, Any]]:
112
+
107
113
  mappings = []
108
114
  for msg in messages:
109
115
  m = {}
@@ -250,6 +256,12 @@ class Database:
250
256
  'psutil_process_disk_write', Float, nullable=True)
251
257
  psutil_process_status = Column(
252
258
  'psutil_process_status', Text, nullable=True)
259
+ psutil_cpu_num = Column(
260
+ 'psutil_cpu_num', Text, nullable=True)
261
+ psutil_process_num_ctx_switches_voluntary = Column(
262
+ 'psutil_process_num_ctx_switches_voluntary', Float, nullable=True)
263
+ psutil_process_num_ctx_switches_involuntary = Column(
264
+ 'psutil_process_num_ctx_switches_involuntary', Float, nullable=True)
253
265
  __table_args__ = (
254
266
  PrimaryKeyConstraint('try_id', 'task_id', 'run_id', 'timestamp'),
255
267
  )
@@ -518,7 +530,10 @@ class DatabaseManager:
518
530
  reprocessable_first_resource_messages.append(msg)
519
531
  else:
520
532
  if task_try_id in deferred_resource_messages:
521
- logger.error("Task {} already has a deferred resource message. Discarding previous message.".format(msg['task_id']))
533
+ logger.error(
534
+ "Task {} already has a deferred resource message. "
535
+ "Discarding previous message.".format(msg['task_id'])
536
+ )
522
537
  deferred_resource_messages[task_try_id] = msg
523
538
  elif msg['last_msg']:
524
539
  # This assumes that the primary key has been added
@@ -544,7 +559,10 @@ class DatabaseManager:
544
559
  if reprocessable_last_resource_messages:
545
560
  self._insert(table=STATUS, messages=reprocessable_last_resource_messages)
546
561
  except Exception:
547
- logger.exception("Exception in db loop: this might have been a malformed message, or some other error. monitoring data may have been lost")
562
+ logger.exception(
563
+ "Exception in db loop: this might have been a malformed message, "
564
+ "or some other error. monitoring data may have been lost"
565
+ )
548
566
  exception_happened = True
549
567
  if exception_happened:
550
568
  raise RuntimeError("An exception happened sometime during database processing and should have been logged in database_manager.log")
@@ -571,8 +589,10 @@ class DatabaseManager:
571
589
  self._dispatch_to_internal(x)
572
590
  elif queue_tag == 'resource':
573
591
  assert isinstance(x, tuple), "_migrate_logs_to_internal was expecting a tuple, got {}".format(x)
574
- assert x[0] == MessageType.RESOURCE_INFO, \
575
- "_migrate_logs_to_internal can only migrate RESOURCE_INFO message from resource queue, got tag {}, message {}".format(x[0], x)
592
+ assert x[0] == MessageType.RESOURCE_INFO, (
593
+ "_migrate_logs_to_internal can only migrate RESOURCE_INFO message from resource queue, "
594
+ "got tag {}, message {}".format(x[0], x)
595
+ )
576
596
  self._dispatch_to_internal(x)
577
597
  elif queue_tag == 'node':
578
598
  assert len(x) == 2, "expected message tuple to have exactly two elements"
@@ -613,7 +633,8 @@ class DatabaseManager:
613
633
  # if retried - for example, the database being locked because someone else is readying
614
634
  # the tables we are trying to write to. If that assumption is wrong, then this loop
615
635
  # may go on forever.
616
- logger.warning("Got a database OperationalError. Ignoring and retrying on the assumption that it is recoverable: {}".format(e))
636
+ logger.warning("Got a database OperationalError. "
637
+ "Ignoring and retrying on the assumption that it is recoverable: {}".format(e))
617
638
  self.db.rollback()
618
639
  time.sleep(1) # hard coded 1s wait - this should be configurable or exponential backoff or something
619
640
 
@@ -640,7 +661,8 @@ class DatabaseManager:
640
661
  done = True
641
662
  except sa.exc.OperationalError as e:
642
663
  # hoping that this is a database locked error during _update, not some other problem
643
- logger.warning("Got a database OperationalError. Ignoring and retrying on the assumption that it is recoverable: {}".format(e))
664
+ logger.warning("Got a database OperationalError. "
665
+ "Ignoring and retrying on the assumption that it is recoverable: {}".format(e))
644
666
  self.db.rollback()
645
667
  time.sleep(1) # hard coded 1s wait - this should be configurable or exponential backoff or something
646
668
  except KeyboardInterrupt:
@@ -15,6 +15,7 @@ import parsl.monitoring.remote
15
15
  from parsl.multiprocessing import ForkProcess, SizedQueue
16
16
  from multiprocessing import Process
17
17
  from multiprocessing.queues import Queue
18
+ from parsl.log_utils import set_file_logger
18
19
  from parsl.utils import RepresentationMixin
19
20
  from parsl.process_loggers import wrap_with_logs
20
21
  from parsl.utils import setproctitle
@@ -38,40 +39,6 @@ else:
38
39
  logger = logging.getLogger(__name__)
39
40
 
40
41
 
41
- def start_file_logger(filename: str, name: str = 'monitoring', level: int = logging.DEBUG, format_string: Optional[str] = None) -> logging.Logger:
42
- """Add a stream log handler.
43
-
44
- Parameters
45
- ---------
46
-
47
- filename: string
48
- Name of the file to write logs to. Required.
49
- name: string
50
- Logger name.
51
- level: logging.LEVEL
52
- Set the logging level. Default=logging.DEBUG
53
- - format_string (string): Set the format string
54
- format_string: string
55
- Format string to use.
56
-
57
- Returns
58
- -------
59
- None.
60
- """
61
- if format_string is None:
62
- format_string = "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d [%(levelname)s] %(message)s"
63
-
64
- logger = logging.getLogger(name)
65
- logger.setLevel(level)
66
- logger.propagate = False
67
- handler = logging.FileHandler(filename)
68
- handler.setLevel(level)
69
- formatter = logging.Formatter(format_string, datefmt='%Y-%m-%d %H:%M:%S')
70
- handler.setFormatter(formatter)
71
- logger.addHandler(handler)
72
- return logger
73
-
74
-
75
42
  @typeguard.typechecked
76
43
  class MonitoringHub(RepresentationMixin):
77
44
  def __init__(self,
@@ -79,9 +46,6 @@ class MonitoringHub(RepresentationMixin):
79
46
  hub_port: Optional[int] = None,
80
47
  hub_port_range: Tuple[int, int] = (55050, 56000),
81
48
 
82
- client_address: str = "127.0.0.1",
83
- client_port_range: Tuple[int, int] = (55000, 56000),
84
-
85
49
  workflow_name: Optional[str] = None,
86
50
  workflow_version: Optional[str] = None,
87
51
  logging_endpoint: Optional[str] = None,
@@ -106,11 +70,6 @@ class MonitoringHub(RepresentationMixin):
106
70
  to deliver monitoring messages to the monitoring router.
107
71
  Note that despite the similar name, this is not related to hub_port.
108
72
  Default: (55050, 56000)
109
- client_address : str
110
- The ip address at which the dfk will be able to reach Hub. Default: "127.0.0.1"
111
- client_port_range : tuple(int, int)
112
- The MonitoringHub picks ports at random from the range which will be used by Hub.
113
- Default: (55000, 56000)
114
73
  workflow_name : str
115
74
  The name for the workflow. Default to the name of the parsl script
116
75
  workflow_version : str
@@ -145,9 +104,6 @@ class MonitoringHub(RepresentationMixin):
145
104
  if _db_manager_excepts:
146
105
  raise _db_manager_excepts
147
106
 
148
- self.client_address = client_address
149
- self.client_port_range = client_port_range
150
-
151
107
  self.hub_address = hub_address
152
108
  self.hub_port = hub_port
153
109
  self.hub_port_range = hub_port_range
@@ -290,8 +246,12 @@ class MonitoringHub(RepresentationMixin):
290
246
  self._dfk_channel.close()
291
247
  if exception_msgs:
292
248
  for exception_msg in exception_msgs:
293
- self.logger.error("{} process delivered an exception: {}. Terminating all monitoring processes immediately.".format(exception_msg[0],
294
- exception_msg[1]))
249
+ self.logger.error(
250
+ "{} process delivered an exception: {}. Terminating all monitoring processes immediately.".format(
251
+ exception_msg[0],
252
+ exception_msg[1]
253
+ )
254
+ )
295
255
  self.router_proc.terminate()
296
256
  self.dbm_proc.terminate()
297
257
  self.filesystem_proc.terminate()
@@ -333,9 +293,9 @@ class MonitoringHub(RepresentationMixin):
333
293
 
334
294
  @wrap_with_logs
335
295
  def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:
336
- logger = start_file_logger("{}/monitoring_filesystem_radio.log".format(logdir),
337
- name="monitoring_filesystem_radio",
338
- level=logging.INFO)
296
+ logger = set_file_logger("{}/monitoring_filesystem_radio.log".format(logdir),
297
+ name="monitoring_filesystem_radio",
298
+ level=logging.INFO)
339
299
 
340
300
  logger.info("Starting filesystem radio receiver")
341
301
  setproctitle("parsl: monitoring filesystem receiver")
@@ -401,9 +361,9 @@ class MonitoringRouter:
401
361
 
402
362
  """
403
363
  os.makedirs(logdir, exist_ok=True)
404
- self.logger = start_file_logger("{}/monitoring_router.log".format(logdir),
405
- name="monitoring_router",
406
- level=logging_level)
364
+ self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
365
+ name="monitoring_router",
366
+ level=logging_level)
407
367
  self.logger.debug("Monitoring router starting")
408
368
 
409
369
  self.hub_address = hub_address
@@ -489,7 +449,8 @@ class MonitoringRouter:
489
449
  # but there is no verification that the message
490
450
  # received from ic_channel.recv_pyobj() is actually
491
451
  # of that type.
492
- self.logger.error(f"Discarding message from interchange with unknown type {msg[0].value}") # type: ignore[unreachable]
452
+ self.logger.error("Discarding message " # type: ignore[unreachable]
453
+ f"from interchange with unknown type {msg[0].value}")
493
454
  except zmq.Again:
494
455
  pass
495
456
  except Exception:
@@ -201,6 +201,8 @@ def monitor(pid: int,
201
201
 
202
202
  children_user_time = {} # type: Dict[int, float]
203
203
  children_system_time = {} # type: Dict[int, float]
204
+ children_num_ctx_switches_voluntary = {} # type: Dict[int, float]
205
+ children_num_ctx_switches_involuntary = {} # type: Dict[int, float]
204
206
 
205
207
  def accumulate_and_prepare() -> Dict[str, Any]:
206
208
  d = {"psutil_process_" + str(k): v for k, v in pm.as_dict().items() if k in simple}
@@ -218,6 +220,15 @@ def monitor(pid: int,
218
220
  logging.debug("got children")
219
221
 
220
222
  d["psutil_cpu_count"] = psutil.cpu_count()
223
+
224
+ # note that this will be the CPU number of the base process, not anything launched by it
225
+ d["psutil_cpu_num"] = pm.cpu_num()
226
+
227
+ pctxsw = pm.num_ctx_switches()
228
+
229
+ d["psutil_process_num_ctx_switches_voluntary"] = pctxsw.voluntary
230
+ d["psutil_process_num_ctx_switches_involuntary"] = pctxsw.involuntary
231
+
221
232
  d['psutil_process_memory_virtual'] = pm.memory_info().vms
222
233
  d['psutil_process_memory_resident'] = pm.memory_info().rss
223
234
  d['psutil_process_time_user'] = pm.cpu_times().user
@@ -238,6 +249,11 @@ def monitor(pid: int,
238
249
  child_system_time = child.cpu_times().system
239
250
  children_user_time[child.pid] = child_user_time
240
251
  children_system_time[child.pid] = child_system_time
252
+
253
+ pctxsw = child.num_ctx_switches()
254
+ children_num_ctx_switches_voluntary[child.pid] = pctxsw.voluntary
255
+ children_num_ctx_switches_involuntary[child.pid] = pctxsw.involuntary
256
+
241
257
  d['psutil_process_memory_virtual'] += child.memory_info().vms
242
258
  d['psutil_process_memory_resident'] += child.memory_info().rss
243
259
  try:
@@ -248,14 +264,27 @@ def monitor(pid: int,
248
264
  logging.exception("Exception reading IO counters for child {k}. Recorded IO usage may be incomplete".format(k=k), exc_info=True)
249
265
  d['psutil_process_disk_write'] += 0
250
266
  d['psutil_process_disk_read'] += 0
267
+
251
268
  total_children_user_time = 0.0
252
269
  for child_pid in children_user_time:
253
270
  total_children_user_time += children_user_time[child_pid]
271
+
254
272
  total_children_system_time = 0.0
255
273
  for child_pid in children_system_time:
256
274
  total_children_system_time += children_system_time[child_pid]
275
+
276
+ total_children_num_ctx_switches_voluntary = 0.0
277
+ for child_pid in children_num_ctx_switches_voluntary:
278
+ total_children_num_ctx_switches_voluntary += children_num_ctx_switches_voluntary[child_pid]
279
+
280
+ total_children_num_ctx_switches_involuntary = 0.0
281
+ for child_pid in children_num_ctx_switches_involuntary:
282
+ total_children_num_ctx_switches_involuntary += children_num_ctx_switches_involuntary[child_pid]
283
+
257
284
  d['psutil_process_time_user'] += total_children_user_time
258
285
  d['psutil_process_time_system'] += total_children_system_time
286
+ d['psutil_process_num_ctx_switches_voluntary'] += total_children_num_ctx_switches_voluntary
287
+ d['psutil_process_num_ctx_switches_involuntary'] += total_children_num_ctx_switches_involuntary
259
288
  logging.debug("sending message")
260
289
  return d
261
290
 
@@ -102,5 +102,12 @@ class Resource(db.Model):
102
102
  'psutil_process_disk_write', db.Float, nullable=True)
103
103
  psutil_process_status = db.Column(
104
104
  'psutil_process_status', db.Text, nullable=True)
105
+ psutil_cpu_num = db.Column(
106
+ 'psutil_cpu_num', db.Text, nullable=True)
107
+ psutil_process_num_ctx_switches_voluntary = db.Column(
108
+ 'psutil_process_num_ctx_switches_voluntary', db.Float, nullable=True)
109
+ psutil_process_num_ctx_switches_involuntary = db.Column(
110
+ 'psutil_process_num_ctx_switches_involuntary', db.Float, nullable=True)
111
+
105
112
  __table_args__ = (
106
113
  db.PrimaryKeyConstraint('task_id', 'run_id', 'timestamp'),)
@@ -27,6 +27,9 @@ gantt_colors = {'unsched': 'rgb(240, 240, 240)',
27
27
 
28
28
  def task_gantt_plot(df_task, df_status, time_completed=None):
29
29
 
30
+ if df_task.empty:
31
+ return None
32
+
30
33
  # if the workflow is not recorded as completed, then assume
31
34
  # that tasks should continue in their last state until now,
32
35
  # rather than the workflow end time.
@@ -8,7 +8,8 @@ from parsl.monitoring.visualization.models import Workflow, Task, Status, db
8
8
 
9
9
  from parsl.monitoring.visualization.plots.default.workflow_plots import task_gantt_plot, task_per_app_plot, workflow_dag_plot
10
10
  from parsl.monitoring.visualization.plots.default.task_plots import time_series_memory_per_task_plot
11
- from parsl.monitoring.visualization.plots.default.workflow_resource_plots import resource_distribution_plot, resource_efficiency, worker_efficiency
11
+ from parsl.monitoring.visualization.plots.default.workflow_resource_plots import (resource_distribution_plot,
12
+ resource_efficiency, worker_efficiency)
12
13
 
13
14
  dummy = True
14
15
 
@@ -91,7 +91,7 @@ class ClusterProvider(ExecutionProvider):
91
91
  - configs (dict) : configs that get pushed into the template
92
92
 
93
93
  Returns:
94
- - True: on success
94
+ - None
95
95
 
96
96
  Raises:
97
97
  SchedulerMissingArgs : If template is missing args
@@ -117,8 +117,6 @@ class ClusterProvider(ExecutionProvider):
117
117
  logger.error("Uncategorized error: %s", e)
118
118
  raise e
119
119
 
120
- return True
121
-
122
120
  @abstractmethod
123
121
  def _status(self):
124
122
  pass
@@ -280,11 +280,22 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
280
280
  else:
281
281
  logger.error("Could not read job ID from submit command standard output.")
282
282
  logger.error("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip())
283
- raise SubmitException(job_name, "Could not read job ID from submit command standard output", stdout=stdout, stderr=stderr, retcode=retcode)
283
+ raise SubmitException(
284
+ job_name,
285
+ "Could not read job ID from submit command standard output",
286
+ stdout=stdout,
287
+ stderr=stderr,
288
+ retcode=retcode
289
+ )
284
290
  else:
285
291
  logger.error("Submit command failed")
286
292
  logger.error("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip())
287
- raise SubmitException(job_name, "Could not read job ID from submit command standard output", stdout=stdout, stderr=stderr, retcode=retcode)
293
+ raise SubmitException(
294
+ job_name, "Could not read job ID from submit command standard output",
295
+ stdout=stdout,
296
+ stderr=stderr,
297
+ retcode=retcode
298
+ )
288
299
 
289
300
  def cancel(self, job_ids):
290
301
  ''' Cancels the jobs specified by a list of job ids
@@ -52,13 +52,16 @@ user_opts = {
52
52
  # 'username': MIDWAY_USERNAME,
53
53
  # 'script_dir': '/scratch/midway2/{}/parsl_scripts'.format(MIDWAY_USERNAME),
54
54
  # 'scheduler_options': "",
55
- # 'worker_init': 'cd /scratch/midway2/{}/parsl_scripts; module load Anaconda3/5.1.0; source activate parsl_testing;'.format(MIDWAY_USERNAME),
55
+ # 'worker_init': 'cd /scratch/midway2/{}/parsl_scripts; '
56
+ # 'module load Anaconda3/5.1.0; source activate parsl_testing;'
57
+ # .format(MIDWAY_USERNAME),
56
58
  # },
57
59
  # 'osg': {
58
60
  # 'username': OSG_USERNAME,
59
61
  # 'script_dir': '/home/{}/parsl_scripts'.format(OSG_USERNAME),
60
62
  # 'scheduler_options': "",
61
- # 'worker_init' : 'module load python/3.5.2; python3 -m venv parsl_env; source parsl_env/bin/activate; python3 -m pip install parsl==0.5.2'
63
+ # 'worker_init' : 'module load python/3.5.2; python3 -m venv parsl_env;
64
+ # source parsl_env/bin/activate; python3 -m pip install parsl==0.5.2'
62
65
  # },
63
66
  # 'swan': {
64
67
  # 'username': SWAN_USERNAME,
@@ -0,0 +1,78 @@
1
+ import parsl
2
+ import pytest
3
+ import time
4
+
5
+ from parsl.providers import LocalProvider
6
+ from parsl.channels import LocalChannel
7
+ from parsl.launchers import SimpleLauncher
8
+
9
+ from parsl.config import Config
10
+ from parsl.executors import HighThroughputExecutor
11
+
12
+ # this constant is used to scale some durations that happen
13
+ # based around the expected drain period: the drain period
14
+ # is TIME_CONST seconds, and the single executed task will
15
+ # last twice that many number of seconds.
16
+ TIME_CONST = 1
17
+
18
+
19
+ def local_config():
20
+ return Config(
21
+ executors=[
22
+ HighThroughputExecutor(
23
+ label="htex_local",
24
+ drain_period=TIME_CONST,
25
+ worker_debug=True,
26
+ cores_per_worker=1,
27
+ encrypted=True,
28
+ provider=LocalProvider(
29
+ channel=LocalChannel(),
30
+ init_blocks=1,
31
+ min_blocks=0,
32
+ max_blocks=0,
33
+ launcher=SimpleLauncher(),
34
+ ),
35
+ )
36
+ ],
37
+ strategy='none',
38
+ )
39
+
40
+
41
+ @parsl.python_app
42
+ def f(n):
43
+ import time
44
+ time.sleep(n)
45
+
46
+
47
+ @pytest.mark.local
48
+ def test_drain(try_assert):
49
+
50
+ htex = parsl.dfk().executors['htex_local']
51
+
52
+ # wait till we have a block running...
53
+
54
+ try_assert(lambda: len(htex.connected_managers()) == 1)
55
+
56
+ managers = htex.connected_managers()
57
+ assert managers[0]['active'], "The manager should be active"
58
+ assert not managers[0]['draining'], "The manager should not be draining"
59
+
60
+ fut = f(TIME_CONST * 2)
61
+
62
+ time.sleep(TIME_CONST)
63
+
64
+ # this assert should happen *very fast* after the above delay...
65
+ try_assert(lambda: htex.connected_managers()[0]['draining'], timeout_ms=500)
66
+
67
+ # and the test task should still be running...
68
+ assert not fut.done(), "The test task should still be running"
69
+
70
+ fut.result()
71
+
72
+ # and now we should see the manager disappear...
73
+ # ... with strategy='none', this should be coming from draining but
74
+ # that information isn't immediately obvious from the absence in
75
+ # connected managers.
76
+ # As with the above draining assert, this should happen very fast after
77
+ # the task ends.
78
+ try_assert(lambda: len(htex.connected_managers()) == 0, timeout_ms=500)
@@ -0,0 +1,86 @@
1
+ """Tests monitoring records app name under various decoration patterns.
2
+ """
3
+
4
+ import os
5
+ import parsl
6
+ import pytest
7
+ import time
8
+
9
+ from parsl.tests.configs.htex_local_alternate import fresh_config
10
+
11
+
12
+ @parsl.python_app
13
+ def regular_decorated_app():
14
+ return 5
15
+
16
+
17
+ @pytest.mark.local
18
+ def get_regular_decorated_app():
19
+ return regular_decorated_app
20
+
21
+
22
+ def for_decoration_later():
23
+ return 77
24
+
25
+
26
+ def get_for_decoration_later():
27
+ return parsl.python_app(for_decoration_later)
28
+
29
+
30
+ def get_decorated_closure():
31
+
32
+ r = 53
33
+
34
+ @parsl.python_app
35
+ def decorated_closure():
36
+ return r
37
+
38
+ return decorated_closure
39
+
40
+
41
+ @pytest.mark.local
42
+ @pytest.mark.parametrize("get_app,expected_name,expected_result",
43
+ [(get_regular_decorated_app, "regular_decorated_app", 5),
44
+ (get_for_decoration_later, "for_decoration_later", 77),
45
+ (get_decorated_closure, "decorated_closure", 53)
46
+ ])
47
+ def test_app_name(get_app, expected_name, expected_result, tmpd_cwd):
48
+
49
+ # this is imported here rather than at module level because
50
+ # it isn't available in a plain parsl install, so this module
51
+ # would otherwise fail to import and break even a basic test
52
+ # run.
53
+ import sqlalchemy
54
+
55
+ c = fresh_config()
56
+ c.run_dir = tmpd_cwd
57
+ c.monitoring.logging_endpoint = f"sqlite:///{tmpd_cwd}/monitoring.db"
58
+ parsl.load(c)
59
+
60
+ app = get_app()
61
+ assert app().result() == expected_result
62
+
63
+ parsl.dfk().cleanup()
64
+ parsl.clear()
65
+
66
+ engine = sqlalchemy.create_engine(c.monitoring.logging_endpoint)
67
+ with engine.begin() as connection:
68
+
69
+ def count_rows(table: str):
70
+ result = connection.execute(f"SELECT COUNT(*) FROM {table}")
71
+ (c, ) = result.first()
72
+ return c
73
+
74
+ # one workflow...
75
+ assert count_rows("workflow") == 1
76
+
77
+ # ... with one task ...
78
+ assert count_rows("task") == 1
79
+
80
+ # ... that was tried once ...
81
+ assert count_rows("try") == 1
82
+
83
+ # ... and has the expected name.
84
+ result = connection.execute("SELECT task_func_name FROM task")
85
+ (c, ) = result.first()
86
+ assert c == expected_name
@@ -37,6 +37,7 @@ def local_config():
37
37
  ],
38
38
  max_idletime=0.5,
39
39
  strategy='htex_auto_scale',
40
+ strategy_period=0.1
40
41
  )
41
42
 
42
43
 
@@ -62,16 +63,6 @@ def waiting_app(ident: int, outputs=(), inputs=()):
62
63
  def test_scale_out(tmpd_cwd, try_assert):
63
64
  dfk = parsl.dfk()
64
65
 
65
- # reconfigure scaling strategy to run faster than usual. This allows
66
- # this test to complete faster - at time of writing 27s with default
67
- # 5s strategy, vs XXXX with 0.5s strategy.
68
-
69
- # check this attribute still exists, in the presence of ongoing
70
- # development, so we have some belief that setting it will not be
71
- # setting a now-ignored parameter.
72
- assert hasattr(dfk.job_status_poller, 'interval')
73
- dfk.job_status_poller.interval = 0.1
74
-
75
66
  num_managers = len(dfk.executors['htex_local'].connected_managers())
76
67
 
77
68
  assert num_managers == 0, "Expected 0 managers at start"
@@ -98,7 +89,8 @@ def test_scale_out(tmpd_cwd, try_assert):
98
89
 
99
90
  assert dfk.executors['htex_local'].outstanding == 0
100
91
 
101
- # now we can launch one "long" task - and what should happen is that the connected_managers count "eventually" (?) converges to 1 and stays there.
92
+ # now we can launch one "long" task -
93
+ # and what should happen is that the connected_managers count "eventually" (?) converges to 1 and stays there.
102
94
 
103
95
  finish_path = tmpd_cwd / "stage2_workers_may_continue"
104
96