parsl 2024.2.26__py3-none-any.whl → 2024.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. parsl/addresses.py +1 -1
  2. parsl/configs/ASPIRE1.py +1 -1
  3. parsl/configs/ad_hoc.py +1 -1
  4. parsl/configs/bridges.py +1 -1
  5. parsl/configs/cc_in2p3.py +1 -1
  6. parsl/configs/expanse.py +1 -1
  7. parsl/configs/frontera.py +1 -1
  8. parsl/configs/kubernetes.py +1 -1
  9. parsl/configs/midway.py +1 -1
  10. parsl/configs/osg.py +1 -1
  11. parsl/configs/stampede2.py +1 -1
  12. parsl/dataflow/dflow.py +11 -6
  13. parsl/dataflow/taskrecord.py +3 -1
  14. parsl/executors/high_throughput/executor.py +69 -37
  15. parsl/executors/high_throughput/interchange.py +78 -59
  16. parsl/executors/high_throughput/process_worker_pool.py +40 -28
  17. parsl/executors/taskvine/executor.py +3 -1
  18. parsl/executors/workqueue/executor.py +5 -2
  19. parsl/executors/workqueue/parsl_coprocess.py +107 -95
  20. parsl/jobs/job_status_poller.py +9 -3
  21. parsl/jobs/strategy.py +4 -3
  22. parsl/monitoring/db_manager.py +25 -5
  23. parsl/monitoring/monitoring.py +6 -2
  24. parsl/monitoring/remote.py +29 -0
  25. parsl/monitoring/visualization/models.py +7 -0
  26. parsl/providers/slurm/slurm.py +13 -2
  27. parsl/tests/configs/ad_hoc_cluster_htex.py +1 -1
  28. parsl/tests/configs/bluewaters.py +1 -1
  29. parsl/tests/configs/bridges.py +1 -1
  30. parsl/tests/configs/cc_in2p3.py +1 -1
  31. parsl/tests/configs/comet.py +1 -1
  32. parsl/tests/configs/frontera.py +1 -1
  33. parsl/tests/configs/midway.py +1 -1
  34. parsl/tests/configs/nscc_singapore.py +1 -1
  35. parsl/tests/configs/osg_htex.py +1 -1
  36. parsl/tests/configs/petrelkube.py +1 -1
  37. parsl/tests/configs/summit.py +1 -1
  38. parsl/tests/configs/theta.py +1 -1
  39. parsl/tests/configs/user_opts.py +3 -1
  40. parsl/tests/manual_tests/test_ad_hoc_htex.py +1 -1
  41. parsl/tests/scaling_tests/htex_local.py +1 -1
  42. parsl/tests/sites/test_affinity.py +1 -1
  43. parsl/tests/sites/test_concurrent.py +1 -1
  44. parsl/tests/sites/test_dynamic_executor.py +1 -1
  45. parsl/tests/sites/test_worker_info.py +1 -1
  46. parsl/tests/test_htex/test_basic.py +1 -1
  47. parsl/tests/test_htex/test_connected_blocks.py +1 -1
  48. parsl/tests/test_htex/test_cpu_affinity_explicit.py +1 -1
  49. parsl/tests/test_htex/test_disconnected_blocks.py +1 -1
  50. parsl/tests/test_htex/test_htex.py +13 -0
  51. parsl/tests/test_htex/test_manager_failure.py +1 -1
  52. parsl/tests/test_htex/test_missing_worker.py +1 -1
  53. parsl/tests/test_htex/test_multiple_disconnected_blocks.py +1 -1
  54. parsl/tests/test_htex/test_worker_failure.py +1 -1
  55. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +1 -1
  56. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +1 -1
  57. parsl/tests/test_mpi_apps/test_resource_spec.py +1 -1
  58. parsl/tests/test_scaling/test_scale_down.py +2 -2
  59. parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +159 -0
  60. parsl/usage_tracking/usage.py +5 -9
  61. parsl/version.py +1 -1
  62. parsl-2024.3.11.data/scripts/parsl_coprocess.py +166 -0
  63. {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/process_worker_pool.py +40 -28
  64. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/METADATA +2 -2
  65. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/RECORD +70 -70
  66. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/WHEEL +1 -1
  67. parsl/configs/bluewaters.py +0 -28
  68. parsl-2024.2.26.data/scripts/parsl_coprocess.py +0 -154
  69. {parsl-2024.2.26.data → parsl-2024.3.11.data}/scripts/exec_parsl_function.py +0 -0
  70. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/LICENSE +0 -0
  71. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/entry_points.txt +0 -0
  72. {parsl-2024.2.26.dist-info → parsl-2024.3.11.dist-info}/top_level.txt +0 -0
@@ -201,6 +201,8 @@ def monitor(pid: int,
201
201
 
202
202
  children_user_time = {} # type: Dict[int, float]
203
203
  children_system_time = {} # type: Dict[int, float]
204
+ children_num_ctx_switches_voluntary = {} # type: Dict[int, float]
205
+ children_num_ctx_switches_involuntary = {} # type: Dict[int, float]
204
206
 
205
207
  def accumulate_and_prepare() -> Dict[str, Any]:
206
208
  d = {"psutil_process_" + str(k): v for k, v in pm.as_dict().items() if k in simple}
@@ -218,6 +220,15 @@ def monitor(pid: int,
218
220
  logging.debug("got children")
219
221
 
220
222
  d["psutil_cpu_count"] = psutil.cpu_count()
223
+
224
+ # note that this will be the CPU number of the base process, not anything launched by it
225
+ d["psutil_cpu_num"] = pm.cpu_num()
226
+
227
+ pctxsw = pm.num_ctx_switches()
228
+
229
+ d["psutil_process_num_ctx_switches_voluntary"] = pctxsw.voluntary
230
+ d["psutil_process_num_ctx_switches_involuntary"] = pctxsw.involuntary
231
+
221
232
  d['psutil_process_memory_virtual'] = pm.memory_info().vms
222
233
  d['psutil_process_memory_resident'] = pm.memory_info().rss
223
234
  d['psutil_process_time_user'] = pm.cpu_times().user
@@ -238,6 +249,11 @@ def monitor(pid: int,
238
249
  child_system_time = child.cpu_times().system
239
250
  children_user_time[child.pid] = child_user_time
240
251
  children_system_time[child.pid] = child_system_time
252
+
253
+ pctxsw = child.num_ctx_switches()
254
+ children_num_ctx_switches_voluntary[child.pid] = pctxsw.voluntary
255
+ children_num_ctx_switches_involuntary[child.pid] = pctxsw.involuntary
256
+
241
257
  d['psutil_process_memory_virtual'] += child.memory_info().vms
242
258
  d['psutil_process_memory_resident'] += child.memory_info().rss
243
259
  try:
@@ -248,14 +264,27 @@ def monitor(pid: int,
248
264
  logging.exception("Exception reading IO counters for child {k}. Recorded IO usage may be incomplete".format(k=k), exc_info=True)
249
265
  d['psutil_process_disk_write'] += 0
250
266
  d['psutil_process_disk_read'] += 0
267
+
251
268
  total_children_user_time = 0.0
252
269
  for child_pid in children_user_time:
253
270
  total_children_user_time += children_user_time[child_pid]
271
+
254
272
  total_children_system_time = 0.0
255
273
  for child_pid in children_system_time:
256
274
  total_children_system_time += children_system_time[child_pid]
275
+
276
+ total_children_num_ctx_switches_voluntary = 0.0
277
+ for child_pid in children_num_ctx_switches_voluntary:
278
+ total_children_num_ctx_switches_voluntary += children_num_ctx_switches_voluntary[child_pid]
279
+
280
+ total_children_num_ctx_switches_involuntary = 0.0
281
+ for child_pid in children_num_ctx_switches_involuntary:
282
+ total_children_num_ctx_switches_involuntary += children_num_ctx_switches_involuntary[child_pid]
283
+
257
284
  d['psutil_process_time_user'] += total_children_user_time
258
285
  d['psutil_process_time_system'] += total_children_system_time
286
+ d['psutil_process_num_ctx_switches_voluntary'] += total_children_num_ctx_switches_voluntary
287
+ d['psutil_process_num_ctx_switches_involuntary'] += total_children_num_ctx_switches_involuntary
259
288
  logging.debug("sending message")
260
289
  return d
261
290
 
@@ -102,5 +102,12 @@ class Resource(db.Model):
102
102
  'psutil_process_disk_write', db.Float, nullable=True)
103
103
  psutil_process_status = db.Column(
104
104
  'psutil_process_status', db.Text, nullable=True)
105
+ psutil_cpu_num = db.Column(
106
+ 'psutil_cpu_num', db.Text, nullable=True)
107
+ psutil_process_num_ctx_switches_voluntary = db.Column(
108
+ 'psutil_process_num_ctx_switches_voluntary', db.Float, nullable=True)
109
+ psutil_process_num_ctx_switches_involuntary = db.Column(
110
+ 'psutil_process_num_ctx_switches_involuntary', db.Float, nullable=True)
111
+
105
112
  __table_args__ = (
106
113
  db.PrimaryKeyConstraint('task_id', 'run_id', 'timestamp'),)
@@ -280,11 +280,22 @@ class SlurmProvider(ClusterProvider, RepresentationMixin):
280
280
  else:
281
281
  logger.error("Could not read job ID from submit command standard output.")
282
282
  logger.error("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip())
283
- raise SubmitException(job_name, "Could not read job ID from submit command standard output", stdout=stdout, stderr=stderr, retcode=retcode)
283
+ raise SubmitException(
284
+ job_name,
285
+ "Could not read job ID from submit command standard output",
286
+ stdout=stdout,
287
+ stderr=stderr,
288
+ retcode=retcode
289
+ )
284
290
  else:
285
291
  logger.error("Submit command failed")
286
292
  logger.error("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip())
287
- raise SubmitException(job_name, "Could not read job ID from submit command standard output", stdout=stdout, stderr=stderr, retcode=retcode)
293
+ raise SubmitException(
294
+ job_name, "Could not read job ID from submit command standard output",
295
+ stdout=stdout,
296
+ stderr=stderr,
297
+ retcode=retcode
298
+ )
288
299
 
289
300
  def cancel(self, job_ids):
290
301
  ''' Cancels the jobs specified by a list of job ids
@@ -16,7 +16,7 @@ config = Config(
16
16
  executors=[
17
17
  HighThroughputExecutor(
18
18
  label='remote_htex',
19
- max_workers=2,
19
+ max_workers_per_node=2,
20
20
  worker_logdir_root=user_opts['adhoc']['script_dir'],
21
21
  encrypted=True,
22
22
  provider=AdHocProvider(
@@ -13,7 +13,7 @@ def fresh_config():
13
13
  label="bw_htex",
14
14
  cores_per_worker=1,
15
15
  worker_debug=False,
16
- max_workers=1,
16
+ max_workers_per_node=1,
17
17
  encrypted=True,
18
18
  provider=TorqueProvider(
19
19
  queue='normal',
@@ -13,7 +13,7 @@ def fresh_config():
13
13
  # This is the network interface on the login node to
14
14
  # which compute nodes can communicate
15
15
  # address=address_by_interface('bond0.144'),
16
- max_workers=1,
16
+ max_workers_per_node=1,
17
17
  encrypted=True,
18
18
  provider=SlurmProvider(
19
19
  user_opts['bridges']['partition'], # Partition / QOS
@@ -11,7 +11,7 @@ def fresh_config():
11
11
  executors=[
12
12
  HighThroughputExecutor(
13
13
  label='cc_in2p3_htex',
14
- max_workers=1,
14
+ max_workers_per_node=1,
15
15
  encrypted=True,
16
16
  provider=GridEngineProvider(
17
17
  channel=LocalChannel(),
@@ -10,7 +10,7 @@ def fresh_config():
10
10
  executors=[
11
11
  HighThroughputExecutor(
12
12
  label='Comet_HTEX_multinode',
13
- max_workers=1,
13
+ max_workers_per_node=1,
14
14
  encrypted=True,
15
15
  provider=SlurmProvider(
16
16
  'debug',
@@ -15,7 +15,7 @@ def fresh_config():
15
15
  executors=[
16
16
  HighThroughputExecutor(
17
17
  label="frontera_htex",
18
- max_workers=1,
18
+ max_workers_per_node=1,
19
19
  encrypted=True,
20
20
  provider=SlurmProvider(
21
21
  cmd_timeout=60, # Add extra time for slow scheduler responses
@@ -12,7 +12,7 @@ def fresh_config():
12
12
  HighThroughputExecutor(
13
13
  label='Midway_HTEX_multinode',
14
14
  worker_debug=False,
15
- max_workers=1,
15
+ max_workers_per_node=1,
16
16
  encrypted=True,
17
17
  provider=SlurmProvider(
18
18
  'broadwl', # Partition name, e.g 'broadwl'
@@ -15,7 +15,7 @@ def fresh_config():
15
15
  heartbeat_period=15,
16
16
  heartbeat_threshold=120,
17
17
  worker_debug=False,
18
- max_workers=1,
18
+ max_workers_per_node=1,
19
19
  address=address_by_interface('ib0'),
20
20
  encrypted=True,
21
21
  provider=PBSProProvider(
@@ -13,7 +13,7 @@ config = Config(
13
13
  executors=[
14
14
  HighThroughputExecutor(
15
15
  label='OSG_HTEX',
16
- max_workers=1,
16
+ max_workers_per_node=1,
17
17
  encrypted=True,
18
18
  provider=CondorProvider(
19
19
  nodes_per_block=1,
@@ -18,7 +18,7 @@ def fresh_config():
18
18
  HighThroughputExecutor(
19
19
  label='kube-htex',
20
20
  cores_per_worker=1,
21
- max_workers=1,
21
+ max_workers_per_node=1,
22
22
  worker_logdir_root='.',
23
23
 
24
24
  # Address for the pod worker to connect back
@@ -20,7 +20,7 @@ def fresh_config():
20
20
 
21
21
  # address=address_by_interface('ib0'), # This assumes Parsl is running on login node
22
22
  worker_port_range=(50000, 55000),
23
- max_workers=1,
23
+ max_workers_per_node=1,
24
24
  encrypted=True,
25
25
  provider=LSFProvider(
26
26
  launcher=JsrunLauncher(),
@@ -11,7 +11,7 @@ def fresh_config():
11
11
  executors=[
12
12
  HighThroughputExecutor(
13
13
  label='theta_local_htex_multinode',
14
- max_workers=1,
14
+ max_workers_per_node=1,
15
15
  encrypted=True,
16
16
  provider=CobaltProvider(
17
17
  queue=user_opts['theta']['queue'],
@@ -52,7 +52,9 @@ user_opts = {
52
52
  # 'username': MIDWAY_USERNAME,
53
53
  # 'script_dir': '/scratch/midway2/{}/parsl_scripts'.format(MIDWAY_USERNAME),
54
54
  # 'scheduler_options': "",
55
- # 'worker_init': 'cd /scratch/midway2/{}/parsl_scripts; module load Anaconda3/5.1.0; source activate parsl_testing;'.format(MIDWAY_USERNAME),
55
+ # 'worker_init': 'cd /scratch/midway2/{}/parsl_scripts; '
56
+ # 'module load Anaconda3/5.1.0; source activate parsl_testing;'
57
+ # .format(MIDWAY_USERNAME),
56
58
  # },
57
59
  # 'osg': {
58
60
  # 'username': OSG_USERNAME,
@@ -13,7 +13,7 @@ config = Config(
13
13
  executors=[
14
14
  HighThroughputExecutor(
15
15
  label='AdHoc',
16
- max_workers=2,
16
+ max_workers_per_node=2,
17
17
  worker_logdir_root="/scratch/midway2/yadunand/parsl_scripts",
18
18
  encrypted=True,
19
19
  provider=AdHocProvider(
@@ -9,7 +9,7 @@ config = Config(
9
9
  HighThroughputExecutor(
10
10
  label="htex_local",
11
11
  cores_per_worker=1,
12
- max_workers=8,
12
+ max_workers_per_node=8,
13
13
  encrypted=True,
14
14
  provider=LocalProvider(
15
15
  channel=LocalChannel(),
@@ -15,7 +15,7 @@ def local_config():
15
15
  HighThroughputExecutor(
16
16
  label="htex_Local",
17
17
  worker_debug=True,
18
- max_workers=2,
18
+ max_workers_per_node=2,
19
19
  cpu_affinity='block',
20
20
  available_accelerators=2,
21
21
  encrypted=True,
@@ -14,7 +14,7 @@ def make_config():
14
14
  executors=[
15
15
  HighThroughputExecutor(
16
16
  address="127.0.0.1",
17
- max_workers=2,
17
+ max_workers_per_node=2,
18
18
  heartbeat_period=2,
19
19
  heartbeat_threshold=4,
20
20
  encrypted=True,
@@ -59,7 +59,7 @@ def test_dynamic_executor():
59
59
  HighThroughputExecutor(
60
60
  label='htex_local',
61
61
  cores_per_worker=1,
62
- max_workers=5,
62
+ max_workers_per_node=5,
63
63
  encrypted=True,
64
64
  provider=LocalProvider(
65
65
  init_blocks=1,
@@ -14,7 +14,7 @@ def local_config():
14
14
  HighThroughputExecutor(
15
15
  label="htex_Local",
16
16
  worker_debug=True,
17
- max_workers=4,
17
+ max_workers_per_node=4,
18
18
  encrypted=True,
19
19
  provider=LocalProvider(
20
20
  channel=LocalChannel(),
@@ -8,7 +8,7 @@ from parsl.tests.configs.htex_local import fresh_config
8
8
  def local_setup():
9
9
  config = fresh_config()
10
10
  config.executors[0].poll_period = 1
11
- config.executors[0].max_workers = 1
11
+ config.executors[0].max_workers_per_node = 1
12
12
  parsl.load(config)
13
13
 
14
14
 
@@ -14,7 +14,7 @@ def local_config():
14
14
  heartbeat_threshold=2,
15
15
  poll_period=100,
16
16
  address="127.0.0.1",
17
- max_workers=1,
17
+ max_workers_per_node=1,
18
18
  provider=LocalProvider(
19
19
  init_blocks=0,
20
20
  max_blocks=2,
@@ -34,7 +34,7 @@ def test_cpu_affinity_explicit():
34
34
 
35
35
  config = fresh_config()
36
36
  config.executors[0].cpu_affinity = affinity
37
- config.executors[0].max_workers = 1
37
+ config.executors[0].max_workers_per_node = 1
38
38
 
39
39
  logger.debug(f"config: {config}")
40
40
  # TODO: is there a `with` style for this, to properly deal with exceptions?
@@ -17,7 +17,7 @@ def local_config():
17
17
  heartbeat_period=1,
18
18
  heartbeat_threshold=2,
19
19
  poll_period=100,
20
- max_workers=1,
20
+ max_workers_per_node=1,
21
21
  provider=LocalProvider(
22
22
  worker_init="conda deactivate; export PATH=''; which python; exit 0",
23
23
  init_blocks=0,
@@ -1,4 +1,5 @@
1
1
  import pathlib
2
+ import warnings
2
3
  from unittest import mock
3
4
 
4
5
  import pytest
@@ -107,3 +108,15 @@ def test_htex_shutdown(
107
108
  assert not mock_ix_proc.terminate.called
108
109
  assert not mock_ix_proc.join.called
109
110
  assert "has not started" in mock_logs[0][0][0]
111
+
112
+
113
+ @pytest.mark.local
114
+ def test_max_workers_per_node():
115
+ with pytest.warns(DeprecationWarning) as record:
116
+ htex = HighThroughputExecutor(max_workers_per_node=1, max_workers=2)
117
+
118
+ warning_msg = "max_workers is deprecated"
119
+ assert any(warning_msg in str(warning.message) for warning in record)
120
+
121
+ # Ensure max_workers_per_node takes precedence
122
+ assert htex.max_workers_per_node == htex.max_workers == 1
@@ -13,7 +13,7 @@ from parsl.tests.configs.htex_local import fresh_config
13
13
  def load_config():
14
14
  config = fresh_config()
15
15
  config.executors[0].poll_period = 1
16
- config.executors[0].max_workers = 1
16
+ config.executors[0].max_workers_per_node = 1
17
17
  config.executors[0].heartbeat_period = 1
18
18
 
19
19
  parsl.load(config)
@@ -8,7 +8,7 @@ from parsl.tests.configs.htex_local import fresh_config
8
8
  def local_setup():
9
9
  config = fresh_config()
10
10
  config.executors[0].poll_period = 1
11
- config.executors[0].max_workers = 1
11
+ config.executors[0].max_workers_per_node = 1
12
12
  config.executors[0].launch_cmd = "executable_that_hopefully_does_not_exist_1030509.py"
13
13
  parsl.load(config)
14
14
 
@@ -17,7 +17,7 @@ def local_config():
17
17
  heartbeat_period=1,
18
18
  heartbeat_threshold=2,
19
19
  poll_period=100,
20
- max_workers=1,
20
+ max_workers_per_node=1,
21
21
  provider=LocalProvider(
22
22
  worker_init="conda deactivate; export PATH=''; which python; exit 0",
23
23
  init_blocks=2,
@@ -8,7 +8,7 @@ def local_config():
8
8
  from parsl.tests.configs.htex_local import fresh_config
9
9
  config = fresh_config()
10
10
  config.executors[0].poll_period = 1
11
- config.executors[0].max_workers = 1
11
+ config.executors[0].max_workers_per_node = 1
12
12
  config.executors[0].heartbeat_period = 1
13
13
  return config
14
14
 
@@ -11,7 +11,7 @@ EXECUTOR_LABEL = "MPI_TEST"
11
11
  def local_setup():
12
12
  config = fresh_config()
13
13
  config.executors[0].label = EXECUTOR_LABEL
14
- config.executors[0].max_workers = 1
14
+ config.executors[0].max_workers_per_node = 1
15
15
  config.executors[0].enable_mpi_mode = False
16
16
  parsl.load(config)
17
17
 
@@ -14,7 +14,7 @@ EXECUTOR_LABEL = "MPI_TEST"
14
14
  def local_setup():
15
15
  config = fresh_config()
16
16
  config.executors[0].label = EXECUTOR_LABEL
17
- config.executors[0].max_workers = 2
17
+ config.executors[0].max_workers_per_node = 2
18
18
  config.executors[0].enable_mpi_mode = True
19
19
  config.executors[0].mpi_launcher = "mpiexec"
20
20
 
@@ -28,7 +28,7 @@ EXECUTOR_LABEL = "MPI_TEST"
28
28
  def local_setup():
29
29
  config = fresh_config()
30
30
  config.executors[0].label = EXECUTOR_LABEL
31
- config.executors[0].max_workers = 1
31
+ config.executors[0].max_workers_per_node = 1
32
32
  parsl.load(config)
33
33
 
34
34
 
@@ -27,7 +27,7 @@ def local_config():
27
27
  poll_period=100,
28
28
  label="htex_local",
29
29
  address="127.0.0.1",
30
- max_workers=1,
30
+ max_workers_per_node=1,
31
31
  encrypted=True,
32
32
  provider=LocalProvider(
33
33
  channel=LocalChannel(),
@@ -39,7 +39,7 @@ def local_config():
39
39
  )
40
40
  ],
41
41
  max_idletime=0.5,
42
- strategy='htex_auto_scale',
42
+ strategy='simple',
43
43
  )
44
44
 
45
45
 
@@ -0,0 +1,159 @@
1
+ import pytest
2
+
3
+ import parsl
4
+
5
+ from parsl import File, python_app
6
+ from parsl.providers import LocalProvider
7
+ from parsl.channels import LocalChannel
8
+ from parsl.launchers import SingleNodeLauncher
9
+ from parsl.config import Config
10
+ from parsl.executors import HighThroughputExecutor
11
+
12
+ from threading import Event
13
+
14
+ _max_blocks = 5
15
+ _min_blocks = 0
16
+
17
+
18
+ def local_config():
19
+ return Config(
20
+ executors=[
21
+ HighThroughputExecutor(
22
+ heartbeat_period=1,
23
+ heartbeat_threshold=2,
24
+ poll_period=100,
25
+ label="htex_local",
26
+ address="127.0.0.1",
27
+ max_workers=1,
28
+ encrypted=True,
29
+ provider=LocalProvider(
30
+ channel=LocalChannel(),
31
+ init_blocks=0,
32
+ max_blocks=_max_blocks,
33
+ min_blocks=_min_blocks,
34
+ launcher=SingleNodeLauncher(),
35
+ ),
36
+ )
37
+ ],
38
+ max_idletime=0.5,
39
+ strategy='htex_auto_scale',
40
+ )
41
+
42
+
43
+ @python_app
44
+ def waiting_app(ident: int, outputs=(), inputs=()):
45
+ import pathlib
46
+ import time
47
+
48
+ # Approximate an Event by writing to files; the test logic will poll this file
49
+ with open(outputs[0], "a") as f:
50
+ f.write(f"Ready: {ident}\n")
51
+
52
+ # Similarly, use Event approximation (file check!) by polling.
53
+ may_finish_file = pathlib.Path(inputs[0])
54
+ while not may_finish_file.exists():
55
+ time.sleep(0.01)
56
+
57
+
58
+ # see issue #1885 for details of failures of this test.
59
+ # at the time of issue #1885 this test was failing frequently
60
+ # in CI.
61
+ @pytest.mark.local
62
+ def test_scale_out(tmpd_cwd, try_assert):
63
+ dfk = parsl.dfk()
64
+
65
+ # reconfigure scaling strategy to run faster than usual. This allows
66
+ # this test to complete faster - at time of writing 27s with default
67
+ # 5s strategy, vs XXXX with 0.5s strategy.
68
+
69
+ # check this attribute still exists, in the presence of ongoing
70
+ # development, so we have some belief that setting it will not be
71
+ # setting a now-ignored parameter.
72
+ assert hasattr(dfk.job_status_poller, 'interval')
73
+ dfk.job_status_poller.interval = 0.1
74
+
75
+ num_managers = len(dfk.executors['htex_local'].connected_managers())
76
+
77
+ assert num_managers == 0, "Expected 0 managers at start"
78
+ assert dfk.executors['htex_local'].outstanding == 0, "Expected 0 tasks at start"
79
+
80
+ ntasks = _max_blocks * 2
81
+ ready_path = tmpd_cwd / "workers_ready"
82
+ finish_path = tmpd_cwd / "stage1_workers_may_continue"
83
+ ready_path.touch()
84
+ inputs = [File(finish_path)]
85
+ outputs = [File(ready_path)]
86
+
87
+ futs = [waiting_app(i, outputs=outputs, inputs=inputs) for i in range(ntasks)]
88
+
89
+ try_assert(lambda: ready_path.read_text().count("\n") == _max_blocks, "Wait for _max_blocks tasks to be running", timeout_ms=15000)
90
+
91
+ # This should be true immediately, because the previous try_assert should
92
+ # wait until there are max_blocks tasks running, and his test should be
93
+ # configured to use 1 worker per block.
94
+ assert len(dfk.executors['htex_local'].connected_managers()) == _max_blocks
95
+
96
+ finish_path.touch() # Approximation of Event, via files
97
+ [x.result() for x in futs]
98
+
99
+ assert dfk.executors['htex_local'].outstanding == 0
100
+
101
+ # now we can launch one "long" task -
102
+ # and what should happen is that the connected_managers count "eventually" (?) converges to 1 and stays there.
103
+
104
+ finish_path = tmpd_cwd / "stage2_workers_may_continue"
105
+
106
+ fut = waiting_app(0, outputs=outputs, inputs=[File(finish_path)])
107
+
108
+ def check_one_block():
109
+ return len(dfk.executors['htex_local'].connected_managers()) == 1
110
+
111
+ try_assert(
112
+ check_one_block,
113
+ fail_msg="Expected 1 managers during a single long task",
114
+ )
115
+
116
+ # the task should not have finished by the time we end up with 1 manager
117
+ assert not fut.done()
118
+
119
+ # This section wait for the strategy to run again, with the above single
120
+ # task outstanding, and check that the strategy has not scaled up or
121
+ # down more on those subsequent iterations.
122
+
123
+ # It does this by hooking the callback of the job status poller, and
124
+ # waiting until it has run.
125
+
126
+ old_cb = dfk.job_status_poller.callback
127
+
128
+ strategy_iterated = Event()
129
+
130
+ def hook_cb(*args, **kwargs):
131
+ r = old_cb(*args, **kwargs)
132
+ strategy_iterated.set()
133
+ return r
134
+
135
+ dfk.job_status_poller.callback = hook_cb
136
+
137
+ # hack strategies to run more frequently. this allo
138
+ # dfk.job_status_poller.
139
+
140
+ try_assert(
141
+ strategy_iterated.is_set,
142
+ fail_msg="Expected strategy to have run within this period",
143
+ )
144
+
145
+ assert check_one_block()
146
+
147
+ finish_path.touch() # now we can end the single stage-2 task
148
+
149
+ fut.result()
150
+
151
+ # now we should expect min_blocks scale down
152
+
153
+ def check_min_blocks():
154
+ return len(dfk.executors['htex_local'].connected_managers()) == _min_blocks
155
+
156
+ try_assert(
157
+ check_min_blocks,
158
+ fail_msg=f"Expected {_min_blocks} managers when no tasks (min_blocks)",
159
+ )
@@ -109,7 +109,6 @@ class UsageTracker:
109
109
  sys.version_info.micro)
110
110
  self.tracking_enabled = self.check_tracking_enabled()
111
111
  logger.debug("Tracking status: {}".format(self.tracking_enabled))
112
- self.initialized = False # Once first message is sent this will be True
113
112
 
114
113
  def check_tracking_enabled(self):
115
114
  """Check if tracking is enabled.
@@ -176,15 +175,12 @@ class UsageTracker:
176
175
  except Exception as e:
177
176
  logger.debug("Usage tracking failed: {}".format(e))
178
177
 
179
- def send_message(self) -> None:
180
- """Send message over UDP.
181
- """
182
- if not self.initialized:
183
- message = self.construct_start_message()
184
- self.initialized = True
185
- else:
186
- message = self.construct_end_message()
178
+ def send_start_message(self) -> None:
179
+ message = self.construct_start_message()
180
+ self.send_UDP_message(message)
187
181
 
182
+ def send_end_message(self) -> None:
183
+ message = self.construct_end_message()
188
184
  self.send_UDP_message(message)
189
185
 
190
186
  def close(self, timeout: float = 10.0) -> None:
parsl/version.py CHANGED
@@ -3,4 +3,4 @@
3
3
  Year.Month.Day[alpha/beta/..]
4
4
  Alphas will be numbered like this -> 2024.12.10a0
5
5
  """
6
- VERSION = '2024.02.26'
6
+ VERSION = '2024.03.11'