PyPI - parsl - Versions diffs - 2024.3.11__py3-none-any.whl → 2025.1.13__py3-none-any.whl - Mend

parsl 2024.3.11py3-none-any.whl → 2025.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (369) hide show

parsl/__init__.py +9 -10
parsl/addresses.py +29 -7
parsl/app/app.py +7 -8
parsl/app/bash.py +15 -8
parsl/app/errors.py +10 -13
parsl/app/futures.py +8 -10
parsl/app/python.py +2 -1
parsl/benchmark/perf.py +2 -1
parsl/concurrent/__init__.py +2 -2
parsl/config.py +57 -10
parsl/configs/ASPIRE1.py +6 -5
parsl/configs/Azure.py +9 -8
parsl/configs/bridges.py +6 -4
parsl/configs/cc_in2p3.py +3 -3
parsl/configs/ec2.py +3 -1
parsl/configs/expanse.py +4 -3
parsl/configs/frontera.py +3 -4
parsl/configs/htex_local.py +3 -4
parsl/configs/illinoiscluster.py +3 -1
parsl/configs/improv.py +34 -0
parsl/configs/kubernetes.py +4 -3
parsl/configs/local_threads.py +5 -1
parsl/configs/midway.py +5 -3
parsl/configs/osg.py +4 -2
parsl/configs/polaris.py +4 -2
parsl/configs/stampede2.py +6 -5
parsl/configs/summit.py +3 -3
parsl/configs/toss3_llnl.py +4 -3
parsl/configs/vineex_local.py +6 -4
parsl/configs/wqex_local.py +5 -3
parsl/curvezmq.py +4 -0
parsl/data_provider/data_manager.py +4 -3
parsl/data_provider/file_noop.py +1 -2
parsl/data_provider/files.py +3 -3
parsl/data_provider/ftp.py +1 -3
parsl/data_provider/globus.py +7 -6
parsl/data_provider/http.py +2 -2
parsl/data_provider/rsync.py +1 -1
parsl/data_provider/staging.py +2 -2
parsl/data_provider/zip.py +135 -0
parsl/dataflow/dependency_resolvers.py +115 -0
parsl/dataflow/dflow.py +262 -224
parsl/dataflow/errors.py +3 -5
parsl/dataflow/futures.py +27 -14
parsl/dataflow/memoization.py +5 -5
parsl/dataflow/rundirs.py +5 -6
parsl/dataflow/taskrecord.py +4 -5
parsl/executors/__init__.py +4 -2
parsl/executors/base.py +45 -15
parsl/executors/errors.py +13 -0
parsl/executors/execute_task.py +37 -0
parsl/executors/flux/execute_parsl_task.py +3 -3
parsl/executors/flux/executor.py +18 -19
parsl/executors/flux/flux_instance_manager.py +26 -27
parsl/executors/high_throughput/errors.py +43 -3
parsl/executors/high_throughput/executor.py +316 -282
parsl/executors/high_throughput/interchange.py +158 -167
parsl/executors/high_throughput/manager_record.py +5 -0
parsl/executors/high_throughput/manager_selector.py +55 -0
parsl/executors/high_throughput/monitoring_info.py +2 -1
parsl/executors/high_throughput/mpi_executor.py +113 -0
parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
parsl/executors/high_throughput/mpi_resource_management.py +6 -17
parsl/executors/high_throughput/probe.py +9 -7
parsl/executors/high_throughput/process_worker_pool.py +115 -77
parsl/executors/high_throughput/zmq_pipes.py +81 -23
parsl/executors/radical/executor.py +130 -79
parsl/executors/radical/rpex_resources.py +17 -15
parsl/executors/radical/rpex_worker.py +4 -3
parsl/executors/status_handling.py +157 -51
parsl/executors/taskvine/__init__.py +1 -1
parsl/executors/taskvine/errors.py +1 -1
parsl/executors/taskvine/exec_parsl_function.py +2 -2
parsl/executors/taskvine/executor.py +41 -57
parsl/executors/taskvine/factory.py +1 -1
parsl/executors/taskvine/factory_config.py +1 -1
parsl/executors/taskvine/manager.py +18 -13
parsl/executors/taskvine/manager_config.py +9 -5
parsl/executors/threads.py +6 -6
parsl/executors/workqueue/errors.py +1 -1
parsl/executors/workqueue/exec_parsl_function.py +6 -5
parsl/executors/workqueue/executor.py +64 -63
parsl/executors/workqueue/parsl_coprocess.py +1 -1
parsl/jobs/error_handlers.py +2 -2
parsl/jobs/job_status_poller.py +30 -113
parsl/jobs/states.py +7 -2
parsl/jobs/strategy.py +43 -31
parsl/launchers/__init__.py +12 -3
parsl/launchers/errors.py +1 -1
parsl/launchers/launchers.py +6 -12
parsl/log_utils.py +9 -6
parsl/monitoring/db_manager.py +59 -95
parsl/monitoring/errors.py +6 -0
parsl/monitoring/monitoring.py +87 -356
parsl/monitoring/queries/pandas.py +1 -2
parsl/monitoring/radios/base.py +13 -0
parsl/monitoring/radios/filesystem.py +52 -0
parsl/monitoring/radios/htex.py +57 -0
parsl/monitoring/radios/multiprocessing.py +17 -0
parsl/monitoring/radios/udp.py +56 -0
parsl/monitoring/radios/zmq.py +17 -0
parsl/monitoring/remote.py +33 -37
parsl/monitoring/router.py +212 -0
parsl/monitoring/types.py +5 -6
parsl/monitoring/visualization/app.py +4 -2
parsl/monitoring/visualization/models.py +0 -1
parsl/monitoring/visualization/plots/default/workflow_plots.py +11 -4
parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
parsl/monitoring/visualization/utils.py +0 -1
parsl/monitoring/visualization/views.py +16 -8
parsl/multiprocessing.py +0 -1
parsl/process_loggers.py +1 -2
parsl/providers/__init__.py +8 -17
parsl/providers/aws/aws.py +2 -3
parsl/providers/azure/azure.py +4 -5
parsl/providers/base.py +2 -18
parsl/providers/cluster_provider.py +4 -12
parsl/providers/condor/condor.py +7 -17
parsl/providers/errors.py +2 -2
parsl/providers/googlecloud/googlecloud.py +2 -1
parsl/providers/grid_engine/grid_engine.py +5 -14
parsl/providers/kubernetes/kube.py +80 -40
parsl/providers/local/local.py +13 -26
parsl/providers/lsf/lsf.py +5 -23
parsl/providers/pbspro/pbspro.py +5 -17
parsl/providers/slurm/slurm.py +81 -39
parsl/providers/torque/torque.py +3 -14
parsl/serialize/__init__.py +8 -3
parsl/serialize/base.py +1 -2
parsl/serialize/concretes.py +5 -4
parsl/serialize/facade.py +3 -3
parsl/serialize/proxystore.py +3 -2
parsl/tests/__init__.py +1 -1
parsl/tests/configs/azure_single_node.py +4 -5
parsl/tests/configs/bridges.py +3 -2
parsl/tests/configs/cc_in2p3.py +1 -3
parsl/tests/configs/comet.py +2 -1
parsl/tests/configs/ec2_single_node.py +1 -2
parsl/tests/configs/ec2_spot.py +1 -2
parsl/tests/configs/flux_local.py +11 -0
parsl/tests/configs/frontera.py +2 -3
parsl/tests/configs/htex_local.py +3 -5
parsl/tests/configs/htex_local_alternate.py +11 -15
parsl/tests/configs/htex_local_intask_staging.py +5 -9
parsl/tests/configs/htex_local_rsync_staging.py +4 -8
parsl/tests/configs/local_radical.py +1 -3
parsl/tests/configs/local_radical_mpi.py +2 -2
parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
parsl/tests/configs/local_threads_monitoring.py +0 -1
parsl/tests/configs/midway.py +2 -2
parsl/tests/configs/nscc_singapore.py +3 -3
parsl/tests/configs/osg_htex.py +1 -1
parsl/tests/configs/petrelkube.py +3 -2
parsl/tests/configs/slurm_local.py +24 -0
parsl/tests/configs/summit.py +1 -0
parsl/tests/configs/taskvine_ex.py +4 -7
parsl/tests/configs/user_opts.py +2 -8
parsl/tests/configs/workqueue_ex.py +4 -6
parsl/tests/conftest.py +27 -13
parsl/tests/integration/test_stress/test_python_simple.py +3 -4
parsl/tests/integration/test_stress/test_python_threads.py +3 -5
parsl/tests/manual_tests/htex_local.py +4 -6
parsl/tests/manual_tests/test_basic.py +1 -0
parsl/tests/manual_tests/test_log_filter.py +3 -1
parsl/tests/manual_tests/test_memory_limits.py +6 -8
parsl/tests/manual_tests/test_regression_220.py +2 -1
parsl/tests/manual_tests/test_udp_simple.py +4 -4
parsl/tests/manual_tests/test_worker_count.py +3 -2
parsl/tests/scaling_tests/htex_local.py +2 -4
parsl/tests/scaling_tests/test_scale.py +0 -9
parsl/tests/scaling_tests/vineex_condor.py +1 -2
parsl/tests/scaling_tests/vineex_local.py +1 -2
parsl/tests/site_tests/site_config_selector.py +1 -6
parsl/tests/site_tests/test_provider.py +4 -2
parsl/tests/site_tests/test_site.py +2 -0
parsl/tests/sites/test_affinity.py +7 -7
parsl/tests/sites/test_dynamic_executor.py +3 -4
parsl/tests/sites/test_ec2.py +3 -2
parsl/tests/sites/test_worker_info.py +4 -5
parsl/tests/test_aalst_patterns.py +0 -1
parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
parsl/tests/test_bash_apps/test_basic.py +10 -4
parsl/tests/test_bash_apps/test_error_codes.py +5 -7
parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
parsl/tests/test_bash_apps/test_memoize.py +2 -8
parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
parsl/tests/test_bash_apps/test_multiline.py +1 -1
parsl/tests/test_bash_apps/test_pipeline.py +1 -1
parsl/tests/test_bash_apps/test_std_uri.py +123 -0
parsl/tests/test_bash_apps/test_stdout.py +33 -8
parsl/tests/test_callables.py +2 -2
parsl/tests/test_checkpointing/test_periodic.py +21 -39
parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
parsl/tests/test_checkpointing/test_regression_239.py +1 -1
parsl/tests/test_checkpointing/test_task_exit.py +2 -3
parsl/tests/test_docs/test_from_slides.py +5 -2
parsl/tests/test_docs/test_kwargs.py +4 -1
parsl/tests/test_docs/test_tutorial_1.py +1 -2
parsl/tests/test_docs/test_workflow1.py +2 -2
parsl/tests/test_docs/test_workflow2.py +0 -1
parsl/tests/test_error_handling/test_rand_fail.py +2 -2
parsl/tests/test_error_handling/test_resource_spec.py +10 -12
parsl/tests/test_error_handling/test_retries.py +6 -16
parsl/tests/test_error_handling/test_retry_handler.py +1 -0
parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
parsl/tests/test_execute_task.py +29 -0
parsl/tests/test_flux.py +1 -1
parsl/tests/test_htex/test_basic.py +2 -3
parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
parsl/tests/test_htex/test_command_client_timeout.py +66 -0
parsl/tests/test_htex/test_connected_blocks.py +3 -2
parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
parsl/tests/test_htex/test_drain.py +79 -0
parsl/tests/test_htex/test_htex.py +51 -25
parsl/tests/test_htex/test_manager_failure.py +0 -1
parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
parsl/tests/test_htex/test_managers_command.py +36 -0
parsl/tests/test_htex/test_missing_worker.py +2 -12
parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
parsl/tests/test_htex/test_zmq_binding.py +29 -8
parsl/tests/test_monitoring/test_app_names.py +86 -0
parsl/tests/test_monitoring/test_basic.py +73 -25
parsl/tests/test_monitoring/test_db_locks.py +6 -4
parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
parsl/tests/test_monitoring/test_stdouterr.py +134 -0
parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
parsl/tests/test_providers/test_local_provider.py +3 -132
parsl/tests/test_providers/test_pbspro_template.py +2 -3
parsl/tests/test_providers/test_slurm_template.py +2 -3
parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
parsl/tests/test_python_apps/test_context_manager.py +128 -0
parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
parsl/tests/test_python_apps/test_fail.py +0 -25
parsl/tests/test_python_apps/test_futures.py +2 -1
parsl/tests/test_python_apps/test_inputs_default.py +22 -0
parsl/tests/test_python_apps/test_join.py +0 -1
parsl/tests/test_python_apps/test_lifted.py +11 -7
parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
parsl/tests/test_python_apps/test_outputs.py +1 -1
parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
parsl/tests/test_radical/test_mpi_funcs.py +1 -2
parsl/tests/test_regression/test_1480.py +2 -1
parsl/tests/test_regression/test_1653.py +2 -1
parsl/tests/test_regression/test_226.py +1 -0
parsl/tests/test_regression/test_2652.py +1 -0
parsl/tests/test_regression/test_69a.py +0 -1
parsl/tests/test_regression/test_854.py +4 -2
parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
parsl/tests/test_regression/test_98.py +0 -1
parsl/tests/test_scaling/test_block_error_handler.py +9 -4
parsl/tests/test_scaling/test_regression_1621.py +11 -15
parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
parsl/tests/test_scaling/test_scale_down.py +2 -5
parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +6 -18
parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
parsl/tests/test_serialization/test_basic.py +2 -1
parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
parsl/tests/test_staging/staging_provider.py +2 -2
parsl/tests/test_staging/test_1316.py +3 -4
parsl/tests/test_staging/test_docs_1.py +2 -1
parsl/tests/test_staging/test_docs_2.py +2 -1
parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
parsl/tests/{test_data → test_staging}/test_file.py +6 -6
parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
parsl/tests/test_staging/test_staging_ftp.py +1 -0
parsl/tests/test_staging/test_staging_https.py +5 -2
parsl/tests/test_staging/test_staging_stdout.py +64 -0
parsl/tests/test_staging/test_zip_in.py +39 -0
parsl/tests/test_staging/test_zip_out.py +110 -0
parsl/tests/test_staging/test_zip_to_zip.py +41 -0
parsl/tests/test_summary.py +2 -2
parsl/tests/test_thread_parallelism.py +0 -1
parsl/tests/test_threads/test_configs.py +1 -2
parsl/tests/test_threads/test_lazy_errors.py +2 -2
parsl/tests/test_utils/test_execute_wait.py +35 -0
parsl/tests/test_utils/test_sanitize_dns.py +76 -0
parsl/tests/unit/test_address.py +20 -0
parsl/tests/unit/test_file.py +99 -0
parsl/tests/unit/test_usage_tracking.py +66 -0
parsl/usage_tracking/api.py +65 -0
parsl/usage_tracking/levels.py +6 -0
parsl/usage_tracking/usage.py +104 -62
parsl/utils.py +139 -6
parsl/version.py +1 -1
{parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
parsl-2025.1.13.data/scripts/interchange.py +649 -0
{parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +115 -77
parsl-2025.1.13.dist-info/METADATA +96 -0
parsl-2025.1.13.dist-info/RECORD +462 -0
{parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
parsl/channels/__init__.py +0 -7
parsl/channels/base.py +0 -141
parsl/channels/errors.py +0 -113
parsl/channels/local/local.py +0 -164
parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
parsl/channels/ssh/ssh.py +0 -276
parsl/channels/ssh_il/__init__.py +0 -0
parsl/channels/ssh_il/ssh_il.py +0 -74
parsl/configs/ad_hoc.py +0 -35
parsl/executors/radical/rpex_master.py +0 -42
parsl/monitoring/radios.py +0 -175
parsl/providers/ad_hoc/__init__.py +0 -0
parsl/providers/ad_hoc/ad_hoc.py +0 -248
parsl/providers/cobalt/__init__.py +0 -0
parsl/providers/cobalt/cobalt.py +0 -236
parsl/providers/cobalt/template.py +0 -17
parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
parsl/tests/configs/cooley_htex.py +0 -37
parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
parsl/tests/configs/local_adhoc.py +0 -18
parsl/tests/configs/swan_htex.py +0 -43
parsl/tests/configs/theta.py +0 -37
parsl/tests/integration/test_channels/__init__.py +0 -0
parsl/tests/integration/test_channels/test_channels.py +0 -17
parsl/tests/integration/test_channels/test_local_channel.py +0 -42
parsl/tests/integration/test_channels/test_scp_1.py +0 -45
parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
parsl/tests/sites/test_local_adhoc.py +0 -61
parsl/tests/test_channels/__init__.py +0 -0
parsl/tests/test_channels/test_large_output.py +0 -22
parsl/tests/test_data/__init__.py +0 -0
parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
parsl-2024.3.11.dist-info/METADATA +0 -98
parsl-2024.3.11.dist-info/RECORD +0 -447
parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
{parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
{parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
{parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
{parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0

parsl/executors/high_throughput/executor.py CHANGED Viewed

@@ -1,43 +1,41 @@
-import typing
-from concurrent.futures import Future
-import typeguard
 import logging
-import threading
-import queue
-import datetime
-import pickle
-from dataclasses import dataclass
-from multiprocessing import Process, Queue
-from typing import Dict, Sequence
-from typing import List, Optional, Tuple, Union, Callable
 import math
+import pickle
+import subprocess
+import threading
+import typing
 import warnings
+from collections import defaultdict
+from concurrent.futures import Future
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
+import typeguard
-import parsl.launchers
-from parsl.serialize import pack_res_spec_apply_message, deserialize
-from parsl.serialize.errors import SerializationError, DeserializationError
+from parsl import curvezmq
+from parsl.addresses import get_all_addresses
 from parsl.app.errors import RemoteExceptionWrapper
-from parsl.jobs.states import JobStatus, JobState
-from parsl.executors.high_throughput import zmq_pipes
-from parsl.executors.high_throughput import interchange
+from parsl.data_provider.staging import Staging
 from parsl.executors.errors import (
-    BadMessage, ScalingFailed,
+    BadMessage,
+    InvalidResourceSpecification,
+    ScalingFailed,
 )
-from parsl.executors.high_throughput.mpi_prefix_composer import (
-    VALID_LAUNCHERS,
-    validate_resource_spec
+from parsl.executors.high_throughput import zmq_pipes
+from parsl.executors.high_throughput.errors import CommandClientTimeoutError
+from parsl.executors.high_throughput.manager_selector import (
+    ManagerSelector,
+    RandomManagerSelector,
 )
-from parsl import curvezmq
 from parsl.executors.status_handling import BlockProviderExecutor
-from parsl.providers.base import ExecutionProvider
-from parsl.data_provider.staging import Staging
-from parsl.addresses import get_all_addresses
+from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
 from parsl.process_loggers import wrap_with_logs
-from parsl.multiprocessing import ForkProcess
-from parsl.utils import RepresentationMixin
 from parsl.providers import LocalProvider
+from parsl.providers.base import ExecutionProvider
+from parsl.serialize import deserialize, pack_res_spec_apply_message
+from parsl.serialize.errors import DeserializationError, SerializationError
+from parsl.usage_tracking.api import UsageInformation
+from parsl.utils import RepresentationMixin
 logger = logging.getLogger(__name__)
@@ -55,54 +53,16 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
                       "--hb_period={heartbeat_period} "
                       "{address_probe_timeout_string} "
                       "--hb_threshold={heartbeat_threshold} "
+                      "--drain_period={drain_period} "
                       "--cpu-affinity {cpu_affinity} "
                       "{enable_mpi_mode} "
                       "--mpi-launcher={mpi_launcher} "
                       "--available-accelerators {accelerators}")
+DEFAULT_INTERCHANGE_LAUNCH_CMD = ["interchange.py"]
-class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
-    """Executor designed for cluster-scale
-    The HighThroughputExecutor system has the following components:
-      1. The HighThroughputExecutor instance which is run as part of the Parsl script.
-      2. The Interchange which acts as a load-balancing proxy between workers and Parsl
-      3. The multiprocessing based worker pool which coordinates task execution over several
-         cores on a node.
-      4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
-    Here is a diagram
-    .. code:: python
-                        |  Data   |  Executor   |  Interchange  | External Process(es)
-                        |  Flow   |             |               |
-                   Task | Kernel  |             |               |
-                 +----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
-                 |      |         |             | batching      |    |         |
-           Parsl<---Fut-|         |             | load-balancing|  result   exception
-                     ^  |         |             | watchdogs     |    |         |
-                     |  |         |   Q_mngmnt  |               |    V         V
-                     |  |         |    Thread<--|-incoming_q<---|--- +---------+
-                     |  |         |      |      |               |
-                     |  |         |      |      |               |
-                     +----update_fut-----+
-    Each of the workers in each process_worker_pool has access to its local rank through
-    an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
-    and is an integer in the range from 0 to the number of workers per in the pool minus 1.
-    The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
-    and the size of the worker pool as ``PARSL_WORKER_COUNT``.
-    Parameters
-    ----------
-    provider : :class:`~parsl.providers.base.ExecutionProvider`
+GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
        Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
-        :class:`~parsl.providers.cobalt.cobalt.Cobalt`,
         :class:`~parsl.providers.condor.condor.Condor`,
         :class:`~parsl.providers.googlecloud.googlecloud.GoogleCloud`,
         :class:`~parsl.providers.gridEngine.gridEngine.GridEngine`,
@@ -120,9 +80,13 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         cores_per_worker, nodes_per_block, heartbeat_period ,heartbeat_threshold, logdir). For example:
         launch_cmd="process_worker_pool.py {debug} -c {cores_per_worker} --task_url={task_url} --result_url={result_url}"
+    interchange_launch_cmd : Sequence[str]
+        Custom sequence of command line tokens to launch the interchange process from the executor. If
+        undefined, the executor will use the default "interchange.py" command.
     address : string
         An address to connect to the main Parsl process which is reachable from the network in which
-        workers will be running. This field expects an IPv4 address (xxx.xxx.xxx.xxx).
+        workers will be running. This field expects an IPv4 or IPv6 address.
         Most login nodes on clusters have several network interfaces available, only some of which
         can be reached from the compute nodes. This field can be used to limit the executor to listen
         only on a specific interface, and limiting connections to the internal network.
@@ -130,6 +94,11 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         Setting an address here overrides the default behavior.
         default=None
+    loopback_address: string
+        Specify address used for internal communication between executor and interchange.
+        Supports IPv4 and IPv6 addresses
+        default=127.0.0.1
     worker_ports : (int, int)
         Specify the ports to be used by workers to connect to Parsl. If this option is specified,
         worker_port_range will not be honored.
@@ -146,6 +115,91 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
     worker_debug : Bool
         Enables worker debug logging.
+    prefetch_capacity : int
+        Number of tasks that could be prefetched over available worker capacity.
+        When there are a few tasks (<100) or when tasks are long running, this option should
+        be set to 0 for better load balancing. Default is 0.
+    address_probe_timeout : int | None
+        Managers attempt connecting over many different addresses to determine a viable address.
+        This option sets a time limit in seconds on the connection attempt.
+        Default of None implies 30s timeout set on worker.
+    heartbeat_threshold : int
+        Seconds since the last message from the counterpart in the communication pair:
+        (interchange, manager) after which the counterpart is assumed to be un-available. Default: 120s
+    heartbeat_period : int
+        Number of seconds after which a heartbeat message indicating liveness is sent to the
+        counterpart (interchange, manager). Default: 30s
+    poll_period : int
+        Timeout period to be used by the executor components in milliseconds. Increasing poll_periods
+        trades performance for cpu efficiency. Default: 10ms
+    drain_period : int
+        The number of seconds after start when workers will begin to drain
+        and then exit. Set this to a time that is slightly less than the
+        maximum walltime of batch jobs to avoid killing tasks while they
+        execute. For example, you could set this to the walltime minus a grace
+        period for the batch job to start the workers, minus the expected
+        maximum length of an individual task.
+    worker_logdir_root : string
+        In case of a remote file system, specify the path to where logs will be kept.
+    encrypted : bool
+        Flag to enable/disable encryption (CurveZMQ). Default is False.
+    manager_selector: ManagerSelector
+        Determines what strategy the interchange uses to select managers during task distribution.
+        See API reference under "Manager Selectors" regarding the various manager selectors.
+        Default: 'RandomManagerSelector'
+"""  # Documentation for params used by both HTEx and MPIEx
+class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageInformation):
+    __doc__ = f"""Executor designed for cluster-scale
+    The HighThroughputExecutor system has the following components:
+      1. The HighThroughputExecutor instance which is run as part of the Parsl script.
+      2. The Interchange which acts as a load-balancing proxy between workers and Parsl
+      3. The multiprocessing based worker pool which coordinates task execution over several
+         cores on a node.
+      4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
+    Here is a diagram
+    .. code:: python
+                        |  Data   |  Executor   |  Interchange  | External Process(es)
+                        |  Flow   |             |               |
+                   Task | Kernel  |             |               |
+                 +----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
+                 |      |         |             | batching      |    |         |
+           Parsl<---Fut-|         |             | load-balancing|  result   exception
+                     ^  |         |             | watchdogs     |    |         |
+                     |  |         |    Result   |               |    |         |
+                     |  |         |    Queue    |               |    V         V
+                     |  |         |    Thread<--|-incoming_q<---|--- +---------+
+                     |  |         |      |      |               |
+                     |  |         |      |      |               |
+                     +----update_fut-----+
+    Each of the workers in each process_worker_pool has access to its local rank through
+    an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
+    and is an integer in the range from 0 to the number of workers per in the pool minus 1.
+    The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
+    and the size of the worker pool as ``PARSL_WORKER_COUNT``.
+    Parameters
+    ----------
+    {GENERAL_HTEX_PARAM_DOCS}
     cores_per_worker : float
         cores to be assigned to each worker. Oversubscription is possible
         by setting cores_per_worker < 1.0. Default=1
@@ -155,9 +209,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         will check the available memory at startup and limit the number of workers such that
         the there's sufficient memory for each worker. Default: None
-    max_workers : int
-        Deprecated. Please use max_workers_per_node instead.
     max_workers_per_node : int
         Caps the number of workers launched per node. Default: None
@@ -179,44 +230,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         default: empty list
-    prefetch_capacity : int
-        Number of tasks that could be prefetched over available worker capacity.
-        When there are a few tasks (<100) or when tasks are long running, this option should
-        be set to 0 for better load balancing. Default is 0.
-    address_probe_timeout : int | None
-        Managers attempt connecting over many different addresses to determine a viable address.
-        This option sets a time limit in seconds on the connection attempt.
-        Default of None implies 30s timeout set on worker.
-    heartbeat_threshold : int
-        Seconds since the last message from the counterpart in the communication pair:
-        (interchange, manager) after which the counterpart is assumed to be un-available. Default: 120s
-    heartbeat_period : int
-        Number of seconds after which a heartbeat message indicating liveness is sent to the
-        counterpart (interchange, manager). Default: 30s
-    poll_period : int
-        Timeout period to be used by the executor components in milliseconds. Increasing poll_periods
-        trades performance for cpu efficiency. Default: 10ms
-    worker_logdir_root : string
-        In case of a remote file system, specify the path to where logs will be kept.
-    enable_mpi_mode: bool
-        If enabled, MPI launch prefixes will be composed for the batch scheduler based on
-        the nodes available in each batch job and the resource_specification dict passed
-        from the app. This is an experimental feature, please refer to the following doc section
-        before use:  https://parsl.readthedocs.io/en/stable/userguide/mpi_apps.html
-    mpi_launcher: str
-        This field is only used if enable_mpi_mode is set. Select one from the
-        list of supported MPI launchers = ("srun", "aprun", "mpiexec").
-        default: "mpiexec"
-    encrypted : bool
-        Flag to enable/disable encryption (CurveZMQ). Default is False.
     """
     @typeguard.typechecked
@@ -224,7 +237,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
                  label: str = 'HighThroughputExecutor',
                  provider: ExecutionProvider = LocalProvider(),
                  launch_cmd: Optional[str] = None,
+                 interchange_launch_cmd: Optional[Sequence[str]] = None,
                  address: Optional[str] = None,
+                 loopback_address: str = "127.0.0.1",
                  worker_ports: Optional[Tuple[int, int]] = None,
                  worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
                  interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000),
@@ -233,18 +248,17 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
                  worker_debug: bool = False,
                  cores_per_worker: float = 1.0,
                  mem_per_worker: Optional[float] = None,
-                 max_workers: Optional[Union[int, float]] = None,
                  max_workers_per_node: Optional[Union[int, float]] = None,
                  cpu_affinity: str = 'none',
                  available_accelerators: Union[int, Sequence[str]] = (),
                  prefetch_capacity: int = 0,
                  heartbeat_threshold: int = 120,
                  heartbeat_period: int = 30,
+                 drain_period: Optional[int] = None,
                  poll_period: int = 10,
                  address_probe_timeout: Optional[int] = None,
                  worker_logdir_root: Optional[str] = None,
-                 enable_mpi_mode: bool = False,
-                 mpi_launcher: str = "mpiexec",
+                 manager_selector: ManagerSelector = RandomManagerSelector(),
                  block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
                  encrypted: bool = False):
@@ -260,14 +274,15 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         self.prefetch_capacity = prefetch_capacity
         self.address = address
         self.address_probe_timeout = address_probe_timeout
+        self.manager_selector = manager_selector
+        self.loopback_address = loopback_address
         if self.address:
             self.all_addresses = address
         else:
             self.all_addresses = ','.join(get_all_addresses())
-        if max_workers:
-            self._warn_deprecated("max_workers", "max_workers_per_node")
-        self.max_workers_per_node = max_workers_per_node or max_workers or float("inf")
+        self.max_workers_per_node = max_workers_per_node or float("inf")
         mem_slots = self.max_workers_per_node
         cpu_slots = self.max_workers_per_node
@@ -294,15 +309,13 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
             self._workers_per_node = 1  # our best guess-- we do not have any provider hints
         self._task_counter = 0
-        self.run_id = None  # set to the correct run_id in dfk
-        self.hub_address = None  # set to the correct hub address in dfk
-        self.hub_port = None  # set to the correct hub port in dfk
         self.worker_ports = worker_ports
         self.worker_port_range = worker_port_range
-        self.interchange_proc: Optional[Process] = None
+        self.interchange_proc: Optional[subprocess.Popen] = None
         self.interchange_port_range = interchange_port_range
         self.heartbeat_threshold = heartbeat_threshold
         self.heartbeat_period = heartbeat_period
+        self.drain_period = drain_period
         self.poll_period = poll_period
         self.run_dir = '.'
         self.worker_logdir_root = worker_logdir_root
@@ -310,20 +323,20 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         self.encrypted = encrypted
         self.cert_dir = None
-        self.enable_mpi_mode = enable_mpi_mode
-        assert mpi_launcher in VALID_LAUNCHERS, \
-            f"mpi_launcher must be set to one of {VALID_LAUNCHERS}"
-        if self.enable_mpi_mode:
-            assert isinstance(self.provider.launcher, parsl.launchers.SingleNodeLauncher), \
-                "mpi_mode requires the provider to be configured to use a SingleNodeLauncher"
-        self.mpi_launcher = mpi_launcher
         if not launch_cmd:
             launch_cmd = DEFAULT_LAUNCH_CMD
         self.launch_cmd = launch_cmd
+        if not interchange_launch_cmd:
+            interchange_launch_cmd = DEFAULT_INTERCHANGE_LAUNCH_CMD
+        self.interchange_launch_cmd = interchange_launch_cmd
+        self._result_queue_thread_exit = threading.Event()
+        self._result_queue_thread: Optional[threading.Thread] = None
     radio_mode = "htex"
+    enable_mpi_mode: bool = False
+    mpi_launcher: str = "mpiexec"
     def _warn_deprecated(self, old: str, new: str):
         warnings.warn(
@@ -333,16 +346,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
             stacklevel=2
         )
-    @property
-    def max_workers(self):
-        self._warn_deprecated("max_workers", "max_workers_per_node")
-        return self.max_workers_per_node
-    @max_workers.setter
-    def max_workers(self, val: Union[int, float]):
-        self._warn_deprecated("max_workers", "max_workers_per_node")
-        self.max_workers_per_node = val
     @property
     def logdir(self):
         return "{}/{}".format(self.run_dir, self.label)
@@ -353,6 +356,20 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
             return "{}/{}".format(self.worker_logdir_root, self.label)
         return self.logdir
+    def validate_resource_spec(self, resource_specification: dict):
+        """HTEX supports the following *Optional* resource specifications:
+        priority: lower value is higher priority"""
+        if resource_specification:
+            acceptable_fields = {'priority'}
+            keys = set(resource_specification.keys())
+            invalid_keys = keys - acceptable_fields
+            if invalid_keys:
+                message = "Task resource specification only accepts these types of resources: {}".format(
+                    ', '.join(acceptable_fields))
+                logger.error(message)
+                raise InvalidResourceSpecification(set(invalid_keys), message)
+        return
     def initialize_scaling(self):
         """Compose the launch command and scale out the initial blocks.
         """
@@ -376,6 +393,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
                                        nodes_per_block=self.provider.nodes_per_block,
                                        heartbeat_period=self.heartbeat_period,
                                        heartbeat_threshold=self.heartbeat_threshold,
+                                       drain_period=self.drain_period,
                                        poll_period=self.poll_period,
                                        cert_dir=self.cert_dir,
                                        logdir=self.worker_logdir,
@@ -388,16 +406,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         logger.debug("Starting HighThroughputExecutor with provider:\n%s", self.provider)
-        # TODO: why is this a provider property?
-        block_ids = []
-        if hasattr(self.provider, 'init_blocks'):
-            try:
-                block_ids = self.scale_out(blocks=self.provider.init_blocks)
-            except Exception as e:
-                logger.error("Scaling out failed: {}".format(e))
-                raise e
-        return block_ids
     def start(self):
         """Create the Interchange process and connect to it.
         """
@@ -412,30 +420,28 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
             )
         self.outgoing_q = zmq_pipes.TasksOutgoing(
-            curvezmq.ClientContext(self.cert_dir), "127.0.0.1", self.interchange_port_range
+            self.loopback_address, self.interchange_port_range, self.cert_dir
         )
         self.incoming_q = zmq_pipes.ResultsIncoming(
-            curvezmq.ClientContext(self.cert_dir), "127.0.0.1", self.interchange_port_range
+            self.loopback_address, self.interchange_port_range, self.cert_dir
         )
         self.command_client = zmq_pipes.CommandClient(
-            curvezmq.ClientContext(self.cert_dir), "127.0.0.1", self.interchange_port_range
+            self.loopback_address, self.interchange_port_range, self.cert_dir
         )
-        self._queue_management_thread = None
-        self._start_queue_management_thread()
+        self._result_queue_thread = None
+        self._start_result_queue_thread()
         self._start_local_interchange_process()
-        logger.debug("Created management thread: {}".format(self._queue_management_thread))
+        logger.debug("Created result queue thread: %s", self._result_queue_thread)
-        block_ids = self.initialize_scaling()
-        return block_ids
+        self.initialize_scaling()
     @wrap_with_logs
-    def _queue_management_worker(self):
-        """Listen to the queue for task status messages and handle them.
+    def _result_queue_worker(self):
+        """Listen to the queue for task result messages and handle them.
-        Depending on the message, tasks will be updated with results, exceptions,
-        or updates. It expects the following messages:
+        Depending on the message, tasks will be updated with results or exceptions.
         .. code:: python
@@ -449,14 +455,14 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
                "task_id" : <task_id>
                "exception" : serialized exception object, on failure
             }
-        The `None` message is a die request.
         """
-        logger.debug("Queue management worker starting")
+        logger.debug("Result queue worker starting")
-        while not self.bad_state_is_set:
+        while not self.bad_state_is_set and not self._result_queue_thread_exit.is_set():
             try:
-                msgs = self.incoming_q.get()
+                msgs = self.incoming_q.get(timeout_ms=self.poll_period)
+                if msgs is None:  # timeout
+                    continue
             except IOError as e:
                 logger.exception("Caught broken queue with exception code {}: {}".format(e.errno, e))
@@ -468,109 +474,114 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
             else:
-                if msgs is None:
-                    logger.debug("Got None, exiting")
-                    return
+                for serialized_msg in msgs:
+                    try:
+                        msg = pickle.loads(serialized_msg)
+                    except pickle.UnpicklingError:
+                        raise BadMessage("Message received could not be unpickled")
-                else:
-                    for serialized_msg in msgs:
+                    if msg['type'] == 'result':
                         try:
-                            msg = pickle.loads(serialized_msg)
-                        except pickle.UnpicklingError:
-                            raise BadMessage("Message received could not be unpickled")
+                            tid = msg['task_id']
+                        except Exception:
+                            raise BadMessage("Message received does not contain 'task_id' field")
+                        if tid == -1 and 'exception' in msg:
+                            logger.warning("Executor shutting down due to exception from interchange")
+                            exception = deserialize(msg['exception'])
+                            self.set_bad_state_and_fail_all(exception)
+                            break
+                        task_fut = self.tasks.pop(tid)
-                        if msg['type'] == 'heartbeat':
-                            continue
-                        elif msg['type'] == 'result':
+                        if 'result' in msg:
+                            result = deserialize(msg['result'])
+                            task_fut.set_result(result)
+                        elif 'exception' in msg:
                             try:
-                                tid = msg['task_id']
-                            except Exception:
-                                raise BadMessage("Message received does not contain 'task_id' field")
-                            if tid == -1 and 'exception' in msg:
-                                logger.warning("Executor shutting down due to exception from interchange")
-                                exception = deserialize(msg['exception'])
-                                self.set_bad_state_and_fail_all(exception)
-                                break
-                            task_fut = self.tasks.pop(tid)
-                            if 'result' in msg:
-                                result = deserialize(msg['result'])
-                                task_fut.set_result(result)
-                            elif 'exception' in msg:
-                                try:
-                                    s = deserialize(msg['exception'])
-                                    # s should be a RemoteExceptionWrapper... so we can reraise it
-                                    if isinstance(s, RemoteExceptionWrapper):
-                                        try:
-                                            s.reraise()
-                                        except Exception as e:
-                                            task_fut.set_exception(e)
-                                    elif isinstance(s, Exception):
-                                        task_fut.set_exception(s)
-                                    else:
-                                        raise ValueError("Unknown exception-like type received: {}".format(type(s)))
-                                except Exception as e:
-                                    # TODO could be a proper wrapped exception?
-                                    task_fut.set_exception(
-                                        DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
-                            else:
-                                raise BadMessage("Message received is neither result or exception")
+                                s = deserialize(msg['exception'])
+                                # s should be a RemoteExceptionWrapper... so we can reraise it
+                                if isinstance(s, RemoteExceptionWrapper):
+                                    try:
+                                        s.reraise()
+                                    except Exception as e:
+                                        task_fut.set_exception(e)
+                                elif isinstance(s, Exception):
+                                    task_fut.set_exception(s)
+                                else:
+                                    raise ValueError("Unknown exception-like type received: {}".format(type(s)))
+                            except Exception as e:
+                                # TODO could be a proper wrapped exception?
+                                task_fut.set_exception(
+                                    DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
                         else:
-                            raise BadMessage("Message received with unknown type {}".format(msg['type']))
+                            raise BadMessage("Message received is neither result or exception")
+                    else:
+                        raise BadMessage("Message received with unknown type {}".format(msg['type']))
-        logger.info("Queue management worker finished")
+        logger.info("Closing result ZMQ pipe")
+        self.incoming_q.close()
+        logger.info("Result queue worker finished")
-    def _start_local_interchange_process(self):
+    def _start_local_interchange_process(self) -> None:
         """ Starts the interchange process locally
-        Starts the interchange process locally and uses an internal command queue to
+        Starts the interchange process locally and uses the command queue to
         get the worker task and result ports that the interchange has bound to.
         """
-        comm_q = Queue(maxsize=10)
-        self.interchange_proc = ForkProcess(target=interchange.starter,
-                                            args=(comm_q,),
-                                            kwargs={"client_ports": (self.outgoing_q.port,
-                                                                     self.incoming_q.port,
-                                                                     self.command_client.port),
-                                                    "interchange_address": self.address,
-                                                    "worker_ports": self.worker_ports,
-                                                    "worker_port_range": self.worker_port_range,
-                                                    "hub_address": self.hub_address,
-                                                    "hub_port": self.hub_port,
-                                                    "logdir": self.logdir,
-                                                    "heartbeat_threshold": self.heartbeat_threshold,
-                                                    "poll_period": self.poll_period,
-                                                    "logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
-                                                    "cert_dir": self.cert_dir,
-                                                    },
-                                            daemon=True,
-                                            name="HTEX-Interchange"
-                                            )
-        self.interchange_proc.start()
+        interchange_config = {"client_address": self.loopback_address,
+                              "client_ports": (self.outgoing_q.port,
+                                               self.incoming_q.port,
+                                               self.command_client.port),
+                              "interchange_address": self.address,
+                              "worker_ports": self.worker_ports,
+                              "worker_port_range": self.worker_port_range,
+                              "hub_address": self.hub_address,
+                              "hub_zmq_port": self.hub_zmq_port,
+                              "logdir": self.logdir,
+                              "heartbeat_threshold": self.heartbeat_threshold,
+                              "poll_period": self.poll_period,
+                              "logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
+                              "cert_dir": self.cert_dir,
+                              "manager_selector": self.manager_selector,
+                              "run_id": self.run_id,
+                              }
+        config_pickle = pickle.dumps(interchange_config)
+        self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd, stdin=subprocess.PIPE)
+        stdin = self.interchange_proc.stdin
+        assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
+        logger.debug("Popened interchange process. Writing config object")
+        stdin.write(config_pickle)
+        stdin.flush()
+        stdin.close()
+        logger.debug("Sent config object. Requesting worker ports")
         try:
-            (self.worker_task_port, self.worker_result_port) = comm_q.get(block=True, timeout=120)
-        except queue.Empty:
-            logger.error("Interchange has not completed initialization in 120s. Aborting")
+            (self.worker_task_port, self.worker_result_port) = self.command_client.run("WORKER_PORTS", timeout_s=120)
+        except CommandClientTimeoutError:
+            logger.error("Interchange has not completed initialization. Aborting")
             raise Exception("Interchange failed to start")
+        logger.debug("Got worker ports")
-    def _start_queue_management_thread(self):
-        """Method to start the management thread as a daemon.
+    def _start_result_queue_thread(self):
+        """Method to start the result queue thread as a daemon.
         Checks if a thread already exists, then starts it.
-        Could be used later as a restart if the management thread dies.
+        Could be used later as a restart if the result queue thread dies.
         """
-        if self._queue_management_thread is None:
-            logger.debug("Starting queue management thread")
-            self._queue_management_thread = threading.Thread(target=self._queue_management_worker, name="HTEX-Queue-Management-Thread")
-            self._queue_management_thread.daemon = True
-            self._queue_management_thread.start()
-            logger.debug("Started queue management thread")
+        if self._result_queue_thread is None:
+            logger.debug("Starting result queue thread")
+            self._result_queue_thread = threading.Thread(target=self._result_queue_worker, name="HTEX-Result-Queue-Thread")
+            self._result_queue_thread.daemon = True
+            self._result_queue_thread.start()
+            logger.debug("Started result queue thread")
         else:
-            logger.error("Management thread already exists, returning")
+            logger.error("Result queue thread already exists, returning")
     def hold_worker(self, worker_id: str) -> None:
         """Puts a worker on hold, preventing scheduling of additional tasks to it.
@@ -591,7 +602,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
     def outstanding(self) -> int:
         """Returns the count of tasks outstanding across the interchange
         and managers"""
-        return self.command_client.run("OUTSTANDING_C")
+        return len(self.tasks)
     @property
     def connected_workers(self) -> int:
@@ -643,7 +654,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         Returns:
               Future
         """
-        validate_resource_spec(resource_specification)
+        self.validate_resource_spec(resource_specification)
         if self.bad_state_is_set:
             raise self.executor_exception
@@ -667,7 +679,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         except TypeError:
             raise SerializationError(func.__name__)
-        msg = {"task_id": task_id, "buffer": fn_buf}
+        msg = {"task_id": task_id, "resource_spec": resource_specification, "buffer": fn_buf}
         # Post task to the outgoing queue
         self.outgoing_q.put(msg)
@@ -675,22 +687,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         # Return the future
         return fut
-    def create_monitoring_info(self, status):
-        """ Create a msg for monitoring based on the poll status
-        """
-        msg = []
-        for bid, s in status.items():
-            d = {}
-            d['run_id'] = self.run_id
-            d['status'] = s.status_name
-            d['timestamp'] = datetime.datetime.now()
-            d['executor_label'] = self.label
-            d['job_id'] = self.blocks.get(bid, None)
-            d['block_id'] = bid
-            msg.append(d)
-        return msg
     @property
     def workers_per_node(self) -> Union[int, float]:
         return self._workers_per_node
@@ -728,14 +724,24 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
             tasks: int  # sum of tasks in this block
             idle: float  # shortest idle time of any manager in this block
+        # block_info will be populated from two sources:
+        # the Job Status Poller mutable block list, and the list of blocks
+        # which have connected to the interchange.
+        def new_block_info():
+            return BlockInfo(tasks=0, idle=float('inf'))
+        block_info: Dict[str, BlockInfo] = defaultdict(new_block_info)
+        for block_id, job_status in self._status.items():
+            if job_status.state not in TERMINAL_STATES:
+                block_info[block_id] = new_block_info()
         managers = self.connected_managers()
-        block_info: Dict[str, BlockInfo] = {}
         for manager in managers:
             if not manager['active']:
                 continue
             b_id = manager['block_id']
-            if b_id not in block_info:
-                block_info[b_id] = BlockInfo(tasks=0, idle=float('inf'))
             block_info[b_id].tasks += manager['tasks']
             block_info[b_id].idle = min(block_info[b_id].idle, manager['idle_duration'])
@@ -767,14 +773,14 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         # Now kill via provider
         # Potential issue with multiple threads trying to remove the same blocks
-        to_kill = [self.blocks[bid] for bid in block_ids_to_kill if bid in self.blocks]
+        to_kill = [self.blocks_to_job_id[bid] for bid in block_ids_to_kill if bid in self.blocks_to_job_id]
         r = self.provider.cancel(to_kill)
         job_ids = self._filter_scale_in_ids(to_kill, r)
-        # to_kill block_ids are fetched from self.blocks
-        # If a block_id is in self.block, it must exist in self.block_mapping
-        block_ids_killed = [self.block_mapping[jid] for jid in job_ids]
+        # to_kill block_ids are fetched from self.blocks_to_job_id
+        # If a block_id is in self.blocks_to_job_id, it must exist in self.job_ids_to_block
+        block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
         return block_ids_killed
@@ -789,7 +795,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         connected_blocks = self.connected_blocks()
         for job_id in job_status:
             job_info = job_status[job_id]
-            if job_info.terminal and job_id not in connected_blocks:
+            if job_info.terminal and job_id not in connected_blocks and job_info.state != JobState.SCALED_IN:
+                logger.debug("Rewriting job %s from status %s to MISSING", job_id, job_info)
                 job_status[job_id].state = JobState.MISSING
                 if job_status[job_id].message is None:
                     job_status[job_id].message = (
@@ -817,10 +824,37 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
         logger.info("Attempting HighThroughputExecutor shutdown")
+        logger.info("Terminating interchange and result queue thread")
+        self._result_queue_thread_exit.set()
         self.interchange_proc.terminate()
-        self.interchange_proc.join(timeout=timeout)
-        if self.interchange_proc.is_alive():
-            logger.info("Unable to terminate Interchange process; sending SIGKILL")
+        try:
+            self.interchange_proc.wait(timeout=timeout)
+        except subprocess.TimeoutExpired:
+            logger.warning("Unable to terminate Interchange process; sending SIGKILL")
             self.interchange_proc.kill()
+        logger.info("Closing ZMQ pipes")
+        # These pipes are used in a thread unsafe manner. If you have traced a
+        # problem to this block of code, you might consider what is happening
+        # with other threads that access these.
+        # incoming_q is not closed here because it is used by the results queue
+        # worker which is not shut down at this point.
+        if hasattr(self, 'outgoing_q'):
+            logger.info("Closing outgoing_q")
+            self.outgoing_q.close()
+        if hasattr(self, 'command_client'):
+            logger.info("Closing command client")
+            self.command_client.close()
+        logger.info("Waiting for result queue thread exit")
+        if self._result_queue_thread:
+            self._result_queue_thread.join()
         logger.info("Finished HighThroughputExecutor shutdown attempt")
+    def get_usage_information(self):
+        return {"mpi": self.enable_mpi_mode}

parsl 2024.3.11__py3-none-any.whl → 2025.1.13__py3-none-any.whl

parsl 2024.3.11py3-none-any.whl → 2025.1.13py3-none-any.whl