PyPI - parsl - Versions diffs - 2024.3.18__py3-none-any.whl → 2025.1.13__py3-none-any.whl - Mend

parsl 2024.3.18py3-none-any.whl → 2025.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (369) hide show

parsl/__init__.py +9 -10
parsl/addresses.py +26 -6
parsl/app/app.py +7 -8
parsl/app/bash.py +15 -8
parsl/app/errors.py +10 -13
parsl/app/futures.py +8 -10
parsl/app/python.py +2 -1
parsl/benchmark/perf.py +2 -1
parsl/concurrent/__init__.py +2 -2
parsl/config.py +53 -10
parsl/configs/ASPIRE1.py +6 -5
parsl/configs/Azure.py +9 -8
parsl/configs/bridges.py +6 -4
parsl/configs/cc_in2p3.py +3 -3
parsl/configs/ec2.py +3 -1
parsl/configs/expanse.py +4 -3
parsl/configs/frontera.py +3 -4
parsl/configs/htex_local.py +3 -4
parsl/configs/illinoiscluster.py +3 -1
parsl/configs/improv.py +34 -0
parsl/configs/kubernetes.py +4 -3
parsl/configs/local_threads.py +5 -1
parsl/configs/midway.py +5 -3
parsl/configs/osg.py +4 -2
parsl/configs/polaris.py +4 -2
parsl/configs/stampede2.py +6 -5
parsl/configs/summit.py +3 -3
parsl/configs/toss3_llnl.py +4 -3
parsl/configs/vineex_local.py +6 -4
parsl/configs/wqex_local.py +5 -3
parsl/curvezmq.py +4 -0
parsl/data_provider/data_manager.py +4 -3
parsl/data_provider/file_noop.py +1 -2
parsl/data_provider/files.py +3 -3
parsl/data_provider/ftp.py +1 -3
parsl/data_provider/globus.py +7 -6
parsl/data_provider/http.py +2 -2
parsl/data_provider/rsync.py +1 -1
parsl/data_provider/staging.py +2 -2
parsl/data_provider/zip.py +135 -0
parsl/dataflow/dependency_resolvers.py +115 -0
parsl/dataflow/dflow.py +259 -223
parsl/dataflow/errors.py +3 -5
parsl/dataflow/futures.py +27 -14
parsl/dataflow/memoization.py +5 -5
parsl/dataflow/rundirs.py +5 -6
parsl/dataflow/taskrecord.py +4 -5
parsl/executors/__init__.py +4 -2
parsl/executors/base.py +45 -15
parsl/executors/errors.py +13 -0
parsl/executors/execute_task.py +37 -0
parsl/executors/flux/execute_parsl_task.py +3 -3
parsl/executors/flux/executor.py +18 -19
parsl/executors/flux/flux_instance_manager.py +26 -27
parsl/executors/high_throughput/errors.py +43 -3
parsl/executors/high_throughput/executor.py +307 -285
parsl/executors/high_throughput/interchange.py +137 -168
parsl/executors/high_throughput/manager_record.py +4 -0
parsl/executors/high_throughput/manager_selector.py +55 -0
parsl/executors/high_throughput/monitoring_info.py +2 -1
parsl/executors/high_throughput/mpi_executor.py +113 -0
parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
parsl/executors/high_throughput/mpi_resource_management.py +6 -17
parsl/executors/high_throughput/probe.py +9 -7
parsl/executors/high_throughput/process_worker_pool.py +77 -75
parsl/executors/high_throughput/zmq_pipes.py +81 -23
parsl/executors/radical/executor.py +130 -79
parsl/executors/radical/rpex_resources.py +17 -15
parsl/executors/radical/rpex_worker.py +4 -3
parsl/executors/status_handling.py +157 -51
parsl/executors/taskvine/__init__.py +1 -1
parsl/executors/taskvine/errors.py +1 -1
parsl/executors/taskvine/exec_parsl_function.py +2 -2
parsl/executors/taskvine/executor.py +38 -55
parsl/executors/taskvine/factory.py +1 -1
parsl/executors/taskvine/factory_config.py +1 -1
parsl/executors/taskvine/manager.py +17 -13
parsl/executors/taskvine/manager_config.py +7 -2
parsl/executors/threads.py +6 -6
parsl/executors/workqueue/errors.py +1 -1
parsl/executors/workqueue/exec_parsl_function.py +6 -5
parsl/executors/workqueue/executor.py +64 -63
parsl/executors/workqueue/parsl_coprocess.py +1 -1
parsl/jobs/error_handlers.py +2 -2
parsl/jobs/job_status_poller.py +28 -112
parsl/jobs/states.py +7 -2
parsl/jobs/strategy.py +43 -31
parsl/launchers/__init__.py +12 -3
parsl/launchers/errors.py +1 -1
parsl/launchers/launchers.py +0 -6
parsl/log_utils.py +1 -2
parsl/monitoring/db_manager.py +55 -93
parsl/monitoring/errors.py +6 -0
parsl/monitoring/monitoring.py +85 -311
parsl/monitoring/queries/pandas.py +1 -2
parsl/monitoring/radios/base.py +13 -0
parsl/monitoring/radios/filesystem.py +52 -0
parsl/monitoring/radios/htex.py +57 -0
parsl/monitoring/radios/multiprocessing.py +17 -0
parsl/monitoring/radios/udp.py +56 -0
parsl/monitoring/radios/zmq.py +17 -0
parsl/monitoring/remote.py +33 -37
parsl/monitoring/router.py +212 -0
parsl/monitoring/types.py +5 -6
parsl/monitoring/visualization/app.py +4 -2
parsl/monitoring/visualization/models.py +0 -1
parsl/monitoring/visualization/plots/default/workflow_plots.py +8 -4
parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
parsl/monitoring/visualization/utils.py +0 -1
parsl/monitoring/visualization/views.py +16 -9
parsl/multiprocessing.py +0 -1
parsl/process_loggers.py +1 -2
parsl/providers/__init__.py +8 -17
parsl/providers/aws/aws.py +2 -3
parsl/providers/azure/azure.py +4 -5
parsl/providers/base.py +2 -18
parsl/providers/cluster_provider.py +3 -9
parsl/providers/condor/condor.py +7 -17
parsl/providers/errors.py +2 -2
parsl/providers/googlecloud/googlecloud.py +2 -1
parsl/providers/grid_engine/grid_engine.py +5 -14
parsl/providers/kubernetes/kube.py +80 -40
parsl/providers/local/local.py +13 -26
parsl/providers/lsf/lsf.py +5 -23
parsl/providers/pbspro/pbspro.py +5 -17
parsl/providers/slurm/slurm.py +81 -39
parsl/providers/torque/torque.py +3 -14
parsl/serialize/__init__.py +8 -3
parsl/serialize/base.py +1 -2
parsl/serialize/concretes.py +5 -4
parsl/serialize/facade.py +3 -3
parsl/serialize/proxystore.py +3 -2
parsl/tests/__init__.py +1 -1
parsl/tests/configs/azure_single_node.py +4 -5
parsl/tests/configs/bridges.py +3 -2
parsl/tests/configs/cc_in2p3.py +1 -3
parsl/tests/configs/comet.py +2 -1
parsl/tests/configs/ec2_single_node.py +1 -2
parsl/tests/configs/ec2_spot.py +1 -2
parsl/tests/configs/flux_local.py +11 -0
parsl/tests/configs/frontera.py +2 -3
parsl/tests/configs/htex_local.py +3 -5
parsl/tests/configs/htex_local_alternate.py +11 -15
parsl/tests/configs/htex_local_intask_staging.py +5 -9
parsl/tests/configs/htex_local_rsync_staging.py +4 -8
parsl/tests/configs/local_radical.py +1 -3
parsl/tests/configs/local_radical_mpi.py +2 -2
parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
parsl/tests/configs/local_threads_monitoring.py +0 -1
parsl/tests/configs/midway.py +2 -2
parsl/tests/configs/nscc_singapore.py +3 -3
parsl/tests/configs/osg_htex.py +1 -1
parsl/tests/configs/petrelkube.py +3 -2
parsl/tests/configs/slurm_local.py +24 -0
parsl/tests/configs/summit.py +1 -0
parsl/tests/configs/taskvine_ex.py +4 -7
parsl/tests/configs/user_opts.py +0 -7
parsl/tests/configs/workqueue_ex.py +4 -6
parsl/tests/conftest.py +27 -13
parsl/tests/integration/test_stress/test_python_simple.py +3 -4
parsl/tests/integration/test_stress/test_python_threads.py +3 -5
parsl/tests/manual_tests/htex_local.py +4 -6
parsl/tests/manual_tests/test_basic.py +1 -0
parsl/tests/manual_tests/test_log_filter.py +3 -1
parsl/tests/manual_tests/test_memory_limits.py +6 -8
parsl/tests/manual_tests/test_regression_220.py +2 -1
parsl/tests/manual_tests/test_udp_simple.py +4 -4
parsl/tests/manual_tests/test_worker_count.py +3 -2
parsl/tests/scaling_tests/htex_local.py +2 -4
parsl/tests/scaling_tests/test_scale.py +0 -9
parsl/tests/scaling_tests/vineex_condor.py +1 -2
parsl/tests/scaling_tests/vineex_local.py +1 -2
parsl/tests/site_tests/site_config_selector.py +1 -6
parsl/tests/site_tests/test_provider.py +4 -2
parsl/tests/site_tests/test_site.py +2 -0
parsl/tests/sites/test_affinity.py +7 -7
parsl/tests/sites/test_dynamic_executor.py +3 -4
parsl/tests/sites/test_ec2.py +3 -2
parsl/tests/sites/test_worker_info.py +4 -5
parsl/tests/test_aalst_patterns.py +0 -1
parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
parsl/tests/test_bash_apps/test_basic.py +10 -4
parsl/tests/test_bash_apps/test_error_codes.py +5 -7
parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
parsl/tests/test_bash_apps/test_memoize.py +2 -8
parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
parsl/tests/test_bash_apps/test_multiline.py +1 -1
parsl/tests/test_bash_apps/test_pipeline.py +1 -1
parsl/tests/test_bash_apps/test_std_uri.py +123 -0
parsl/tests/test_bash_apps/test_stdout.py +33 -8
parsl/tests/test_callables.py +2 -2
parsl/tests/test_checkpointing/test_periodic.py +21 -39
parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
parsl/tests/test_checkpointing/test_regression_239.py +1 -1
parsl/tests/test_checkpointing/test_task_exit.py +2 -3
parsl/tests/test_docs/test_from_slides.py +5 -2
parsl/tests/test_docs/test_kwargs.py +4 -1
parsl/tests/test_docs/test_tutorial_1.py +1 -2
parsl/tests/test_docs/test_workflow1.py +2 -2
parsl/tests/test_docs/test_workflow2.py +0 -1
parsl/tests/test_error_handling/test_rand_fail.py +2 -2
parsl/tests/test_error_handling/test_resource_spec.py +10 -12
parsl/tests/test_error_handling/test_retries.py +6 -16
parsl/tests/test_error_handling/test_retry_handler.py +1 -0
parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
parsl/tests/test_execute_task.py +29 -0
parsl/tests/test_flux.py +1 -1
parsl/tests/test_htex/test_basic.py +2 -3
parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
parsl/tests/test_htex/test_command_client_timeout.py +66 -0
parsl/tests/test_htex/test_connected_blocks.py +3 -2
parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
parsl/tests/test_htex/test_drain.py +11 -10
parsl/tests/test_htex/test_htex.py +51 -25
parsl/tests/test_htex/test_manager_failure.py +0 -1
parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
parsl/tests/test_htex/test_managers_command.py +36 -0
parsl/tests/test_htex/test_missing_worker.py +2 -12
parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
parsl/tests/test_htex/test_zmq_binding.py +29 -8
parsl/tests/test_monitoring/test_app_names.py +5 -5
parsl/tests/test_monitoring/test_basic.py +73 -25
parsl/tests/test_monitoring/test_db_locks.py +6 -4
parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
parsl/tests/test_monitoring/test_stdouterr.py +134 -0
parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
parsl/tests/test_providers/test_local_provider.py +3 -132
parsl/tests/test_providers/test_pbspro_template.py +2 -3
parsl/tests/test_providers/test_slurm_template.py +2 -3
parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
parsl/tests/test_python_apps/test_context_manager.py +128 -0
parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
parsl/tests/test_python_apps/test_fail.py +0 -25
parsl/tests/test_python_apps/test_futures.py +2 -1
parsl/tests/test_python_apps/test_inputs_default.py +22 -0
parsl/tests/test_python_apps/test_join.py +0 -1
parsl/tests/test_python_apps/test_lifted.py +11 -7
parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
parsl/tests/test_python_apps/test_outputs.py +1 -1
parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
parsl/tests/test_radical/test_mpi_funcs.py +1 -2
parsl/tests/test_regression/test_1480.py +2 -1
parsl/tests/test_regression/test_1653.py +2 -1
parsl/tests/test_regression/test_226.py +1 -0
parsl/tests/test_regression/test_2652.py +1 -0
parsl/tests/test_regression/test_69a.py +0 -1
parsl/tests/test_regression/test_854.py +4 -2
parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
parsl/tests/test_regression/test_98.py +0 -1
parsl/tests/test_scaling/test_block_error_handler.py +9 -4
parsl/tests/test_scaling/test_regression_1621.py +11 -15
parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
parsl/tests/test_scaling/test_scale_down.py +2 -5
parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +5 -8
parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
parsl/tests/test_serialization/test_basic.py +2 -1
parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
parsl/tests/test_staging/staging_provider.py +2 -2
parsl/tests/test_staging/test_1316.py +3 -4
parsl/tests/test_staging/test_docs_1.py +2 -1
parsl/tests/test_staging/test_docs_2.py +2 -1
parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
parsl/tests/{test_data → test_staging}/test_file.py +6 -6
parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
parsl/tests/test_staging/test_staging_ftp.py +1 -0
parsl/tests/test_staging/test_staging_https.py +5 -2
parsl/tests/test_staging/test_staging_stdout.py +64 -0
parsl/tests/test_staging/test_zip_in.py +39 -0
parsl/tests/test_staging/test_zip_out.py +110 -0
parsl/tests/test_staging/test_zip_to_zip.py +41 -0
parsl/tests/test_summary.py +2 -2
parsl/tests/test_thread_parallelism.py +0 -1
parsl/tests/test_threads/test_configs.py +1 -2
parsl/tests/test_threads/test_lazy_errors.py +2 -2
parsl/tests/test_utils/test_execute_wait.py +35 -0
parsl/tests/test_utils/test_sanitize_dns.py +76 -0
parsl/tests/unit/test_address.py +20 -0
parsl/tests/unit/test_file.py +99 -0
parsl/tests/unit/test_usage_tracking.py +66 -0
parsl/usage_tracking/api.py +65 -0
parsl/usage_tracking/levels.py +6 -0
parsl/usage_tracking/usage.py +104 -62
parsl/utils.py +137 -4
parsl/version.py +1 -1
{parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
parsl-2025.1.13.data/scripts/interchange.py +649 -0
{parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +77 -75
parsl-2025.1.13.dist-info/METADATA +96 -0
parsl-2025.1.13.dist-info/RECORD +462 -0
{parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
parsl/channels/__init__.py +0 -7
parsl/channels/base.py +0 -141
parsl/channels/errors.py +0 -113
parsl/channels/local/local.py +0 -164
parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
parsl/channels/ssh/ssh.py +0 -276
parsl/channels/ssh_il/__init__.py +0 -0
parsl/channels/ssh_il/ssh_il.py +0 -74
parsl/configs/ad_hoc.py +0 -35
parsl/executors/radical/rpex_master.py +0 -42
parsl/monitoring/radios.py +0 -175
parsl/providers/ad_hoc/__init__.py +0 -0
parsl/providers/ad_hoc/ad_hoc.py +0 -248
parsl/providers/cobalt/__init__.py +0 -0
parsl/providers/cobalt/cobalt.py +0 -236
parsl/providers/cobalt/template.py +0 -17
parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
parsl/tests/configs/cooley_htex.py +0 -37
parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
parsl/tests/configs/local_adhoc.py +0 -18
parsl/tests/configs/swan_htex.py +0 -43
parsl/tests/configs/theta.py +0 -37
parsl/tests/integration/test_channels/__init__.py +0 -0
parsl/tests/integration/test_channels/test_channels.py +0 -17
parsl/tests/integration/test_channels/test_local_channel.py +0 -42
parsl/tests/integration/test_channels/test_scp_1.py +0 -45
parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
parsl/tests/sites/test_local_adhoc.py +0 -61
parsl/tests/test_channels/__init__.py +0 -0
parsl/tests/test_channels/test_large_output.py +0 -22
parsl/tests/test_data/__init__.py +0 -0
parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
parsl-2024.3.18.dist-info/METADATA +0 -98
parsl-2024.3.18.dist-info/RECORD +0 -449
parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
{parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
{parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
{parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
{parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0

parsl/executors/high_throughput/manager_selector.py ADDED Viewed

@@ -0,0 +1,55 @@
+import random
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Set
+from parsl.executors.high_throughput.manager_record import ManagerRecord
+class ManagerSelector(metaclass=ABCMeta):
+    @abstractmethod
+    def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
+        """ Sort a given list of managers.
+        Any operations pertaining to the sorting and rearrangement of the
+        interesting_managers Set should be performed here.
+        """
+        pass
+class RandomManagerSelector(ManagerSelector):
+    """Returns a shuffled list of interesting_managers
+    By default this strategy is used by the interchange. Works well
+    in distributing workloads equally across all availble compute
+    resources. The random workload strategy is not effective in
+    conjunction with elastic scaling behavior as the even task
+    distribution does not allow the scaling down of blocks, leading
+    to wasted resource consumption.
+    """
+    def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
+        c_manager_list = list(manager_list)
+        random.shuffle(c_manager_list)
+        return c_manager_list
+class BlockIdManagerSelector(ManagerSelector):
+    """Returns an interesting_managers list sorted by block ID
+    Observations:
+    1. BlockID manager selector helps with workloads that see a varying
+    amount of tasks over time. New blocks are prioritized with the
+    blockID manager selector, when used with 'htex_auto_scaling', results
+    in compute cost savings.
+    2. Doesn't really work with bag-of-tasks workloads. When all the tasks
+    are put into the queue upfront, all blocks operate at near full
+    utilization for the majority of the workload, which task goes where
+    doesn't really matter.
+    """
+    def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
+        return sorted(manager_list, key=lambda x: (ready_managers[x]['block_id'] is not None, ready_managers[x]['block_id']))

parsl/executors/high_throughput/monitoring_info.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # then be acquired by any other code running in
 # a worker context - specifically the monitoring
 # wrapper code.
-from typing import Optional
 from queue import Queue
+from typing import Optional
 result_queue: Optional[Queue] = None

parsl/executors/high_throughput/mpi_executor.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""A simplified interface for HTEx when running in MPI mode"""
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import typeguard
+from parsl.data_provider.staging import Staging
+from parsl.executors.high_throughput.executor import (
+    GENERAL_HTEX_PARAM_DOCS,
+    HighThroughputExecutor,
+)
+from parsl.executors.high_throughput.mpi_prefix_composer import (
+    VALID_LAUNCHERS,
+    validate_resource_spec,
+)
+from parsl.executors.status_handling import BlockProviderExecutor
+from parsl.jobs.states import JobStatus
+from parsl.launchers import SimpleLauncher
+from parsl.providers import LocalProvider
+from parsl.providers.base import ExecutionProvider
+class MPIExecutor(HighThroughputExecutor):
+    __doc__ = f"""A version of :class:`~parsl.HighThroughputExecutor` tuned for executing multi-node (e.g., MPI) tasks.
+    The Provider _must_ use the :class:`~parsl.launchers.SimpleLauncher`,
+    which places a single pool of workers on the first node of a block.
+    Each worker can then make system calls which use an MPI launcher (e.g., ``mpirun``, ``srun``)
+    to spawn multi-node tasks.
+    Specify the maximum number of multi-node tasks to run at once using ``max_workers_per_block``.
+    The value should be less than or equal to the ``nodes_per_block`` in the Provider.
+    Parameters
+    ----------
+    max_workers_per_block: int
+        Maximum number of MPI applications to run at once per block
+    mpi_launcher: str
+        Select one from the list of supported MPI launchers:
+        ("srun", "aprun", "mpiexec").
+        default: "mpiexec"
+    {GENERAL_HTEX_PARAM_DOCS}
+    """
+    @typeguard.typechecked
+    def __init__(self,
+                 label: str = 'MPIExecutor',
+                 provider: ExecutionProvider = LocalProvider(),
+                 launch_cmd: Optional[str] = None,
+                 interchange_launch_cmd: Optional[str] = None,
+                 address: Optional[str] = None,
+                 loopback_address: str = "127.0.0.1",
+                 worker_ports: Optional[Tuple[int, int]] = None,
+                 worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
+                 interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000),
+                 storage_access: Optional[List[Staging]] = None,
+                 working_dir: Optional[str] = None,
+                 worker_debug: bool = False,
+                 max_workers_per_block: int = 1,
+                 prefetch_capacity: int = 0,
+                 heartbeat_threshold: int = 120,
+                 heartbeat_period: int = 30,
+                 drain_period: Optional[int] = None,
+                 poll_period: int = 10,
+                 address_probe_timeout: Optional[int] = None,
+                 worker_logdir_root: Optional[str] = None,
+                 mpi_launcher: str = "mpiexec",
+                 block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
+                 encrypted: bool = False):
+        super().__init__(
+            # Hard-coded settings
+            cores_per_worker=1e-9,  # Ensures there will be at least an absurd number of workers
+            max_workers_per_node=max_workers_per_block,
+            # Everything else
+            label=label,
+            provider=provider,
+            launch_cmd=launch_cmd,
+            interchange_launch_cmd=interchange_launch_cmd,
+            address=address,
+            loopback_address=loopback_address,
+            worker_ports=worker_ports,
+            worker_port_range=worker_port_range,
+            interchange_port_range=interchange_port_range,
+            storage_access=storage_access,
+            working_dir=working_dir,
+            worker_debug=worker_debug,
+            prefetch_capacity=prefetch_capacity,
+            heartbeat_threshold=heartbeat_threshold,
+            heartbeat_period=heartbeat_period,
+            drain_period=drain_period,
+            poll_period=poll_period,
+            address_probe_timeout=address_probe_timeout,
+            worker_logdir_root=worker_logdir_root,
+            block_error_handler=block_error_handler,
+            encrypted=encrypted
+        )
+        self.enable_mpi_mode = True
+        self.mpi_launcher = mpi_launcher
+        self.max_workers_per_block = max_workers_per_block
+        if not isinstance(self.provider.launcher, SimpleLauncher):
+            raise TypeError("mpi_mode requires the provider to be configured to use a SimpleLauncher")
+        if mpi_launcher not in VALID_LAUNCHERS:
+            raise ValueError(f"mpi_launcher set to:{mpi_launcher} must be set to one of {VALID_LAUNCHERS}")
+        self.mpi_launcher = mpi_launcher
+    def validate_resource_spec(self, resource_specification: dict):
+        return validate_resource_spec(resource_specification)

parsl/executors/high_throughput/mpi_prefix_composer.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import logging
-from typing import Dict, List, Tuple, Set
+from typing import Dict, List, Tuple
+from parsl.executors.errors import InvalidResourceSpecification
 logger = logging.getLogger(__name__)
@@ -8,16 +10,6 @@ VALID_LAUNCHERS = ('srun',
                    'mpiexec')
-class InvalidResourceSpecification(Exception):
-    """Exception raised when Invalid keys are supplied via resource specification"""
-    def __init__(self, invalid_keys: Set[str]):
-        self.invalid_keys = invalid_keys
-    def __str__(self):
-        return f"Invalid resource specification options supplied: {self.invalid_keys}"
 def validate_resource_spec(resource_spec: Dict[str, str]):
     """Basic validation of keys in the resource_spec
@@ -25,6 +17,13 @@ def validate_resource_spec(resource_spec: Dict[str, str]):
         is invalid (e.g, contains invalid keys)
     """
     user_keys = set(resource_spec.keys())
+    # empty resource_spec when mpi_mode is set causes parsl to hang
+    # ref issue #3427
+    if len(user_keys) == 0:
+        raise InvalidResourceSpecification(user_keys,
+                                           'MPI mode requires optional parsl_resource_specification keyword argument to be configured')
     legal_keys = set(("ranks_per_node",
                       "num_nodes",
                       "num_ranks",

parsl/executors/high_throughput/mpi_resource_management.py CHANGED Viewed

@@ -8,8 +8,7 @@ from enum import Enum
 from typing import Dict, List
 from parsl.multiprocessing import SpawnContext
-from parsl.serialize import (pack_res_spec_apply_message,
-                             unpack_res_spec_apply_message)
+from parsl.serialize import pack_res_spec_apply_message, unpack_res_spec_apply_message
 logger = logging.getLogger(__name__)
@@ -18,7 +17,6 @@ class Scheduler(Enum):
     Unknown = 0
     Slurm = 1
     PBS = 2
-    Cobalt = 3
 def get_slurm_hosts_list() -> List[str]:
@@ -38,13 +36,6 @@ def get_pbs_hosts_list() -> List[str]:
         return [line.strip() for line in f.readlines()]
-def get_cobalt_hosts_list() -> List[str]:
-    """Get list of COBALT hosts from envvar: COBALT_NODEFILE"""
-    nodefile_name = os.environ["COBALT_NODEFILE"]
-    with open(nodefile_name) as f:
-        return [line.strip() for line in f.readlines()]
 def get_nodes_in_batchjob(scheduler: Scheduler) -> List[str]:
     """Get nodelist from all supported schedulers"""
     nodelist = []
@@ -52,8 +43,6 @@ def get_nodes_in_batchjob(scheduler: Scheduler) -> List[str]:
         nodelist = get_slurm_hosts_list()
     elif scheduler == Scheduler.PBS:
         nodelist = get_pbs_hosts_list()
-    elif scheduler == Scheduler.Cobalt:
-        nodelist = get_cobalt_hosts_list()
     else:
         raise RuntimeError(f"mpi_mode does not support scheduler:{scheduler}")
     return nodelist
@@ -65,8 +54,6 @@ def identify_scheduler() -> Scheduler:
         return Scheduler.Slurm
     elif os.environ.get("PBS_NODEFILE"):
         return Scheduler.PBS
-    elif os.environ.get("COBALT_NODEFILE"):
-        return Scheduler.Cobalt
     else:
         return Scheduler.Unknown
@@ -173,9 +160,7 @@ class MPITaskScheduler(TaskScheduler):
         """Schedule task if resources are available otherwise backlog the task"""
         user_ns = locals()
         user_ns.update({"__builtins__": __builtins__})
-        _f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(
-            task_package["buffer"], user_ns, copy=False
-        )
+        _f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(task_package["buffer"])
         nodes_needed = resource_spec.get("num_nodes")
         if nodes_needed:
@@ -190,6 +175,7 @@ class MPITaskScheduler(TaskScheduler):
                 self._map_tasks_to_nodes[task_package["task_id"]] = allocated_nodes
                 buffer = pack_res_spec_apply_message(_f, _args, _kwargs, resource_spec)
                 task_package["buffer"] = buffer
+                task_package["resource_spec"] = resource_spec
         self.pending_task_q.put(task_package)
@@ -208,8 +194,11 @@ class MPITaskScheduler(TaskScheduler):
         """Return result and relinquish provisioned nodes"""
         result_pkl = self.pending_result_q.get(block, timeout=timeout)
         result_dict = pickle.loads(result_pkl)
+        # TODO (wardlt): If the task did not request nodes, it won't be in `self._map_tasks_to_nodes`.
+        #  Causes Parsl to hang. See Issue #3427
         if result_dict["type"] == "result":
             task_id = result_dict["task_id"]
+            assert task_id in self._map_tasks_to_nodes, "You are about to experience issue #3427"
             nodes_to_reallocate = self._map_tasks_to_nodes[task_id]
             self._return_nodes(nodes_to_reallocate)
             self._schedule_backlog_tasks()

parsl/executors/high_throughput/probe.py CHANGED Viewed

@@ -1,11 +1,13 @@
-import zmq
 import argparse
-import uuid
-import time
 import logging
-from parsl.addresses import get_all_addresses
+import time
+import uuid
+import zmq
 from zmq.utils.monitor import recv_monitor_message
+from parsl.addresses import get_all_addresses, tcp_url
 logger = logging.getLogger(__name__)
@@ -30,7 +32,8 @@ def probe_addresses(addresses, task_port, timeout=120):
     for addr in addresses:
         socket = context.socket(zmq.DEALER)
         socket.setsockopt(zmq.LINGER, 0)
-        url = "tcp://{}:{}".format(addr, task_port)
+        socket.setsockopt(zmq.IPV6, True)
+        url = tcp_url(addr, task_port)
         logger.debug("Trying to connect back on {}".format(url))
         socket.connect(url)
         addr_map[addr] = {'sock': socket,
@@ -69,8 +72,7 @@ class TestWorker:
         address = probe_addresses(addresses, port)
         print("Viable address :", address)
-        self.task_incoming.connect("tcp://{}:{}".format(address, port))
-        print("Here")
+        self.task_incoming.connect(tcp_url(address, port))
     def heartbeat(self):
         """ Send heartbeat to the incoming task queue

parsl/executors/high_throughput/process_worker_pool.py CHANGED Viewed

@@ -1,39 +1,44 @@
 #!/usr/bin/env python3
 import argparse
+import json
 import logging
+import math
+import multiprocessing
 import os
-import sys
+import pickle
 import platform
+import queue
+import subprocess
+import sys
 import threading
-import pickle
 import time
-import queue
 import uuid
-from typing import Sequence, Optional, Dict, List
-import zmq
-import math
-import json
-import psutil
-import multiprocessing
 from multiprocessing.managers import DictProxy
 from multiprocessing.sharedctypes import Synchronized
+from typing import Dict, List, Optional, Sequence
+import psutil
+import zmq
 from parsl import curvezmq
-from parsl.process_loggers import wrap_with_logs
-from parsl.version import VERSION as PARSL_VERSION
+from parsl.addresses import tcp_url
 from parsl.app.errors import RemoteExceptionWrapper
+from parsl.executors.execute_task import execute_task
 from parsl.executors.high_throughput.errors import WorkerLost
-from parsl.executors.high_throughput.probe import probe_addresses
-from parsl.multiprocessing import SpawnContext
-from parsl.serialize import unpack_res_spec_apply_message, serialize
+from parsl.executors.high_throughput.mpi_prefix_composer import (
+    VALID_LAUNCHERS,
+    compose_all,
+)
 from parsl.executors.high_throughput.mpi_resource_management import (
+    MPITaskScheduler,
     TaskScheduler,
-    MPITaskScheduler
 )
-from parsl.executors.high_throughput.mpi_prefix_composer import compose_all, VALID_LAUNCHERS
+from parsl.executors.high_throughput.probe import probe_addresses
+from parsl.multiprocessing import SpawnContext
+from parsl.process_loggers import wrap_with_logs
+from parsl.serialize import serialize
+from parsl.version import VERSION as PARSL_VERSION
 HEARTBEAT_CODE = (2 ** 32) - 1
 DRAINED_CODE = (2 ** 32) - 2
@@ -155,8 +160,8 @@ class Manager:
                 raise Exception("No viable address found")
             else:
                 logger.info("Connection to Interchange successful on {}".format(ix_address))
-                task_q_url = "tcp://{}:{}".format(ix_address, task_port)
-                result_q_url = "tcp://{}:{}".format(ix_address, result_port)
+                task_q_url = tcp_url(ix_address, task_port)
+                result_q_url = tcp_url(ix_address, result_port)
                 logger.info("Task url : {}".format(task_q_url))
                 logger.info("Result url : {}".format(result_q_url))
         except Exception:
@@ -181,6 +186,7 @@ class Manager:
         self.uid = uid
         self.block_id = block_id
+        self.start_time = time.time()
         self.enable_mpi_mode = enable_mpi_mode
         self.mpi_launcher = mpi_launcher
@@ -260,6 +266,7 @@ class Manager:
                'worker_count': self.worker_count,
                'uid': self.uid,
                'block_id': self.block_id,
+               'start_time': self.start_time,
                'prefetch_capacity': self.prefetch_capacity,
                'max_capacity': self.worker_count + self.prefetch_capacity,
                'os': platform.system(),
@@ -335,14 +342,17 @@ class Manager:
                 self.heartbeat_to_incoming()
                 last_beat = time.time()
-            if self.drain_time and time.time() > self.drain_time:
+            if time.time() > self.drain_time:
                 logger.info("Requesting drain")
                 self.drain_to_incoming()
-                self.drain_time = None
                 # This will start the pool draining...
                 # Drained exit behaviour does not happen here. It will be
                 # driven by the interchange sending a DRAINED_CODE message.
+                # now set drain time to the far future so we don't send a drain
+                # message every iteration.
+                self.drain_time = float('inf')
             poll_duration_s = max(0, next_interesting_event_time - time.time())
             socks = dict(poller.poll(timeout=poll_duration_s * 1000))
@@ -354,11 +364,13 @@ class Manager:
                 if tasks == HEARTBEAT_CODE:
                     logger.debug("Got heartbeat from interchange")
                 elif tasks == DRAINED_CODE:
-                    logger.info("Got fulled drained message from interchange - setting kill flag")
+                    logger.info("Got fully drained message from interchange - setting kill flag")
                     kill_event.set()
                 else:
                     task_recv_counter += len(tasks)
-                    logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format([t['task_id'] for t in tasks], task_recv_counter))
+                    logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format(
+                        [t['task_id'] for t in tasks], task_recv_counter
+                    ))
                     for task in tasks:
                         self.task_scheduler.put_task(task)
@@ -580,45 +592,13 @@ def update_resource_spec_env_vars(mpi_launcher: str, resource_spec: Dict, node_i
         os.environ[key] = prefix_table[key]
-def execute_task(bufs, mpi_launcher: Optional[str] = None):
-    """Deserialize the buffer and execute the task.
-    Returns the result or throws exception.
-    """
-    user_ns = locals()
-    user_ns.update({'__builtins__': __builtins__})
-    f, args, kwargs, resource_spec = unpack_res_spec_apply_message(bufs, user_ns, copy=False)
-    for varname in resource_spec:
-        envname = "PARSL_" + str(varname).upper()
-        os.environ[envname] = str(resource_spec[varname])
-    if resource_spec.get("MPI_NODELIST"):
-        worker_id = os.environ['PARSL_WORKER_RANK']
-        nodes_for_task = resource_spec["MPI_NODELIST"].split(',')
-        logger.info(f"Launching task on provisioned nodes: {nodes_for_task}")
-        assert mpi_launcher
-        update_resource_spec_env_vars(mpi_launcher,
-                                      resource_spec=resource_spec,
-                                      node_info=nodes_for_task)
-    # We might need to look into callability of the function from itself
-    # since we change it's name in the new namespace
-    prefix = "parsl_"
-    fname = prefix + "f"
-    argname = prefix + "args"
-    kwargname = prefix + "kwargs"
-    resultname = prefix + "result"
-    user_ns.update({fname: f,
-                    argname: args,
-                    kwargname: kwargs,
-                    resultname: resultname})
-    code = "{0} = {1}(*{2}, **{3})".format(resultname, fname,
-                                           argname, kwargname)
-    exec(code, user_ns, user_ns)
-    return user_ns.get(resultname)
+def _init_mpi_env(mpi_launcher: str, resource_spec: Dict):
+    node_list = resource_spec.get("MPI_NODELIST")
+    if node_list is None:
+        return
+    nodes_for_task = node_list.split(',')
+    logger.info(f"Launching task on provisioned nodes: {nodes_for_task}")
+    update_resource_spec_env_vars(mpi_launcher=mpi_launcher, resource_spec=resource_spec, node_info=nodes_for_task)
 @wrap_with_logs(target="worker_log")
@@ -640,14 +620,6 @@ def worker(
     debug: bool,
     mpi_launcher: str,
 ):
-    """
-    Put request token into queue
-    Get task from task_queue
-    Pop request from queue
-    Put result into result_queue
-    """
     # override the global logger inherited from the __main__ process (which
     # usually logs to manager.log) with one specific to this worker.
     global logger
@@ -672,7 +644,8 @@ def worker(
     # If desired, set process affinity
     if cpu_affinity != "none":
         # Count the number of cores per worker
-        avail_cores = sorted(os.sched_getaffinity(0))  # Get the available threads
+        # OSX does not implement os.sched_getaffinity
+        avail_cores = sorted(os.sched_getaffinity(0))  # type: ignore[attr-defined, unused-ignore]
         cores_per_worker = len(avail_cores) // pool_size
         assert cores_per_worker > 0, "Affinity does not work if there are more workers than cores"
@@ -712,12 +685,39 @@ def worker(
         os.environ["KMP_AFFINITY"] = f"explicit,proclist=[{proc_list}]"  # For Intel OpenMP
         # Set the affinity for this worker
-        os.sched_setaffinity(0, my_cores)
+        # OSX does not implement os.sched_setaffinity so type checking
+        # is ignored here in two ways:
+        # On a platform without sched_setaffinity, that attribute will not
+        # be defined, so ignore[attr-defined] will tell mypy to ignore this
+        # incorrect-for-OS X attribute access.
+        # On a platform with sched_setaffinity, that type: ignore message
+        # will be redundant, and ignore[unused-ignore] tells mypy to ignore
+        # that this ignore is unneeded.
+        os.sched_setaffinity(0, my_cores)  # type: ignore[attr-defined, unused-ignore]
         logger.info("Set worker CPU affinity to {}".format(my_cores))
     # If desired, pin to accelerator
     if accelerator is not None:
-        os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
+        # If CUDA devices, find total number of devices to allow for MPS
+        # See: https://developer.nvidia.com/system-management-interface
+        nvidia_smi_cmd = "nvidia-smi -L > /dev/null && nvidia-smi -L | wc -l"
+        nvidia_smi_ret = subprocess.run(nvidia_smi_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        if nvidia_smi_ret.returncode == 0:
+            num_cuda_devices = int(nvidia_smi_ret.stdout.split()[0])
+        else:
+            num_cuda_devices = None
+        try:
+            if num_cuda_devices is not None:
+                procs_per_cuda_device = pool_size // num_cuda_devices
+                partitioned_accelerator = str(int(accelerator) // procs_per_cuda_device)  # multiple workers will share a GPU
+                os.environ["CUDA_VISIBLE_DEVICES"] = partitioned_accelerator
+                logger.info(f'Pinned worker to partitioned cuda device: {partitioned_accelerator}')
+            else:
+                os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
+        except (TypeError, ValueError, ZeroDivisionError):
+            os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
         os.environ["ROCR_VISIBLE_DEVICES"] = accelerator
         os.environ["ZE_AFFINITY_MASK"] = accelerator
         os.environ["ZE_ENABLE_PCI_ID_DEVICE_ORDER"] = '1'
@@ -756,8 +756,10 @@ def worker(
             ready_worker_count.value -= 1
         worker_enqueued = False
+        _init_mpi_env(mpi_launcher=mpi_launcher, resource_spec=req["resource_spec"])
         try:
-            result = execute_task(req['buffer'], mpi_launcher=mpi_launcher)
+            result = execute_task(req['buffer'])
             serialized_result = serialize(result, buffer_threshold=1000000)
         except Exception as e:
             logger.info('Caught an exception: {}'.format(e))

parsl 2024.3.18__py3-none-any.whl → 2025.1.13__py3-none-any.whl

parsl 2024.3.18py3-none-any.whl → 2025.1.13py3-none-any.whl