parsl 2024.3.18__py3-none-any.whl → 2025.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/__init__.py +9 -10
- parsl/addresses.py +26 -6
- parsl/app/app.py +7 -8
- parsl/app/bash.py +15 -8
- parsl/app/errors.py +10 -13
- parsl/app/futures.py +8 -10
- parsl/app/python.py +2 -1
- parsl/benchmark/perf.py +2 -1
- parsl/concurrent/__init__.py +2 -2
- parsl/config.py +53 -10
- parsl/configs/ASPIRE1.py +6 -5
- parsl/configs/Azure.py +9 -8
- parsl/configs/bridges.py +6 -4
- parsl/configs/cc_in2p3.py +3 -3
- parsl/configs/ec2.py +3 -1
- parsl/configs/expanse.py +4 -3
- parsl/configs/frontera.py +3 -4
- parsl/configs/htex_local.py +3 -4
- parsl/configs/illinoiscluster.py +3 -1
- parsl/configs/improv.py +34 -0
- parsl/configs/kubernetes.py +4 -3
- parsl/configs/local_threads.py +5 -1
- parsl/configs/midway.py +5 -3
- parsl/configs/osg.py +4 -2
- parsl/configs/polaris.py +4 -2
- parsl/configs/stampede2.py +6 -5
- parsl/configs/summit.py +3 -3
- parsl/configs/toss3_llnl.py +4 -3
- parsl/configs/vineex_local.py +6 -4
- parsl/configs/wqex_local.py +5 -3
- parsl/curvezmq.py +4 -0
- parsl/data_provider/data_manager.py +4 -3
- parsl/data_provider/file_noop.py +1 -2
- parsl/data_provider/files.py +3 -3
- parsl/data_provider/ftp.py +1 -3
- parsl/data_provider/globus.py +7 -6
- parsl/data_provider/http.py +2 -2
- parsl/data_provider/rsync.py +1 -1
- parsl/data_provider/staging.py +2 -2
- parsl/data_provider/zip.py +135 -0
- parsl/dataflow/dependency_resolvers.py +115 -0
- parsl/dataflow/dflow.py +259 -223
- parsl/dataflow/errors.py +3 -5
- parsl/dataflow/futures.py +27 -14
- parsl/dataflow/memoization.py +5 -5
- parsl/dataflow/rundirs.py +5 -6
- parsl/dataflow/taskrecord.py +4 -5
- parsl/executors/__init__.py +4 -2
- parsl/executors/base.py +45 -15
- parsl/executors/errors.py +13 -0
- parsl/executors/execute_task.py +37 -0
- parsl/executors/flux/execute_parsl_task.py +3 -3
- parsl/executors/flux/executor.py +18 -19
- parsl/executors/flux/flux_instance_manager.py +26 -27
- parsl/executors/high_throughput/errors.py +43 -3
- parsl/executors/high_throughput/executor.py +307 -285
- parsl/executors/high_throughput/interchange.py +137 -168
- parsl/executors/high_throughput/manager_record.py +4 -0
- parsl/executors/high_throughput/manager_selector.py +55 -0
- parsl/executors/high_throughput/monitoring_info.py +2 -1
- parsl/executors/high_throughput/mpi_executor.py +113 -0
- parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
- parsl/executors/high_throughput/mpi_resource_management.py +6 -17
- parsl/executors/high_throughput/probe.py +9 -7
- parsl/executors/high_throughput/process_worker_pool.py +77 -75
- parsl/executors/high_throughput/zmq_pipes.py +81 -23
- parsl/executors/radical/executor.py +130 -79
- parsl/executors/radical/rpex_resources.py +17 -15
- parsl/executors/radical/rpex_worker.py +4 -3
- parsl/executors/status_handling.py +157 -51
- parsl/executors/taskvine/__init__.py +1 -1
- parsl/executors/taskvine/errors.py +1 -1
- parsl/executors/taskvine/exec_parsl_function.py +2 -2
- parsl/executors/taskvine/executor.py +38 -55
- parsl/executors/taskvine/factory.py +1 -1
- parsl/executors/taskvine/factory_config.py +1 -1
- parsl/executors/taskvine/manager.py +17 -13
- parsl/executors/taskvine/manager_config.py +7 -2
- parsl/executors/threads.py +6 -6
- parsl/executors/workqueue/errors.py +1 -1
- parsl/executors/workqueue/exec_parsl_function.py +6 -5
- parsl/executors/workqueue/executor.py +64 -63
- parsl/executors/workqueue/parsl_coprocess.py +1 -1
- parsl/jobs/error_handlers.py +2 -2
- parsl/jobs/job_status_poller.py +28 -112
- parsl/jobs/states.py +7 -2
- parsl/jobs/strategy.py +43 -31
- parsl/launchers/__init__.py +12 -3
- parsl/launchers/errors.py +1 -1
- parsl/launchers/launchers.py +0 -6
- parsl/log_utils.py +1 -2
- parsl/monitoring/db_manager.py +55 -93
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +85 -311
- parsl/monitoring/queries/pandas.py +1 -2
- parsl/monitoring/radios/base.py +13 -0
- parsl/monitoring/radios/filesystem.py +52 -0
- parsl/monitoring/radios/htex.py +57 -0
- parsl/monitoring/radios/multiprocessing.py +17 -0
- parsl/monitoring/radios/udp.py +56 -0
- parsl/monitoring/radios/zmq.py +17 -0
- parsl/monitoring/remote.py +33 -37
- parsl/monitoring/router.py +212 -0
- parsl/monitoring/types.py +5 -6
- parsl/monitoring/visualization/app.py +4 -2
- parsl/monitoring/visualization/models.py +0 -1
- parsl/monitoring/visualization/plots/default/workflow_plots.py +8 -4
- parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
- parsl/monitoring/visualization/utils.py +0 -1
- parsl/monitoring/visualization/views.py +16 -9
- parsl/multiprocessing.py +0 -1
- parsl/process_loggers.py +1 -2
- parsl/providers/__init__.py +8 -17
- parsl/providers/aws/aws.py +2 -3
- parsl/providers/azure/azure.py +4 -5
- parsl/providers/base.py +2 -18
- parsl/providers/cluster_provider.py +3 -9
- parsl/providers/condor/condor.py +7 -17
- parsl/providers/errors.py +2 -2
- parsl/providers/googlecloud/googlecloud.py +2 -1
- parsl/providers/grid_engine/grid_engine.py +5 -14
- parsl/providers/kubernetes/kube.py +80 -40
- parsl/providers/local/local.py +13 -26
- parsl/providers/lsf/lsf.py +5 -23
- parsl/providers/pbspro/pbspro.py +5 -17
- parsl/providers/slurm/slurm.py +81 -39
- parsl/providers/torque/torque.py +3 -14
- parsl/serialize/__init__.py +8 -3
- parsl/serialize/base.py +1 -2
- parsl/serialize/concretes.py +5 -4
- parsl/serialize/facade.py +3 -3
- parsl/serialize/proxystore.py +3 -2
- parsl/tests/__init__.py +1 -1
- parsl/tests/configs/azure_single_node.py +4 -5
- parsl/tests/configs/bridges.py +3 -2
- parsl/tests/configs/cc_in2p3.py +1 -3
- parsl/tests/configs/comet.py +2 -1
- parsl/tests/configs/ec2_single_node.py +1 -2
- parsl/tests/configs/ec2_spot.py +1 -2
- parsl/tests/configs/flux_local.py +11 -0
- parsl/tests/configs/frontera.py +2 -3
- parsl/tests/configs/htex_local.py +3 -5
- parsl/tests/configs/htex_local_alternate.py +11 -15
- parsl/tests/configs/htex_local_intask_staging.py +5 -9
- parsl/tests/configs/htex_local_rsync_staging.py +4 -8
- parsl/tests/configs/local_radical.py +1 -3
- parsl/tests/configs/local_radical_mpi.py +2 -2
- parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
- parsl/tests/configs/local_threads_monitoring.py +0 -1
- parsl/tests/configs/midway.py +2 -2
- parsl/tests/configs/nscc_singapore.py +3 -3
- parsl/tests/configs/osg_htex.py +1 -1
- parsl/tests/configs/petrelkube.py +3 -2
- parsl/tests/configs/slurm_local.py +24 -0
- parsl/tests/configs/summit.py +1 -0
- parsl/tests/configs/taskvine_ex.py +4 -7
- parsl/tests/configs/user_opts.py +0 -7
- parsl/tests/configs/workqueue_ex.py +4 -6
- parsl/tests/conftest.py +27 -13
- parsl/tests/integration/test_stress/test_python_simple.py +3 -4
- parsl/tests/integration/test_stress/test_python_threads.py +3 -5
- parsl/tests/manual_tests/htex_local.py +4 -6
- parsl/tests/manual_tests/test_basic.py +1 -0
- parsl/tests/manual_tests/test_log_filter.py +3 -1
- parsl/tests/manual_tests/test_memory_limits.py +6 -8
- parsl/tests/manual_tests/test_regression_220.py +2 -1
- parsl/tests/manual_tests/test_udp_simple.py +4 -4
- parsl/tests/manual_tests/test_worker_count.py +3 -2
- parsl/tests/scaling_tests/htex_local.py +2 -4
- parsl/tests/scaling_tests/test_scale.py +0 -9
- parsl/tests/scaling_tests/vineex_condor.py +1 -2
- parsl/tests/scaling_tests/vineex_local.py +1 -2
- parsl/tests/site_tests/site_config_selector.py +1 -6
- parsl/tests/site_tests/test_provider.py +4 -2
- parsl/tests/site_tests/test_site.py +2 -0
- parsl/tests/sites/test_affinity.py +7 -7
- parsl/tests/sites/test_dynamic_executor.py +3 -4
- parsl/tests/sites/test_ec2.py +3 -2
- parsl/tests/sites/test_worker_info.py +4 -5
- parsl/tests/test_aalst_patterns.py +0 -1
- parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
- parsl/tests/test_bash_apps/test_basic.py +10 -4
- parsl/tests/test_bash_apps/test_error_codes.py +5 -7
- parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
- parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
- parsl/tests/test_bash_apps/test_memoize.py +2 -8
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
- parsl/tests/test_bash_apps/test_multiline.py +1 -1
- parsl/tests/test_bash_apps/test_pipeline.py +1 -1
- parsl/tests/test_bash_apps/test_std_uri.py +123 -0
- parsl/tests/test_bash_apps/test_stdout.py +33 -8
- parsl/tests/test_callables.py +2 -2
- parsl/tests/test_checkpointing/test_periodic.py +21 -39
- parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
- parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
- parsl/tests/test_checkpointing/test_regression_239.py +1 -1
- parsl/tests/test_checkpointing/test_task_exit.py +2 -3
- parsl/tests/test_docs/test_from_slides.py +5 -2
- parsl/tests/test_docs/test_kwargs.py +4 -1
- parsl/tests/test_docs/test_tutorial_1.py +1 -2
- parsl/tests/test_docs/test_workflow1.py +2 -2
- parsl/tests/test_docs/test_workflow2.py +0 -1
- parsl/tests/test_error_handling/test_rand_fail.py +2 -2
- parsl/tests/test_error_handling/test_resource_spec.py +10 -12
- parsl/tests/test_error_handling/test_retries.py +6 -16
- parsl/tests/test_error_handling/test_retry_handler.py +1 -0
- parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
- parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
- parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
- parsl/tests/test_execute_task.py +29 -0
- parsl/tests/test_flux.py +1 -1
- parsl/tests/test_htex/test_basic.py +2 -3
- parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
- parsl/tests/test_htex/test_command_client_timeout.py +66 -0
- parsl/tests/test_htex/test_connected_blocks.py +3 -2
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
- parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_drain.py +11 -10
- parsl/tests/test_htex/test_htex.py +51 -25
- parsl/tests/test_htex/test_manager_failure.py +0 -1
- parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
- parsl/tests/test_htex/test_managers_command.py +36 -0
- parsl/tests/test_htex/test_missing_worker.py +2 -12
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
- parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
- parsl/tests/test_htex/test_zmq_binding.py +29 -8
- parsl/tests/test_monitoring/test_app_names.py +5 -5
- parsl/tests/test_monitoring/test_basic.py +73 -25
- parsl/tests/test_monitoring/test_db_locks.py +6 -4
- parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
- parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
- parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
- parsl/tests/test_monitoring/test_stdouterr.py +134 -0
- parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
- parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
- parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
- parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
- parsl/tests/test_providers/test_local_provider.py +3 -132
- parsl/tests/test_providers/test_pbspro_template.py +2 -3
- parsl/tests/test_providers/test_slurm_template.py +2 -3
- parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
- parsl/tests/test_python_apps/test_context_manager.py +128 -0
- parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
- parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
- parsl/tests/test_python_apps/test_fail.py +0 -25
- parsl/tests/test_python_apps/test_futures.py +2 -1
- parsl/tests/test_python_apps/test_inputs_default.py +22 -0
- parsl/tests/test_python_apps/test_join.py +0 -1
- parsl/tests/test_python_apps/test_lifted.py +11 -7
- parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
- parsl/tests/test_python_apps/test_outputs.py +1 -1
- parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
- parsl/tests/test_radical/test_mpi_funcs.py +1 -2
- parsl/tests/test_regression/test_1480.py +2 -1
- parsl/tests/test_regression/test_1653.py +2 -1
- parsl/tests/test_regression/test_226.py +1 -0
- parsl/tests/test_regression/test_2652.py +1 -0
- parsl/tests/test_regression/test_69a.py +0 -1
- parsl/tests/test_regression/test_854.py +4 -2
- parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
- parsl/tests/test_regression/test_98.py +0 -1
- parsl/tests/test_scaling/test_block_error_handler.py +9 -4
- parsl/tests/test_scaling/test_regression_1621.py +11 -15
- parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
- parsl/tests/test_scaling/test_scale_down.py +2 -5
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +5 -8
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
- parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
- parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
- parsl/tests/test_serialization/test_basic.py +2 -1
- parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
- parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
- parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
- parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
- parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
- parsl/tests/test_staging/staging_provider.py +2 -2
- parsl/tests/test_staging/test_1316.py +3 -4
- parsl/tests/test_staging/test_docs_1.py +2 -1
- parsl/tests/test_staging/test_docs_2.py +2 -1
- parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
- parsl/tests/{test_data → test_staging}/test_file.py +6 -6
- parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
- parsl/tests/test_staging/test_staging_ftp.py +1 -0
- parsl/tests/test_staging/test_staging_https.py +5 -2
- parsl/tests/test_staging/test_staging_stdout.py +64 -0
- parsl/tests/test_staging/test_zip_in.py +39 -0
- parsl/tests/test_staging/test_zip_out.py +110 -0
- parsl/tests/test_staging/test_zip_to_zip.py +41 -0
- parsl/tests/test_summary.py +2 -2
- parsl/tests/test_thread_parallelism.py +0 -1
- parsl/tests/test_threads/test_configs.py +1 -2
- parsl/tests/test_threads/test_lazy_errors.py +2 -2
- parsl/tests/test_utils/test_execute_wait.py +35 -0
- parsl/tests/test_utils/test_sanitize_dns.py +76 -0
- parsl/tests/unit/test_address.py +20 -0
- parsl/tests/unit/test_file.py +99 -0
- parsl/tests/unit/test_usage_tracking.py +66 -0
- parsl/usage_tracking/api.py +65 -0
- parsl/usage_tracking/levels.py +6 -0
- parsl/usage_tracking/usage.py +104 -62
- parsl/utils.py +137 -4
- parsl/version.py +1 -1
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
- parsl-2025.1.13.data/scripts/interchange.py +649 -0
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +77 -75
- parsl-2025.1.13.dist-info/METADATA +96 -0
- parsl-2025.1.13.dist-info/RECORD +462 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
- parsl/channels/__init__.py +0 -7
- parsl/channels/base.py +0 -141
- parsl/channels/errors.py +0 -113
- parsl/channels/local/local.py +0 -164
- parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
- parsl/channels/ssh/ssh.py +0 -276
- parsl/channels/ssh_il/__init__.py +0 -0
- parsl/channels/ssh_il/ssh_il.py +0 -74
- parsl/configs/ad_hoc.py +0 -35
- parsl/executors/radical/rpex_master.py +0 -42
- parsl/monitoring/radios.py +0 -175
- parsl/providers/ad_hoc/__init__.py +0 -0
- parsl/providers/ad_hoc/ad_hoc.py +0 -248
- parsl/providers/cobalt/__init__.py +0 -0
- parsl/providers/cobalt/cobalt.py +0 -236
- parsl/providers/cobalt/template.py +0 -17
- parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
- parsl/tests/configs/cooley_htex.py +0 -37
- parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
- parsl/tests/configs/local_adhoc.py +0 -18
- parsl/tests/configs/swan_htex.py +0 -43
- parsl/tests/configs/theta.py +0 -37
- parsl/tests/integration/test_channels/__init__.py +0 -0
- parsl/tests/integration/test_channels/test_channels.py +0 -17
- parsl/tests/integration/test_channels/test_local_channel.py +0 -42
- parsl/tests/integration/test_channels/test_scp_1.py +0 -45
- parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
- parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
- parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
- parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
- parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
- parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
- parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
- parsl/tests/sites/test_local_adhoc.py +0 -61
- parsl/tests/test_channels/__init__.py +0 -0
- parsl/tests/test_channels/test_large_output.py +0 -22
- parsl/tests/test_data/__init__.py +0 -0
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
- parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
- parsl-2024.3.18.dist-info/METADATA +0 -98
- parsl-2024.3.18.dist-info/RECORD +0 -449
- parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
- parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
- parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
@@ -1,43 +1,41 @@
|
|
1
|
-
import typing
|
2
|
-
from concurrent.futures import Future
|
3
|
-
import typeguard
|
4
1
|
import logging
|
5
|
-
import threading
|
6
|
-
import queue
|
7
|
-
import datetime
|
8
|
-
import pickle
|
9
|
-
from dataclasses import dataclass
|
10
|
-
from multiprocessing import Process, Queue
|
11
|
-
from typing import Dict, Sequence
|
12
|
-
from typing import List, Optional, Tuple, Union, Callable
|
13
2
|
import math
|
3
|
+
import pickle
|
4
|
+
import subprocess
|
5
|
+
import threading
|
6
|
+
import typing
|
14
7
|
import warnings
|
8
|
+
from collections import defaultdict
|
9
|
+
from concurrent.futures import Future
|
10
|
+
from dataclasses import dataclass
|
11
|
+
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
|
12
|
+
|
13
|
+
import typeguard
|
15
14
|
|
16
|
-
import
|
17
|
-
from parsl.
|
18
|
-
from parsl.serialize.errors import SerializationError, DeserializationError
|
15
|
+
from parsl import curvezmq
|
16
|
+
from parsl.addresses import get_all_addresses
|
19
17
|
from parsl.app.errors import RemoteExceptionWrapper
|
20
|
-
from parsl.
|
21
|
-
from parsl.executors.high_throughput import zmq_pipes
|
22
|
-
from parsl.executors.high_throughput import interchange
|
18
|
+
from parsl.data_provider.staging import Staging
|
23
19
|
from parsl.executors.errors import (
|
24
|
-
BadMessage,
|
20
|
+
BadMessage,
|
21
|
+
InvalidResourceSpecification,
|
22
|
+
ScalingFailed,
|
25
23
|
)
|
26
|
-
from parsl.executors.high_throughput
|
27
|
-
|
28
|
-
|
24
|
+
from parsl.executors.high_throughput import zmq_pipes
|
25
|
+
from parsl.executors.high_throughput.errors import CommandClientTimeoutError
|
26
|
+
from parsl.executors.high_throughput.manager_selector import (
|
27
|
+
ManagerSelector,
|
28
|
+
RandomManagerSelector,
|
29
29
|
)
|
30
|
-
|
31
|
-
from parsl import curvezmq
|
32
30
|
from parsl.executors.status_handling import BlockProviderExecutor
|
33
|
-
from parsl.
|
34
|
-
from parsl.data_provider.staging import Staging
|
35
|
-
from parsl.addresses import get_all_addresses
|
31
|
+
from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
|
36
32
|
from parsl.process_loggers import wrap_with_logs
|
37
|
-
|
38
|
-
from parsl.multiprocessing import ForkProcess
|
39
|
-
from parsl.utils import RepresentationMixin
|
40
33
|
from parsl.providers import LocalProvider
|
34
|
+
from parsl.providers.base import ExecutionProvider
|
35
|
+
from parsl.serialize import deserialize, pack_res_spec_apply_message
|
36
|
+
from parsl.serialize.errors import DeserializationError, SerializationError
|
37
|
+
from parsl.usage_tracking.api import UsageInformation
|
38
|
+
from parsl.utils import RepresentationMixin
|
41
39
|
|
42
40
|
logger = logging.getLogger(__name__)
|
43
41
|
|
@@ -61,49 +59,10 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
|
|
61
59
|
"--mpi-launcher={mpi_launcher} "
|
62
60
|
"--available-accelerators {accelerators}")
|
63
61
|
|
62
|
+
DEFAULT_INTERCHANGE_LAUNCH_CMD = ["interchange.py"]
|
64
63
|
|
65
|
-
|
66
|
-
"""Executor designed for cluster-scale
|
67
|
-
|
68
|
-
The HighThroughputExecutor system has the following components:
|
69
|
-
1. The HighThroughputExecutor instance which is run as part of the Parsl script.
|
70
|
-
2. The Interchange which acts as a load-balancing proxy between workers and Parsl
|
71
|
-
3. The multiprocessing based worker pool which coordinates task execution over several
|
72
|
-
cores on a node.
|
73
|
-
4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
|
74
|
-
|
75
|
-
Here is a diagram
|
76
|
-
|
77
|
-
.. code:: python
|
78
|
-
|
79
|
-
|
80
|
-
| Data | Executor | Interchange | External Process(es)
|
81
|
-
| Flow | | |
|
82
|
-
Task | Kernel | | |
|
83
|
-
+----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
|
84
|
-
| | | | batching | | |
|
85
|
-
Parsl<---Fut-| | | load-balancing| result exception
|
86
|
-
^ | | | watchdogs | | |
|
87
|
-
| | | Q_mngmnt | | V V
|
88
|
-
| | | Thread<--|-incoming_q<---|--- +---------+
|
89
|
-
| | | | | |
|
90
|
-
| | | | | |
|
91
|
-
+----update_fut-----+
|
92
|
-
|
93
|
-
|
94
|
-
Each of the workers in each process_worker_pool has access to its local rank through
|
95
|
-
an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
|
96
|
-
and is an integer in the range from 0 to the number of workers per in the pool minus 1.
|
97
|
-
The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
|
98
|
-
and the size of the worker pool as ``PARSL_WORKER_COUNT``.
|
99
|
-
|
100
|
-
|
101
|
-
Parameters
|
102
|
-
----------
|
103
|
-
|
104
|
-
provider : :class:`~parsl.providers.base.ExecutionProvider`
|
64
|
+
GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
|
105
65
|
Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
|
106
|
-
:class:`~parsl.providers.cobalt.cobalt.Cobalt`,
|
107
66
|
:class:`~parsl.providers.condor.condor.Condor`,
|
108
67
|
:class:`~parsl.providers.googlecloud.googlecloud.GoogleCloud`,
|
109
68
|
:class:`~parsl.providers.gridEngine.gridEngine.GridEngine`,
|
@@ -121,9 +80,13 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
121
80
|
cores_per_worker, nodes_per_block, heartbeat_period ,heartbeat_threshold, logdir). For example:
|
122
81
|
launch_cmd="process_worker_pool.py {debug} -c {cores_per_worker} --task_url={task_url} --result_url={result_url}"
|
123
82
|
|
83
|
+
interchange_launch_cmd : Sequence[str]
|
84
|
+
Custom sequence of command line tokens to launch the interchange process from the executor. If
|
85
|
+
undefined, the executor will use the default "interchange.py" command.
|
86
|
+
|
124
87
|
address : string
|
125
88
|
An address to connect to the main Parsl process which is reachable from the network in which
|
126
|
-
workers will be running. This field expects an IPv4 address
|
89
|
+
workers will be running. This field expects an IPv4 or IPv6 address.
|
127
90
|
Most login nodes on clusters have several network interfaces available, only some of which
|
128
91
|
can be reached from the compute nodes. This field can be used to limit the executor to listen
|
129
92
|
only on a specific interface, and limiting connections to the internal network.
|
@@ -131,6 +94,11 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
131
94
|
Setting an address here overrides the default behavior.
|
132
95
|
default=None
|
133
96
|
|
97
|
+
loopback_address: string
|
98
|
+
Specify address used for internal communication between executor and interchange.
|
99
|
+
Supports IPv4 and IPv6 addresses
|
100
|
+
default=127.0.0.1
|
101
|
+
|
134
102
|
worker_ports : (int, int)
|
135
103
|
Specify the ports to be used by workers to connect to Parsl. If this option is specified,
|
136
104
|
worker_port_range will not be honored.
|
@@ -147,39 +115,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
147
115
|
worker_debug : Bool
|
148
116
|
Enables worker debug logging.
|
149
117
|
|
150
|
-
cores_per_worker : float
|
151
|
-
cores to be assigned to each worker. Oversubscription is possible
|
152
|
-
by setting cores_per_worker < 1.0. Default=1
|
153
|
-
|
154
|
-
mem_per_worker : float
|
155
|
-
GB of memory required per worker. If this option is specified, the node manager
|
156
|
-
will check the available memory at startup and limit the number of workers such that
|
157
|
-
the there's sufficient memory for each worker. Default: None
|
158
|
-
|
159
|
-
max_workers : int
|
160
|
-
Deprecated. Please use max_workers_per_node instead.
|
161
|
-
|
162
|
-
max_workers_per_node : int
|
163
|
-
Caps the number of workers launched per node. Default: None
|
164
|
-
|
165
|
-
cpu_affinity: string
|
166
|
-
Whether or how each worker process sets thread affinity. Options include "none" to forgo
|
167
|
-
any CPU affinity configuration, "block" to assign adjacent cores to workers
|
168
|
-
(ex: assign 0-1 to worker 0, 2-3 to worker 1), and
|
169
|
-
"alternating" to assign cores to workers in round-robin
|
170
|
-
(ex: assign 0,2 to worker 0, 1,3 to worker 1).
|
171
|
-
The "block-reverse" option assigns adjacent cores to workers, but assigns
|
172
|
-
the CPUs with large indices to low index workers (ex: assign 2-3 to worker 1, 0,1 to worker 2)
|
173
|
-
|
174
|
-
available_accelerators: int | list
|
175
|
-
Accelerators available for workers to use. Each worker will be pinned to exactly one of the provided
|
176
|
-
accelerators, and no more workers will be launched than the number of accelerators.
|
177
|
-
|
178
|
-
Either provide the list of accelerator names or the number available. If a number is provided,
|
179
|
-
Parsl will create names as integers starting with 0.
|
180
|
-
|
181
|
-
default: empty list
|
182
|
-
|
183
118
|
prefetch_capacity : int
|
184
119
|
Number of tasks that could be prefetched over available worker capacity.
|
185
120
|
When there are a few tasks (<100) or when tasks are long running, this option should
|
@@ -213,19 +148,88 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
213
148
|
worker_logdir_root : string
|
214
149
|
In case of a remote file system, specify the path to where logs will be kept.
|
215
150
|
|
216
|
-
enable_mpi_mode: bool
|
217
|
-
If enabled, MPI launch prefixes will be composed for the batch scheduler based on
|
218
|
-
the nodes available in each batch job and the resource_specification dict passed
|
219
|
-
from the app. This is an experimental feature, please refer to the following doc section
|
220
|
-
before use: https://parsl.readthedocs.io/en/stable/userguide/mpi_apps.html
|
221
|
-
|
222
|
-
mpi_launcher: str
|
223
|
-
This field is only used if enable_mpi_mode is set. Select one from the
|
224
|
-
list of supported MPI launchers = ("srun", "aprun", "mpiexec").
|
225
|
-
default: "mpiexec"
|
226
|
-
|
227
151
|
encrypted : bool
|
228
152
|
Flag to enable/disable encryption (CurveZMQ). Default is False.
|
153
|
+
|
154
|
+
manager_selector: ManagerSelector
|
155
|
+
Determines what strategy the interchange uses to select managers during task distribution.
|
156
|
+
See API reference under "Manager Selectors" regarding the various manager selectors.
|
157
|
+
Default: 'RandomManagerSelector'
|
158
|
+
""" # Documentation for params used by both HTEx and MPIEx
|
159
|
+
|
160
|
+
|
161
|
+
class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageInformation):
|
162
|
+
__doc__ = f"""Executor designed for cluster-scale
|
163
|
+
|
164
|
+
The HighThroughputExecutor system has the following components:
|
165
|
+
1. The HighThroughputExecutor instance which is run as part of the Parsl script.
|
166
|
+
2. The Interchange which acts as a load-balancing proxy between workers and Parsl
|
167
|
+
3. The multiprocessing based worker pool which coordinates task execution over several
|
168
|
+
cores on a node.
|
169
|
+
4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
|
170
|
+
|
171
|
+
Here is a diagram
|
172
|
+
|
173
|
+
.. code:: python
|
174
|
+
|
175
|
+
|
176
|
+
| Data | Executor | Interchange | External Process(es)
|
177
|
+
| Flow | | |
|
178
|
+
Task | Kernel | | |
|
179
|
+
+----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
|
180
|
+
| | | | batching | | |
|
181
|
+
Parsl<---Fut-| | | load-balancing| result exception
|
182
|
+
^ | | | watchdogs | | |
|
183
|
+
| | | Result | | | |
|
184
|
+
| | | Queue | | V V
|
185
|
+
| | | Thread<--|-incoming_q<---|--- +---------+
|
186
|
+
| | | | | |
|
187
|
+
| | | | | |
|
188
|
+
+----update_fut-----+
|
189
|
+
|
190
|
+
|
191
|
+
Each of the workers in each process_worker_pool has access to its local rank through
|
192
|
+
an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
|
193
|
+
and is an integer in the range from 0 to the number of workers per in the pool minus 1.
|
194
|
+
The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
|
195
|
+
and the size of the worker pool as ``PARSL_WORKER_COUNT``.
|
196
|
+
|
197
|
+
|
198
|
+
Parameters
|
199
|
+
----------
|
200
|
+
|
201
|
+
{GENERAL_HTEX_PARAM_DOCS}
|
202
|
+
|
203
|
+
cores_per_worker : float
|
204
|
+
cores to be assigned to each worker. Oversubscription is possible
|
205
|
+
by setting cores_per_worker < 1.0. Default=1
|
206
|
+
|
207
|
+
mem_per_worker : float
|
208
|
+
GB of memory required per worker. If this option is specified, the node manager
|
209
|
+
will check the available memory at startup and limit the number of workers such that
|
210
|
+
the there's sufficient memory for each worker. Default: None
|
211
|
+
|
212
|
+
max_workers_per_node : int
|
213
|
+
Caps the number of workers launched per node. Default: None
|
214
|
+
|
215
|
+
cpu_affinity: string
|
216
|
+
Whether or how each worker process sets thread affinity. Options include "none" to forgo
|
217
|
+
any CPU affinity configuration, "block" to assign adjacent cores to workers
|
218
|
+
(ex: assign 0-1 to worker 0, 2-3 to worker 1), and
|
219
|
+
"alternating" to assign cores to workers in round-robin
|
220
|
+
(ex: assign 0,2 to worker 0, 1,3 to worker 1).
|
221
|
+
The "block-reverse" option assigns adjacent cores to workers, but assigns
|
222
|
+
the CPUs with large indices to low index workers (ex: assign 2-3 to worker 1, 0,1 to worker 2)
|
223
|
+
|
224
|
+
available_accelerators: int | list
|
225
|
+
Accelerators available for workers to use. Each worker will be pinned to exactly one of the provided
|
226
|
+
accelerators, and no more workers will be launched than the number of accelerators.
|
227
|
+
|
228
|
+
Either provide the list of accelerator names or the number available. If a number is provided,
|
229
|
+
Parsl will create names as integers starting with 0.
|
230
|
+
|
231
|
+
default: empty list
|
232
|
+
|
229
233
|
"""
|
230
234
|
|
231
235
|
@typeguard.typechecked
|
@@ -233,7 +237,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
233
237
|
label: str = 'HighThroughputExecutor',
|
234
238
|
provider: ExecutionProvider = LocalProvider(),
|
235
239
|
launch_cmd: Optional[str] = None,
|
240
|
+
interchange_launch_cmd: Optional[Sequence[str]] = None,
|
236
241
|
address: Optional[str] = None,
|
242
|
+
loopback_address: str = "127.0.0.1",
|
237
243
|
worker_ports: Optional[Tuple[int, int]] = None,
|
238
244
|
worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
|
239
245
|
interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000),
|
@@ -242,7 +248,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
242
248
|
worker_debug: bool = False,
|
243
249
|
cores_per_worker: float = 1.0,
|
244
250
|
mem_per_worker: Optional[float] = None,
|
245
|
-
max_workers: Optional[Union[int, float]] = None,
|
246
251
|
max_workers_per_node: Optional[Union[int, float]] = None,
|
247
252
|
cpu_affinity: str = 'none',
|
248
253
|
available_accelerators: Union[int, Sequence[str]] = (),
|
@@ -253,8 +258,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
253
258
|
poll_period: int = 10,
|
254
259
|
address_probe_timeout: Optional[int] = None,
|
255
260
|
worker_logdir_root: Optional[str] = None,
|
256
|
-
|
257
|
-
mpi_launcher: str = "mpiexec",
|
261
|
+
manager_selector: ManagerSelector = RandomManagerSelector(),
|
258
262
|
block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
|
259
263
|
encrypted: bool = False):
|
260
264
|
|
@@ -270,14 +274,15 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
270
274
|
self.prefetch_capacity = prefetch_capacity
|
271
275
|
self.address = address
|
272
276
|
self.address_probe_timeout = address_probe_timeout
|
277
|
+
self.manager_selector = manager_selector
|
278
|
+
self.loopback_address = loopback_address
|
279
|
+
|
273
280
|
if self.address:
|
274
281
|
self.all_addresses = address
|
275
282
|
else:
|
276
283
|
self.all_addresses = ','.join(get_all_addresses())
|
277
284
|
|
278
|
-
|
279
|
-
self._warn_deprecated("max_workers", "max_workers_per_node")
|
280
|
-
self.max_workers_per_node = max_workers_per_node or max_workers or float("inf")
|
285
|
+
self.max_workers_per_node = max_workers_per_node or float("inf")
|
281
286
|
|
282
287
|
mem_slots = self.max_workers_per_node
|
283
288
|
cpu_slots = self.max_workers_per_node
|
@@ -304,12 +309,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
304
309
|
self._workers_per_node = 1 # our best guess-- we do not have any provider hints
|
305
310
|
|
306
311
|
self._task_counter = 0
|
307
|
-
self.run_id = None # set to the correct run_id in dfk
|
308
|
-
self.hub_address = None # set to the correct hub address in dfk
|
309
|
-
self.hub_port = None # set to the correct hub port in dfk
|
310
312
|
self.worker_ports = worker_ports
|
311
313
|
self.worker_port_range = worker_port_range
|
312
|
-
self.interchange_proc: Optional[
|
314
|
+
self.interchange_proc: Optional[subprocess.Popen] = None
|
313
315
|
self.interchange_port_range = interchange_port_range
|
314
316
|
self.heartbeat_threshold = heartbeat_threshold
|
315
317
|
self.heartbeat_period = heartbeat_period
|
@@ -321,20 +323,20 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
321
323
|
self.encrypted = encrypted
|
322
324
|
self.cert_dir = None
|
323
325
|
|
324
|
-
self.enable_mpi_mode = enable_mpi_mode
|
325
|
-
assert mpi_launcher in VALID_LAUNCHERS, \
|
326
|
-
f"mpi_launcher must be set to one of {VALID_LAUNCHERS}"
|
327
|
-
if self.enable_mpi_mode:
|
328
|
-
assert isinstance(self.provider.launcher, parsl.launchers.SingleNodeLauncher), \
|
329
|
-
"mpi_mode requires the provider to be configured to use a SingleNodeLauncher"
|
330
|
-
|
331
|
-
self.mpi_launcher = mpi_launcher
|
332
|
-
|
333
326
|
if not launch_cmd:
|
334
327
|
launch_cmd = DEFAULT_LAUNCH_CMD
|
335
328
|
self.launch_cmd = launch_cmd
|
336
329
|
|
330
|
+
if not interchange_launch_cmd:
|
331
|
+
interchange_launch_cmd = DEFAULT_INTERCHANGE_LAUNCH_CMD
|
332
|
+
self.interchange_launch_cmd = interchange_launch_cmd
|
333
|
+
|
334
|
+
self._result_queue_thread_exit = threading.Event()
|
335
|
+
self._result_queue_thread: Optional[threading.Thread] = None
|
336
|
+
|
337
337
|
radio_mode = "htex"
|
338
|
+
enable_mpi_mode: bool = False
|
339
|
+
mpi_launcher: str = "mpiexec"
|
338
340
|
|
339
341
|
def _warn_deprecated(self, old: str, new: str):
|
340
342
|
warnings.warn(
|
@@ -344,16 +346,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
344
346
|
stacklevel=2
|
345
347
|
)
|
346
348
|
|
347
|
-
@property
|
348
|
-
def max_workers(self):
|
349
|
-
self._warn_deprecated("max_workers", "max_workers_per_node")
|
350
|
-
return self.max_workers_per_node
|
351
|
-
|
352
|
-
@max_workers.setter
|
353
|
-
def max_workers(self, val: Union[int, float]):
|
354
|
-
self._warn_deprecated("max_workers", "max_workers_per_node")
|
355
|
-
self.max_workers_per_node = val
|
356
|
-
|
357
349
|
@property
|
358
350
|
def logdir(self):
|
359
351
|
return "{}/{}".format(self.run_dir, self.label)
|
@@ -364,6 +356,20 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
364
356
|
return "{}/{}".format(self.worker_logdir_root, self.label)
|
365
357
|
return self.logdir
|
366
358
|
|
359
|
+
def validate_resource_spec(self, resource_specification: dict):
|
360
|
+
"""HTEX supports the following *Optional* resource specifications:
|
361
|
+
priority: lower value is higher priority"""
|
362
|
+
if resource_specification:
|
363
|
+
acceptable_fields = {'priority'}
|
364
|
+
keys = set(resource_specification.keys())
|
365
|
+
invalid_keys = keys - acceptable_fields
|
366
|
+
if invalid_keys:
|
367
|
+
message = "Task resource specification only accepts these types of resources: {}".format(
|
368
|
+
', '.join(acceptable_fields))
|
369
|
+
logger.error(message)
|
370
|
+
raise InvalidResourceSpecification(set(invalid_keys), message)
|
371
|
+
return
|
372
|
+
|
367
373
|
def initialize_scaling(self):
|
368
374
|
"""Compose the launch command and scale out the initial blocks.
|
369
375
|
"""
|
@@ -400,16 +406,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
400
406
|
|
401
407
|
logger.debug("Starting HighThroughputExecutor with provider:\n%s", self.provider)
|
402
408
|
|
403
|
-
# TODO: why is this a provider property?
|
404
|
-
block_ids = []
|
405
|
-
if hasattr(self.provider, 'init_blocks'):
|
406
|
-
try:
|
407
|
-
block_ids = self.scale_out(blocks=self.provider.init_blocks)
|
408
|
-
except Exception as e:
|
409
|
-
logger.error("Scaling out failed: {}".format(e))
|
410
|
-
raise e
|
411
|
-
return block_ids
|
412
|
-
|
413
409
|
def start(self):
|
414
410
|
"""Create the Interchange process and connect to it.
|
415
411
|
"""
|
@@ -424,30 +420,28 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
424
420
|
)
|
425
421
|
|
426
422
|
self.outgoing_q = zmq_pipes.TasksOutgoing(
|
427
|
-
|
423
|
+
self.loopback_address, self.interchange_port_range, self.cert_dir
|
428
424
|
)
|
429
425
|
self.incoming_q = zmq_pipes.ResultsIncoming(
|
430
|
-
|
426
|
+
self.loopback_address, self.interchange_port_range, self.cert_dir
|
431
427
|
)
|
432
428
|
self.command_client = zmq_pipes.CommandClient(
|
433
|
-
|
429
|
+
self.loopback_address, self.interchange_port_range, self.cert_dir
|
434
430
|
)
|
435
431
|
|
436
|
-
self.
|
437
|
-
self.
|
432
|
+
self._result_queue_thread = None
|
433
|
+
self._start_result_queue_thread()
|
438
434
|
self._start_local_interchange_process()
|
439
435
|
|
440
|
-
logger.debug("Created
|
436
|
+
logger.debug("Created result queue thread: %s", self._result_queue_thread)
|
441
437
|
|
442
|
-
|
443
|
-
return block_ids
|
438
|
+
self.initialize_scaling()
|
444
439
|
|
445
440
|
@wrap_with_logs
|
446
|
-
def
|
447
|
-
"""Listen to the queue for task
|
441
|
+
def _result_queue_worker(self):
|
442
|
+
"""Listen to the queue for task result messages and handle them.
|
448
443
|
|
449
|
-
Depending on the message, tasks will be updated with results
|
450
|
-
or updates. It expects the following messages:
|
444
|
+
Depending on the message, tasks will be updated with results or exceptions.
|
451
445
|
|
452
446
|
.. code:: python
|
453
447
|
|
@@ -461,14 +455,14 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
461
455
|
"task_id" : <task_id>
|
462
456
|
"exception" : serialized exception object, on failure
|
463
457
|
}
|
464
|
-
|
465
|
-
The `None` message is a die request.
|
466
458
|
"""
|
467
|
-
logger.debug("
|
459
|
+
logger.debug("Result queue worker starting")
|
468
460
|
|
469
|
-
while not self.bad_state_is_set:
|
461
|
+
while not self.bad_state_is_set and not self._result_queue_thread_exit.is_set():
|
470
462
|
try:
|
471
|
-
msgs = self.incoming_q.get()
|
463
|
+
msgs = self.incoming_q.get(timeout_ms=self.poll_period)
|
464
|
+
if msgs is None: # timeout
|
465
|
+
continue
|
472
466
|
|
473
467
|
except IOError as e:
|
474
468
|
logger.exception("Caught broken queue with exception code {}: {}".format(e.errno, e))
|
@@ -480,109 +474,114 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
480
474
|
|
481
475
|
else:
|
482
476
|
|
483
|
-
|
484
|
-
|
485
|
-
|
477
|
+
for serialized_msg in msgs:
|
478
|
+
try:
|
479
|
+
msg = pickle.loads(serialized_msg)
|
480
|
+
except pickle.UnpicklingError:
|
481
|
+
raise BadMessage("Message received could not be unpickled")
|
486
482
|
|
487
|
-
|
488
|
-
for serialized_msg in msgs:
|
483
|
+
if msg['type'] == 'result':
|
489
484
|
try:
|
490
|
-
|
491
|
-
except
|
492
|
-
raise BadMessage("Message received
|
485
|
+
tid = msg['task_id']
|
486
|
+
except Exception:
|
487
|
+
raise BadMessage("Message received does not contain 'task_id' field")
|
488
|
+
|
489
|
+
if tid == -1 and 'exception' in msg:
|
490
|
+
logger.warning("Executor shutting down due to exception from interchange")
|
491
|
+
exception = deserialize(msg['exception'])
|
492
|
+
self.set_bad_state_and_fail_all(exception)
|
493
|
+
break
|
494
|
+
|
495
|
+
task_fut = self.tasks.pop(tid)
|
493
496
|
|
494
|
-
if
|
495
|
-
|
496
|
-
|
497
|
+
if 'result' in msg:
|
498
|
+
result = deserialize(msg['result'])
|
499
|
+
task_fut.set_result(result)
|
500
|
+
|
501
|
+
elif 'exception' in msg:
|
497
502
|
try:
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
elif 'exception' in msg:
|
515
|
-
try:
|
516
|
-
s = deserialize(msg['exception'])
|
517
|
-
# s should be a RemoteExceptionWrapper... so we can reraise it
|
518
|
-
if isinstance(s, RemoteExceptionWrapper):
|
519
|
-
try:
|
520
|
-
s.reraise()
|
521
|
-
except Exception as e:
|
522
|
-
task_fut.set_exception(e)
|
523
|
-
elif isinstance(s, Exception):
|
524
|
-
task_fut.set_exception(s)
|
525
|
-
else:
|
526
|
-
raise ValueError("Unknown exception-like type received: {}".format(type(s)))
|
527
|
-
except Exception as e:
|
528
|
-
# TODO could be a proper wrapped exception?
|
529
|
-
task_fut.set_exception(
|
530
|
-
DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
|
531
|
-
else:
|
532
|
-
raise BadMessage("Message received is neither result or exception")
|
503
|
+
s = deserialize(msg['exception'])
|
504
|
+
# s should be a RemoteExceptionWrapper... so we can reraise it
|
505
|
+
if isinstance(s, RemoteExceptionWrapper):
|
506
|
+
try:
|
507
|
+
s.reraise()
|
508
|
+
except Exception as e:
|
509
|
+
task_fut.set_exception(e)
|
510
|
+
elif isinstance(s, Exception):
|
511
|
+
task_fut.set_exception(s)
|
512
|
+
else:
|
513
|
+
raise ValueError("Unknown exception-like type received: {}".format(type(s)))
|
514
|
+
except Exception as e:
|
515
|
+
# TODO could be a proper wrapped exception?
|
516
|
+
task_fut.set_exception(
|
517
|
+
DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
|
533
518
|
else:
|
534
|
-
raise BadMessage("Message received
|
519
|
+
raise BadMessage("Message received is neither result or exception")
|
520
|
+
else:
|
521
|
+
raise BadMessage("Message received with unknown type {}".format(msg['type']))
|
535
522
|
|
536
|
-
logger.info("
|
523
|
+
logger.info("Closing result ZMQ pipe")
|
524
|
+
self.incoming_q.close()
|
525
|
+
logger.info("Result queue worker finished")
|
537
526
|
|
538
|
-
def _start_local_interchange_process(self):
|
527
|
+
def _start_local_interchange_process(self) -> None:
|
539
528
|
""" Starts the interchange process locally
|
540
529
|
|
541
|
-
Starts the interchange process locally and uses
|
530
|
+
Starts the interchange process locally and uses the command queue to
|
542
531
|
get the worker task and result ports that the interchange has bound to.
|
543
532
|
"""
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
533
|
+
|
534
|
+
interchange_config = {"client_address": self.loopback_address,
|
535
|
+
"client_ports": (self.outgoing_q.port,
|
536
|
+
self.incoming_q.port,
|
537
|
+
self.command_client.port),
|
538
|
+
"interchange_address": self.address,
|
539
|
+
"worker_ports": self.worker_ports,
|
540
|
+
"worker_port_range": self.worker_port_range,
|
541
|
+
"hub_address": self.hub_address,
|
542
|
+
"hub_zmq_port": self.hub_zmq_port,
|
543
|
+
"logdir": self.logdir,
|
544
|
+
"heartbeat_threshold": self.heartbeat_threshold,
|
545
|
+
"poll_period": self.poll_period,
|
546
|
+
"logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
|
547
|
+
"cert_dir": self.cert_dir,
|
548
|
+
"manager_selector": self.manager_selector,
|
549
|
+
"run_id": self.run_id,
|
550
|
+
}
|
551
|
+
|
552
|
+
config_pickle = pickle.dumps(interchange_config)
|
553
|
+
|
554
|
+
self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd, stdin=subprocess.PIPE)
|
555
|
+
stdin = self.interchange_proc.stdin
|
556
|
+
assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
|
557
|
+
|
558
|
+
logger.debug("Popened interchange process. Writing config object")
|
559
|
+
stdin.write(config_pickle)
|
560
|
+
stdin.flush()
|
561
|
+
stdin.close()
|
562
|
+
logger.debug("Sent config object. Requesting worker ports")
|
565
563
|
try:
|
566
|
-
(self.worker_task_port, self.worker_result_port) =
|
567
|
-
except
|
568
|
-
logger.error("Interchange has not completed initialization
|
564
|
+
(self.worker_task_port, self.worker_result_port) = self.command_client.run("WORKER_PORTS", timeout_s=120)
|
565
|
+
except CommandClientTimeoutError:
|
566
|
+
logger.error("Interchange has not completed initialization. Aborting")
|
569
567
|
raise Exception("Interchange failed to start")
|
568
|
+
logger.debug("Got worker ports")
|
570
569
|
|
571
|
-
def
|
572
|
-
"""Method to start the
|
570
|
+
def _start_result_queue_thread(self):
|
571
|
+
"""Method to start the result queue thread as a daemon.
|
573
572
|
|
574
573
|
Checks if a thread already exists, then starts it.
|
575
|
-
Could be used later as a restart if the
|
574
|
+
Could be used later as a restart if the result queue thread dies.
|
576
575
|
"""
|
577
|
-
if self.
|
578
|
-
logger.debug("Starting queue
|
579
|
-
self.
|
580
|
-
self.
|
581
|
-
self.
|
582
|
-
logger.debug("Started queue
|
576
|
+
if self._result_queue_thread is None:
|
577
|
+
logger.debug("Starting result queue thread")
|
578
|
+
self._result_queue_thread = threading.Thread(target=self._result_queue_worker, name="HTEX-Result-Queue-Thread")
|
579
|
+
self._result_queue_thread.daemon = True
|
580
|
+
self._result_queue_thread.start()
|
581
|
+
logger.debug("Started result queue thread")
|
583
582
|
|
584
583
|
else:
|
585
|
-
logger.error("
|
584
|
+
logger.error("Result queue thread already exists, returning")
|
586
585
|
|
587
586
|
def hold_worker(self, worker_id: str) -> None:
|
588
587
|
"""Puts a worker on hold, preventing scheduling of additional tasks to it.
|
@@ -603,7 +602,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
603
602
|
def outstanding(self) -> int:
|
604
603
|
"""Returns the count of tasks outstanding across the interchange
|
605
604
|
and managers"""
|
606
|
-
return self.
|
605
|
+
return len(self.tasks)
|
607
606
|
|
608
607
|
@property
|
609
608
|
def connected_workers(self) -> int:
|
@@ -655,7 +654,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
655
654
|
Returns:
|
656
655
|
Future
|
657
656
|
"""
|
658
|
-
|
657
|
+
|
658
|
+
self.validate_resource_spec(resource_specification)
|
659
659
|
|
660
660
|
if self.bad_state_is_set:
|
661
661
|
raise self.executor_exception
|
@@ -679,7 +679,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
679
679
|
except TypeError:
|
680
680
|
raise SerializationError(func.__name__)
|
681
681
|
|
682
|
-
msg = {"task_id": task_id, "buffer": fn_buf}
|
682
|
+
msg = {"task_id": task_id, "resource_spec": resource_specification, "buffer": fn_buf}
|
683
683
|
|
684
684
|
# Post task to the outgoing queue
|
685
685
|
self.outgoing_q.put(msg)
|
@@ -687,22 +687,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
687
687
|
# Return the future
|
688
688
|
return fut
|
689
689
|
|
690
|
-
def create_monitoring_info(self, status):
|
691
|
-
""" Create a msg for monitoring based on the poll status
|
692
|
-
|
693
|
-
"""
|
694
|
-
msg = []
|
695
|
-
for bid, s in status.items():
|
696
|
-
d = {}
|
697
|
-
d['run_id'] = self.run_id
|
698
|
-
d['status'] = s.status_name
|
699
|
-
d['timestamp'] = datetime.datetime.now()
|
700
|
-
d['executor_label'] = self.label
|
701
|
-
d['job_id'] = self.blocks.get(bid, None)
|
702
|
-
d['block_id'] = bid
|
703
|
-
msg.append(d)
|
704
|
-
return msg
|
705
|
-
|
706
690
|
@property
|
707
691
|
def workers_per_node(self) -> Union[int, float]:
|
708
692
|
return self._workers_per_node
|
@@ -740,14 +724,24 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
740
724
|
tasks: int # sum of tasks in this block
|
741
725
|
idle: float # shortest idle time of any manager in this block
|
742
726
|
|
727
|
+
# block_info will be populated from two sources:
|
728
|
+
# the Job Status Poller mutable block list, and the list of blocks
|
729
|
+
# which have connected to the interchange.
|
730
|
+
|
731
|
+
def new_block_info():
|
732
|
+
return BlockInfo(tasks=0, idle=float('inf'))
|
733
|
+
|
734
|
+
block_info: Dict[str, BlockInfo] = defaultdict(new_block_info)
|
735
|
+
|
736
|
+
for block_id, job_status in self._status.items():
|
737
|
+
if job_status.state not in TERMINAL_STATES:
|
738
|
+
block_info[block_id] = new_block_info()
|
739
|
+
|
743
740
|
managers = self.connected_managers()
|
744
|
-
block_info: Dict[str, BlockInfo] = {}
|
745
741
|
for manager in managers:
|
746
742
|
if not manager['active']:
|
747
743
|
continue
|
748
744
|
b_id = manager['block_id']
|
749
|
-
if b_id not in block_info:
|
750
|
-
block_info[b_id] = BlockInfo(tasks=0, idle=float('inf'))
|
751
745
|
block_info[b_id].tasks += manager['tasks']
|
752
746
|
block_info[b_id].idle = min(block_info[b_id].idle, manager['idle_duration'])
|
753
747
|
|
@@ -779,14 +773,14 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
779
773
|
|
780
774
|
# Now kill via provider
|
781
775
|
# Potential issue with multiple threads trying to remove the same blocks
|
782
|
-
to_kill = [self.
|
776
|
+
to_kill = [self.blocks_to_job_id[bid] for bid in block_ids_to_kill if bid in self.blocks_to_job_id]
|
783
777
|
|
784
778
|
r = self.provider.cancel(to_kill)
|
785
779
|
job_ids = self._filter_scale_in_ids(to_kill, r)
|
786
780
|
|
787
|
-
# to_kill block_ids are fetched from self.
|
788
|
-
# If a block_id is in self.
|
789
|
-
block_ids_killed = [self.
|
781
|
+
# to_kill block_ids are fetched from self.blocks_to_job_id
|
782
|
+
# If a block_id is in self.blocks_to_job_id, it must exist in self.job_ids_to_block
|
783
|
+
block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
|
790
784
|
|
791
785
|
return block_ids_killed
|
792
786
|
|
@@ -801,7 +795,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
801
795
|
connected_blocks = self.connected_blocks()
|
802
796
|
for job_id in job_status:
|
803
797
|
job_info = job_status[job_id]
|
804
|
-
if job_info.terminal and job_id not in connected_blocks:
|
798
|
+
if job_info.terminal and job_id not in connected_blocks and job_info.state != JobState.SCALED_IN:
|
799
|
+
logger.debug("Rewriting job %s from status %s to MISSING", job_id, job_info)
|
805
800
|
job_status[job_id].state = JobState.MISSING
|
806
801
|
if job_status[job_id].message is None:
|
807
802
|
job_status[job_id].message = (
|
@@ -829,10 +824,37 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
829
824
|
|
830
825
|
logger.info("Attempting HighThroughputExecutor shutdown")
|
831
826
|
|
827
|
+
logger.info("Terminating interchange and result queue thread")
|
828
|
+
self._result_queue_thread_exit.set()
|
832
829
|
self.interchange_proc.terminate()
|
833
|
-
|
834
|
-
|
835
|
-
|
830
|
+
try:
|
831
|
+
self.interchange_proc.wait(timeout=timeout)
|
832
|
+
except subprocess.TimeoutExpired:
|
833
|
+
logger.warning("Unable to terminate Interchange process; sending SIGKILL")
|
836
834
|
self.interchange_proc.kill()
|
837
835
|
|
836
|
+
logger.info("Closing ZMQ pipes")
|
837
|
+
|
838
|
+
# These pipes are used in a thread unsafe manner. If you have traced a
|
839
|
+
# problem to this block of code, you might consider what is happening
|
840
|
+
# with other threads that access these.
|
841
|
+
|
842
|
+
# incoming_q is not closed here because it is used by the results queue
|
843
|
+
# worker which is not shut down at this point.
|
844
|
+
|
845
|
+
if hasattr(self, 'outgoing_q'):
|
846
|
+
logger.info("Closing outgoing_q")
|
847
|
+
self.outgoing_q.close()
|
848
|
+
|
849
|
+
if hasattr(self, 'command_client'):
|
850
|
+
logger.info("Closing command client")
|
851
|
+
self.command_client.close()
|
852
|
+
|
853
|
+
logger.info("Waiting for result queue thread exit")
|
854
|
+
if self._result_queue_thread:
|
855
|
+
self._result_queue_thread.join()
|
856
|
+
|
838
857
|
logger.info("Finished HighThroughputExecutor shutdown attempt")
|
858
|
+
|
859
|
+
def get_usage_information(self):
|
860
|
+
return {"mpi": self.enable_mpi_mode}
|