parsl 2024.3.11__py3-none-any.whl → 2025.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/__init__.py +9 -10
- parsl/addresses.py +29 -7
- parsl/app/app.py +7 -8
- parsl/app/bash.py +15 -8
- parsl/app/errors.py +10 -13
- parsl/app/futures.py +8 -10
- parsl/app/python.py +2 -1
- parsl/benchmark/perf.py +2 -1
- parsl/concurrent/__init__.py +2 -2
- parsl/config.py +57 -10
- parsl/configs/ASPIRE1.py +6 -5
- parsl/configs/Azure.py +9 -8
- parsl/configs/bridges.py +6 -4
- parsl/configs/cc_in2p3.py +3 -3
- parsl/configs/ec2.py +3 -1
- parsl/configs/expanse.py +4 -3
- parsl/configs/frontera.py +3 -4
- parsl/configs/htex_local.py +3 -4
- parsl/configs/illinoiscluster.py +3 -1
- parsl/configs/improv.py +34 -0
- parsl/configs/kubernetes.py +4 -3
- parsl/configs/local_threads.py +5 -1
- parsl/configs/midway.py +5 -3
- parsl/configs/osg.py +4 -2
- parsl/configs/polaris.py +4 -2
- parsl/configs/stampede2.py +6 -5
- parsl/configs/summit.py +3 -3
- parsl/configs/toss3_llnl.py +4 -3
- parsl/configs/vineex_local.py +6 -4
- parsl/configs/wqex_local.py +5 -3
- parsl/curvezmq.py +4 -0
- parsl/data_provider/data_manager.py +4 -3
- parsl/data_provider/file_noop.py +1 -2
- parsl/data_provider/files.py +3 -3
- parsl/data_provider/ftp.py +1 -3
- parsl/data_provider/globus.py +7 -6
- parsl/data_provider/http.py +2 -2
- parsl/data_provider/rsync.py +1 -1
- parsl/data_provider/staging.py +2 -2
- parsl/data_provider/zip.py +135 -0
- parsl/dataflow/dependency_resolvers.py +115 -0
- parsl/dataflow/dflow.py +262 -224
- parsl/dataflow/errors.py +3 -5
- parsl/dataflow/futures.py +27 -14
- parsl/dataflow/memoization.py +5 -5
- parsl/dataflow/rundirs.py +5 -6
- parsl/dataflow/taskrecord.py +4 -5
- parsl/executors/__init__.py +4 -2
- parsl/executors/base.py +45 -15
- parsl/executors/errors.py +13 -0
- parsl/executors/execute_task.py +37 -0
- parsl/executors/flux/execute_parsl_task.py +3 -3
- parsl/executors/flux/executor.py +18 -19
- parsl/executors/flux/flux_instance_manager.py +26 -27
- parsl/executors/high_throughput/errors.py +43 -3
- parsl/executors/high_throughput/executor.py +316 -282
- parsl/executors/high_throughput/interchange.py +158 -167
- parsl/executors/high_throughput/manager_record.py +5 -0
- parsl/executors/high_throughput/manager_selector.py +55 -0
- parsl/executors/high_throughput/monitoring_info.py +2 -1
- parsl/executors/high_throughput/mpi_executor.py +113 -0
- parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
- parsl/executors/high_throughput/mpi_resource_management.py +6 -17
- parsl/executors/high_throughput/probe.py +9 -7
- parsl/executors/high_throughput/process_worker_pool.py +115 -77
- parsl/executors/high_throughput/zmq_pipes.py +81 -23
- parsl/executors/radical/executor.py +130 -79
- parsl/executors/radical/rpex_resources.py +17 -15
- parsl/executors/radical/rpex_worker.py +4 -3
- parsl/executors/status_handling.py +157 -51
- parsl/executors/taskvine/__init__.py +1 -1
- parsl/executors/taskvine/errors.py +1 -1
- parsl/executors/taskvine/exec_parsl_function.py +2 -2
- parsl/executors/taskvine/executor.py +41 -57
- parsl/executors/taskvine/factory.py +1 -1
- parsl/executors/taskvine/factory_config.py +1 -1
- parsl/executors/taskvine/manager.py +18 -13
- parsl/executors/taskvine/manager_config.py +9 -5
- parsl/executors/threads.py +6 -6
- parsl/executors/workqueue/errors.py +1 -1
- parsl/executors/workqueue/exec_parsl_function.py +6 -5
- parsl/executors/workqueue/executor.py +64 -63
- parsl/executors/workqueue/parsl_coprocess.py +1 -1
- parsl/jobs/error_handlers.py +2 -2
- parsl/jobs/job_status_poller.py +30 -113
- parsl/jobs/states.py +7 -2
- parsl/jobs/strategy.py +43 -31
- parsl/launchers/__init__.py +12 -3
- parsl/launchers/errors.py +1 -1
- parsl/launchers/launchers.py +6 -12
- parsl/log_utils.py +9 -6
- parsl/monitoring/db_manager.py +59 -95
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +87 -356
- parsl/monitoring/queries/pandas.py +1 -2
- parsl/monitoring/radios/base.py +13 -0
- parsl/monitoring/radios/filesystem.py +52 -0
- parsl/monitoring/radios/htex.py +57 -0
- parsl/monitoring/radios/multiprocessing.py +17 -0
- parsl/monitoring/radios/udp.py +56 -0
- parsl/monitoring/radios/zmq.py +17 -0
- parsl/monitoring/remote.py +33 -37
- parsl/monitoring/router.py +212 -0
- parsl/monitoring/types.py +5 -6
- parsl/monitoring/visualization/app.py +4 -2
- parsl/monitoring/visualization/models.py +0 -1
- parsl/monitoring/visualization/plots/default/workflow_plots.py +11 -4
- parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
- parsl/monitoring/visualization/utils.py +0 -1
- parsl/monitoring/visualization/views.py +16 -8
- parsl/multiprocessing.py +0 -1
- parsl/process_loggers.py +1 -2
- parsl/providers/__init__.py +8 -17
- parsl/providers/aws/aws.py +2 -3
- parsl/providers/azure/azure.py +4 -5
- parsl/providers/base.py +2 -18
- parsl/providers/cluster_provider.py +4 -12
- parsl/providers/condor/condor.py +7 -17
- parsl/providers/errors.py +2 -2
- parsl/providers/googlecloud/googlecloud.py +2 -1
- parsl/providers/grid_engine/grid_engine.py +5 -14
- parsl/providers/kubernetes/kube.py +80 -40
- parsl/providers/local/local.py +13 -26
- parsl/providers/lsf/lsf.py +5 -23
- parsl/providers/pbspro/pbspro.py +5 -17
- parsl/providers/slurm/slurm.py +81 -39
- parsl/providers/torque/torque.py +3 -14
- parsl/serialize/__init__.py +8 -3
- parsl/serialize/base.py +1 -2
- parsl/serialize/concretes.py +5 -4
- parsl/serialize/facade.py +3 -3
- parsl/serialize/proxystore.py +3 -2
- parsl/tests/__init__.py +1 -1
- parsl/tests/configs/azure_single_node.py +4 -5
- parsl/tests/configs/bridges.py +3 -2
- parsl/tests/configs/cc_in2p3.py +1 -3
- parsl/tests/configs/comet.py +2 -1
- parsl/tests/configs/ec2_single_node.py +1 -2
- parsl/tests/configs/ec2_spot.py +1 -2
- parsl/tests/configs/flux_local.py +11 -0
- parsl/tests/configs/frontera.py +2 -3
- parsl/tests/configs/htex_local.py +3 -5
- parsl/tests/configs/htex_local_alternate.py +11 -15
- parsl/tests/configs/htex_local_intask_staging.py +5 -9
- parsl/tests/configs/htex_local_rsync_staging.py +4 -8
- parsl/tests/configs/local_radical.py +1 -3
- parsl/tests/configs/local_radical_mpi.py +2 -2
- parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
- parsl/tests/configs/local_threads_monitoring.py +0 -1
- parsl/tests/configs/midway.py +2 -2
- parsl/tests/configs/nscc_singapore.py +3 -3
- parsl/tests/configs/osg_htex.py +1 -1
- parsl/tests/configs/petrelkube.py +3 -2
- parsl/tests/configs/slurm_local.py +24 -0
- parsl/tests/configs/summit.py +1 -0
- parsl/tests/configs/taskvine_ex.py +4 -7
- parsl/tests/configs/user_opts.py +2 -8
- parsl/tests/configs/workqueue_ex.py +4 -6
- parsl/tests/conftest.py +27 -13
- parsl/tests/integration/test_stress/test_python_simple.py +3 -4
- parsl/tests/integration/test_stress/test_python_threads.py +3 -5
- parsl/tests/manual_tests/htex_local.py +4 -6
- parsl/tests/manual_tests/test_basic.py +1 -0
- parsl/tests/manual_tests/test_log_filter.py +3 -1
- parsl/tests/manual_tests/test_memory_limits.py +6 -8
- parsl/tests/manual_tests/test_regression_220.py +2 -1
- parsl/tests/manual_tests/test_udp_simple.py +4 -4
- parsl/tests/manual_tests/test_worker_count.py +3 -2
- parsl/tests/scaling_tests/htex_local.py +2 -4
- parsl/tests/scaling_tests/test_scale.py +0 -9
- parsl/tests/scaling_tests/vineex_condor.py +1 -2
- parsl/tests/scaling_tests/vineex_local.py +1 -2
- parsl/tests/site_tests/site_config_selector.py +1 -6
- parsl/tests/site_tests/test_provider.py +4 -2
- parsl/tests/site_tests/test_site.py +2 -0
- parsl/tests/sites/test_affinity.py +7 -7
- parsl/tests/sites/test_dynamic_executor.py +3 -4
- parsl/tests/sites/test_ec2.py +3 -2
- parsl/tests/sites/test_worker_info.py +4 -5
- parsl/tests/test_aalst_patterns.py +0 -1
- parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
- parsl/tests/test_bash_apps/test_basic.py +10 -4
- parsl/tests/test_bash_apps/test_error_codes.py +5 -7
- parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
- parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
- parsl/tests/test_bash_apps/test_memoize.py +2 -8
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
- parsl/tests/test_bash_apps/test_multiline.py +1 -1
- parsl/tests/test_bash_apps/test_pipeline.py +1 -1
- parsl/tests/test_bash_apps/test_std_uri.py +123 -0
- parsl/tests/test_bash_apps/test_stdout.py +33 -8
- parsl/tests/test_callables.py +2 -2
- parsl/tests/test_checkpointing/test_periodic.py +21 -39
- parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
- parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
- parsl/tests/test_checkpointing/test_regression_239.py +1 -1
- parsl/tests/test_checkpointing/test_task_exit.py +2 -3
- parsl/tests/test_docs/test_from_slides.py +5 -2
- parsl/tests/test_docs/test_kwargs.py +4 -1
- parsl/tests/test_docs/test_tutorial_1.py +1 -2
- parsl/tests/test_docs/test_workflow1.py +2 -2
- parsl/tests/test_docs/test_workflow2.py +0 -1
- parsl/tests/test_error_handling/test_rand_fail.py +2 -2
- parsl/tests/test_error_handling/test_resource_spec.py +10 -12
- parsl/tests/test_error_handling/test_retries.py +6 -16
- parsl/tests/test_error_handling/test_retry_handler.py +1 -0
- parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
- parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
- parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
- parsl/tests/test_execute_task.py +29 -0
- parsl/tests/test_flux.py +1 -1
- parsl/tests/test_htex/test_basic.py +2 -3
- parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
- parsl/tests/test_htex/test_command_client_timeout.py +66 -0
- parsl/tests/test_htex/test_connected_blocks.py +3 -2
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
- parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_drain.py +79 -0
- parsl/tests/test_htex/test_htex.py +51 -25
- parsl/tests/test_htex/test_manager_failure.py +0 -1
- parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
- parsl/tests/test_htex/test_managers_command.py +36 -0
- parsl/tests/test_htex/test_missing_worker.py +2 -12
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
- parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
- parsl/tests/test_htex/test_zmq_binding.py +29 -8
- parsl/tests/test_monitoring/test_app_names.py +86 -0
- parsl/tests/test_monitoring/test_basic.py +73 -25
- parsl/tests/test_monitoring/test_db_locks.py +6 -4
- parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
- parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
- parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
- parsl/tests/test_monitoring/test_stdouterr.py +134 -0
- parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
- parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
- parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
- parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
- parsl/tests/test_providers/test_local_provider.py +3 -132
- parsl/tests/test_providers/test_pbspro_template.py +2 -3
- parsl/tests/test_providers/test_slurm_template.py +2 -3
- parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
- parsl/tests/test_python_apps/test_context_manager.py +128 -0
- parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
- parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
- parsl/tests/test_python_apps/test_fail.py +0 -25
- parsl/tests/test_python_apps/test_futures.py +2 -1
- parsl/tests/test_python_apps/test_inputs_default.py +22 -0
- parsl/tests/test_python_apps/test_join.py +0 -1
- parsl/tests/test_python_apps/test_lifted.py +11 -7
- parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
- parsl/tests/test_python_apps/test_outputs.py +1 -1
- parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
- parsl/tests/test_radical/test_mpi_funcs.py +1 -2
- parsl/tests/test_regression/test_1480.py +2 -1
- parsl/tests/test_regression/test_1653.py +2 -1
- parsl/tests/test_regression/test_226.py +1 -0
- parsl/tests/test_regression/test_2652.py +1 -0
- parsl/tests/test_regression/test_69a.py +0 -1
- parsl/tests/test_regression/test_854.py +4 -2
- parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
- parsl/tests/test_regression/test_98.py +0 -1
- parsl/tests/test_scaling/test_block_error_handler.py +9 -4
- parsl/tests/test_scaling/test_regression_1621.py +11 -15
- parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
- parsl/tests/test_scaling/test_scale_down.py +2 -5
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +6 -18
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
- parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
- parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
- parsl/tests/test_serialization/test_basic.py +2 -1
- parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
- parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
- parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
- parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
- parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
- parsl/tests/test_staging/staging_provider.py +2 -2
- parsl/tests/test_staging/test_1316.py +3 -4
- parsl/tests/test_staging/test_docs_1.py +2 -1
- parsl/tests/test_staging/test_docs_2.py +2 -1
- parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
- parsl/tests/{test_data → test_staging}/test_file.py +6 -6
- parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
- parsl/tests/test_staging/test_staging_ftp.py +1 -0
- parsl/tests/test_staging/test_staging_https.py +5 -2
- parsl/tests/test_staging/test_staging_stdout.py +64 -0
- parsl/tests/test_staging/test_zip_in.py +39 -0
- parsl/tests/test_staging/test_zip_out.py +110 -0
- parsl/tests/test_staging/test_zip_to_zip.py +41 -0
- parsl/tests/test_summary.py +2 -2
- parsl/tests/test_thread_parallelism.py +0 -1
- parsl/tests/test_threads/test_configs.py +1 -2
- parsl/tests/test_threads/test_lazy_errors.py +2 -2
- parsl/tests/test_utils/test_execute_wait.py +35 -0
- parsl/tests/test_utils/test_sanitize_dns.py +76 -0
- parsl/tests/unit/test_address.py +20 -0
- parsl/tests/unit/test_file.py +99 -0
- parsl/tests/unit/test_usage_tracking.py +66 -0
- parsl/usage_tracking/api.py +65 -0
- parsl/usage_tracking/levels.py +6 -0
- parsl/usage_tracking/usage.py +104 -62
- parsl/utils.py +139 -6
- parsl/version.py +1 -1
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
- parsl-2025.1.13.data/scripts/interchange.py +649 -0
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +115 -77
- parsl-2025.1.13.dist-info/METADATA +96 -0
- parsl-2025.1.13.dist-info/RECORD +462 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
- parsl/channels/__init__.py +0 -7
- parsl/channels/base.py +0 -141
- parsl/channels/errors.py +0 -113
- parsl/channels/local/local.py +0 -164
- parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
- parsl/channels/ssh/ssh.py +0 -276
- parsl/channels/ssh_il/__init__.py +0 -0
- parsl/channels/ssh_il/ssh_il.py +0 -74
- parsl/configs/ad_hoc.py +0 -35
- parsl/executors/radical/rpex_master.py +0 -42
- parsl/monitoring/radios.py +0 -175
- parsl/providers/ad_hoc/__init__.py +0 -0
- parsl/providers/ad_hoc/ad_hoc.py +0 -248
- parsl/providers/cobalt/__init__.py +0 -0
- parsl/providers/cobalt/cobalt.py +0 -236
- parsl/providers/cobalt/template.py +0 -17
- parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
- parsl/tests/configs/cooley_htex.py +0 -37
- parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
- parsl/tests/configs/local_adhoc.py +0 -18
- parsl/tests/configs/swan_htex.py +0 -43
- parsl/tests/configs/theta.py +0 -37
- parsl/tests/integration/test_channels/__init__.py +0 -0
- parsl/tests/integration/test_channels/test_channels.py +0 -17
- parsl/tests/integration/test_channels/test_local_channel.py +0 -42
- parsl/tests/integration/test_channels/test_scp_1.py +0 -45
- parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
- parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
- parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
- parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
- parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
- parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
- parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
- parsl/tests/sites/test_local_adhoc.py +0 -61
- parsl/tests/test_channels/__init__.py +0 -0
- parsl/tests/test_channels/test_large_output.py +0 -22
- parsl/tests/test_data/__init__.py +0 -0
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
- parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
- parsl-2024.3.11.dist-info/METADATA +0 -98
- parsl-2024.3.11.dist-info/RECORD +0 -447
- parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
- parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
- parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
@@ -1,43 +1,41 @@
|
|
1
|
-
import typing
|
2
|
-
from concurrent.futures import Future
|
3
|
-
import typeguard
|
4
1
|
import logging
|
5
|
-
import threading
|
6
|
-
import queue
|
7
|
-
import datetime
|
8
|
-
import pickle
|
9
|
-
from dataclasses import dataclass
|
10
|
-
from multiprocessing import Process, Queue
|
11
|
-
from typing import Dict, Sequence
|
12
|
-
from typing import List, Optional, Tuple, Union, Callable
|
13
2
|
import math
|
3
|
+
import pickle
|
4
|
+
import subprocess
|
5
|
+
import threading
|
6
|
+
import typing
|
14
7
|
import warnings
|
8
|
+
from collections import defaultdict
|
9
|
+
from concurrent.futures import Future
|
10
|
+
from dataclasses import dataclass
|
11
|
+
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
|
12
|
+
|
13
|
+
import typeguard
|
15
14
|
|
16
|
-
import
|
17
|
-
from parsl.
|
18
|
-
from parsl.serialize.errors import SerializationError, DeserializationError
|
15
|
+
from parsl import curvezmq
|
16
|
+
from parsl.addresses import get_all_addresses
|
19
17
|
from parsl.app.errors import RemoteExceptionWrapper
|
20
|
-
from parsl.
|
21
|
-
from parsl.executors.high_throughput import zmq_pipes
|
22
|
-
from parsl.executors.high_throughput import interchange
|
18
|
+
from parsl.data_provider.staging import Staging
|
23
19
|
from parsl.executors.errors import (
|
24
|
-
BadMessage,
|
20
|
+
BadMessage,
|
21
|
+
InvalidResourceSpecification,
|
22
|
+
ScalingFailed,
|
25
23
|
)
|
26
|
-
from parsl.executors.high_throughput
|
27
|
-
|
28
|
-
|
24
|
+
from parsl.executors.high_throughput import zmq_pipes
|
25
|
+
from parsl.executors.high_throughput.errors import CommandClientTimeoutError
|
26
|
+
from parsl.executors.high_throughput.manager_selector import (
|
27
|
+
ManagerSelector,
|
28
|
+
RandomManagerSelector,
|
29
29
|
)
|
30
|
-
|
31
|
-
from parsl import curvezmq
|
32
30
|
from parsl.executors.status_handling import BlockProviderExecutor
|
33
|
-
from parsl.
|
34
|
-
from parsl.data_provider.staging import Staging
|
35
|
-
from parsl.addresses import get_all_addresses
|
31
|
+
from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
|
36
32
|
from parsl.process_loggers import wrap_with_logs
|
37
|
-
|
38
|
-
from parsl.multiprocessing import ForkProcess
|
39
|
-
from parsl.utils import RepresentationMixin
|
40
33
|
from parsl.providers import LocalProvider
|
34
|
+
from parsl.providers.base import ExecutionProvider
|
35
|
+
from parsl.serialize import deserialize, pack_res_spec_apply_message
|
36
|
+
from parsl.serialize.errors import DeserializationError, SerializationError
|
37
|
+
from parsl.usage_tracking.api import UsageInformation
|
38
|
+
from parsl.utils import RepresentationMixin
|
41
39
|
|
42
40
|
logger = logging.getLogger(__name__)
|
43
41
|
|
@@ -55,54 +53,16 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
|
|
55
53
|
"--hb_period={heartbeat_period} "
|
56
54
|
"{address_probe_timeout_string} "
|
57
55
|
"--hb_threshold={heartbeat_threshold} "
|
56
|
+
"--drain_period={drain_period} "
|
58
57
|
"--cpu-affinity {cpu_affinity} "
|
59
58
|
"{enable_mpi_mode} "
|
60
59
|
"--mpi-launcher={mpi_launcher} "
|
61
60
|
"--available-accelerators {accelerators}")
|
62
61
|
|
62
|
+
DEFAULT_INTERCHANGE_LAUNCH_CMD = ["interchange.py"]
|
63
63
|
|
64
|
-
|
65
|
-
"""Executor designed for cluster-scale
|
66
|
-
|
67
|
-
The HighThroughputExecutor system has the following components:
|
68
|
-
1. The HighThroughputExecutor instance which is run as part of the Parsl script.
|
69
|
-
2. The Interchange which acts as a load-balancing proxy between workers and Parsl
|
70
|
-
3. The multiprocessing based worker pool which coordinates task execution over several
|
71
|
-
cores on a node.
|
72
|
-
4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
|
73
|
-
|
74
|
-
Here is a diagram
|
75
|
-
|
76
|
-
.. code:: python
|
77
|
-
|
78
|
-
|
79
|
-
| Data | Executor | Interchange | External Process(es)
|
80
|
-
| Flow | | |
|
81
|
-
Task | Kernel | | |
|
82
|
-
+----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
|
83
|
-
| | | | batching | | |
|
84
|
-
Parsl<---Fut-| | | load-balancing| result exception
|
85
|
-
^ | | | watchdogs | | |
|
86
|
-
| | | Q_mngmnt | | V V
|
87
|
-
| | | Thread<--|-incoming_q<---|--- +---------+
|
88
|
-
| | | | | |
|
89
|
-
| | | | | |
|
90
|
-
+----update_fut-----+
|
91
|
-
|
92
|
-
|
93
|
-
Each of the workers in each process_worker_pool has access to its local rank through
|
94
|
-
an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
|
95
|
-
and is an integer in the range from 0 to the number of workers per in the pool minus 1.
|
96
|
-
The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
|
97
|
-
and the size of the worker pool as ``PARSL_WORKER_COUNT``.
|
98
|
-
|
99
|
-
|
100
|
-
Parameters
|
101
|
-
----------
|
102
|
-
|
103
|
-
provider : :class:`~parsl.providers.base.ExecutionProvider`
|
64
|
+
GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
|
104
65
|
Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
|
105
|
-
:class:`~parsl.providers.cobalt.cobalt.Cobalt`,
|
106
66
|
:class:`~parsl.providers.condor.condor.Condor`,
|
107
67
|
:class:`~parsl.providers.googlecloud.googlecloud.GoogleCloud`,
|
108
68
|
:class:`~parsl.providers.gridEngine.gridEngine.GridEngine`,
|
@@ -120,9 +80,13 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
120
80
|
cores_per_worker, nodes_per_block, heartbeat_period ,heartbeat_threshold, logdir). For example:
|
121
81
|
launch_cmd="process_worker_pool.py {debug} -c {cores_per_worker} --task_url={task_url} --result_url={result_url}"
|
122
82
|
|
83
|
+
interchange_launch_cmd : Sequence[str]
|
84
|
+
Custom sequence of command line tokens to launch the interchange process from the executor. If
|
85
|
+
undefined, the executor will use the default "interchange.py" command.
|
86
|
+
|
123
87
|
address : string
|
124
88
|
An address to connect to the main Parsl process which is reachable from the network in which
|
125
|
-
workers will be running. This field expects an IPv4 address
|
89
|
+
workers will be running. This field expects an IPv4 or IPv6 address.
|
126
90
|
Most login nodes on clusters have several network interfaces available, only some of which
|
127
91
|
can be reached from the compute nodes. This field can be used to limit the executor to listen
|
128
92
|
only on a specific interface, and limiting connections to the internal network.
|
@@ -130,6 +94,11 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
130
94
|
Setting an address here overrides the default behavior.
|
131
95
|
default=None
|
132
96
|
|
97
|
+
loopback_address: string
|
98
|
+
Specify address used for internal communication between executor and interchange.
|
99
|
+
Supports IPv4 and IPv6 addresses
|
100
|
+
default=127.0.0.1
|
101
|
+
|
133
102
|
worker_ports : (int, int)
|
134
103
|
Specify the ports to be used by workers to connect to Parsl. If this option is specified,
|
135
104
|
worker_port_range will not be honored.
|
@@ -146,6 +115,91 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
146
115
|
worker_debug : Bool
|
147
116
|
Enables worker debug logging.
|
148
117
|
|
118
|
+
prefetch_capacity : int
|
119
|
+
Number of tasks that could be prefetched over available worker capacity.
|
120
|
+
When there are a few tasks (<100) or when tasks are long running, this option should
|
121
|
+
be set to 0 for better load balancing. Default is 0.
|
122
|
+
|
123
|
+
address_probe_timeout : int | None
|
124
|
+
Managers attempt connecting over many different addresses to determine a viable address.
|
125
|
+
This option sets a time limit in seconds on the connection attempt.
|
126
|
+
Default of None implies 30s timeout set on worker.
|
127
|
+
|
128
|
+
heartbeat_threshold : int
|
129
|
+
Seconds since the last message from the counterpart in the communication pair:
|
130
|
+
(interchange, manager) after which the counterpart is assumed to be un-available. Default: 120s
|
131
|
+
|
132
|
+
heartbeat_period : int
|
133
|
+
Number of seconds after which a heartbeat message indicating liveness is sent to the
|
134
|
+
counterpart (interchange, manager). Default: 30s
|
135
|
+
|
136
|
+
poll_period : int
|
137
|
+
Timeout period to be used by the executor components in milliseconds. Increasing poll_periods
|
138
|
+
trades performance for cpu efficiency. Default: 10ms
|
139
|
+
|
140
|
+
drain_period : int
|
141
|
+
The number of seconds after start when workers will begin to drain
|
142
|
+
and then exit. Set this to a time that is slightly less than the
|
143
|
+
maximum walltime of batch jobs to avoid killing tasks while they
|
144
|
+
execute. For example, you could set this to the walltime minus a grace
|
145
|
+
period for the batch job to start the workers, minus the expected
|
146
|
+
maximum length of an individual task.
|
147
|
+
|
148
|
+
worker_logdir_root : string
|
149
|
+
In case of a remote file system, specify the path to where logs will be kept.
|
150
|
+
|
151
|
+
encrypted : bool
|
152
|
+
Flag to enable/disable encryption (CurveZMQ). Default is False.
|
153
|
+
|
154
|
+
manager_selector: ManagerSelector
|
155
|
+
Determines what strategy the interchange uses to select managers during task distribution.
|
156
|
+
See API reference under "Manager Selectors" regarding the various manager selectors.
|
157
|
+
Default: 'RandomManagerSelector'
|
158
|
+
""" # Documentation for params used by both HTEx and MPIEx
|
159
|
+
|
160
|
+
|
161
|
+
class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageInformation):
|
162
|
+
__doc__ = f"""Executor designed for cluster-scale
|
163
|
+
|
164
|
+
The HighThroughputExecutor system has the following components:
|
165
|
+
1. The HighThroughputExecutor instance which is run as part of the Parsl script.
|
166
|
+
2. The Interchange which acts as a load-balancing proxy between workers and Parsl
|
167
|
+
3. The multiprocessing based worker pool which coordinates task execution over several
|
168
|
+
cores on a node.
|
169
|
+
4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
|
170
|
+
|
171
|
+
Here is a diagram
|
172
|
+
|
173
|
+
.. code:: python
|
174
|
+
|
175
|
+
|
176
|
+
| Data | Executor | Interchange | External Process(es)
|
177
|
+
| Flow | | |
|
178
|
+
Task | Kernel | | |
|
179
|
+
+----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
|
180
|
+
| | | | batching | | |
|
181
|
+
Parsl<---Fut-| | | load-balancing| result exception
|
182
|
+
^ | | | watchdogs | | |
|
183
|
+
| | | Result | | | |
|
184
|
+
| | | Queue | | V V
|
185
|
+
| | | Thread<--|-incoming_q<---|--- +---------+
|
186
|
+
| | | | | |
|
187
|
+
| | | | | |
|
188
|
+
+----update_fut-----+
|
189
|
+
|
190
|
+
|
191
|
+
Each of the workers in each process_worker_pool has access to its local rank through
|
192
|
+
an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
|
193
|
+
and is an integer in the range from 0 to the number of workers per in the pool minus 1.
|
194
|
+
The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
|
195
|
+
and the size of the worker pool as ``PARSL_WORKER_COUNT``.
|
196
|
+
|
197
|
+
|
198
|
+
Parameters
|
199
|
+
----------
|
200
|
+
|
201
|
+
{GENERAL_HTEX_PARAM_DOCS}
|
202
|
+
|
149
203
|
cores_per_worker : float
|
150
204
|
cores to be assigned to each worker. Oversubscription is possible
|
151
205
|
by setting cores_per_worker < 1.0. Default=1
|
@@ -155,9 +209,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
155
209
|
will check the available memory at startup and limit the number of workers such that
|
156
210
|
the there's sufficient memory for each worker. Default: None
|
157
211
|
|
158
|
-
max_workers : int
|
159
|
-
Deprecated. Please use max_workers_per_node instead.
|
160
|
-
|
161
212
|
max_workers_per_node : int
|
162
213
|
Caps the number of workers launched per node. Default: None
|
163
214
|
|
@@ -179,44 +230,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
179
230
|
|
180
231
|
default: empty list
|
181
232
|
|
182
|
-
prefetch_capacity : int
|
183
|
-
Number of tasks that could be prefetched over available worker capacity.
|
184
|
-
When there are a few tasks (<100) or when tasks are long running, this option should
|
185
|
-
be set to 0 for better load balancing. Default is 0.
|
186
|
-
|
187
|
-
address_probe_timeout : int | None
|
188
|
-
Managers attempt connecting over many different addresses to determine a viable address.
|
189
|
-
This option sets a time limit in seconds on the connection attempt.
|
190
|
-
Default of None implies 30s timeout set on worker.
|
191
|
-
|
192
|
-
heartbeat_threshold : int
|
193
|
-
Seconds since the last message from the counterpart in the communication pair:
|
194
|
-
(interchange, manager) after which the counterpart is assumed to be un-available. Default: 120s
|
195
|
-
|
196
|
-
heartbeat_period : int
|
197
|
-
Number of seconds after which a heartbeat message indicating liveness is sent to the
|
198
|
-
counterpart (interchange, manager). Default: 30s
|
199
|
-
|
200
|
-
poll_period : int
|
201
|
-
Timeout period to be used by the executor components in milliseconds. Increasing poll_periods
|
202
|
-
trades performance for cpu efficiency. Default: 10ms
|
203
|
-
|
204
|
-
worker_logdir_root : string
|
205
|
-
In case of a remote file system, specify the path to where logs will be kept.
|
206
|
-
|
207
|
-
enable_mpi_mode: bool
|
208
|
-
If enabled, MPI launch prefixes will be composed for the batch scheduler based on
|
209
|
-
the nodes available in each batch job and the resource_specification dict passed
|
210
|
-
from the app. This is an experimental feature, please refer to the following doc section
|
211
|
-
before use: https://parsl.readthedocs.io/en/stable/userguide/mpi_apps.html
|
212
|
-
|
213
|
-
mpi_launcher: str
|
214
|
-
This field is only used if enable_mpi_mode is set. Select one from the
|
215
|
-
list of supported MPI launchers = ("srun", "aprun", "mpiexec").
|
216
|
-
default: "mpiexec"
|
217
|
-
|
218
|
-
encrypted : bool
|
219
|
-
Flag to enable/disable encryption (CurveZMQ). Default is False.
|
220
233
|
"""
|
221
234
|
|
222
235
|
@typeguard.typechecked
|
@@ -224,7 +237,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
224
237
|
label: str = 'HighThroughputExecutor',
|
225
238
|
provider: ExecutionProvider = LocalProvider(),
|
226
239
|
launch_cmd: Optional[str] = None,
|
240
|
+
interchange_launch_cmd: Optional[Sequence[str]] = None,
|
227
241
|
address: Optional[str] = None,
|
242
|
+
loopback_address: str = "127.0.0.1",
|
228
243
|
worker_ports: Optional[Tuple[int, int]] = None,
|
229
244
|
worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
|
230
245
|
interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000),
|
@@ -233,18 +248,17 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
233
248
|
worker_debug: bool = False,
|
234
249
|
cores_per_worker: float = 1.0,
|
235
250
|
mem_per_worker: Optional[float] = None,
|
236
|
-
max_workers: Optional[Union[int, float]] = None,
|
237
251
|
max_workers_per_node: Optional[Union[int, float]] = None,
|
238
252
|
cpu_affinity: str = 'none',
|
239
253
|
available_accelerators: Union[int, Sequence[str]] = (),
|
240
254
|
prefetch_capacity: int = 0,
|
241
255
|
heartbeat_threshold: int = 120,
|
242
256
|
heartbeat_period: int = 30,
|
257
|
+
drain_period: Optional[int] = None,
|
243
258
|
poll_period: int = 10,
|
244
259
|
address_probe_timeout: Optional[int] = None,
|
245
260
|
worker_logdir_root: Optional[str] = None,
|
246
|
-
|
247
|
-
mpi_launcher: str = "mpiexec",
|
261
|
+
manager_selector: ManagerSelector = RandomManagerSelector(),
|
248
262
|
block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
|
249
263
|
encrypted: bool = False):
|
250
264
|
|
@@ -260,14 +274,15 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
260
274
|
self.prefetch_capacity = prefetch_capacity
|
261
275
|
self.address = address
|
262
276
|
self.address_probe_timeout = address_probe_timeout
|
277
|
+
self.manager_selector = manager_selector
|
278
|
+
self.loopback_address = loopback_address
|
279
|
+
|
263
280
|
if self.address:
|
264
281
|
self.all_addresses = address
|
265
282
|
else:
|
266
283
|
self.all_addresses = ','.join(get_all_addresses())
|
267
284
|
|
268
|
-
|
269
|
-
self._warn_deprecated("max_workers", "max_workers_per_node")
|
270
|
-
self.max_workers_per_node = max_workers_per_node or max_workers or float("inf")
|
285
|
+
self.max_workers_per_node = max_workers_per_node or float("inf")
|
271
286
|
|
272
287
|
mem_slots = self.max_workers_per_node
|
273
288
|
cpu_slots = self.max_workers_per_node
|
@@ -294,15 +309,13 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
294
309
|
self._workers_per_node = 1 # our best guess-- we do not have any provider hints
|
295
310
|
|
296
311
|
self._task_counter = 0
|
297
|
-
self.run_id = None # set to the correct run_id in dfk
|
298
|
-
self.hub_address = None # set to the correct hub address in dfk
|
299
|
-
self.hub_port = None # set to the correct hub port in dfk
|
300
312
|
self.worker_ports = worker_ports
|
301
313
|
self.worker_port_range = worker_port_range
|
302
|
-
self.interchange_proc: Optional[
|
314
|
+
self.interchange_proc: Optional[subprocess.Popen] = None
|
303
315
|
self.interchange_port_range = interchange_port_range
|
304
316
|
self.heartbeat_threshold = heartbeat_threshold
|
305
317
|
self.heartbeat_period = heartbeat_period
|
318
|
+
self.drain_period = drain_period
|
306
319
|
self.poll_period = poll_period
|
307
320
|
self.run_dir = '.'
|
308
321
|
self.worker_logdir_root = worker_logdir_root
|
@@ -310,20 +323,20 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
310
323
|
self.encrypted = encrypted
|
311
324
|
self.cert_dir = None
|
312
325
|
|
313
|
-
self.enable_mpi_mode = enable_mpi_mode
|
314
|
-
assert mpi_launcher in VALID_LAUNCHERS, \
|
315
|
-
f"mpi_launcher must be set to one of {VALID_LAUNCHERS}"
|
316
|
-
if self.enable_mpi_mode:
|
317
|
-
assert isinstance(self.provider.launcher, parsl.launchers.SingleNodeLauncher), \
|
318
|
-
"mpi_mode requires the provider to be configured to use a SingleNodeLauncher"
|
319
|
-
|
320
|
-
self.mpi_launcher = mpi_launcher
|
321
|
-
|
322
326
|
if not launch_cmd:
|
323
327
|
launch_cmd = DEFAULT_LAUNCH_CMD
|
324
328
|
self.launch_cmd = launch_cmd
|
325
329
|
|
330
|
+
if not interchange_launch_cmd:
|
331
|
+
interchange_launch_cmd = DEFAULT_INTERCHANGE_LAUNCH_CMD
|
332
|
+
self.interchange_launch_cmd = interchange_launch_cmd
|
333
|
+
|
334
|
+
self._result_queue_thread_exit = threading.Event()
|
335
|
+
self._result_queue_thread: Optional[threading.Thread] = None
|
336
|
+
|
326
337
|
radio_mode = "htex"
|
338
|
+
enable_mpi_mode: bool = False
|
339
|
+
mpi_launcher: str = "mpiexec"
|
327
340
|
|
328
341
|
def _warn_deprecated(self, old: str, new: str):
|
329
342
|
warnings.warn(
|
@@ -333,16 +346,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
333
346
|
stacklevel=2
|
334
347
|
)
|
335
348
|
|
336
|
-
@property
|
337
|
-
def max_workers(self):
|
338
|
-
self._warn_deprecated("max_workers", "max_workers_per_node")
|
339
|
-
return self.max_workers_per_node
|
340
|
-
|
341
|
-
@max_workers.setter
|
342
|
-
def max_workers(self, val: Union[int, float]):
|
343
|
-
self._warn_deprecated("max_workers", "max_workers_per_node")
|
344
|
-
self.max_workers_per_node = val
|
345
|
-
|
346
349
|
@property
|
347
350
|
def logdir(self):
|
348
351
|
return "{}/{}".format(self.run_dir, self.label)
|
@@ -353,6 +356,20 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
353
356
|
return "{}/{}".format(self.worker_logdir_root, self.label)
|
354
357
|
return self.logdir
|
355
358
|
|
359
|
+
def validate_resource_spec(self, resource_specification: dict):
|
360
|
+
"""HTEX supports the following *Optional* resource specifications:
|
361
|
+
priority: lower value is higher priority"""
|
362
|
+
if resource_specification:
|
363
|
+
acceptable_fields = {'priority'}
|
364
|
+
keys = set(resource_specification.keys())
|
365
|
+
invalid_keys = keys - acceptable_fields
|
366
|
+
if invalid_keys:
|
367
|
+
message = "Task resource specification only accepts these types of resources: {}".format(
|
368
|
+
', '.join(acceptable_fields))
|
369
|
+
logger.error(message)
|
370
|
+
raise InvalidResourceSpecification(set(invalid_keys), message)
|
371
|
+
return
|
372
|
+
|
356
373
|
def initialize_scaling(self):
|
357
374
|
"""Compose the launch command and scale out the initial blocks.
|
358
375
|
"""
|
@@ -376,6 +393,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
376
393
|
nodes_per_block=self.provider.nodes_per_block,
|
377
394
|
heartbeat_period=self.heartbeat_period,
|
378
395
|
heartbeat_threshold=self.heartbeat_threshold,
|
396
|
+
drain_period=self.drain_period,
|
379
397
|
poll_period=self.poll_period,
|
380
398
|
cert_dir=self.cert_dir,
|
381
399
|
logdir=self.worker_logdir,
|
@@ -388,16 +406,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
388
406
|
|
389
407
|
logger.debug("Starting HighThroughputExecutor with provider:\n%s", self.provider)
|
390
408
|
|
391
|
-
# TODO: why is this a provider property?
|
392
|
-
block_ids = []
|
393
|
-
if hasattr(self.provider, 'init_blocks'):
|
394
|
-
try:
|
395
|
-
block_ids = self.scale_out(blocks=self.provider.init_blocks)
|
396
|
-
except Exception as e:
|
397
|
-
logger.error("Scaling out failed: {}".format(e))
|
398
|
-
raise e
|
399
|
-
return block_ids
|
400
|
-
|
401
409
|
def start(self):
|
402
410
|
"""Create the Interchange process and connect to it.
|
403
411
|
"""
|
@@ -412,30 +420,28 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
412
420
|
)
|
413
421
|
|
414
422
|
self.outgoing_q = zmq_pipes.TasksOutgoing(
|
415
|
-
|
423
|
+
self.loopback_address, self.interchange_port_range, self.cert_dir
|
416
424
|
)
|
417
425
|
self.incoming_q = zmq_pipes.ResultsIncoming(
|
418
|
-
|
426
|
+
self.loopback_address, self.interchange_port_range, self.cert_dir
|
419
427
|
)
|
420
428
|
self.command_client = zmq_pipes.CommandClient(
|
421
|
-
|
429
|
+
self.loopback_address, self.interchange_port_range, self.cert_dir
|
422
430
|
)
|
423
431
|
|
424
|
-
self.
|
425
|
-
self.
|
432
|
+
self._result_queue_thread = None
|
433
|
+
self._start_result_queue_thread()
|
426
434
|
self._start_local_interchange_process()
|
427
435
|
|
428
|
-
logger.debug("Created
|
436
|
+
logger.debug("Created result queue thread: %s", self._result_queue_thread)
|
429
437
|
|
430
|
-
|
431
|
-
return block_ids
|
438
|
+
self.initialize_scaling()
|
432
439
|
|
433
440
|
@wrap_with_logs
|
434
|
-
def
|
435
|
-
"""Listen to the queue for task
|
441
|
+
def _result_queue_worker(self):
|
442
|
+
"""Listen to the queue for task result messages and handle them.
|
436
443
|
|
437
|
-
Depending on the message, tasks will be updated with results
|
438
|
-
or updates. It expects the following messages:
|
444
|
+
Depending on the message, tasks will be updated with results or exceptions.
|
439
445
|
|
440
446
|
.. code:: python
|
441
447
|
|
@@ -449,14 +455,14 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
449
455
|
"task_id" : <task_id>
|
450
456
|
"exception" : serialized exception object, on failure
|
451
457
|
}
|
452
|
-
|
453
|
-
The `None` message is a die request.
|
454
458
|
"""
|
455
|
-
logger.debug("
|
459
|
+
logger.debug("Result queue worker starting")
|
456
460
|
|
457
|
-
while not self.bad_state_is_set:
|
461
|
+
while not self.bad_state_is_set and not self._result_queue_thread_exit.is_set():
|
458
462
|
try:
|
459
|
-
msgs = self.incoming_q.get()
|
463
|
+
msgs = self.incoming_q.get(timeout_ms=self.poll_period)
|
464
|
+
if msgs is None: # timeout
|
465
|
+
continue
|
460
466
|
|
461
467
|
except IOError as e:
|
462
468
|
logger.exception("Caught broken queue with exception code {}: {}".format(e.errno, e))
|
@@ -468,109 +474,114 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
468
474
|
|
469
475
|
else:
|
470
476
|
|
471
|
-
|
472
|
-
|
473
|
-
|
477
|
+
for serialized_msg in msgs:
|
478
|
+
try:
|
479
|
+
msg = pickle.loads(serialized_msg)
|
480
|
+
except pickle.UnpicklingError:
|
481
|
+
raise BadMessage("Message received could not be unpickled")
|
474
482
|
|
475
|
-
|
476
|
-
for serialized_msg in msgs:
|
483
|
+
if msg['type'] == 'result':
|
477
484
|
try:
|
478
|
-
|
479
|
-
except
|
480
|
-
raise BadMessage("Message received
|
485
|
+
tid = msg['task_id']
|
486
|
+
except Exception:
|
487
|
+
raise BadMessage("Message received does not contain 'task_id' field")
|
488
|
+
|
489
|
+
if tid == -1 and 'exception' in msg:
|
490
|
+
logger.warning("Executor shutting down due to exception from interchange")
|
491
|
+
exception = deserialize(msg['exception'])
|
492
|
+
self.set_bad_state_and_fail_all(exception)
|
493
|
+
break
|
494
|
+
|
495
|
+
task_fut = self.tasks.pop(tid)
|
481
496
|
|
482
|
-
if
|
483
|
-
|
484
|
-
|
497
|
+
if 'result' in msg:
|
498
|
+
result = deserialize(msg['result'])
|
499
|
+
task_fut.set_result(result)
|
500
|
+
|
501
|
+
elif 'exception' in msg:
|
485
502
|
try:
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
elif 'exception' in msg:
|
503
|
-
try:
|
504
|
-
s = deserialize(msg['exception'])
|
505
|
-
# s should be a RemoteExceptionWrapper... so we can reraise it
|
506
|
-
if isinstance(s, RemoteExceptionWrapper):
|
507
|
-
try:
|
508
|
-
s.reraise()
|
509
|
-
except Exception as e:
|
510
|
-
task_fut.set_exception(e)
|
511
|
-
elif isinstance(s, Exception):
|
512
|
-
task_fut.set_exception(s)
|
513
|
-
else:
|
514
|
-
raise ValueError("Unknown exception-like type received: {}".format(type(s)))
|
515
|
-
except Exception as e:
|
516
|
-
# TODO could be a proper wrapped exception?
|
517
|
-
task_fut.set_exception(
|
518
|
-
DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
|
519
|
-
else:
|
520
|
-
raise BadMessage("Message received is neither result or exception")
|
503
|
+
s = deserialize(msg['exception'])
|
504
|
+
# s should be a RemoteExceptionWrapper... so we can reraise it
|
505
|
+
if isinstance(s, RemoteExceptionWrapper):
|
506
|
+
try:
|
507
|
+
s.reraise()
|
508
|
+
except Exception as e:
|
509
|
+
task_fut.set_exception(e)
|
510
|
+
elif isinstance(s, Exception):
|
511
|
+
task_fut.set_exception(s)
|
512
|
+
else:
|
513
|
+
raise ValueError("Unknown exception-like type received: {}".format(type(s)))
|
514
|
+
except Exception as e:
|
515
|
+
# TODO could be a proper wrapped exception?
|
516
|
+
task_fut.set_exception(
|
517
|
+
DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
|
521
518
|
else:
|
522
|
-
raise BadMessage("Message received
|
519
|
+
raise BadMessage("Message received is neither result or exception")
|
520
|
+
else:
|
521
|
+
raise BadMessage("Message received with unknown type {}".format(msg['type']))
|
523
522
|
|
524
|
-
logger.info("
|
523
|
+
logger.info("Closing result ZMQ pipe")
|
524
|
+
self.incoming_q.close()
|
525
|
+
logger.info("Result queue worker finished")
|
525
526
|
|
526
|
-
def _start_local_interchange_process(self):
|
527
|
+
def _start_local_interchange_process(self) -> None:
|
527
528
|
""" Starts the interchange process locally
|
528
529
|
|
529
|
-
Starts the interchange process locally and uses
|
530
|
+
Starts the interchange process locally and uses the command queue to
|
530
531
|
get the worker task and result ports that the interchange has bound to.
|
531
532
|
"""
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
533
|
+
|
534
|
+
interchange_config = {"client_address": self.loopback_address,
|
535
|
+
"client_ports": (self.outgoing_q.port,
|
536
|
+
self.incoming_q.port,
|
537
|
+
self.command_client.port),
|
538
|
+
"interchange_address": self.address,
|
539
|
+
"worker_ports": self.worker_ports,
|
540
|
+
"worker_port_range": self.worker_port_range,
|
541
|
+
"hub_address": self.hub_address,
|
542
|
+
"hub_zmq_port": self.hub_zmq_port,
|
543
|
+
"logdir": self.logdir,
|
544
|
+
"heartbeat_threshold": self.heartbeat_threshold,
|
545
|
+
"poll_period": self.poll_period,
|
546
|
+
"logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
|
547
|
+
"cert_dir": self.cert_dir,
|
548
|
+
"manager_selector": self.manager_selector,
|
549
|
+
"run_id": self.run_id,
|
550
|
+
}
|
551
|
+
|
552
|
+
config_pickle = pickle.dumps(interchange_config)
|
553
|
+
|
554
|
+
self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd, stdin=subprocess.PIPE)
|
555
|
+
stdin = self.interchange_proc.stdin
|
556
|
+
assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
|
557
|
+
|
558
|
+
logger.debug("Popened interchange process. Writing config object")
|
559
|
+
stdin.write(config_pickle)
|
560
|
+
stdin.flush()
|
561
|
+
stdin.close()
|
562
|
+
logger.debug("Sent config object. Requesting worker ports")
|
553
563
|
try:
|
554
|
-
(self.worker_task_port, self.worker_result_port) =
|
555
|
-
except
|
556
|
-
logger.error("Interchange has not completed initialization
|
564
|
+
(self.worker_task_port, self.worker_result_port) = self.command_client.run("WORKER_PORTS", timeout_s=120)
|
565
|
+
except CommandClientTimeoutError:
|
566
|
+
logger.error("Interchange has not completed initialization. Aborting")
|
557
567
|
raise Exception("Interchange failed to start")
|
568
|
+
logger.debug("Got worker ports")
|
558
569
|
|
559
|
-
def
|
560
|
-
"""Method to start the
|
570
|
+
def _start_result_queue_thread(self):
|
571
|
+
"""Method to start the result queue thread as a daemon.
|
561
572
|
|
562
573
|
Checks if a thread already exists, then starts it.
|
563
|
-
Could be used later as a restart if the
|
574
|
+
Could be used later as a restart if the result queue thread dies.
|
564
575
|
"""
|
565
|
-
if self.
|
566
|
-
logger.debug("Starting queue
|
567
|
-
self.
|
568
|
-
self.
|
569
|
-
self.
|
570
|
-
logger.debug("Started queue
|
576
|
+
if self._result_queue_thread is None:
|
577
|
+
logger.debug("Starting result queue thread")
|
578
|
+
self._result_queue_thread = threading.Thread(target=self._result_queue_worker, name="HTEX-Result-Queue-Thread")
|
579
|
+
self._result_queue_thread.daemon = True
|
580
|
+
self._result_queue_thread.start()
|
581
|
+
logger.debug("Started result queue thread")
|
571
582
|
|
572
583
|
else:
|
573
|
-
logger.error("
|
584
|
+
logger.error("Result queue thread already exists, returning")
|
574
585
|
|
575
586
|
def hold_worker(self, worker_id: str) -> None:
|
576
587
|
"""Puts a worker on hold, preventing scheduling of additional tasks to it.
|
@@ -591,7 +602,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
591
602
|
def outstanding(self) -> int:
|
592
603
|
"""Returns the count of tasks outstanding across the interchange
|
593
604
|
and managers"""
|
594
|
-
return self.
|
605
|
+
return len(self.tasks)
|
595
606
|
|
596
607
|
@property
|
597
608
|
def connected_workers(self) -> int:
|
@@ -643,7 +654,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
643
654
|
Returns:
|
644
655
|
Future
|
645
656
|
"""
|
646
|
-
|
657
|
+
|
658
|
+
self.validate_resource_spec(resource_specification)
|
647
659
|
|
648
660
|
if self.bad_state_is_set:
|
649
661
|
raise self.executor_exception
|
@@ -667,7 +679,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
667
679
|
except TypeError:
|
668
680
|
raise SerializationError(func.__name__)
|
669
681
|
|
670
|
-
msg = {"task_id": task_id, "buffer": fn_buf}
|
682
|
+
msg = {"task_id": task_id, "resource_spec": resource_specification, "buffer": fn_buf}
|
671
683
|
|
672
684
|
# Post task to the outgoing queue
|
673
685
|
self.outgoing_q.put(msg)
|
@@ -675,22 +687,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
675
687
|
# Return the future
|
676
688
|
return fut
|
677
689
|
|
678
|
-
def create_monitoring_info(self, status):
|
679
|
-
""" Create a msg for monitoring based on the poll status
|
680
|
-
|
681
|
-
"""
|
682
|
-
msg = []
|
683
|
-
for bid, s in status.items():
|
684
|
-
d = {}
|
685
|
-
d['run_id'] = self.run_id
|
686
|
-
d['status'] = s.status_name
|
687
|
-
d['timestamp'] = datetime.datetime.now()
|
688
|
-
d['executor_label'] = self.label
|
689
|
-
d['job_id'] = self.blocks.get(bid, None)
|
690
|
-
d['block_id'] = bid
|
691
|
-
msg.append(d)
|
692
|
-
return msg
|
693
|
-
|
694
690
|
@property
|
695
691
|
def workers_per_node(self) -> Union[int, float]:
|
696
692
|
return self._workers_per_node
|
@@ -728,14 +724,24 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
728
724
|
tasks: int # sum of tasks in this block
|
729
725
|
idle: float # shortest idle time of any manager in this block
|
730
726
|
|
727
|
+
# block_info will be populated from two sources:
|
728
|
+
# the Job Status Poller mutable block list, and the list of blocks
|
729
|
+
# which have connected to the interchange.
|
730
|
+
|
731
|
+
def new_block_info():
|
732
|
+
return BlockInfo(tasks=0, idle=float('inf'))
|
733
|
+
|
734
|
+
block_info: Dict[str, BlockInfo] = defaultdict(new_block_info)
|
735
|
+
|
736
|
+
for block_id, job_status in self._status.items():
|
737
|
+
if job_status.state not in TERMINAL_STATES:
|
738
|
+
block_info[block_id] = new_block_info()
|
739
|
+
|
731
740
|
managers = self.connected_managers()
|
732
|
-
block_info: Dict[str, BlockInfo] = {}
|
733
741
|
for manager in managers:
|
734
742
|
if not manager['active']:
|
735
743
|
continue
|
736
744
|
b_id = manager['block_id']
|
737
|
-
if b_id not in block_info:
|
738
|
-
block_info[b_id] = BlockInfo(tasks=0, idle=float('inf'))
|
739
745
|
block_info[b_id].tasks += manager['tasks']
|
740
746
|
block_info[b_id].idle = min(block_info[b_id].idle, manager['idle_duration'])
|
741
747
|
|
@@ -767,14 +773,14 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
767
773
|
|
768
774
|
# Now kill via provider
|
769
775
|
# Potential issue with multiple threads trying to remove the same blocks
|
770
|
-
to_kill = [self.
|
776
|
+
to_kill = [self.blocks_to_job_id[bid] for bid in block_ids_to_kill if bid in self.blocks_to_job_id]
|
771
777
|
|
772
778
|
r = self.provider.cancel(to_kill)
|
773
779
|
job_ids = self._filter_scale_in_ids(to_kill, r)
|
774
780
|
|
775
|
-
# to_kill block_ids are fetched from self.
|
776
|
-
# If a block_id is in self.
|
777
|
-
block_ids_killed = [self.
|
781
|
+
# to_kill block_ids are fetched from self.blocks_to_job_id
|
782
|
+
# If a block_id is in self.blocks_to_job_id, it must exist in self.job_ids_to_block
|
783
|
+
block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
|
778
784
|
|
779
785
|
return block_ids_killed
|
780
786
|
|
@@ -789,7 +795,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
789
795
|
connected_blocks = self.connected_blocks()
|
790
796
|
for job_id in job_status:
|
791
797
|
job_info = job_status[job_id]
|
792
|
-
if job_info.terminal and job_id not in connected_blocks:
|
798
|
+
if job_info.terminal and job_id not in connected_blocks and job_info.state != JobState.SCALED_IN:
|
799
|
+
logger.debug("Rewriting job %s from status %s to MISSING", job_id, job_info)
|
793
800
|
job_status[job_id].state = JobState.MISSING
|
794
801
|
if job_status[job_id].message is None:
|
795
802
|
job_status[job_id].message = (
|
@@ -817,10 +824,37 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
|
|
817
824
|
|
818
825
|
logger.info("Attempting HighThroughputExecutor shutdown")
|
819
826
|
|
827
|
+
logger.info("Terminating interchange and result queue thread")
|
828
|
+
self._result_queue_thread_exit.set()
|
820
829
|
self.interchange_proc.terminate()
|
821
|
-
|
822
|
-
|
823
|
-
|
830
|
+
try:
|
831
|
+
self.interchange_proc.wait(timeout=timeout)
|
832
|
+
except subprocess.TimeoutExpired:
|
833
|
+
logger.warning("Unable to terminate Interchange process; sending SIGKILL")
|
824
834
|
self.interchange_proc.kill()
|
825
835
|
|
836
|
+
logger.info("Closing ZMQ pipes")
|
837
|
+
|
838
|
+
# These pipes are used in a thread unsafe manner. If you have traced a
|
839
|
+
# problem to this block of code, you might consider what is happening
|
840
|
+
# with other threads that access these.
|
841
|
+
|
842
|
+
# incoming_q is not closed here because it is used by the results queue
|
843
|
+
# worker which is not shut down at this point.
|
844
|
+
|
845
|
+
if hasattr(self, 'outgoing_q'):
|
846
|
+
logger.info("Closing outgoing_q")
|
847
|
+
self.outgoing_q.close()
|
848
|
+
|
849
|
+
if hasattr(self, 'command_client'):
|
850
|
+
logger.info("Closing command client")
|
851
|
+
self.command_client.close()
|
852
|
+
|
853
|
+
logger.info("Waiting for result queue thread exit")
|
854
|
+
if self._result_queue_thread:
|
855
|
+
self._result_queue_thread.join()
|
856
|
+
|
826
857
|
logger.info("Finished HighThroughputExecutor shutdown attempt")
|
858
|
+
|
859
|
+
def get_usage_information(self):
|
860
|
+
return {"mpi": self.enable_mpi_mode}
|