parsl 2024.3.18__py3-none-any.whl → 2025.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/__init__.py +9 -10
- parsl/addresses.py +26 -6
- parsl/app/app.py +7 -8
- parsl/app/bash.py +15 -8
- parsl/app/errors.py +10 -13
- parsl/app/futures.py +8 -10
- parsl/app/python.py +2 -1
- parsl/benchmark/perf.py +2 -1
- parsl/concurrent/__init__.py +2 -2
- parsl/config.py +53 -10
- parsl/configs/ASPIRE1.py +6 -5
- parsl/configs/Azure.py +9 -8
- parsl/configs/bridges.py +6 -4
- parsl/configs/cc_in2p3.py +3 -3
- parsl/configs/ec2.py +3 -1
- parsl/configs/expanse.py +4 -3
- parsl/configs/frontera.py +3 -4
- parsl/configs/htex_local.py +3 -4
- parsl/configs/illinoiscluster.py +3 -1
- parsl/configs/improv.py +34 -0
- parsl/configs/kubernetes.py +4 -3
- parsl/configs/local_threads.py +5 -1
- parsl/configs/midway.py +5 -3
- parsl/configs/osg.py +4 -2
- parsl/configs/polaris.py +4 -2
- parsl/configs/stampede2.py +6 -5
- parsl/configs/summit.py +3 -3
- parsl/configs/toss3_llnl.py +4 -3
- parsl/configs/vineex_local.py +6 -4
- parsl/configs/wqex_local.py +5 -3
- parsl/curvezmq.py +4 -0
- parsl/data_provider/data_manager.py +4 -3
- parsl/data_provider/file_noop.py +1 -2
- parsl/data_provider/files.py +3 -3
- parsl/data_provider/ftp.py +1 -3
- parsl/data_provider/globus.py +7 -6
- parsl/data_provider/http.py +2 -2
- parsl/data_provider/rsync.py +1 -1
- parsl/data_provider/staging.py +2 -2
- parsl/data_provider/zip.py +135 -0
- parsl/dataflow/dependency_resolvers.py +115 -0
- parsl/dataflow/dflow.py +259 -223
- parsl/dataflow/errors.py +3 -5
- parsl/dataflow/futures.py +27 -14
- parsl/dataflow/memoization.py +5 -5
- parsl/dataflow/rundirs.py +5 -6
- parsl/dataflow/taskrecord.py +4 -5
- parsl/executors/__init__.py +4 -2
- parsl/executors/base.py +45 -15
- parsl/executors/errors.py +13 -0
- parsl/executors/execute_task.py +37 -0
- parsl/executors/flux/execute_parsl_task.py +3 -3
- parsl/executors/flux/executor.py +18 -19
- parsl/executors/flux/flux_instance_manager.py +26 -27
- parsl/executors/high_throughput/errors.py +43 -3
- parsl/executors/high_throughput/executor.py +307 -285
- parsl/executors/high_throughput/interchange.py +137 -168
- parsl/executors/high_throughput/manager_record.py +4 -0
- parsl/executors/high_throughput/manager_selector.py +55 -0
- parsl/executors/high_throughput/monitoring_info.py +2 -1
- parsl/executors/high_throughput/mpi_executor.py +113 -0
- parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
- parsl/executors/high_throughput/mpi_resource_management.py +6 -17
- parsl/executors/high_throughput/probe.py +9 -7
- parsl/executors/high_throughput/process_worker_pool.py +77 -75
- parsl/executors/high_throughput/zmq_pipes.py +81 -23
- parsl/executors/radical/executor.py +130 -79
- parsl/executors/radical/rpex_resources.py +17 -15
- parsl/executors/radical/rpex_worker.py +4 -3
- parsl/executors/status_handling.py +157 -51
- parsl/executors/taskvine/__init__.py +1 -1
- parsl/executors/taskvine/errors.py +1 -1
- parsl/executors/taskvine/exec_parsl_function.py +2 -2
- parsl/executors/taskvine/executor.py +38 -55
- parsl/executors/taskvine/factory.py +1 -1
- parsl/executors/taskvine/factory_config.py +1 -1
- parsl/executors/taskvine/manager.py +17 -13
- parsl/executors/taskvine/manager_config.py +7 -2
- parsl/executors/threads.py +6 -6
- parsl/executors/workqueue/errors.py +1 -1
- parsl/executors/workqueue/exec_parsl_function.py +6 -5
- parsl/executors/workqueue/executor.py +64 -63
- parsl/executors/workqueue/parsl_coprocess.py +1 -1
- parsl/jobs/error_handlers.py +2 -2
- parsl/jobs/job_status_poller.py +28 -112
- parsl/jobs/states.py +7 -2
- parsl/jobs/strategy.py +43 -31
- parsl/launchers/__init__.py +12 -3
- parsl/launchers/errors.py +1 -1
- parsl/launchers/launchers.py +0 -6
- parsl/log_utils.py +1 -2
- parsl/monitoring/db_manager.py +55 -93
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +85 -311
- parsl/monitoring/queries/pandas.py +1 -2
- parsl/monitoring/radios/base.py +13 -0
- parsl/monitoring/radios/filesystem.py +52 -0
- parsl/monitoring/radios/htex.py +57 -0
- parsl/monitoring/radios/multiprocessing.py +17 -0
- parsl/monitoring/radios/udp.py +56 -0
- parsl/monitoring/radios/zmq.py +17 -0
- parsl/monitoring/remote.py +33 -37
- parsl/monitoring/router.py +212 -0
- parsl/monitoring/types.py +5 -6
- parsl/monitoring/visualization/app.py +4 -2
- parsl/monitoring/visualization/models.py +0 -1
- parsl/monitoring/visualization/plots/default/workflow_plots.py +8 -4
- parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
- parsl/monitoring/visualization/utils.py +0 -1
- parsl/monitoring/visualization/views.py +16 -9
- parsl/multiprocessing.py +0 -1
- parsl/process_loggers.py +1 -2
- parsl/providers/__init__.py +8 -17
- parsl/providers/aws/aws.py +2 -3
- parsl/providers/azure/azure.py +4 -5
- parsl/providers/base.py +2 -18
- parsl/providers/cluster_provider.py +3 -9
- parsl/providers/condor/condor.py +7 -17
- parsl/providers/errors.py +2 -2
- parsl/providers/googlecloud/googlecloud.py +2 -1
- parsl/providers/grid_engine/grid_engine.py +5 -14
- parsl/providers/kubernetes/kube.py +80 -40
- parsl/providers/local/local.py +13 -26
- parsl/providers/lsf/lsf.py +5 -23
- parsl/providers/pbspro/pbspro.py +5 -17
- parsl/providers/slurm/slurm.py +81 -39
- parsl/providers/torque/torque.py +3 -14
- parsl/serialize/__init__.py +8 -3
- parsl/serialize/base.py +1 -2
- parsl/serialize/concretes.py +5 -4
- parsl/serialize/facade.py +3 -3
- parsl/serialize/proxystore.py +3 -2
- parsl/tests/__init__.py +1 -1
- parsl/tests/configs/azure_single_node.py +4 -5
- parsl/tests/configs/bridges.py +3 -2
- parsl/tests/configs/cc_in2p3.py +1 -3
- parsl/tests/configs/comet.py +2 -1
- parsl/tests/configs/ec2_single_node.py +1 -2
- parsl/tests/configs/ec2_spot.py +1 -2
- parsl/tests/configs/flux_local.py +11 -0
- parsl/tests/configs/frontera.py +2 -3
- parsl/tests/configs/htex_local.py +3 -5
- parsl/tests/configs/htex_local_alternate.py +11 -15
- parsl/tests/configs/htex_local_intask_staging.py +5 -9
- parsl/tests/configs/htex_local_rsync_staging.py +4 -8
- parsl/tests/configs/local_radical.py +1 -3
- parsl/tests/configs/local_radical_mpi.py +2 -2
- parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
- parsl/tests/configs/local_threads_monitoring.py +0 -1
- parsl/tests/configs/midway.py +2 -2
- parsl/tests/configs/nscc_singapore.py +3 -3
- parsl/tests/configs/osg_htex.py +1 -1
- parsl/tests/configs/petrelkube.py +3 -2
- parsl/tests/configs/slurm_local.py +24 -0
- parsl/tests/configs/summit.py +1 -0
- parsl/tests/configs/taskvine_ex.py +4 -7
- parsl/tests/configs/user_opts.py +0 -7
- parsl/tests/configs/workqueue_ex.py +4 -6
- parsl/tests/conftest.py +27 -13
- parsl/tests/integration/test_stress/test_python_simple.py +3 -4
- parsl/tests/integration/test_stress/test_python_threads.py +3 -5
- parsl/tests/manual_tests/htex_local.py +4 -6
- parsl/tests/manual_tests/test_basic.py +1 -0
- parsl/tests/manual_tests/test_log_filter.py +3 -1
- parsl/tests/manual_tests/test_memory_limits.py +6 -8
- parsl/tests/manual_tests/test_regression_220.py +2 -1
- parsl/tests/manual_tests/test_udp_simple.py +4 -4
- parsl/tests/manual_tests/test_worker_count.py +3 -2
- parsl/tests/scaling_tests/htex_local.py +2 -4
- parsl/tests/scaling_tests/test_scale.py +0 -9
- parsl/tests/scaling_tests/vineex_condor.py +1 -2
- parsl/tests/scaling_tests/vineex_local.py +1 -2
- parsl/tests/site_tests/site_config_selector.py +1 -6
- parsl/tests/site_tests/test_provider.py +4 -2
- parsl/tests/site_tests/test_site.py +2 -0
- parsl/tests/sites/test_affinity.py +7 -7
- parsl/tests/sites/test_dynamic_executor.py +3 -4
- parsl/tests/sites/test_ec2.py +3 -2
- parsl/tests/sites/test_worker_info.py +4 -5
- parsl/tests/test_aalst_patterns.py +0 -1
- parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
- parsl/tests/test_bash_apps/test_basic.py +10 -4
- parsl/tests/test_bash_apps/test_error_codes.py +5 -7
- parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
- parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
- parsl/tests/test_bash_apps/test_memoize.py +2 -8
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
- parsl/tests/test_bash_apps/test_multiline.py +1 -1
- parsl/tests/test_bash_apps/test_pipeline.py +1 -1
- parsl/tests/test_bash_apps/test_std_uri.py +123 -0
- parsl/tests/test_bash_apps/test_stdout.py +33 -8
- parsl/tests/test_callables.py +2 -2
- parsl/tests/test_checkpointing/test_periodic.py +21 -39
- parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
- parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
- parsl/tests/test_checkpointing/test_regression_239.py +1 -1
- parsl/tests/test_checkpointing/test_task_exit.py +2 -3
- parsl/tests/test_docs/test_from_slides.py +5 -2
- parsl/tests/test_docs/test_kwargs.py +4 -1
- parsl/tests/test_docs/test_tutorial_1.py +1 -2
- parsl/tests/test_docs/test_workflow1.py +2 -2
- parsl/tests/test_docs/test_workflow2.py +0 -1
- parsl/tests/test_error_handling/test_rand_fail.py +2 -2
- parsl/tests/test_error_handling/test_resource_spec.py +10 -12
- parsl/tests/test_error_handling/test_retries.py +6 -16
- parsl/tests/test_error_handling/test_retry_handler.py +1 -0
- parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
- parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
- parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
- parsl/tests/test_execute_task.py +29 -0
- parsl/tests/test_flux.py +1 -1
- parsl/tests/test_htex/test_basic.py +2 -3
- parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
- parsl/tests/test_htex/test_command_client_timeout.py +66 -0
- parsl/tests/test_htex/test_connected_blocks.py +3 -2
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
- parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_drain.py +11 -10
- parsl/tests/test_htex/test_htex.py +51 -25
- parsl/tests/test_htex/test_manager_failure.py +0 -1
- parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
- parsl/tests/test_htex/test_managers_command.py +36 -0
- parsl/tests/test_htex/test_missing_worker.py +2 -12
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
- parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
- parsl/tests/test_htex/test_zmq_binding.py +29 -8
- parsl/tests/test_monitoring/test_app_names.py +5 -5
- parsl/tests/test_monitoring/test_basic.py +73 -25
- parsl/tests/test_monitoring/test_db_locks.py +6 -4
- parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
- parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
- parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
- parsl/tests/test_monitoring/test_stdouterr.py +134 -0
- parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
- parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
- parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
- parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
- parsl/tests/test_providers/test_local_provider.py +3 -132
- parsl/tests/test_providers/test_pbspro_template.py +2 -3
- parsl/tests/test_providers/test_slurm_template.py +2 -3
- parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
- parsl/tests/test_python_apps/test_context_manager.py +128 -0
- parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
- parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
- parsl/tests/test_python_apps/test_fail.py +0 -25
- parsl/tests/test_python_apps/test_futures.py +2 -1
- parsl/tests/test_python_apps/test_inputs_default.py +22 -0
- parsl/tests/test_python_apps/test_join.py +0 -1
- parsl/tests/test_python_apps/test_lifted.py +11 -7
- parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
- parsl/tests/test_python_apps/test_outputs.py +1 -1
- parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
- parsl/tests/test_radical/test_mpi_funcs.py +1 -2
- parsl/tests/test_regression/test_1480.py +2 -1
- parsl/tests/test_regression/test_1653.py +2 -1
- parsl/tests/test_regression/test_226.py +1 -0
- parsl/tests/test_regression/test_2652.py +1 -0
- parsl/tests/test_regression/test_69a.py +0 -1
- parsl/tests/test_regression/test_854.py +4 -2
- parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
- parsl/tests/test_regression/test_98.py +0 -1
- parsl/tests/test_scaling/test_block_error_handler.py +9 -4
- parsl/tests/test_scaling/test_regression_1621.py +11 -15
- parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
- parsl/tests/test_scaling/test_scale_down.py +2 -5
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +5 -8
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
- parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
- parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
- parsl/tests/test_serialization/test_basic.py +2 -1
- parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
- parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
- parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
- parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
- parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
- parsl/tests/test_staging/staging_provider.py +2 -2
- parsl/tests/test_staging/test_1316.py +3 -4
- parsl/tests/test_staging/test_docs_1.py +2 -1
- parsl/tests/test_staging/test_docs_2.py +2 -1
- parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
- parsl/tests/{test_data → test_staging}/test_file.py +6 -6
- parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
- parsl/tests/test_staging/test_staging_ftp.py +1 -0
- parsl/tests/test_staging/test_staging_https.py +5 -2
- parsl/tests/test_staging/test_staging_stdout.py +64 -0
- parsl/tests/test_staging/test_zip_in.py +39 -0
- parsl/tests/test_staging/test_zip_out.py +110 -0
- parsl/tests/test_staging/test_zip_to_zip.py +41 -0
- parsl/tests/test_summary.py +2 -2
- parsl/tests/test_thread_parallelism.py +0 -1
- parsl/tests/test_threads/test_configs.py +1 -2
- parsl/tests/test_threads/test_lazy_errors.py +2 -2
- parsl/tests/test_utils/test_execute_wait.py +35 -0
- parsl/tests/test_utils/test_sanitize_dns.py +76 -0
- parsl/tests/unit/test_address.py +20 -0
- parsl/tests/unit/test_file.py +99 -0
- parsl/tests/unit/test_usage_tracking.py +66 -0
- parsl/usage_tracking/api.py +65 -0
- parsl/usage_tracking/levels.py +6 -0
- parsl/usage_tracking/usage.py +104 -62
- parsl/utils.py +137 -4
- parsl/version.py +1 -1
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
- parsl-2025.1.13.data/scripts/interchange.py +649 -0
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +77 -75
- parsl-2025.1.13.dist-info/METADATA +96 -0
- parsl-2025.1.13.dist-info/RECORD +462 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
- parsl/channels/__init__.py +0 -7
- parsl/channels/base.py +0 -141
- parsl/channels/errors.py +0 -113
- parsl/channels/local/local.py +0 -164
- parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
- parsl/channels/ssh/ssh.py +0 -276
- parsl/channels/ssh_il/__init__.py +0 -0
- parsl/channels/ssh_il/ssh_il.py +0 -74
- parsl/configs/ad_hoc.py +0 -35
- parsl/executors/radical/rpex_master.py +0 -42
- parsl/monitoring/radios.py +0 -175
- parsl/providers/ad_hoc/__init__.py +0 -0
- parsl/providers/ad_hoc/ad_hoc.py +0 -248
- parsl/providers/cobalt/__init__.py +0 -0
- parsl/providers/cobalt/cobalt.py +0 -236
- parsl/providers/cobalt/template.py +0 -17
- parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
- parsl/tests/configs/cooley_htex.py +0 -37
- parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
- parsl/tests/configs/local_adhoc.py +0 -18
- parsl/tests/configs/swan_htex.py +0 -43
- parsl/tests/configs/theta.py +0 -37
- parsl/tests/integration/test_channels/__init__.py +0 -0
- parsl/tests/integration/test_channels/test_channels.py +0 -17
- parsl/tests/integration/test_channels/test_local_channel.py +0 -42
- parsl/tests/integration/test_channels/test_scp_1.py +0 -45
- parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
- parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
- parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
- parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
- parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
- parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
- parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
- parsl/tests/sites/test_local_adhoc.py +0 -61
- parsl/tests/test_channels/__init__.py +0 -0
- parsl/tests/test_channels/test_large_output.py +0 -22
- parsl/tests/test_data/__init__.py +0 -0
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
- parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
- parsl-2024.3.18.dist-info/METADATA +0 -98
- parsl-2024.3.18.dist-info/RECORD +0 -449
- parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
- parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
- parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
import random
|
2
|
+
from abc import ABCMeta, abstractmethod
|
3
|
+
from typing import Dict, List, Set
|
4
|
+
|
5
|
+
from parsl.executors.high_throughput.manager_record import ManagerRecord
|
6
|
+
|
7
|
+
|
8
|
+
class ManagerSelector(metaclass=ABCMeta):
|
9
|
+
|
10
|
+
@abstractmethod
|
11
|
+
def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
|
12
|
+
""" Sort a given list of managers.
|
13
|
+
|
14
|
+
Any operations pertaining to the sorting and rearrangement of the
|
15
|
+
interesting_managers Set should be performed here.
|
16
|
+
"""
|
17
|
+
pass
|
18
|
+
|
19
|
+
|
20
|
+
class RandomManagerSelector(ManagerSelector):
|
21
|
+
|
22
|
+
"""Returns a shuffled list of interesting_managers
|
23
|
+
|
24
|
+
By default this strategy is used by the interchange. Works well
|
25
|
+
in distributing workloads equally across all availble compute
|
26
|
+
resources. The random workload strategy is not effective in
|
27
|
+
conjunction with elastic scaling behavior as the even task
|
28
|
+
distribution does not allow the scaling down of blocks, leading
|
29
|
+
to wasted resource consumption.
|
30
|
+
"""
|
31
|
+
|
32
|
+
def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
|
33
|
+
c_manager_list = list(manager_list)
|
34
|
+
random.shuffle(c_manager_list)
|
35
|
+
return c_manager_list
|
36
|
+
|
37
|
+
|
38
|
+
class BlockIdManagerSelector(ManagerSelector):
|
39
|
+
|
40
|
+
"""Returns an interesting_managers list sorted by block ID
|
41
|
+
|
42
|
+
Observations:
|
43
|
+
1. BlockID manager selector helps with workloads that see a varying
|
44
|
+
amount of tasks over time. New blocks are prioritized with the
|
45
|
+
blockID manager selector, when used with 'htex_auto_scaling', results
|
46
|
+
in compute cost savings.
|
47
|
+
|
48
|
+
2. Doesn't really work with bag-of-tasks workloads. When all the tasks
|
49
|
+
are put into the queue upfront, all blocks operate at near full
|
50
|
+
utilization for the majority of the workload, which task goes where
|
51
|
+
doesn't really matter.
|
52
|
+
"""
|
53
|
+
|
54
|
+
def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
|
55
|
+
return sorted(manager_list, key=lambda x: (ready_managers[x]['block_id'] is not None, ready_managers[x]['block_id']))
|
@@ -0,0 +1,113 @@
|
|
1
|
+
"""A simplified interface for HTEx when running in MPI mode"""
|
2
|
+
from typing import Callable, Dict, List, Optional, Tuple, Union
|
3
|
+
|
4
|
+
import typeguard
|
5
|
+
|
6
|
+
from parsl.data_provider.staging import Staging
|
7
|
+
from parsl.executors.high_throughput.executor import (
|
8
|
+
GENERAL_HTEX_PARAM_DOCS,
|
9
|
+
HighThroughputExecutor,
|
10
|
+
)
|
11
|
+
from parsl.executors.high_throughput.mpi_prefix_composer import (
|
12
|
+
VALID_LAUNCHERS,
|
13
|
+
validate_resource_spec,
|
14
|
+
)
|
15
|
+
from parsl.executors.status_handling import BlockProviderExecutor
|
16
|
+
from parsl.jobs.states import JobStatus
|
17
|
+
from parsl.launchers import SimpleLauncher
|
18
|
+
from parsl.providers import LocalProvider
|
19
|
+
from parsl.providers.base import ExecutionProvider
|
20
|
+
|
21
|
+
|
22
|
+
class MPIExecutor(HighThroughputExecutor):
|
23
|
+
__doc__ = f"""A version of :class:`~parsl.HighThroughputExecutor` tuned for executing multi-node (e.g., MPI) tasks.
|
24
|
+
|
25
|
+
The Provider _must_ use the :class:`~parsl.launchers.SimpleLauncher`,
|
26
|
+
which places a single pool of workers on the first node of a block.
|
27
|
+
Each worker can then make system calls which use an MPI launcher (e.g., ``mpirun``, ``srun``)
|
28
|
+
to spawn multi-node tasks.
|
29
|
+
|
30
|
+
Specify the maximum number of multi-node tasks to run at once using ``max_workers_per_block``.
|
31
|
+
The value should be less than or equal to the ``nodes_per_block`` in the Provider.
|
32
|
+
|
33
|
+
Parameters
|
34
|
+
----------
|
35
|
+
max_workers_per_block: int
|
36
|
+
Maximum number of MPI applications to run at once per block
|
37
|
+
|
38
|
+
mpi_launcher: str
|
39
|
+
Select one from the list of supported MPI launchers:
|
40
|
+
("srun", "aprun", "mpiexec").
|
41
|
+
default: "mpiexec"
|
42
|
+
|
43
|
+
{GENERAL_HTEX_PARAM_DOCS}
|
44
|
+
"""
|
45
|
+
|
46
|
+
@typeguard.typechecked
|
47
|
+
def __init__(self,
|
48
|
+
label: str = 'MPIExecutor',
|
49
|
+
provider: ExecutionProvider = LocalProvider(),
|
50
|
+
launch_cmd: Optional[str] = None,
|
51
|
+
interchange_launch_cmd: Optional[str] = None,
|
52
|
+
address: Optional[str] = None,
|
53
|
+
loopback_address: str = "127.0.0.1",
|
54
|
+
worker_ports: Optional[Tuple[int, int]] = None,
|
55
|
+
worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
|
56
|
+
interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000),
|
57
|
+
storage_access: Optional[List[Staging]] = None,
|
58
|
+
working_dir: Optional[str] = None,
|
59
|
+
worker_debug: bool = False,
|
60
|
+
max_workers_per_block: int = 1,
|
61
|
+
prefetch_capacity: int = 0,
|
62
|
+
heartbeat_threshold: int = 120,
|
63
|
+
heartbeat_period: int = 30,
|
64
|
+
drain_period: Optional[int] = None,
|
65
|
+
poll_period: int = 10,
|
66
|
+
address_probe_timeout: Optional[int] = None,
|
67
|
+
worker_logdir_root: Optional[str] = None,
|
68
|
+
mpi_launcher: str = "mpiexec",
|
69
|
+
block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
|
70
|
+
encrypted: bool = False):
|
71
|
+
super().__init__(
|
72
|
+
# Hard-coded settings
|
73
|
+
cores_per_worker=1e-9, # Ensures there will be at least an absurd number of workers
|
74
|
+
max_workers_per_node=max_workers_per_block,
|
75
|
+
|
76
|
+
# Everything else
|
77
|
+
label=label,
|
78
|
+
provider=provider,
|
79
|
+
launch_cmd=launch_cmd,
|
80
|
+
interchange_launch_cmd=interchange_launch_cmd,
|
81
|
+
address=address,
|
82
|
+
loopback_address=loopback_address,
|
83
|
+
worker_ports=worker_ports,
|
84
|
+
worker_port_range=worker_port_range,
|
85
|
+
interchange_port_range=interchange_port_range,
|
86
|
+
storage_access=storage_access,
|
87
|
+
working_dir=working_dir,
|
88
|
+
worker_debug=worker_debug,
|
89
|
+
prefetch_capacity=prefetch_capacity,
|
90
|
+
heartbeat_threshold=heartbeat_threshold,
|
91
|
+
heartbeat_period=heartbeat_period,
|
92
|
+
drain_period=drain_period,
|
93
|
+
poll_period=poll_period,
|
94
|
+
address_probe_timeout=address_probe_timeout,
|
95
|
+
worker_logdir_root=worker_logdir_root,
|
96
|
+
block_error_handler=block_error_handler,
|
97
|
+
encrypted=encrypted
|
98
|
+
)
|
99
|
+
self.enable_mpi_mode = True
|
100
|
+
self.mpi_launcher = mpi_launcher
|
101
|
+
|
102
|
+
self.max_workers_per_block = max_workers_per_block
|
103
|
+
|
104
|
+
if not isinstance(self.provider.launcher, SimpleLauncher):
|
105
|
+
raise TypeError("mpi_mode requires the provider to be configured to use a SimpleLauncher")
|
106
|
+
|
107
|
+
if mpi_launcher not in VALID_LAUNCHERS:
|
108
|
+
raise ValueError(f"mpi_launcher set to:{mpi_launcher} must be set to one of {VALID_LAUNCHERS}")
|
109
|
+
|
110
|
+
self.mpi_launcher = mpi_launcher
|
111
|
+
|
112
|
+
def validate_resource_spec(self, resource_specification: dict):
|
113
|
+
return validate_resource_spec(resource_specification)
|
@@ -1,5 +1,7 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import Dict, List, Tuple
|
2
|
+
from typing import Dict, List, Tuple
|
3
|
+
|
4
|
+
from parsl.executors.errors import InvalidResourceSpecification
|
3
5
|
|
4
6
|
logger = logging.getLogger(__name__)
|
5
7
|
|
@@ -8,16 +10,6 @@ VALID_LAUNCHERS = ('srun',
|
|
8
10
|
'mpiexec')
|
9
11
|
|
10
12
|
|
11
|
-
class InvalidResourceSpecification(Exception):
|
12
|
-
"""Exception raised when Invalid keys are supplied via resource specification"""
|
13
|
-
|
14
|
-
def __init__(self, invalid_keys: Set[str]):
|
15
|
-
self.invalid_keys = invalid_keys
|
16
|
-
|
17
|
-
def __str__(self):
|
18
|
-
return f"Invalid resource specification options supplied: {self.invalid_keys}"
|
19
|
-
|
20
|
-
|
21
13
|
def validate_resource_spec(resource_spec: Dict[str, str]):
|
22
14
|
"""Basic validation of keys in the resource_spec
|
23
15
|
|
@@ -25,6 +17,13 @@ def validate_resource_spec(resource_spec: Dict[str, str]):
|
|
25
17
|
is invalid (e.g, contains invalid keys)
|
26
18
|
"""
|
27
19
|
user_keys = set(resource_spec.keys())
|
20
|
+
|
21
|
+
# empty resource_spec when mpi_mode is set causes parsl to hang
|
22
|
+
# ref issue #3427
|
23
|
+
if len(user_keys) == 0:
|
24
|
+
raise InvalidResourceSpecification(user_keys,
|
25
|
+
'MPI mode requires optional parsl_resource_specification keyword argument to be configured')
|
26
|
+
|
28
27
|
legal_keys = set(("ranks_per_node",
|
29
28
|
"num_nodes",
|
30
29
|
"num_ranks",
|
@@ -8,8 +8,7 @@ from enum import Enum
|
|
8
8
|
from typing import Dict, List
|
9
9
|
|
10
10
|
from parsl.multiprocessing import SpawnContext
|
11
|
-
from parsl.serialize import
|
12
|
-
unpack_res_spec_apply_message)
|
11
|
+
from parsl.serialize import pack_res_spec_apply_message, unpack_res_spec_apply_message
|
13
12
|
|
14
13
|
logger = logging.getLogger(__name__)
|
15
14
|
|
@@ -18,7 +17,6 @@ class Scheduler(Enum):
|
|
18
17
|
Unknown = 0
|
19
18
|
Slurm = 1
|
20
19
|
PBS = 2
|
21
|
-
Cobalt = 3
|
22
20
|
|
23
21
|
|
24
22
|
def get_slurm_hosts_list() -> List[str]:
|
@@ -38,13 +36,6 @@ def get_pbs_hosts_list() -> List[str]:
|
|
38
36
|
return [line.strip() for line in f.readlines()]
|
39
37
|
|
40
38
|
|
41
|
-
def get_cobalt_hosts_list() -> List[str]:
|
42
|
-
"""Get list of COBALT hosts from envvar: COBALT_NODEFILE"""
|
43
|
-
nodefile_name = os.environ["COBALT_NODEFILE"]
|
44
|
-
with open(nodefile_name) as f:
|
45
|
-
return [line.strip() for line in f.readlines()]
|
46
|
-
|
47
|
-
|
48
39
|
def get_nodes_in_batchjob(scheduler: Scheduler) -> List[str]:
|
49
40
|
"""Get nodelist from all supported schedulers"""
|
50
41
|
nodelist = []
|
@@ -52,8 +43,6 @@ def get_nodes_in_batchjob(scheduler: Scheduler) -> List[str]:
|
|
52
43
|
nodelist = get_slurm_hosts_list()
|
53
44
|
elif scheduler == Scheduler.PBS:
|
54
45
|
nodelist = get_pbs_hosts_list()
|
55
|
-
elif scheduler == Scheduler.Cobalt:
|
56
|
-
nodelist = get_cobalt_hosts_list()
|
57
46
|
else:
|
58
47
|
raise RuntimeError(f"mpi_mode does not support scheduler:{scheduler}")
|
59
48
|
return nodelist
|
@@ -65,8 +54,6 @@ def identify_scheduler() -> Scheduler:
|
|
65
54
|
return Scheduler.Slurm
|
66
55
|
elif os.environ.get("PBS_NODEFILE"):
|
67
56
|
return Scheduler.PBS
|
68
|
-
elif os.environ.get("COBALT_NODEFILE"):
|
69
|
-
return Scheduler.Cobalt
|
70
57
|
else:
|
71
58
|
return Scheduler.Unknown
|
72
59
|
|
@@ -173,9 +160,7 @@ class MPITaskScheduler(TaskScheduler):
|
|
173
160
|
"""Schedule task if resources are available otherwise backlog the task"""
|
174
161
|
user_ns = locals()
|
175
162
|
user_ns.update({"__builtins__": __builtins__})
|
176
|
-
_f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(
|
177
|
-
task_package["buffer"], user_ns, copy=False
|
178
|
-
)
|
163
|
+
_f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(task_package["buffer"])
|
179
164
|
|
180
165
|
nodes_needed = resource_spec.get("num_nodes")
|
181
166
|
if nodes_needed:
|
@@ -190,6 +175,7 @@ class MPITaskScheduler(TaskScheduler):
|
|
190
175
|
self._map_tasks_to_nodes[task_package["task_id"]] = allocated_nodes
|
191
176
|
buffer = pack_res_spec_apply_message(_f, _args, _kwargs, resource_spec)
|
192
177
|
task_package["buffer"] = buffer
|
178
|
+
task_package["resource_spec"] = resource_spec
|
193
179
|
|
194
180
|
self.pending_task_q.put(task_package)
|
195
181
|
|
@@ -208,8 +194,11 @@ class MPITaskScheduler(TaskScheduler):
|
|
208
194
|
"""Return result and relinquish provisioned nodes"""
|
209
195
|
result_pkl = self.pending_result_q.get(block, timeout=timeout)
|
210
196
|
result_dict = pickle.loads(result_pkl)
|
197
|
+
# TODO (wardlt): If the task did not request nodes, it won't be in `self._map_tasks_to_nodes`.
|
198
|
+
# Causes Parsl to hang. See Issue #3427
|
211
199
|
if result_dict["type"] == "result":
|
212
200
|
task_id = result_dict["task_id"]
|
201
|
+
assert task_id in self._map_tasks_to_nodes, "You are about to experience issue #3427"
|
213
202
|
nodes_to_reallocate = self._map_tasks_to_nodes[task_id]
|
214
203
|
self._return_nodes(nodes_to_reallocate)
|
215
204
|
self._schedule_backlog_tasks()
|
@@ -1,11 +1,13 @@
|
|
1
|
-
import zmq
|
2
1
|
import argparse
|
3
|
-
import uuid
|
4
|
-
import time
|
5
2
|
import logging
|
6
|
-
|
3
|
+
import time
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
import zmq
|
7
7
|
from zmq.utils.monitor import recv_monitor_message
|
8
8
|
|
9
|
+
from parsl.addresses import get_all_addresses, tcp_url
|
10
|
+
|
9
11
|
logger = logging.getLogger(__name__)
|
10
12
|
|
11
13
|
|
@@ -30,7 +32,8 @@ def probe_addresses(addresses, task_port, timeout=120):
|
|
30
32
|
for addr in addresses:
|
31
33
|
socket = context.socket(zmq.DEALER)
|
32
34
|
socket.setsockopt(zmq.LINGER, 0)
|
33
|
-
|
35
|
+
socket.setsockopt(zmq.IPV6, True)
|
36
|
+
url = tcp_url(addr, task_port)
|
34
37
|
logger.debug("Trying to connect back on {}".format(url))
|
35
38
|
socket.connect(url)
|
36
39
|
addr_map[addr] = {'sock': socket,
|
@@ -69,8 +72,7 @@ class TestWorker:
|
|
69
72
|
|
70
73
|
address = probe_addresses(addresses, port)
|
71
74
|
print("Viable address :", address)
|
72
|
-
self.task_incoming.connect(
|
73
|
-
print("Here")
|
75
|
+
self.task_incoming.connect(tcp_url(address, port))
|
74
76
|
|
75
77
|
def heartbeat(self):
|
76
78
|
""" Send heartbeat to the incoming task queue
|
@@ -1,39 +1,44 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
|
3
3
|
import argparse
|
4
|
+
import json
|
4
5
|
import logging
|
6
|
+
import math
|
7
|
+
import multiprocessing
|
5
8
|
import os
|
6
|
-
import
|
9
|
+
import pickle
|
7
10
|
import platform
|
11
|
+
import queue
|
12
|
+
import subprocess
|
13
|
+
import sys
|
8
14
|
import threading
|
9
|
-
import pickle
|
10
15
|
import time
|
11
|
-
import queue
|
12
16
|
import uuid
|
13
|
-
from typing import Sequence, Optional, Dict, List
|
14
|
-
|
15
|
-
import zmq
|
16
|
-
import math
|
17
|
-
import json
|
18
|
-
import psutil
|
19
|
-
import multiprocessing
|
20
17
|
from multiprocessing.managers import DictProxy
|
21
18
|
from multiprocessing.sharedctypes import Synchronized
|
19
|
+
from typing import Dict, List, Optional, Sequence
|
20
|
+
|
21
|
+
import psutil
|
22
|
+
import zmq
|
22
23
|
|
23
24
|
from parsl import curvezmq
|
24
|
-
from parsl.
|
25
|
-
from parsl.version import VERSION as PARSL_VERSION
|
25
|
+
from parsl.addresses import tcp_url
|
26
26
|
from parsl.app.errors import RemoteExceptionWrapper
|
27
|
+
from parsl.executors.execute_task import execute_task
|
27
28
|
from parsl.executors.high_throughput.errors import WorkerLost
|
28
|
-
from parsl.executors.high_throughput.
|
29
|
-
|
30
|
-
|
29
|
+
from parsl.executors.high_throughput.mpi_prefix_composer import (
|
30
|
+
VALID_LAUNCHERS,
|
31
|
+
compose_all,
|
32
|
+
)
|
31
33
|
from parsl.executors.high_throughput.mpi_resource_management import (
|
34
|
+
MPITaskScheduler,
|
32
35
|
TaskScheduler,
|
33
|
-
MPITaskScheduler
|
34
36
|
)
|
35
|
-
|
36
|
-
from parsl.
|
37
|
+
from parsl.executors.high_throughput.probe import probe_addresses
|
38
|
+
from parsl.multiprocessing import SpawnContext
|
39
|
+
from parsl.process_loggers import wrap_with_logs
|
40
|
+
from parsl.serialize import serialize
|
41
|
+
from parsl.version import VERSION as PARSL_VERSION
|
37
42
|
|
38
43
|
HEARTBEAT_CODE = (2 ** 32) - 1
|
39
44
|
DRAINED_CODE = (2 ** 32) - 2
|
@@ -155,8 +160,8 @@ class Manager:
|
|
155
160
|
raise Exception("No viable address found")
|
156
161
|
else:
|
157
162
|
logger.info("Connection to Interchange successful on {}".format(ix_address))
|
158
|
-
task_q_url =
|
159
|
-
result_q_url =
|
163
|
+
task_q_url = tcp_url(ix_address, task_port)
|
164
|
+
result_q_url = tcp_url(ix_address, result_port)
|
160
165
|
logger.info("Task url : {}".format(task_q_url))
|
161
166
|
logger.info("Result url : {}".format(result_q_url))
|
162
167
|
except Exception:
|
@@ -181,6 +186,7 @@ class Manager:
|
|
181
186
|
|
182
187
|
self.uid = uid
|
183
188
|
self.block_id = block_id
|
189
|
+
self.start_time = time.time()
|
184
190
|
|
185
191
|
self.enable_mpi_mode = enable_mpi_mode
|
186
192
|
self.mpi_launcher = mpi_launcher
|
@@ -260,6 +266,7 @@ class Manager:
|
|
260
266
|
'worker_count': self.worker_count,
|
261
267
|
'uid': self.uid,
|
262
268
|
'block_id': self.block_id,
|
269
|
+
'start_time': self.start_time,
|
263
270
|
'prefetch_capacity': self.prefetch_capacity,
|
264
271
|
'max_capacity': self.worker_count + self.prefetch_capacity,
|
265
272
|
'os': platform.system(),
|
@@ -335,14 +342,17 @@ class Manager:
|
|
335
342
|
self.heartbeat_to_incoming()
|
336
343
|
last_beat = time.time()
|
337
344
|
|
338
|
-
if
|
345
|
+
if time.time() > self.drain_time:
|
339
346
|
logger.info("Requesting drain")
|
340
347
|
self.drain_to_incoming()
|
341
|
-
self.drain_time = None
|
342
348
|
# This will start the pool draining...
|
343
349
|
# Drained exit behaviour does not happen here. It will be
|
344
350
|
# driven by the interchange sending a DRAINED_CODE message.
|
345
351
|
|
352
|
+
# now set drain time to the far future so we don't send a drain
|
353
|
+
# message every iteration.
|
354
|
+
self.drain_time = float('inf')
|
355
|
+
|
346
356
|
poll_duration_s = max(0, next_interesting_event_time - time.time())
|
347
357
|
socks = dict(poller.poll(timeout=poll_duration_s * 1000))
|
348
358
|
|
@@ -354,11 +364,13 @@ class Manager:
|
|
354
364
|
if tasks == HEARTBEAT_CODE:
|
355
365
|
logger.debug("Got heartbeat from interchange")
|
356
366
|
elif tasks == DRAINED_CODE:
|
357
|
-
logger.info("Got
|
367
|
+
logger.info("Got fully drained message from interchange - setting kill flag")
|
358
368
|
kill_event.set()
|
359
369
|
else:
|
360
370
|
task_recv_counter += len(tasks)
|
361
|
-
logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format(
|
371
|
+
logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format(
|
372
|
+
[t['task_id'] for t in tasks], task_recv_counter
|
373
|
+
))
|
362
374
|
|
363
375
|
for task in tasks:
|
364
376
|
self.task_scheduler.put_task(task)
|
@@ -580,45 +592,13 @@ def update_resource_spec_env_vars(mpi_launcher: str, resource_spec: Dict, node_i
|
|
580
592
|
os.environ[key] = prefix_table[key]
|
581
593
|
|
582
594
|
|
583
|
-
def
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
f, args, kwargs, resource_spec = unpack_res_spec_apply_message(bufs, user_ns, copy=False)
|
592
|
-
|
593
|
-
for varname in resource_spec:
|
594
|
-
envname = "PARSL_" + str(varname).upper()
|
595
|
-
os.environ[envname] = str(resource_spec[varname])
|
596
|
-
|
597
|
-
if resource_spec.get("MPI_NODELIST"):
|
598
|
-
worker_id = os.environ['PARSL_WORKER_RANK']
|
599
|
-
nodes_for_task = resource_spec["MPI_NODELIST"].split(',')
|
600
|
-
logger.info(f"Launching task on provisioned nodes: {nodes_for_task}")
|
601
|
-
assert mpi_launcher
|
602
|
-
update_resource_spec_env_vars(mpi_launcher,
|
603
|
-
resource_spec=resource_spec,
|
604
|
-
node_info=nodes_for_task)
|
605
|
-
# We might need to look into callability of the function from itself
|
606
|
-
# since we change it's name in the new namespace
|
607
|
-
prefix = "parsl_"
|
608
|
-
fname = prefix + "f"
|
609
|
-
argname = prefix + "args"
|
610
|
-
kwargname = prefix + "kwargs"
|
611
|
-
resultname = prefix + "result"
|
612
|
-
|
613
|
-
user_ns.update({fname: f,
|
614
|
-
argname: args,
|
615
|
-
kwargname: kwargs,
|
616
|
-
resultname: resultname})
|
617
|
-
|
618
|
-
code = "{0} = {1}(*{2}, **{3})".format(resultname, fname,
|
619
|
-
argname, kwargname)
|
620
|
-
exec(code, user_ns, user_ns)
|
621
|
-
return user_ns.get(resultname)
|
595
|
+
def _init_mpi_env(mpi_launcher: str, resource_spec: Dict):
|
596
|
+
node_list = resource_spec.get("MPI_NODELIST")
|
597
|
+
if node_list is None:
|
598
|
+
return
|
599
|
+
nodes_for_task = node_list.split(',')
|
600
|
+
logger.info(f"Launching task on provisioned nodes: {nodes_for_task}")
|
601
|
+
update_resource_spec_env_vars(mpi_launcher=mpi_launcher, resource_spec=resource_spec, node_info=nodes_for_task)
|
622
602
|
|
623
603
|
|
624
604
|
@wrap_with_logs(target="worker_log")
|
@@ -640,14 +620,6 @@ def worker(
|
|
640
620
|
debug: bool,
|
641
621
|
mpi_launcher: str,
|
642
622
|
):
|
643
|
-
"""
|
644
|
-
|
645
|
-
Put request token into queue
|
646
|
-
Get task from task_queue
|
647
|
-
Pop request from queue
|
648
|
-
Put result into result_queue
|
649
|
-
"""
|
650
|
-
|
651
623
|
# override the global logger inherited from the __main__ process (which
|
652
624
|
# usually logs to manager.log) with one specific to this worker.
|
653
625
|
global logger
|
@@ -672,7 +644,8 @@ def worker(
|
|
672
644
|
# If desired, set process affinity
|
673
645
|
if cpu_affinity != "none":
|
674
646
|
# Count the number of cores per worker
|
675
|
-
|
647
|
+
# OSX does not implement os.sched_getaffinity
|
648
|
+
avail_cores = sorted(os.sched_getaffinity(0)) # type: ignore[attr-defined, unused-ignore]
|
676
649
|
cores_per_worker = len(avail_cores) // pool_size
|
677
650
|
assert cores_per_worker > 0, "Affinity does not work if there are more workers than cores"
|
678
651
|
|
@@ -712,12 +685,39 @@ def worker(
|
|
712
685
|
os.environ["KMP_AFFINITY"] = f"explicit,proclist=[{proc_list}]" # For Intel OpenMP
|
713
686
|
|
714
687
|
# Set the affinity for this worker
|
715
|
-
os.sched_setaffinity
|
688
|
+
# OSX does not implement os.sched_setaffinity so type checking
|
689
|
+
# is ignored here in two ways:
|
690
|
+
# On a platform without sched_setaffinity, that attribute will not
|
691
|
+
# be defined, so ignore[attr-defined] will tell mypy to ignore this
|
692
|
+
# incorrect-for-OS X attribute access.
|
693
|
+
# On a platform with sched_setaffinity, that type: ignore message
|
694
|
+
# will be redundant, and ignore[unused-ignore] tells mypy to ignore
|
695
|
+
# that this ignore is unneeded.
|
696
|
+
os.sched_setaffinity(0, my_cores) # type: ignore[attr-defined, unused-ignore]
|
716
697
|
logger.info("Set worker CPU affinity to {}".format(my_cores))
|
717
698
|
|
718
699
|
# If desired, pin to accelerator
|
719
700
|
if accelerator is not None:
|
720
|
-
|
701
|
+
|
702
|
+
# If CUDA devices, find total number of devices to allow for MPS
|
703
|
+
# See: https://developer.nvidia.com/system-management-interface
|
704
|
+
nvidia_smi_cmd = "nvidia-smi -L > /dev/null && nvidia-smi -L | wc -l"
|
705
|
+
nvidia_smi_ret = subprocess.run(nvidia_smi_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
706
|
+
if nvidia_smi_ret.returncode == 0:
|
707
|
+
num_cuda_devices = int(nvidia_smi_ret.stdout.split()[0])
|
708
|
+
else:
|
709
|
+
num_cuda_devices = None
|
710
|
+
|
711
|
+
try:
|
712
|
+
if num_cuda_devices is not None:
|
713
|
+
procs_per_cuda_device = pool_size // num_cuda_devices
|
714
|
+
partitioned_accelerator = str(int(accelerator) // procs_per_cuda_device) # multiple workers will share a GPU
|
715
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = partitioned_accelerator
|
716
|
+
logger.info(f'Pinned worker to partitioned cuda device: {partitioned_accelerator}')
|
717
|
+
else:
|
718
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
|
719
|
+
except (TypeError, ValueError, ZeroDivisionError):
|
720
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
|
721
721
|
os.environ["ROCR_VISIBLE_DEVICES"] = accelerator
|
722
722
|
os.environ["ZE_AFFINITY_MASK"] = accelerator
|
723
723
|
os.environ["ZE_ENABLE_PCI_ID_DEVICE_ORDER"] = '1'
|
@@ -756,8 +756,10 @@ def worker(
|
|
756
756
|
ready_worker_count.value -= 1
|
757
757
|
worker_enqueued = False
|
758
758
|
|
759
|
+
_init_mpi_env(mpi_launcher=mpi_launcher, resource_spec=req["resource_spec"])
|
760
|
+
|
759
761
|
try:
|
760
|
-
result = execute_task(req['buffer']
|
762
|
+
result = execute_task(req['buffer'])
|
761
763
|
serialized_result = serialize(result, buffer_threshold=1000000)
|
762
764
|
except Exception as e:
|
763
765
|
logger.info('Caught an exception: {}'.format(e))
|