parsl 2024.3.11__py3-none-any.whl → 2025.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/__init__.py +9 -10
- parsl/addresses.py +29 -7
- parsl/app/app.py +7 -8
- parsl/app/bash.py +15 -8
- parsl/app/errors.py +10 -13
- parsl/app/futures.py +8 -10
- parsl/app/python.py +2 -1
- parsl/benchmark/perf.py +2 -1
- parsl/concurrent/__init__.py +2 -2
- parsl/config.py +57 -10
- parsl/configs/ASPIRE1.py +6 -5
- parsl/configs/Azure.py +9 -8
- parsl/configs/bridges.py +6 -4
- parsl/configs/cc_in2p3.py +3 -3
- parsl/configs/ec2.py +3 -1
- parsl/configs/expanse.py +4 -3
- parsl/configs/frontera.py +3 -4
- parsl/configs/htex_local.py +3 -4
- parsl/configs/illinoiscluster.py +3 -1
- parsl/configs/improv.py +34 -0
- parsl/configs/kubernetes.py +4 -3
- parsl/configs/local_threads.py +5 -1
- parsl/configs/midway.py +5 -3
- parsl/configs/osg.py +4 -2
- parsl/configs/polaris.py +4 -2
- parsl/configs/stampede2.py +6 -5
- parsl/configs/summit.py +3 -3
- parsl/configs/toss3_llnl.py +4 -3
- parsl/configs/vineex_local.py +6 -4
- parsl/configs/wqex_local.py +5 -3
- parsl/curvezmq.py +4 -0
- parsl/data_provider/data_manager.py +4 -3
- parsl/data_provider/file_noop.py +1 -2
- parsl/data_provider/files.py +3 -3
- parsl/data_provider/ftp.py +1 -3
- parsl/data_provider/globus.py +7 -6
- parsl/data_provider/http.py +2 -2
- parsl/data_provider/rsync.py +1 -1
- parsl/data_provider/staging.py +2 -2
- parsl/data_provider/zip.py +135 -0
- parsl/dataflow/dependency_resolvers.py +115 -0
- parsl/dataflow/dflow.py +262 -224
- parsl/dataflow/errors.py +3 -5
- parsl/dataflow/futures.py +27 -14
- parsl/dataflow/memoization.py +5 -5
- parsl/dataflow/rundirs.py +5 -6
- parsl/dataflow/taskrecord.py +4 -5
- parsl/executors/__init__.py +4 -2
- parsl/executors/base.py +45 -15
- parsl/executors/errors.py +13 -0
- parsl/executors/execute_task.py +37 -0
- parsl/executors/flux/execute_parsl_task.py +3 -3
- parsl/executors/flux/executor.py +18 -19
- parsl/executors/flux/flux_instance_manager.py +26 -27
- parsl/executors/high_throughput/errors.py +43 -3
- parsl/executors/high_throughput/executor.py +316 -282
- parsl/executors/high_throughput/interchange.py +158 -167
- parsl/executors/high_throughput/manager_record.py +5 -0
- parsl/executors/high_throughput/manager_selector.py +55 -0
- parsl/executors/high_throughput/monitoring_info.py +2 -1
- parsl/executors/high_throughput/mpi_executor.py +113 -0
- parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
- parsl/executors/high_throughput/mpi_resource_management.py +6 -17
- parsl/executors/high_throughput/probe.py +9 -7
- parsl/executors/high_throughput/process_worker_pool.py +115 -77
- parsl/executors/high_throughput/zmq_pipes.py +81 -23
- parsl/executors/radical/executor.py +130 -79
- parsl/executors/radical/rpex_resources.py +17 -15
- parsl/executors/radical/rpex_worker.py +4 -3
- parsl/executors/status_handling.py +157 -51
- parsl/executors/taskvine/__init__.py +1 -1
- parsl/executors/taskvine/errors.py +1 -1
- parsl/executors/taskvine/exec_parsl_function.py +2 -2
- parsl/executors/taskvine/executor.py +41 -57
- parsl/executors/taskvine/factory.py +1 -1
- parsl/executors/taskvine/factory_config.py +1 -1
- parsl/executors/taskvine/manager.py +18 -13
- parsl/executors/taskvine/manager_config.py +9 -5
- parsl/executors/threads.py +6 -6
- parsl/executors/workqueue/errors.py +1 -1
- parsl/executors/workqueue/exec_parsl_function.py +6 -5
- parsl/executors/workqueue/executor.py +64 -63
- parsl/executors/workqueue/parsl_coprocess.py +1 -1
- parsl/jobs/error_handlers.py +2 -2
- parsl/jobs/job_status_poller.py +30 -113
- parsl/jobs/states.py +7 -2
- parsl/jobs/strategy.py +43 -31
- parsl/launchers/__init__.py +12 -3
- parsl/launchers/errors.py +1 -1
- parsl/launchers/launchers.py +6 -12
- parsl/log_utils.py +9 -6
- parsl/monitoring/db_manager.py +59 -95
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +87 -356
- parsl/monitoring/queries/pandas.py +1 -2
- parsl/monitoring/radios/base.py +13 -0
- parsl/monitoring/radios/filesystem.py +52 -0
- parsl/monitoring/radios/htex.py +57 -0
- parsl/monitoring/radios/multiprocessing.py +17 -0
- parsl/monitoring/radios/udp.py +56 -0
- parsl/monitoring/radios/zmq.py +17 -0
- parsl/monitoring/remote.py +33 -37
- parsl/monitoring/router.py +212 -0
- parsl/monitoring/types.py +5 -6
- parsl/monitoring/visualization/app.py +4 -2
- parsl/monitoring/visualization/models.py +0 -1
- parsl/monitoring/visualization/plots/default/workflow_plots.py +11 -4
- parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
- parsl/monitoring/visualization/utils.py +0 -1
- parsl/monitoring/visualization/views.py +16 -8
- parsl/multiprocessing.py +0 -1
- parsl/process_loggers.py +1 -2
- parsl/providers/__init__.py +8 -17
- parsl/providers/aws/aws.py +2 -3
- parsl/providers/azure/azure.py +4 -5
- parsl/providers/base.py +2 -18
- parsl/providers/cluster_provider.py +4 -12
- parsl/providers/condor/condor.py +7 -17
- parsl/providers/errors.py +2 -2
- parsl/providers/googlecloud/googlecloud.py +2 -1
- parsl/providers/grid_engine/grid_engine.py +5 -14
- parsl/providers/kubernetes/kube.py +80 -40
- parsl/providers/local/local.py +13 -26
- parsl/providers/lsf/lsf.py +5 -23
- parsl/providers/pbspro/pbspro.py +5 -17
- parsl/providers/slurm/slurm.py +81 -39
- parsl/providers/torque/torque.py +3 -14
- parsl/serialize/__init__.py +8 -3
- parsl/serialize/base.py +1 -2
- parsl/serialize/concretes.py +5 -4
- parsl/serialize/facade.py +3 -3
- parsl/serialize/proxystore.py +3 -2
- parsl/tests/__init__.py +1 -1
- parsl/tests/configs/azure_single_node.py +4 -5
- parsl/tests/configs/bridges.py +3 -2
- parsl/tests/configs/cc_in2p3.py +1 -3
- parsl/tests/configs/comet.py +2 -1
- parsl/tests/configs/ec2_single_node.py +1 -2
- parsl/tests/configs/ec2_spot.py +1 -2
- parsl/tests/configs/flux_local.py +11 -0
- parsl/tests/configs/frontera.py +2 -3
- parsl/tests/configs/htex_local.py +3 -5
- parsl/tests/configs/htex_local_alternate.py +11 -15
- parsl/tests/configs/htex_local_intask_staging.py +5 -9
- parsl/tests/configs/htex_local_rsync_staging.py +4 -8
- parsl/tests/configs/local_radical.py +1 -3
- parsl/tests/configs/local_radical_mpi.py +2 -2
- parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
- parsl/tests/configs/local_threads_monitoring.py +0 -1
- parsl/tests/configs/midway.py +2 -2
- parsl/tests/configs/nscc_singapore.py +3 -3
- parsl/tests/configs/osg_htex.py +1 -1
- parsl/tests/configs/petrelkube.py +3 -2
- parsl/tests/configs/slurm_local.py +24 -0
- parsl/tests/configs/summit.py +1 -0
- parsl/tests/configs/taskvine_ex.py +4 -7
- parsl/tests/configs/user_opts.py +2 -8
- parsl/tests/configs/workqueue_ex.py +4 -6
- parsl/tests/conftest.py +27 -13
- parsl/tests/integration/test_stress/test_python_simple.py +3 -4
- parsl/tests/integration/test_stress/test_python_threads.py +3 -5
- parsl/tests/manual_tests/htex_local.py +4 -6
- parsl/tests/manual_tests/test_basic.py +1 -0
- parsl/tests/manual_tests/test_log_filter.py +3 -1
- parsl/tests/manual_tests/test_memory_limits.py +6 -8
- parsl/tests/manual_tests/test_regression_220.py +2 -1
- parsl/tests/manual_tests/test_udp_simple.py +4 -4
- parsl/tests/manual_tests/test_worker_count.py +3 -2
- parsl/tests/scaling_tests/htex_local.py +2 -4
- parsl/tests/scaling_tests/test_scale.py +0 -9
- parsl/tests/scaling_tests/vineex_condor.py +1 -2
- parsl/tests/scaling_tests/vineex_local.py +1 -2
- parsl/tests/site_tests/site_config_selector.py +1 -6
- parsl/tests/site_tests/test_provider.py +4 -2
- parsl/tests/site_tests/test_site.py +2 -0
- parsl/tests/sites/test_affinity.py +7 -7
- parsl/tests/sites/test_dynamic_executor.py +3 -4
- parsl/tests/sites/test_ec2.py +3 -2
- parsl/tests/sites/test_worker_info.py +4 -5
- parsl/tests/test_aalst_patterns.py +0 -1
- parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
- parsl/tests/test_bash_apps/test_basic.py +10 -4
- parsl/tests/test_bash_apps/test_error_codes.py +5 -7
- parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
- parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
- parsl/tests/test_bash_apps/test_memoize.py +2 -8
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
- parsl/tests/test_bash_apps/test_multiline.py +1 -1
- parsl/tests/test_bash_apps/test_pipeline.py +1 -1
- parsl/tests/test_bash_apps/test_std_uri.py +123 -0
- parsl/tests/test_bash_apps/test_stdout.py +33 -8
- parsl/tests/test_callables.py +2 -2
- parsl/tests/test_checkpointing/test_periodic.py +21 -39
- parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
- parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
- parsl/tests/test_checkpointing/test_regression_239.py +1 -1
- parsl/tests/test_checkpointing/test_task_exit.py +2 -3
- parsl/tests/test_docs/test_from_slides.py +5 -2
- parsl/tests/test_docs/test_kwargs.py +4 -1
- parsl/tests/test_docs/test_tutorial_1.py +1 -2
- parsl/tests/test_docs/test_workflow1.py +2 -2
- parsl/tests/test_docs/test_workflow2.py +0 -1
- parsl/tests/test_error_handling/test_rand_fail.py +2 -2
- parsl/tests/test_error_handling/test_resource_spec.py +10 -12
- parsl/tests/test_error_handling/test_retries.py +6 -16
- parsl/tests/test_error_handling/test_retry_handler.py +1 -0
- parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
- parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
- parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
- parsl/tests/test_execute_task.py +29 -0
- parsl/tests/test_flux.py +1 -1
- parsl/tests/test_htex/test_basic.py +2 -3
- parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
- parsl/tests/test_htex/test_command_client_timeout.py +66 -0
- parsl/tests/test_htex/test_connected_blocks.py +3 -2
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
- parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_drain.py +79 -0
- parsl/tests/test_htex/test_htex.py +51 -25
- parsl/tests/test_htex/test_manager_failure.py +0 -1
- parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
- parsl/tests/test_htex/test_managers_command.py +36 -0
- parsl/tests/test_htex/test_missing_worker.py +2 -12
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
- parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
- parsl/tests/test_htex/test_zmq_binding.py +29 -8
- parsl/tests/test_monitoring/test_app_names.py +86 -0
- parsl/tests/test_monitoring/test_basic.py +73 -25
- parsl/tests/test_monitoring/test_db_locks.py +6 -4
- parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
- parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
- parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
- parsl/tests/test_monitoring/test_stdouterr.py +134 -0
- parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
- parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
- parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
- parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
- parsl/tests/test_providers/test_local_provider.py +3 -132
- parsl/tests/test_providers/test_pbspro_template.py +2 -3
- parsl/tests/test_providers/test_slurm_template.py +2 -3
- parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
- parsl/tests/test_python_apps/test_context_manager.py +128 -0
- parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
- parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
- parsl/tests/test_python_apps/test_fail.py +0 -25
- parsl/tests/test_python_apps/test_futures.py +2 -1
- parsl/tests/test_python_apps/test_inputs_default.py +22 -0
- parsl/tests/test_python_apps/test_join.py +0 -1
- parsl/tests/test_python_apps/test_lifted.py +11 -7
- parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
- parsl/tests/test_python_apps/test_outputs.py +1 -1
- parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
- parsl/tests/test_radical/test_mpi_funcs.py +1 -2
- parsl/tests/test_regression/test_1480.py +2 -1
- parsl/tests/test_regression/test_1653.py +2 -1
- parsl/tests/test_regression/test_226.py +1 -0
- parsl/tests/test_regression/test_2652.py +1 -0
- parsl/tests/test_regression/test_69a.py +0 -1
- parsl/tests/test_regression/test_854.py +4 -2
- parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
- parsl/tests/test_regression/test_98.py +0 -1
- parsl/tests/test_scaling/test_block_error_handler.py +9 -4
- parsl/tests/test_scaling/test_regression_1621.py +11 -15
- parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
- parsl/tests/test_scaling/test_scale_down.py +2 -5
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +6 -18
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
- parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
- parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
- parsl/tests/test_serialization/test_basic.py +2 -1
- parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
- parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
- parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
- parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
- parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
- parsl/tests/test_staging/staging_provider.py +2 -2
- parsl/tests/test_staging/test_1316.py +3 -4
- parsl/tests/test_staging/test_docs_1.py +2 -1
- parsl/tests/test_staging/test_docs_2.py +2 -1
- parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
- parsl/tests/{test_data → test_staging}/test_file.py +6 -6
- parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
- parsl/tests/test_staging/test_staging_ftp.py +1 -0
- parsl/tests/test_staging/test_staging_https.py +5 -2
- parsl/tests/test_staging/test_staging_stdout.py +64 -0
- parsl/tests/test_staging/test_zip_in.py +39 -0
- parsl/tests/test_staging/test_zip_out.py +110 -0
- parsl/tests/test_staging/test_zip_to_zip.py +41 -0
- parsl/tests/test_summary.py +2 -2
- parsl/tests/test_thread_parallelism.py +0 -1
- parsl/tests/test_threads/test_configs.py +1 -2
- parsl/tests/test_threads/test_lazy_errors.py +2 -2
- parsl/tests/test_utils/test_execute_wait.py +35 -0
- parsl/tests/test_utils/test_sanitize_dns.py +76 -0
- parsl/tests/unit/test_address.py +20 -0
- parsl/tests/unit/test_file.py +99 -0
- parsl/tests/unit/test_usage_tracking.py +66 -0
- parsl/usage_tracking/api.py +65 -0
- parsl/usage_tracking/levels.py +6 -0
- parsl/usage_tracking/usage.py +104 -62
- parsl/utils.py +139 -6
- parsl/version.py +1 -1
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
- parsl-2025.1.13.data/scripts/interchange.py +649 -0
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +115 -77
- parsl-2025.1.13.dist-info/METADATA +96 -0
- parsl-2025.1.13.dist-info/RECORD +462 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
- parsl/channels/__init__.py +0 -7
- parsl/channels/base.py +0 -141
- parsl/channels/errors.py +0 -113
- parsl/channels/local/local.py +0 -164
- parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
- parsl/channels/ssh/ssh.py +0 -276
- parsl/channels/ssh_il/__init__.py +0 -0
- parsl/channels/ssh_il/ssh_il.py +0 -74
- parsl/configs/ad_hoc.py +0 -35
- parsl/executors/radical/rpex_master.py +0 -42
- parsl/monitoring/radios.py +0 -175
- parsl/providers/ad_hoc/__init__.py +0 -0
- parsl/providers/ad_hoc/ad_hoc.py +0 -248
- parsl/providers/cobalt/__init__.py +0 -0
- parsl/providers/cobalt/cobalt.py +0 -236
- parsl/providers/cobalt/template.py +0 -17
- parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
- parsl/tests/configs/cooley_htex.py +0 -37
- parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
- parsl/tests/configs/local_adhoc.py +0 -18
- parsl/tests/configs/swan_htex.py +0 -43
- parsl/tests/configs/theta.py +0 -37
- parsl/tests/integration/test_channels/__init__.py +0 -0
- parsl/tests/integration/test_channels/test_channels.py +0 -17
- parsl/tests/integration/test_channels/test_local_channel.py +0 -42
- parsl/tests/integration/test_channels/test_scp_1.py +0 -45
- parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
- parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
- parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
- parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
- parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
- parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
- parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
- parsl/tests/sites/test_local_adhoc.py +0 -61
- parsl/tests/test_channels/__init__.py +0 -0
- parsl/tests/test_channels/test_large_output.py +0 -22
- parsl/tests/test_data/__init__.py +0 -0
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
- parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
- parsl-2024.3.11.dist-info/METADATA +0 -98
- parsl-2024.3.11.dist-info/RECORD +0 -447
- parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
- parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
- parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,11 @@
|
|
1
|
+
import pickle
|
2
|
+
import sys
|
3
|
+
import traceback
|
4
|
+
|
1
5
|
from parsl.app.errors import RemoteExceptionWrapper
|
2
6
|
from parsl.data_provider.files import File
|
3
|
-
from parsl.utils import get_std_fname_mode
|
4
|
-
import traceback
|
5
|
-
import sys
|
6
|
-
import pickle
|
7
7
|
from parsl.serialize import serialize
|
8
|
+
from parsl.utils import get_std_fname_mode
|
8
9
|
|
9
10
|
# This scripts executes a parsl function which is pickled in a file:
|
10
11
|
#
|
@@ -93,7 +94,7 @@ def unpack_source_code_function(function_info, user_namespace):
|
|
93
94
|
|
94
95
|
def unpack_byte_code_function(function_info, user_namespace):
|
95
96
|
from parsl.serialize import unpack_apply_message
|
96
|
-
func, args, kwargs = unpack_apply_message(function_info["byte code"]
|
97
|
+
func, args, kwargs = unpack_apply_message(function_info["byte code"])
|
97
98
|
return (func, 'parsl_function_name', args, kwargs)
|
98
99
|
|
99
100
|
|
@@ -3,50 +3,49 @@ Cooperative Computing Lab (CCL) at Notre Dame to provide a fault-tolerant,
|
|
3
3
|
high-throughput system for delegating Parsl tasks to thousands of remote machines
|
4
4
|
"""
|
5
5
|
|
6
|
-
import threading
|
7
|
-
import multiprocessing
|
8
|
-
import logging
|
9
|
-
from concurrent.futures import Future
|
10
|
-
from ctypes import c_bool
|
11
|
-
|
12
|
-
import tempfile
|
13
6
|
import hashlib
|
14
|
-
import
|
7
|
+
import inspect
|
8
|
+
import itertools
|
9
|
+
import logging
|
10
|
+
import multiprocessing
|
15
11
|
import os
|
16
|
-
import socket
|
17
|
-
import time
|
18
12
|
import pickle
|
19
13
|
import queue
|
20
|
-
import inspect
|
21
14
|
import shutil
|
22
|
-
import
|
15
|
+
import socket
|
16
|
+
import subprocess
|
17
|
+
import tempfile
|
18
|
+
import threading
|
19
|
+
import time
|
20
|
+
from collections import namedtuple
|
21
|
+
from concurrent.futures import Future
|
22
|
+
from ctypes import c_bool
|
23
|
+
from typing import Dict, List, Optional, Set, Union
|
24
|
+
|
25
|
+
import typeguard
|
23
26
|
|
24
|
-
from parsl.serialize import pack_apply_message, deserialize
|
25
27
|
import parsl.utils as putils
|
26
|
-
from parsl.executors.errors import ExecutorError
|
27
28
|
from parsl.data_provider.files import File
|
29
|
+
from parsl.data_provider.staging import Staging
|
28
30
|
from parsl.errors import OptionalModuleMissing
|
31
|
+
from parsl.executors.errors import ExecutorError, InvalidResourceSpecification
|
29
32
|
from parsl.executors.status_handling import BlockProviderExecutor
|
30
|
-
from parsl.providers.base import ExecutionProvider
|
31
|
-
from parsl.providers import LocalProvider, CondorProvider
|
32
33
|
from parsl.executors.workqueue import exec_parsl_function
|
33
34
|
from parsl.process_loggers import wrap_with_logs
|
35
|
+
from parsl.providers import CondorProvider, LocalProvider
|
36
|
+
from parsl.providers.base import ExecutionProvider
|
37
|
+
from parsl.serialize import deserialize, pack_apply_message
|
34
38
|
from parsl.utils import setproctitle
|
35
39
|
|
36
|
-
import
|
37
|
-
from typing import Dict, List, Optional, Set, Union
|
38
|
-
from parsl.data_provider.staging import Staging
|
39
|
-
|
40
|
-
from .errors import WorkQueueTaskFailure
|
41
|
-
from .errors import WorkQueueFailure
|
42
|
-
|
43
|
-
from collections import namedtuple
|
40
|
+
from .errors import WorkQueueFailure, WorkQueueTaskFailure
|
44
41
|
|
45
42
|
try:
|
46
43
|
import work_queue as wq
|
47
|
-
from work_queue import
|
48
|
-
|
49
|
-
|
44
|
+
from work_queue import (
|
45
|
+
WORK_QUEUE_ALLOCATION_MODE_MAX_THROUGHPUT,
|
46
|
+
WORK_QUEUE_DEFAULT_PORT,
|
47
|
+
WorkQueue,
|
48
|
+
)
|
50
49
|
except ImportError:
|
51
50
|
_work_queue_enabled = False
|
52
51
|
WORK_QUEUE_DEFAULT_PORT = 0
|
@@ -216,6 +215,13 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
216
215
|
This requires a version of Work Queue / cctools after commit
|
217
216
|
874df524516441da531b694afc9d591e8b134b73 (release 7.5.0 is too early).
|
218
217
|
Default is False.
|
218
|
+
|
219
|
+
scaling_cores_per_worker: int
|
220
|
+
When using Parsl scaling, this specifies the number of cores that a
|
221
|
+
worker is expected to have available for computation. Default 1. This
|
222
|
+
parameter can be ignored when using a fixed number of blocks, or when
|
223
|
+
using one task per worker (by omitting a ``cores`` resource
|
224
|
+
specifiation for each task).
|
219
225
|
"""
|
220
226
|
|
221
227
|
radio_mode = "filesystem"
|
@@ -245,16 +251,17 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
245
251
|
full_debug: bool = True,
|
246
252
|
worker_executable: str = 'work_queue_worker',
|
247
253
|
function_dir: Optional[str] = None,
|
248
|
-
coprocess: bool = False
|
254
|
+
coprocess: bool = False,
|
255
|
+
scaling_cores_per_worker: int = 1):
|
249
256
|
BlockProviderExecutor.__init__(self, provider=provider,
|
250
257
|
block_error_handler=True)
|
251
258
|
if not _work_queue_enabled:
|
252
259
|
raise OptionalModuleMissing(['work_queue'], "WorkQueueExecutor requires the work_queue module.")
|
253
260
|
|
261
|
+
self.scaling_cores_per_worker = scaling_cores_per_worker
|
254
262
|
self.label = label
|
255
263
|
self.task_queue = multiprocessing.Queue() # type: multiprocessing.Queue
|
256
264
|
self.collector_queue = multiprocessing.Queue() # type: multiprocessing.Queue
|
257
|
-
self.blocks = {} # type: Dict[str, str]
|
258
265
|
self.address = address
|
259
266
|
self.port = port
|
260
267
|
self.executor_task_counter = -1
|
@@ -412,7 +419,7 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
412
419
|
message = "Task resource specification only accepts these types of resources: {}".format(
|
413
420
|
', '.join(acceptable_fields))
|
414
421
|
logger.error(message)
|
415
|
-
raise
|
422
|
+
raise InvalidResourceSpecification(keys, message)
|
416
423
|
|
417
424
|
# this checks that either all of the required resource types are specified, or
|
418
425
|
# that none of them are: the `required_resource_types` are not actually required,
|
@@ -423,9 +430,10 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
423
430
|
logger.error("Running with `autolabel=False`. In this mode, "
|
424
431
|
"task resource specification requires "
|
425
432
|
"three resources to be specified simultaneously: cores, memory, and disk")
|
426
|
-
raise
|
427
|
-
|
428
|
-
|
433
|
+
raise InvalidResourceSpecification(keys,
|
434
|
+
"Task resource specification requires "
|
435
|
+
"three resources to be specified simultaneously: cores, memory, and disk. "
|
436
|
+
"Try setting autolabel=True if you are unsure of the resource usage")
|
429
437
|
|
430
438
|
for k in keys:
|
431
439
|
if k == 'cores':
|
@@ -471,6 +479,8 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
471
479
|
# Create a Future object and have it be mapped from the task ID in the tasks dictionary
|
472
480
|
fu = Future()
|
473
481
|
fu.parsl_executor_task_id = executor_task_id
|
482
|
+
assert isinstance(resource_specification, dict)
|
483
|
+
fu.resource_specification = resource_specification
|
474
484
|
logger.debug("Getting tasks_lock to set WQ-level task entry")
|
475
485
|
with self.tasks_lock:
|
476
486
|
logger.debug("Got tasks_lock to set WQ-level task entry")
|
@@ -654,42 +664,31 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
654
664
|
self.worker_command = self._construct_worker_command()
|
655
665
|
self._patch_providers()
|
656
666
|
|
657
|
-
if hasattr(self.provider, 'init_blocks'):
|
658
|
-
try:
|
659
|
-
self.scale_out(blocks=self.provider.init_blocks)
|
660
|
-
except Exception as e:
|
661
|
-
logger.error("Initial block scaling out failed: {}".format(e))
|
662
|
-
raise e
|
663
|
-
|
664
667
|
@property
|
665
668
|
def outstanding(self) -> int:
|
666
|
-
"""Count the number of outstanding
|
669
|
+
"""Count the number of outstanding slots required. This is inefficiently
|
667
670
|
implemented and probably could be replaced with a counter.
|
668
671
|
"""
|
672
|
+
logger.debug("Calculating outstanding task slot load")
|
669
673
|
outstanding = 0
|
674
|
+
tasks = 0 # only for log message...
|
670
675
|
with self.tasks_lock:
|
671
676
|
for fut in self.tasks.values():
|
672
677
|
if not fut.done():
|
673
|
-
|
674
|
-
|
678
|
+
# if a task does not specify a core count, Work Queue will allocate an entire
|
679
|
+
# worker node to that task. That's approximated here by saying that it uses
|
680
|
+
# scaling_cores_per_worker.
|
681
|
+
resource_spec = getattr(fut, 'resource_specification', {})
|
682
|
+
cores = resource_spec.get('cores', self.scaling_cores_per_worker)
|
683
|
+
|
684
|
+
outstanding += cores
|
685
|
+
tasks += 1
|
686
|
+
logger.debug(f"Counted {tasks} outstanding tasks with {outstanding} outstanding slots")
|
675
687
|
return outstanding
|
676
688
|
|
677
689
|
@property
|
678
690
|
def workers_per_node(self) -> Union[int, float]:
|
679
|
-
return
|
680
|
-
|
681
|
-
def scale_in(self, count):
|
682
|
-
"""Scale in method.
|
683
|
-
"""
|
684
|
-
# Obtain list of blocks to kill
|
685
|
-
to_kill = list(self.blocks.keys())[:count]
|
686
|
-
kill_ids = [self.blocks[block] for block in to_kill]
|
687
|
-
|
688
|
-
# Cancel the blocks provisioned
|
689
|
-
if self.provider:
|
690
|
-
self.provider.cancel(kill_ids)
|
691
|
-
else:
|
692
|
-
logger.error("No execution provider available to scale")
|
691
|
+
return self.scaling_cores_per_worker
|
693
692
|
|
694
693
|
def shutdown(self, *args, **kwargs):
|
695
694
|
"""Shutdown the executor. Sets flag to cancel the submit process and
|
@@ -698,17 +697,19 @@ class WorkQueueExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
|
698
697
|
logger.debug("Work Queue shutdown started")
|
699
698
|
self.should_stop.value = True
|
700
699
|
|
701
|
-
# Remove the workers that are still going
|
702
|
-
kill_ids = [self.blocks[block] for block in self.blocks.keys()]
|
703
|
-
if self.provider:
|
704
|
-
logger.debug("Cancelling blocks")
|
705
|
-
self.provider.cancel(kill_ids)
|
706
|
-
|
707
700
|
logger.debug("Joining on submit process")
|
708
701
|
self.submit_process.join()
|
702
|
+
self.submit_process.close()
|
703
|
+
|
709
704
|
logger.debug("Joining on collector thread")
|
710
705
|
self.collector_thread.join()
|
711
706
|
|
707
|
+
logger.debug("Closing multiprocessing queues")
|
708
|
+
self.task_queue.close()
|
709
|
+
self.task_queue.join_thread()
|
710
|
+
self.collector_queue.close()
|
711
|
+
self.collector_queue.join_thread()
|
712
|
+
|
712
713
|
logger.debug("Work Queue shutdown completed")
|
713
714
|
|
714
715
|
@wrap_with_logs
|
parsl/jobs/error_handlers.py
CHANGED
@@ -3,8 +3,8 @@ from __future__ import annotations
|
|
3
3
|
from typing import Dict, Tuple
|
4
4
|
|
5
5
|
import parsl.executors.status_handling as status_handling
|
6
|
-
from parsl.jobs.states import JobStatus, JobState
|
7
6
|
from parsl.jobs.errors import TooManyJobFailuresError
|
7
|
+
from parsl.jobs.states import JobState, JobStatus
|
8
8
|
|
9
9
|
|
10
10
|
def noop_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3) -> None:
|
@@ -20,7 +20,7 @@ def simple_error_handler(executor: status_handling.BlockProviderExecutor, status
|
|
20
20
|
executor.set_bad_state_and_fail_all(_get_error(status))
|
21
21
|
|
22
22
|
|
23
|
-
def windowed_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3):
|
23
|
+
def windowed_error_handler(executor: status_handling.BlockProviderExecutor, status: Dict[str, JobStatus], threshold: int = 3) -> None:
|
24
24
|
sorted_status = [(key, status[key]) for key in sorted(status, key=lambda x: int(x))]
|
25
25
|
current_window = dict(sorted_status[-threshold:])
|
26
26
|
total, failed = _count_jobs(current_window)
|
parsl/jobs/job_status_poller.py
CHANGED
@@ -1,136 +1,53 @@
|
|
1
1
|
import logging
|
2
|
-
import
|
3
|
-
import time
|
4
|
-
import zmq
|
5
|
-
from typing import Dict, List, Sequence, Optional
|
2
|
+
from typing import List, Optional, Sequence, Union
|
6
3
|
|
7
|
-
from parsl.jobs.states import JobStatus, JobState
|
8
|
-
from parsl.jobs.strategy import Strategy
|
9
4
|
from parsl.executors.status_handling import BlockProviderExecutor
|
10
|
-
from parsl.
|
11
|
-
|
12
|
-
|
5
|
+
from parsl.jobs.strategy import Strategy
|
13
6
|
from parsl.utils import Timer
|
14
7
|
|
15
|
-
|
16
8
|
logger = logging.getLogger(__name__)
|
17
9
|
|
18
10
|
|
19
|
-
class PollItem:
|
20
|
-
def __init__(self, executor: BlockProviderExecutor, dfk: Optional["parsl.dataflow.dflow.DataFlowKernel"] = None):
|
21
|
-
self._executor = executor
|
22
|
-
self._dfk = dfk
|
23
|
-
self._interval = executor.status_polling_interval
|
24
|
-
self._last_poll_time = 0.0
|
25
|
-
self._status = {} # type: Dict[str, JobStatus]
|
26
|
-
|
27
|
-
# Create a ZMQ channel to send poll status to monitoring
|
28
|
-
self.monitoring_enabled = False
|
29
|
-
if self._dfk and self._dfk.monitoring is not None:
|
30
|
-
self.monitoring_enabled = True
|
31
|
-
hub_address = self._dfk.hub_address
|
32
|
-
hub_port = self._dfk.hub_interchange_port
|
33
|
-
context = zmq.Context()
|
34
|
-
self.hub_channel = context.socket(zmq.DEALER)
|
35
|
-
self.hub_channel.set_hwm(0)
|
36
|
-
self.hub_channel.connect("tcp://{}:{}".format(hub_address, hub_port))
|
37
|
-
logger.info("Monitoring enabled on job status poller")
|
38
|
-
|
39
|
-
def _should_poll(self, now: float) -> bool:
|
40
|
-
return now >= self._last_poll_time + self._interval
|
41
|
-
|
42
|
-
def poll(self, now: float) -> None:
|
43
|
-
if self._should_poll(now):
|
44
|
-
previous_status = self._status
|
45
|
-
self._status = self._executor.status()
|
46
|
-
self._last_poll_time = now
|
47
|
-
delta_status = {}
|
48
|
-
for block_id in self._status:
|
49
|
-
if block_id not in previous_status \
|
50
|
-
or previous_status[block_id].state != self._status[block_id].state:
|
51
|
-
delta_status[block_id] = self._status[block_id]
|
52
|
-
|
53
|
-
if delta_status:
|
54
|
-
self.send_monitoring_info(delta_status)
|
55
|
-
|
56
|
-
def send_monitoring_info(self, status: Dict) -> None:
|
57
|
-
# Send monitoring info for HTEX when monitoring enabled
|
58
|
-
if self.monitoring_enabled:
|
59
|
-
msg = self._executor.create_monitoring_info(status)
|
60
|
-
logger.debug("Sending message {} to hub from job status poller".format(msg))
|
61
|
-
self.hub_channel.send_pyobj((MessageType.BLOCK_INFO, msg))
|
62
|
-
|
63
|
-
@property
|
64
|
-
def status(self) -> Dict[str, JobStatus]:
|
65
|
-
"""Return the status of all jobs/blocks of the executor of this poller.
|
66
|
-
|
67
|
-
:return: a dictionary mapping block ids (in string) to job status
|
68
|
-
"""
|
69
|
-
return self._status
|
70
|
-
|
71
|
-
@property
|
72
|
-
def executor(self) -> BlockProviderExecutor:
|
73
|
-
return self._executor
|
74
|
-
|
75
|
-
def scale_in(self, n, max_idletime=None):
|
76
|
-
|
77
|
-
if max_idletime is None:
|
78
|
-
block_ids = self._executor.scale_in(n)
|
79
|
-
else:
|
80
|
-
# This is a HighThroughputExecutor-specific interface violation.
|
81
|
-
# This code hopes, through pan-codebase reasoning, that this
|
82
|
-
# scale_in method really does come from HighThroughputExecutor,
|
83
|
-
# and so does have an extra max_idletime parameter not present
|
84
|
-
# in the executor interface.
|
85
|
-
block_ids = self._executor.scale_in(n, max_idletime=max_idletime)
|
86
|
-
if block_ids is not None:
|
87
|
-
new_status = {}
|
88
|
-
for block_id in block_ids:
|
89
|
-
new_status[block_id] = JobStatus(JobState.CANCELLED)
|
90
|
-
del self._status[block_id]
|
91
|
-
self.send_monitoring_info(new_status)
|
92
|
-
return block_ids
|
93
|
-
|
94
|
-
def scale_out(self, n):
|
95
|
-
block_ids = self._executor.scale_out(n)
|
96
|
-
if block_ids is not None:
|
97
|
-
new_status = {}
|
98
|
-
for block_id in block_ids:
|
99
|
-
new_status[block_id] = JobStatus(JobState.PENDING)
|
100
|
-
self.send_monitoring_info(new_status)
|
101
|
-
self._status.update(new_status)
|
102
|
-
return block_ids
|
103
|
-
|
104
|
-
def __repr__(self) -> str:
|
105
|
-
return self._status.__repr__()
|
106
|
-
|
107
|
-
|
108
11
|
class JobStatusPoller(Timer):
|
109
|
-
def __init__(self, strategy: Optional[str]
|
110
|
-
|
111
|
-
self.
|
112
|
-
self.dfk = dfk
|
12
|
+
def __init__(self, *, strategy: Optional[str], max_idletime: float,
|
13
|
+
strategy_period: Union[float, int]) -> None:
|
14
|
+
self._executors = [] # type: List[BlockProviderExecutor]
|
113
15
|
self._strategy = Strategy(strategy=strategy,
|
114
16
|
max_idletime=max_idletime)
|
115
|
-
super().__init__(self.poll, interval=
|
17
|
+
super().__init__(self.poll, interval=strategy_period, name="JobStatusPoller")
|
116
18
|
|
117
19
|
def poll(self) -> None:
|
118
20
|
self._update_state()
|
119
|
-
self._run_error_handlers(self.
|
120
|
-
self._strategy.strategize(self.
|
21
|
+
self._run_error_handlers(self._executors)
|
22
|
+
self._strategy.strategize(self._executors)
|
121
23
|
|
122
|
-
def _run_error_handlers(self,
|
123
|
-
for
|
124
|
-
|
24
|
+
def _run_error_handlers(self, executors: List[BlockProviderExecutor]) -> None:
|
25
|
+
for e in executors:
|
26
|
+
e.handle_errors(e.status_facade)
|
125
27
|
|
126
28
|
def _update_state(self) -> None:
|
127
|
-
|
128
|
-
|
129
|
-
item.poll(now)
|
29
|
+
for item in self._executors:
|
30
|
+
item.poll_facade()
|
130
31
|
|
131
32
|
def add_executors(self, executors: Sequence[BlockProviderExecutor]) -> None:
|
132
33
|
for executor in executors:
|
133
34
|
if executor.status_polling_interval > 0:
|
134
35
|
logger.debug("Adding executor {}".format(executor.label))
|
135
|
-
self.
|
36
|
+
self._executors.append(executor)
|
136
37
|
self._strategy.add_executors(executors)
|
38
|
+
|
39
|
+
def close(self, timeout: Optional[float] = None) -> None:
|
40
|
+
super().close(timeout)
|
41
|
+
for executor in self._executors:
|
42
|
+
if not executor.bad_state_is_set:
|
43
|
+
logger.info(f"Scaling in executor {executor.label}")
|
44
|
+
|
45
|
+
# this code needs to be at least as many blocks as need
|
46
|
+
# cancelling, but it is safe to be more, as the scaling
|
47
|
+
# code will cope with being asked to cancel more blocks
|
48
|
+
# than exist.
|
49
|
+
block_count = len(executor.status_facade)
|
50
|
+
executor.scale_in_facade(block_count)
|
51
|
+
|
52
|
+
else: # and bad_state_is_set
|
53
|
+
logger.warning(f"Not scaling in executor {executor.label} because it is in bad state")
|
parsl/jobs/states.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
+
import logging
|
1
2
|
import os
|
2
3
|
from enum import IntEnum
|
3
|
-
import logging
|
4
4
|
from typing import Optional
|
5
5
|
|
6
6
|
logger = logging.getLogger(__name__)
|
@@ -46,12 +46,17 @@ class JobState(IntEnum):
|
|
46
46
|
bad worker environment or network connectivity issues.
|
47
47
|
"""
|
48
48
|
|
49
|
+
SCALED_IN = 9
|
50
|
+
"""This job has been deliberately scaled in. Scaling code should not be concerned
|
51
|
+
that the job never ran (for example for error handling purposes).
|
52
|
+
"""
|
53
|
+
|
49
54
|
def __str__(self) -> str:
|
50
55
|
return f"{self.__class__.__name__}.{self.name}"
|
51
56
|
|
52
57
|
|
53
58
|
TERMINAL_STATES = [JobState.CANCELLED, JobState.COMPLETED, JobState.FAILED,
|
54
|
-
JobState.TIMEOUT, JobState.MISSING]
|
59
|
+
JobState.TIMEOUT, JobState.MISSING, JobState.SCALED_IN]
|
55
60
|
|
56
61
|
|
57
62
|
class JobStatus:
|
parsl/jobs/strategy.py
CHANGED
@@ -1,19 +1,17 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
+
|
2
3
|
import logging
|
3
|
-
import time
|
4
4
|
import math
|
5
|
+
import time
|
5
6
|
import warnings
|
6
7
|
from typing import Dict, List, Optional, Sequence, TypedDict
|
7
8
|
|
8
|
-
import parsl.jobs.job_status_poller as jsp
|
9
|
-
|
10
9
|
from parsl.executors import HighThroughputExecutor
|
11
10
|
from parsl.executors.base import ParslExecutor
|
12
11
|
from parsl.executors.status_handling import BlockProviderExecutor
|
13
12
|
from parsl.jobs.states import JobState
|
14
13
|
from parsl.process_loggers import wrap_with_logs
|
15
14
|
|
16
|
-
|
17
15
|
logger = logging.getLogger(__name__)
|
18
16
|
|
19
17
|
|
@@ -26,6 +24,10 @@ class ExecutorState(TypedDict):
|
|
26
24
|
If the executor is not idle, then None.
|
27
25
|
"""
|
28
26
|
|
27
|
+
first: bool
|
28
|
+
"""True if this executor has not yet had a strategy poll.
|
29
|
+
"""
|
30
|
+
|
29
31
|
|
30
32
|
class Strategy:
|
31
33
|
"""Scaling strategy.
|
@@ -129,8 +131,8 @@ class Strategy:
|
|
129
131
|
self.executors = {}
|
130
132
|
self.max_idletime = max_idletime
|
131
133
|
|
132
|
-
self.strategies = {None: self.
|
133
|
-
'none': self.
|
134
|
+
self.strategies = {None: self._strategy_init_only,
|
135
|
+
'none': self._strategy_init_only,
|
134
136
|
'simple': self._strategy_simple,
|
135
137
|
'htex_auto_scale': self._strategy_htex_auto_scale}
|
136
138
|
|
@@ -144,17 +146,23 @@ class Strategy:
|
|
144
146
|
|
145
147
|
def add_executors(self, executors: Sequence[ParslExecutor]) -> None:
|
146
148
|
for executor in executors:
|
147
|
-
self.executors[executor.label] = {'idle_since': None}
|
149
|
+
self.executors[executor.label] = {'idle_since': None, 'first': True}
|
148
150
|
|
149
|
-
def
|
150
|
-
"""
|
151
|
+
def _strategy_init_only(self, executors: List[BlockProviderExecutor]) -> None:
|
152
|
+
"""Scale up to init_blocks at the start, then nothing more.
|
151
153
|
"""
|
152
|
-
|
154
|
+
for executor in executors:
|
155
|
+
if self.executors[executor.label]['first']:
|
156
|
+
logger.debug(f"strategy_init_only: scaling out {executor.provider.init_blocks} initial blocks for {executor.label}")
|
157
|
+
executor.scale_out_facade(executor.provider.init_blocks)
|
158
|
+
self.executors[executor.label]['first'] = False
|
159
|
+
else:
|
160
|
+
logger.debug("strategy_init_only: doing nothing")
|
153
161
|
|
154
|
-
def _strategy_simple(self,
|
155
|
-
self._general_strategy(
|
162
|
+
def _strategy_simple(self, executors: List[BlockProviderExecutor]) -> None:
|
163
|
+
self._general_strategy(executors, strategy_type='simple')
|
156
164
|
|
157
|
-
def _strategy_htex_auto_scale(self,
|
165
|
+
def _strategy_htex_auto_scale(self, executors: List[BlockProviderExecutor]) -> None:
|
158
166
|
"""HTEX specific auto scaling strategy
|
159
167
|
|
160
168
|
This strategy works only for HTEX. This strategy will scale out by
|
@@ -169,24 +177,25 @@ class Strategy:
|
|
169
177
|
expected to scale in effectively only when # of workers, or tasks executing
|
170
178
|
per block is close to 1.
|
171
179
|
"""
|
172
|
-
self._general_strategy(
|
180
|
+
self._general_strategy(executors, strategy_type='htex')
|
173
181
|
|
174
182
|
@wrap_with_logs
|
175
|
-
def _general_strategy(self,
|
176
|
-
logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(
|
183
|
+
def _general_strategy(self, executors: List[BlockProviderExecutor], *, strategy_type: str) -> None:
|
184
|
+
logger.debug(f"general strategy starting with strategy_type {strategy_type} for {len(executors)} executors")
|
177
185
|
|
178
|
-
for
|
179
|
-
executor = exec_status.executor
|
186
|
+
for executor in executors:
|
180
187
|
label = executor.label
|
181
|
-
if not isinstance(executor, BlockProviderExecutor):
|
182
|
-
logger.debug(f"Not strategizing for executor {label} because scaling not enabled")
|
183
|
-
continue
|
184
188
|
logger.debug(f"Strategizing for executor {label}")
|
185
189
|
|
190
|
+
if self.executors[label]['first']:
|
191
|
+
logger.debug(f"Scaling out {executor.provider.init_blocks} initial blocks for {label}")
|
192
|
+
executor.scale_out_facade(executor.provider.init_blocks)
|
193
|
+
self.executors[label]['first'] = False
|
194
|
+
|
186
195
|
# Tasks that are either pending completion
|
187
196
|
active_tasks = executor.outstanding
|
188
197
|
|
189
|
-
status =
|
198
|
+
status = executor.status_facade
|
190
199
|
|
191
200
|
# FIXME we need to handle case where provider does not define these
|
192
201
|
# FIXME probably more of this logic should be moved to the provider
|
@@ -230,23 +239,26 @@ class Strategy:
|
|
230
239
|
else:
|
231
240
|
# We want to make sure that max_idletime is reached
|
232
241
|
# before killing off resources
|
233
|
-
logger.debug(f"Strategy case 1b: Executor has no active tasks, and more ({active_blocks})
|
242
|
+
logger.debug(f"Strategy case 1b: Executor has no active tasks, and more ({active_blocks})"
|
243
|
+
f" than minimum blocks ({min_blocks})")
|
234
244
|
|
235
245
|
if not self.executors[executor.label]['idle_since']:
|
236
246
|
logger.debug(f"Starting idle timer for executor. If idle time exceeds {self.max_idletime}s, blocks will be scaled in")
|
237
247
|
self.executors[executor.label]['idle_since'] = time.time()
|
238
|
-
|
239
248
|
idle_since = self.executors[executor.label]['idle_since']
|
249
|
+
assert idle_since is not None, "The `if` statement above this assert should have forced idle time to be not-None"
|
250
|
+
|
240
251
|
idle_duration = time.time() - idle_since
|
241
252
|
if idle_duration > self.max_idletime:
|
242
253
|
# We have resources idle for the max duration,
|
243
254
|
# we have to scale_in now.
|
244
255
|
logger.debug(f"Idle time has reached {self.max_idletime}s for executor {label}; scaling in")
|
245
|
-
|
256
|
+
executor.scale_in_facade(active_blocks - min_blocks)
|
246
257
|
|
247
258
|
else:
|
248
259
|
logger.debug(
|
249
|
-
f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s
|
260
|
+
f"Idle time {idle_duration}s is less than max_idletime {self.max_idletime}s"
|
261
|
+
f" for executor {label}; not scaling in")
|
250
262
|
|
251
263
|
# Case 2
|
252
264
|
# More tasks than the available slots.
|
@@ -265,7 +277,7 @@ class Strategy:
|
|
265
277
|
excess_blocks = math.ceil(float(excess_slots) / (tasks_per_node * nodes_per_block))
|
266
278
|
excess_blocks = min(excess_blocks, max_blocks - active_blocks)
|
267
279
|
logger.debug(f"Requesting {excess_blocks} more blocks")
|
268
|
-
|
280
|
+
executor.scale_out_facade(excess_blocks)
|
269
281
|
|
270
282
|
elif active_slots == 0 and active_tasks > 0:
|
271
283
|
logger.debug("Strategy case 4a: No active slots but some active tasks - could scale out by a single block")
|
@@ -274,7 +286,7 @@ class Strategy:
|
|
274
286
|
if active_blocks < max_blocks:
|
275
287
|
logger.debug("Requesting single block")
|
276
288
|
|
277
|
-
|
289
|
+
executor.scale_out_facade(1)
|
278
290
|
else:
|
279
291
|
logger.debug("Not requesting single block, because at maxblocks already")
|
280
292
|
|
@@ -286,11 +298,11 @@ class Strategy:
|
|
286
298
|
# Scale in for htex
|
287
299
|
if isinstance(executor, HighThroughputExecutor):
|
288
300
|
if active_blocks > min_blocks:
|
289
|
-
excess_slots = math.
|
290
|
-
excess_blocks = math.
|
301
|
+
excess_slots = math.floor(active_slots - (active_tasks * parallelism))
|
302
|
+
excess_blocks = math.floor(float(excess_slots) / (tasks_per_node * nodes_per_block))
|
291
303
|
excess_blocks = min(excess_blocks, active_blocks - min_blocks)
|
292
304
|
logger.debug(f"Requesting scaling in by {excess_blocks} blocks with idle time {self.max_idletime}s")
|
293
|
-
|
305
|
+
executor.scale_in_facade(excess_blocks, max_idletime=self.max_idletime)
|
294
306
|
else:
|
295
307
|
logger.error("This strategy does not support scaling in except for HighThroughputExecutor - taking no action")
|
296
308
|
else:
|