parsl 2024.3.11__py3-none-any.whl → 2025.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/__init__.py +9 -10
- parsl/addresses.py +29 -7
- parsl/app/app.py +7 -8
- parsl/app/bash.py +15 -8
- parsl/app/errors.py +10 -13
- parsl/app/futures.py +8 -10
- parsl/app/python.py +2 -1
- parsl/benchmark/perf.py +2 -1
- parsl/concurrent/__init__.py +2 -2
- parsl/config.py +57 -10
- parsl/configs/ASPIRE1.py +6 -5
- parsl/configs/Azure.py +9 -8
- parsl/configs/bridges.py +6 -4
- parsl/configs/cc_in2p3.py +3 -3
- parsl/configs/ec2.py +3 -1
- parsl/configs/expanse.py +4 -3
- parsl/configs/frontera.py +3 -4
- parsl/configs/htex_local.py +3 -4
- parsl/configs/illinoiscluster.py +3 -1
- parsl/configs/improv.py +34 -0
- parsl/configs/kubernetes.py +4 -3
- parsl/configs/local_threads.py +5 -1
- parsl/configs/midway.py +5 -3
- parsl/configs/osg.py +4 -2
- parsl/configs/polaris.py +4 -2
- parsl/configs/stampede2.py +6 -5
- parsl/configs/summit.py +3 -3
- parsl/configs/toss3_llnl.py +4 -3
- parsl/configs/vineex_local.py +6 -4
- parsl/configs/wqex_local.py +5 -3
- parsl/curvezmq.py +4 -0
- parsl/data_provider/data_manager.py +4 -3
- parsl/data_provider/file_noop.py +1 -2
- parsl/data_provider/files.py +3 -3
- parsl/data_provider/ftp.py +1 -3
- parsl/data_provider/globus.py +7 -6
- parsl/data_provider/http.py +2 -2
- parsl/data_provider/rsync.py +1 -1
- parsl/data_provider/staging.py +2 -2
- parsl/data_provider/zip.py +135 -0
- parsl/dataflow/dependency_resolvers.py +115 -0
- parsl/dataflow/dflow.py +262 -224
- parsl/dataflow/errors.py +3 -5
- parsl/dataflow/futures.py +27 -14
- parsl/dataflow/memoization.py +5 -5
- parsl/dataflow/rundirs.py +5 -6
- parsl/dataflow/taskrecord.py +4 -5
- parsl/executors/__init__.py +4 -2
- parsl/executors/base.py +45 -15
- parsl/executors/errors.py +13 -0
- parsl/executors/execute_task.py +37 -0
- parsl/executors/flux/execute_parsl_task.py +3 -3
- parsl/executors/flux/executor.py +18 -19
- parsl/executors/flux/flux_instance_manager.py +26 -27
- parsl/executors/high_throughput/errors.py +43 -3
- parsl/executors/high_throughput/executor.py +316 -282
- parsl/executors/high_throughput/interchange.py +158 -167
- parsl/executors/high_throughput/manager_record.py +5 -0
- parsl/executors/high_throughput/manager_selector.py +55 -0
- parsl/executors/high_throughput/monitoring_info.py +2 -1
- parsl/executors/high_throughput/mpi_executor.py +113 -0
- parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
- parsl/executors/high_throughput/mpi_resource_management.py +6 -17
- parsl/executors/high_throughput/probe.py +9 -7
- parsl/executors/high_throughput/process_worker_pool.py +115 -77
- parsl/executors/high_throughput/zmq_pipes.py +81 -23
- parsl/executors/radical/executor.py +130 -79
- parsl/executors/radical/rpex_resources.py +17 -15
- parsl/executors/radical/rpex_worker.py +4 -3
- parsl/executors/status_handling.py +157 -51
- parsl/executors/taskvine/__init__.py +1 -1
- parsl/executors/taskvine/errors.py +1 -1
- parsl/executors/taskvine/exec_parsl_function.py +2 -2
- parsl/executors/taskvine/executor.py +41 -57
- parsl/executors/taskvine/factory.py +1 -1
- parsl/executors/taskvine/factory_config.py +1 -1
- parsl/executors/taskvine/manager.py +18 -13
- parsl/executors/taskvine/manager_config.py +9 -5
- parsl/executors/threads.py +6 -6
- parsl/executors/workqueue/errors.py +1 -1
- parsl/executors/workqueue/exec_parsl_function.py +6 -5
- parsl/executors/workqueue/executor.py +64 -63
- parsl/executors/workqueue/parsl_coprocess.py +1 -1
- parsl/jobs/error_handlers.py +2 -2
- parsl/jobs/job_status_poller.py +30 -113
- parsl/jobs/states.py +7 -2
- parsl/jobs/strategy.py +43 -31
- parsl/launchers/__init__.py +12 -3
- parsl/launchers/errors.py +1 -1
- parsl/launchers/launchers.py +6 -12
- parsl/log_utils.py +9 -6
- parsl/monitoring/db_manager.py +59 -95
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +87 -356
- parsl/monitoring/queries/pandas.py +1 -2
- parsl/monitoring/radios/base.py +13 -0
- parsl/monitoring/radios/filesystem.py +52 -0
- parsl/monitoring/radios/htex.py +57 -0
- parsl/monitoring/radios/multiprocessing.py +17 -0
- parsl/monitoring/radios/udp.py +56 -0
- parsl/monitoring/radios/zmq.py +17 -0
- parsl/monitoring/remote.py +33 -37
- parsl/monitoring/router.py +212 -0
- parsl/monitoring/types.py +5 -6
- parsl/monitoring/visualization/app.py +4 -2
- parsl/monitoring/visualization/models.py +0 -1
- parsl/monitoring/visualization/plots/default/workflow_plots.py +11 -4
- parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
- parsl/monitoring/visualization/utils.py +0 -1
- parsl/monitoring/visualization/views.py +16 -8
- parsl/multiprocessing.py +0 -1
- parsl/process_loggers.py +1 -2
- parsl/providers/__init__.py +8 -17
- parsl/providers/aws/aws.py +2 -3
- parsl/providers/azure/azure.py +4 -5
- parsl/providers/base.py +2 -18
- parsl/providers/cluster_provider.py +4 -12
- parsl/providers/condor/condor.py +7 -17
- parsl/providers/errors.py +2 -2
- parsl/providers/googlecloud/googlecloud.py +2 -1
- parsl/providers/grid_engine/grid_engine.py +5 -14
- parsl/providers/kubernetes/kube.py +80 -40
- parsl/providers/local/local.py +13 -26
- parsl/providers/lsf/lsf.py +5 -23
- parsl/providers/pbspro/pbspro.py +5 -17
- parsl/providers/slurm/slurm.py +81 -39
- parsl/providers/torque/torque.py +3 -14
- parsl/serialize/__init__.py +8 -3
- parsl/serialize/base.py +1 -2
- parsl/serialize/concretes.py +5 -4
- parsl/serialize/facade.py +3 -3
- parsl/serialize/proxystore.py +3 -2
- parsl/tests/__init__.py +1 -1
- parsl/tests/configs/azure_single_node.py +4 -5
- parsl/tests/configs/bridges.py +3 -2
- parsl/tests/configs/cc_in2p3.py +1 -3
- parsl/tests/configs/comet.py +2 -1
- parsl/tests/configs/ec2_single_node.py +1 -2
- parsl/tests/configs/ec2_spot.py +1 -2
- parsl/tests/configs/flux_local.py +11 -0
- parsl/tests/configs/frontera.py +2 -3
- parsl/tests/configs/htex_local.py +3 -5
- parsl/tests/configs/htex_local_alternate.py +11 -15
- parsl/tests/configs/htex_local_intask_staging.py +5 -9
- parsl/tests/configs/htex_local_rsync_staging.py +4 -8
- parsl/tests/configs/local_radical.py +1 -3
- parsl/tests/configs/local_radical_mpi.py +2 -2
- parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
- parsl/tests/configs/local_threads_monitoring.py +0 -1
- parsl/tests/configs/midway.py +2 -2
- parsl/tests/configs/nscc_singapore.py +3 -3
- parsl/tests/configs/osg_htex.py +1 -1
- parsl/tests/configs/petrelkube.py +3 -2
- parsl/tests/configs/slurm_local.py +24 -0
- parsl/tests/configs/summit.py +1 -0
- parsl/tests/configs/taskvine_ex.py +4 -7
- parsl/tests/configs/user_opts.py +2 -8
- parsl/tests/configs/workqueue_ex.py +4 -6
- parsl/tests/conftest.py +27 -13
- parsl/tests/integration/test_stress/test_python_simple.py +3 -4
- parsl/tests/integration/test_stress/test_python_threads.py +3 -5
- parsl/tests/manual_tests/htex_local.py +4 -6
- parsl/tests/manual_tests/test_basic.py +1 -0
- parsl/tests/manual_tests/test_log_filter.py +3 -1
- parsl/tests/manual_tests/test_memory_limits.py +6 -8
- parsl/tests/manual_tests/test_regression_220.py +2 -1
- parsl/tests/manual_tests/test_udp_simple.py +4 -4
- parsl/tests/manual_tests/test_worker_count.py +3 -2
- parsl/tests/scaling_tests/htex_local.py +2 -4
- parsl/tests/scaling_tests/test_scale.py +0 -9
- parsl/tests/scaling_tests/vineex_condor.py +1 -2
- parsl/tests/scaling_tests/vineex_local.py +1 -2
- parsl/tests/site_tests/site_config_selector.py +1 -6
- parsl/tests/site_tests/test_provider.py +4 -2
- parsl/tests/site_tests/test_site.py +2 -0
- parsl/tests/sites/test_affinity.py +7 -7
- parsl/tests/sites/test_dynamic_executor.py +3 -4
- parsl/tests/sites/test_ec2.py +3 -2
- parsl/tests/sites/test_worker_info.py +4 -5
- parsl/tests/test_aalst_patterns.py +0 -1
- parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
- parsl/tests/test_bash_apps/test_basic.py +10 -4
- parsl/tests/test_bash_apps/test_error_codes.py +5 -7
- parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
- parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
- parsl/tests/test_bash_apps/test_memoize.py +2 -8
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
- parsl/tests/test_bash_apps/test_multiline.py +1 -1
- parsl/tests/test_bash_apps/test_pipeline.py +1 -1
- parsl/tests/test_bash_apps/test_std_uri.py +123 -0
- parsl/tests/test_bash_apps/test_stdout.py +33 -8
- parsl/tests/test_callables.py +2 -2
- parsl/tests/test_checkpointing/test_periodic.py +21 -39
- parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
- parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
- parsl/tests/test_checkpointing/test_regression_239.py +1 -1
- parsl/tests/test_checkpointing/test_task_exit.py +2 -3
- parsl/tests/test_docs/test_from_slides.py +5 -2
- parsl/tests/test_docs/test_kwargs.py +4 -1
- parsl/tests/test_docs/test_tutorial_1.py +1 -2
- parsl/tests/test_docs/test_workflow1.py +2 -2
- parsl/tests/test_docs/test_workflow2.py +0 -1
- parsl/tests/test_error_handling/test_rand_fail.py +2 -2
- parsl/tests/test_error_handling/test_resource_spec.py +10 -12
- parsl/tests/test_error_handling/test_retries.py +6 -16
- parsl/tests/test_error_handling/test_retry_handler.py +1 -0
- parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
- parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
- parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
- parsl/tests/test_execute_task.py +29 -0
- parsl/tests/test_flux.py +1 -1
- parsl/tests/test_htex/test_basic.py +2 -3
- parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
- parsl/tests/test_htex/test_command_client_timeout.py +66 -0
- parsl/tests/test_htex/test_connected_blocks.py +3 -2
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
- parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_drain.py +79 -0
- parsl/tests/test_htex/test_htex.py +51 -25
- parsl/tests/test_htex/test_manager_failure.py +0 -1
- parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
- parsl/tests/test_htex/test_managers_command.py +36 -0
- parsl/tests/test_htex/test_missing_worker.py +2 -12
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
- parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
- parsl/tests/test_htex/test_zmq_binding.py +29 -8
- parsl/tests/test_monitoring/test_app_names.py +86 -0
- parsl/tests/test_monitoring/test_basic.py +73 -25
- parsl/tests/test_monitoring/test_db_locks.py +6 -4
- parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
- parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
- parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
- parsl/tests/test_monitoring/test_stdouterr.py +134 -0
- parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
- parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
- parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
- parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
- parsl/tests/test_providers/test_local_provider.py +3 -132
- parsl/tests/test_providers/test_pbspro_template.py +2 -3
- parsl/tests/test_providers/test_slurm_template.py +2 -3
- parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
- parsl/tests/test_python_apps/test_context_manager.py +128 -0
- parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
- parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
- parsl/tests/test_python_apps/test_fail.py +0 -25
- parsl/tests/test_python_apps/test_futures.py +2 -1
- parsl/tests/test_python_apps/test_inputs_default.py +22 -0
- parsl/tests/test_python_apps/test_join.py +0 -1
- parsl/tests/test_python_apps/test_lifted.py +11 -7
- parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
- parsl/tests/test_python_apps/test_outputs.py +1 -1
- parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
- parsl/tests/test_radical/test_mpi_funcs.py +1 -2
- parsl/tests/test_regression/test_1480.py +2 -1
- parsl/tests/test_regression/test_1653.py +2 -1
- parsl/tests/test_regression/test_226.py +1 -0
- parsl/tests/test_regression/test_2652.py +1 -0
- parsl/tests/test_regression/test_69a.py +0 -1
- parsl/tests/test_regression/test_854.py +4 -2
- parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
- parsl/tests/test_regression/test_98.py +0 -1
- parsl/tests/test_scaling/test_block_error_handler.py +9 -4
- parsl/tests/test_scaling/test_regression_1621.py +11 -15
- parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
- parsl/tests/test_scaling/test_scale_down.py +2 -5
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +6 -18
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
- parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
- parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
- parsl/tests/test_serialization/test_basic.py +2 -1
- parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
- parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
- parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
- parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
- parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
- parsl/tests/test_staging/staging_provider.py +2 -2
- parsl/tests/test_staging/test_1316.py +3 -4
- parsl/tests/test_staging/test_docs_1.py +2 -1
- parsl/tests/test_staging/test_docs_2.py +2 -1
- parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
- parsl/tests/{test_data → test_staging}/test_file.py +6 -6
- parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
- parsl/tests/test_staging/test_staging_ftp.py +1 -0
- parsl/tests/test_staging/test_staging_https.py +5 -2
- parsl/tests/test_staging/test_staging_stdout.py +64 -0
- parsl/tests/test_staging/test_zip_in.py +39 -0
- parsl/tests/test_staging/test_zip_out.py +110 -0
- parsl/tests/test_staging/test_zip_to_zip.py +41 -0
- parsl/tests/test_summary.py +2 -2
- parsl/tests/test_thread_parallelism.py +0 -1
- parsl/tests/test_threads/test_configs.py +1 -2
- parsl/tests/test_threads/test_lazy_errors.py +2 -2
- parsl/tests/test_utils/test_execute_wait.py +35 -0
- parsl/tests/test_utils/test_sanitize_dns.py +76 -0
- parsl/tests/unit/test_address.py +20 -0
- parsl/tests/unit/test_file.py +99 -0
- parsl/tests/unit/test_usage_tracking.py +66 -0
- parsl/usage_tracking/api.py +65 -0
- parsl/usage_tracking/levels.py +6 -0
- parsl/usage_tracking/usage.py +104 -62
- parsl/utils.py +139 -6
- parsl/version.py +1 -1
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
- parsl-2025.1.13.data/scripts/interchange.py +649 -0
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +115 -77
- parsl-2025.1.13.dist-info/METADATA +96 -0
- parsl-2025.1.13.dist-info/RECORD +462 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
- parsl/channels/__init__.py +0 -7
- parsl/channels/base.py +0 -141
- parsl/channels/errors.py +0 -113
- parsl/channels/local/local.py +0 -164
- parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
- parsl/channels/ssh/ssh.py +0 -276
- parsl/channels/ssh_il/__init__.py +0 -0
- parsl/channels/ssh_il/ssh_il.py +0 -74
- parsl/configs/ad_hoc.py +0 -35
- parsl/executors/radical/rpex_master.py +0 -42
- parsl/monitoring/radios.py +0 -175
- parsl/providers/ad_hoc/__init__.py +0 -0
- parsl/providers/ad_hoc/ad_hoc.py +0 -248
- parsl/providers/cobalt/__init__.py +0 -0
- parsl/providers/cobalt/cobalt.py +0 -236
- parsl/providers/cobalt/template.py +0 -17
- parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
- parsl/tests/configs/cooley_htex.py +0 -37
- parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
- parsl/tests/configs/local_adhoc.py +0 -18
- parsl/tests/configs/swan_htex.py +0 -43
- parsl/tests/configs/theta.py +0 -37
- parsl/tests/integration/test_channels/__init__.py +0 -0
- parsl/tests/integration/test_channels/test_channels.py +0 -17
- parsl/tests/integration/test_channels/test_local_channel.py +0 -42
- parsl/tests/integration/test_channels/test_scp_1.py +0 -45
- parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
- parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
- parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
- parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
- parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
- parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
- parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
- parsl/tests/sites/test_local_adhoc.py +0 -61
- parsl/tests/test_channels/__init__.py +0 -0
- parsl/tests/test_channels/test_large_output.py +0 -22
- parsl/tests/test_data/__init__.py +0 -0
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
- parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
- parsl-2024.3.11.dist-info/METADATA +0 -98
- parsl-2024.3.11.dist-info/RECORD +0 -447
- parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
- parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
- parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
@@ -1,64 +1,39 @@
|
|
1
1
|
#!/usr/bin/env python
|
2
|
-
import multiprocessing
|
3
|
-
import zmq
|
4
|
-
import os
|
5
|
-
import sys
|
6
|
-
import platform
|
7
|
-
import random
|
8
|
-
import time
|
9
2
|
import datetime
|
10
|
-
import
|
11
|
-
import signal
|
3
|
+
import json
|
12
4
|
import logging
|
5
|
+
import os
|
6
|
+
import pickle
|
7
|
+
import platform
|
13
8
|
import queue
|
9
|
+
import sys
|
14
10
|
import threading
|
15
|
-
import
|
11
|
+
import time
|
12
|
+
from typing import Any, Dict, List, NoReturn, Optional, Sequence, Set, Tuple, cast
|
16
13
|
|
17
|
-
|
14
|
+
import zmq
|
18
15
|
|
19
16
|
from parsl import curvezmq
|
20
|
-
from parsl.
|
21
|
-
from parsl.version import VERSION as PARSL_VERSION
|
22
|
-
from parsl.serialize import serialize as serialize_object
|
23
|
-
|
17
|
+
from parsl.addresses import tcp_url
|
24
18
|
from parsl.app.errors import RemoteExceptionWrapper
|
19
|
+
from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch
|
25
20
|
from parsl.executors.high_throughput.manager_record import ManagerRecord
|
21
|
+
from parsl.executors.high_throughput.manager_selector import ManagerSelector
|
26
22
|
from parsl.monitoring.message_type import MessageType
|
23
|
+
from parsl.monitoring.radios.base import MonitoringRadioSender
|
24
|
+
from parsl.monitoring.radios.zmq import ZMQRadioSender
|
27
25
|
from parsl.process_loggers import wrap_with_logs
|
28
|
-
|
26
|
+
from parsl.serialize import serialize as serialize_object
|
27
|
+
from parsl.utils import setproctitle
|
28
|
+
from parsl.version import VERSION as PARSL_VERSION
|
29
29
|
|
30
30
|
PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
|
31
|
+
PKL_DRAINED_CODE = pickle.dumps((2 ** 32) - 2)
|
31
32
|
|
32
33
|
LOGGER_NAME = "interchange"
|
33
34
|
logger = logging.getLogger(LOGGER_NAME)
|
34
35
|
|
35
36
|
|
36
|
-
class ManagerLost(Exception):
|
37
|
-
''' Task lost due to manager loss. Manager is considered lost when multiple heartbeats
|
38
|
-
have been missed.
|
39
|
-
'''
|
40
|
-
def __init__(self, manager_id: bytes, hostname: str) -> None:
|
41
|
-
self.manager_id = manager_id
|
42
|
-
self.tstamp = time.time()
|
43
|
-
self.hostname = hostname
|
44
|
-
|
45
|
-
def __str__(self) -> str:
|
46
|
-
return "Task failure due to loss of manager {} on host {}".format(self.manager_id.decode(), self.hostname)
|
47
|
-
|
48
|
-
|
49
|
-
class VersionMismatch(Exception):
|
50
|
-
''' Manager and Interchange versions do not match
|
51
|
-
'''
|
52
|
-
def __init__(self, interchange_version: str, manager_version: str):
|
53
|
-
self.interchange_version = interchange_version
|
54
|
-
self.manager_version = manager_version
|
55
|
-
|
56
|
-
def __str__(self) -> str:
|
57
|
-
return "Manager version info {} does not match interchange version info {}, causing a critical failure".format(
|
58
|
-
self.manager_version,
|
59
|
-
self.interchange_version)
|
60
|
-
|
61
|
-
|
62
37
|
class Interchange:
|
63
38
|
""" Interchange is a task orchestrator for distributed systems.
|
64
39
|
|
@@ -67,18 +42,21 @@ class Interchange:
|
|
67
42
|
3. Detect workers that have failed using heartbeats
|
68
43
|
"""
|
69
44
|
def __init__(self,
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
45
|
+
*,
|
46
|
+
client_address: str,
|
47
|
+
interchange_address: Optional[str],
|
48
|
+
client_ports: Tuple[int, int, int],
|
49
|
+
worker_ports: Optional[Tuple[int, int]],
|
50
|
+
worker_port_range: Tuple[int, int],
|
51
|
+
hub_address: Optional[str],
|
52
|
+
hub_zmq_port: Optional[int],
|
53
|
+
heartbeat_threshold: int,
|
54
|
+
logdir: str,
|
55
|
+
logging_level: int,
|
56
|
+
poll_period: int,
|
57
|
+
cert_dir: Optional[str],
|
58
|
+
manager_selector: ManagerSelector,
|
59
|
+
run_id: str,
|
82
60
|
) -> None:
|
83
61
|
"""
|
84
62
|
Parameters
|
@@ -90,45 +68,44 @@ class Interchange:
|
|
90
68
|
If specified the interchange will only listen on this address for connections from workers
|
91
69
|
else, it binds to all addresses.
|
92
70
|
|
93
|
-
client_ports :
|
71
|
+
client_ports : tuple(int, int, int)
|
94
72
|
The ports at which the client can be reached
|
95
73
|
|
96
74
|
worker_ports : tuple(int, int)
|
97
|
-
The specific two ports at which workers will connect to the Interchange.
|
75
|
+
The specific two ports at which workers will connect to the Interchange.
|
98
76
|
|
99
77
|
worker_port_range : tuple(int, int)
|
100
78
|
The interchange picks ports at random from the range which will be used by workers.
|
101
|
-
This is overridden when the worker_ports option is set.
|
79
|
+
This is overridden when the worker_ports option is set.
|
102
80
|
|
103
81
|
hub_address : str
|
104
|
-
The
|
105
|
-
|
82
|
+
The IP address at which the interchange can send info about managers to when monitoring is enabled.
|
83
|
+
When None, monitoring is disabled.
|
106
84
|
|
107
|
-
|
85
|
+
hub_zmq_port : str
|
108
86
|
The port at which the interchange can send info about managers to when monitoring is enabled.
|
109
|
-
|
87
|
+
When None, monitoring is disabled.
|
110
88
|
|
111
89
|
heartbeat_threshold : int
|
112
90
|
Number of seconds since the last heartbeat after which worker is considered lost.
|
113
91
|
|
114
92
|
logdir : str
|
115
|
-
Parsl log directory paths. Logs and temp files go here.
|
93
|
+
Parsl log directory paths. Logs and temp files go here.
|
116
94
|
|
117
95
|
logging_level : int
|
118
|
-
Logging level as defined in the logging module.
|
96
|
+
Logging level as defined in the logging module.
|
119
97
|
|
120
98
|
poll_period : int
|
121
|
-
The main thread polling period, in milliseconds.
|
99
|
+
The main thread polling period, in milliseconds.
|
122
100
|
|
123
101
|
cert_dir : str | None
|
124
|
-
Path to the certificate directory.
|
102
|
+
Path to the certificate directory.
|
125
103
|
"""
|
126
104
|
self.cert_dir = cert_dir
|
127
105
|
self.logdir = logdir
|
128
106
|
os.makedirs(self.logdir, exist_ok=True)
|
129
107
|
|
130
108
|
start_file_logger("{}/interchange.log".format(self.logdir), level=logging_level)
|
131
|
-
logger.propagate = False
|
132
109
|
logger.debug("Initializing Interchange process")
|
133
110
|
|
134
111
|
self.client_address = client_address
|
@@ -140,17 +117,19 @@ class Interchange:
|
|
140
117
|
self.zmq_context = curvezmq.ServerContext(self.cert_dir)
|
141
118
|
self.task_incoming = self.zmq_context.socket(zmq.DEALER)
|
142
119
|
self.task_incoming.set_hwm(0)
|
143
|
-
self.task_incoming.connect(
|
120
|
+
self.task_incoming.connect(tcp_url(client_address, client_ports[0]))
|
144
121
|
self.results_outgoing = self.zmq_context.socket(zmq.DEALER)
|
145
122
|
self.results_outgoing.set_hwm(0)
|
146
|
-
self.results_outgoing.connect(
|
123
|
+
self.results_outgoing.connect(tcp_url(client_address, client_ports[1]))
|
147
124
|
|
148
125
|
self.command_channel = self.zmq_context.socket(zmq.REP)
|
149
|
-
self.command_channel.connect(
|
126
|
+
self.command_channel.connect(tcp_url(client_address, client_ports[2]))
|
150
127
|
logger.info("Connected to client")
|
151
128
|
|
129
|
+
self.run_id = run_id
|
130
|
+
|
152
131
|
self.hub_address = hub_address
|
153
|
-
self.
|
132
|
+
self.hub_zmq_port = hub_zmq_port
|
154
133
|
|
155
134
|
self.pending_task_queue: queue.Queue[Any] = queue.Queue(maxsize=10 ** 6)
|
156
135
|
self.count = 0
|
@@ -167,14 +146,14 @@ class Interchange:
|
|
167
146
|
self.worker_task_port = self.worker_ports[0]
|
168
147
|
self.worker_result_port = self.worker_ports[1]
|
169
148
|
|
170
|
-
self.task_outgoing.bind(
|
171
|
-
self.results_incoming.bind(
|
149
|
+
self.task_outgoing.bind(tcp_url(self.interchange_address, self.worker_task_port))
|
150
|
+
self.results_incoming.bind(tcp_url(self.interchange_address, self.worker_result_port))
|
172
151
|
|
173
152
|
else:
|
174
|
-
self.worker_task_port = self.task_outgoing.bind_to_random_port(
|
153
|
+
self.worker_task_port = self.task_outgoing.bind_to_random_port(tcp_url(self.interchange_address),
|
175
154
|
min_port=worker_port_range[0],
|
176
155
|
max_port=worker_port_range[1], max_tries=100)
|
177
|
-
self.worker_result_port = self.results_incoming.bind_to_random_port(
|
156
|
+
self.worker_result_port = self.results_incoming.bind_to_random_port(tcp_url(self.interchange_address),
|
178
157
|
min_port=worker_port_range[0],
|
179
158
|
max_port=worker_port_range[1], max_tries=100)
|
180
159
|
|
@@ -186,6 +165,8 @@ class Interchange:
|
|
186
165
|
|
187
166
|
self.heartbeat_threshold = heartbeat_threshold
|
188
167
|
|
168
|
+
self.manager_selector = manager_selector
|
169
|
+
|
189
170
|
self.current_platform = {'parsl_v': PARSL_VERSION,
|
190
171
|
'python_v': "{}.{}.{}".format(sys.version_info.major,
|
191
172
|
sys.version_info.minor,
|
@@ -242,27 +223,16 @@ class Interchange:
|
|
242
223
|
task_counter += 1
|
243
224
|
logger.debug(f"Fetched {task_counter} tasks so far")
|
244
225
|
|
245
|
-
def
|
246
|
-
if
|
247
|
-
logger.info("
|
248
|
-
# This is a one-off because monitoring is unencrypted
|
249
|
-
hub_channel = zmq.Context().socket(zmq.DEALER)
|
250
|
-
hub_channel.set_hwm(0)
|
251
|
-
hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_port))
|
252
|
-
logger.info("Monitoring enabled and connected to hub")
|
253
|
-
return hub_channel
|
254
|
-
else:
|
255
|
-
return None
|
256
|
-
|
257
|
-
def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
|
258
|
-
if hub_channel:
|
259
|
-
logger.info("Sending message {} to hub".format(manager))
|
226
|
+
def _send_monitoring_info(self, monitoring_radio: Optional[MonitoringRadioSender], manager: ManagerRecord) -> None:
|
227
|
+
if monitoring_radio:
|
228
|
+
logger.info("Sending message {} to MonitoringHub".format(manager))
|
260
229
|
|
261
230
|
d: Dict = cast(Dict, manager.copy())
|
262
231
|
d['timestamp'] = datetime.datetime.now()
|
263
232
|
d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat'])
|
233
|
+
d['run_id'] = self.run_id
|
264
234
|
|
265
|
-
|
235
|
+
monitoring_radio.send((MessageType.NODE_INFO, d))
|
266
236
|
|
267
237
|
@wrap_with_logs(target="interchange")
|
268
238
|
def _command_server(self) -> NoReturn:
|
@@ -270,8 +240,11 @@ class Interchange:
|
|
270
240
|
"""
|
271
241
|
logger.debug("Command Server Starting")
|
272
242
|
|
273
|
-
|
274
|
-
|
243
|
+
if self.hub_address is not None and self.hub_zmq_port is not None:
|
244
|
+
logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
|
245
|
+
monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
|
246
|
+
else:
|
247
|
+
monitoring_radio = None
|
275
248
|
|
276
249
|
reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
|
277
250
|
|
@@ -279,13 +252,7 @@ class Interchange:
|
|
279
252
|
try:
|
280
253
|
command_req = self.command_channel.recv_pyobj()
|
281
254
|
logger.debug("Received command request: {}".format(command_req))
|
282
|
-
if command_req == "
|
283
|
-
outstanding = self.pending_task_queue.qsize()
|
284
|
-
for manager in self._ready_managers.values():
|
285
|
-
outstanding += len(manager['tasks'])
|
286
|
-
reply = outstanding
|
287
|
-
|
288
|
-
elif command_req == "CONNECTED_BLOCKS":
|
255
|
+
if command_req == "CONNECTED_BLOCKS":
|
289
256
|
reply = self.connected_block_history
|
290
257
|
|
291
258
|
elif command_req == "WORKERS":
|
@@ -308,7 +275,10 @@ class Interchange:
|
|
308
275
|
'worker_count': m['worker_count'],
|
309
276
|
'tasks': len(m['tasks']),
|
310
277
|
'idle_duration': idle_duration,
|
311
|
-
'active': m['active']
|
278
|
+
'active': m['active'],
|
279
|
+
'parsl_version': m['parsl_version'],
|
280
|
+
'python_version': m['python_version'],
|
281
|
+
'draining': m['draining']}
|
312
282
|
reply.append(resp)
|
313
283
|
|
314
284
|
elif command_req.startswith("HOLD_WORKER"):
|
@@ -318,13 +288,17 @@ class Interchange:
|
|
318
288
|
if manager_id in self._ready_managers:
|
319
289
|
m = self._ready_managers[manager_id]
|
320
290
|
m['active'] = False
|
321
|
-
self._send_monitoring_info(
|
291
|
+
self._send_monitoring_info(monitoring_radio, m)
|
322
292
|
else:
|
323
293
|
logger.warning("Worker to hold was not in ready managers list")
|
324
294
|
|
325
295
|
reply = None
|
326
296
|
|
297
|
+
elif command_req == "WORKER_PORTS":
|
298
|
+
reply = (self.worker_task_port, self.worker_result_port)
|
299
|
+
|
327
300
|
else:
|
301
|
+
logger.error(f"Received unknown command: {command_req}")
|
328
302
|
reply = None
|
329
303
|
|
330
304
|
logger.debug("Reply: {}".format(reply))
|
@@ -339,19 +313,14 @@ class Interchange:
|
|
339
313
|
""" Start the interchange
|
340
314
|
"""
|
341
315
|
|
342
|
-
|
343
|
-
# handler will be inherited by the interchange process because it is
|
344
|
-
# launched as a multiprocessing fork process.
|
345
|
-
# That can interfere with the interchange shutdown mechanism, which is
|
346
|
-
# to receive a SIGTERM and exit immediately.
|
347
|
-
# See Parsl issue #2343 (Threads and multiprocessing cannot be
|
348
|
-
# intermingled without deadlocks) which talks about other fork-related
|
349
|
-
# parent-process-inheritance problems.
|
350
|
-
signal.signal(signal.SIGTERM, signal.SIG_DFL)
|
351
|
-
|
352
|
-
logger.info("Incoming ports bound")
|
316
|
+
logger.info("Starting main interchange method")
|
353
317
|
|
354
|
-
|
318
|
+
if self.hub_address is not None and self.hub_zmq_port is not None:
|
319
|
+
logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
|
320
|
+
monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
|
321
|
+
logger.debug("Created monitoring radio")
|
322
|
+
else:
|
323
|
+
monitoring_radio = None
|
355
324
|
|
356
325
|
poll_period = self.poll_period
|
357
326
|
|
@@ -382,20 +351,21 @@ class Interchange:
|
|
382
351
|
while not kill_event.is_set():
|
383
352
|
self.socks = dict(poller.poll(timeout=poll_period))
|
384
353
|
|
385
|
-
self.process_task_outgoing_incoming(interesting_managers,
|
386
|
-
self.process_results_incoming(interesting_managers,
|
387
|
-
self.expire_bad_managers(interesting_managers,
|
354
|
+
self.process_task_outgoing_incoming(interesting_managers, monitoring_radio, kill_event)
|
355
|
+
self.process_results_incoming(interesting_managers, monitoring_radio)
|
356
|
+
self.expire_bad_managers(interesting_managers, monitoring_radio)
|
357
|
+
self.expire_drained_managers(interesting_managers, monitoring_radio)
|
388
358
|
self.process_tasks_to_send(interesting_managers)
|
389
359
|
|
390
360
|
self.zmq_context.destroy()
|
391
361
|
delta = time.time() - start
|
392
|
-
logger.info("Processed {} tasks in {} seconds"
|
362
|
+
logger.info(f"Processed {self.count} tasks in {delta} seconds")
|
393
363
|
logger.warning("Exiting")
|
394
364
|
|
395
365
|
def process_task_outgoing_incoming(
|
396
366
|
self,
|
397
367
|
interesting_managers: Set[bytes],
|
398
|
-
|
368
|
+
monitoring_radio: Optional[MonitoringRadioSender],
|
399
369
|
kill_event: threading.Event
|
400
370
|
) -> None:
|
401
371
|
"""Process one message from manager on the task_outgoing channel.
|
@@ -410,9 +380,8 @@ class Interchange:
|
|
410
380
|
try:
|
411
381
|
msg = json.loads(message[1].decode('utf-8'))
|
412
382
|
except Exception:
|
413
|
-
logger.warning("Got Exception reading message from manager: {!r}"
|
414
|
-
|
415
|
-
logger.debug("Message: \n{!r}\n".format(message[1]))
|
383
|
+
logger.warning(f"Got Exception reading message from manager: {manager_id!r}", exc_info=True)
|
384
|
+
logger.debug("Message:\n %r\n", message[1])
|
416
385
|
return
|
417
386
|
|
418
387
|
# perform a bit of validation on the structure of the deserialized
|
@@ -420,7 +389,7 @@ class Interchange:
|
|
420
389
|
# in obviously malformed cases
|
421
390
|
if not isinstance(msg, dict) or 'type' not in msg:
|
422
391
|
logger.error(f"JSON message was not correctly formatted from manager: {manager_id!r}")
|
423
|
-
logger.debug("Message
|
392
|
+
logger.debug("Message:\n %r\n", message[1])
|
424
393
|
return
|
425
394
|
|
426
395
|
if msg['type'] == 'registration':
|
@@ -428,14 +397,18 @@ class Interchange:
|
|
428
397
|
self._ready_managers[manager_id] = {'last_heartbeat': time.time(),
|
429
398
|
'idle_since': time.time(),
|
430
399
|
'block_id': None,
|
400
|
+
'start_time': msg['start_time'],
|
431
401
|
'max_capacity': 0,
|
432
402
|
'worker_count': 0,
|
433
403
|
'active': True,
|
404
|
+
'draining': False,
|
405
|
+
'parsl_version': msg['parsl_v'],
|
406
|
+
'python_version': msg['python_v'],
|
434
407
|
'tasks': []}
|
435
408
|
self.connected_block_history.append(msg['block_id'])
|
436
409
|
|
437
410
|
interesting_managers.add(manager_id)
|
438
|
-
logger.info("Adding manager: {!r} to ready queue"
|
411
|
+
logger.info(f"Adding manager: {manager_id!r} to ready queue")
|
439
412
|
m = self._ready_managers[manager_id]
|
440
413
|
|
441
414
|
# m is a ManagerRecord, but msg is a dict[Any,Any] and so can
|
@@ -444,12 +417,12 @@ class Interchange:
|
|
444
417
|
# later.
|
445
418
|
m.update(msg) # type: ignore[typeddict-item]
|
446
419
|
|
447
|
-
logger.info("Registration info for manager {!r}: {}"
|
448
|
-
self._send_monitoring_info(
|
420
|
+
logger.info(f"Registration info for manager {manager_id!r}: {msg}")
|
421
|
+
self._send_monitoring_info(monitoring_radio, m)
|
449
422
|
|
450
423
|
if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
|
451
424
|
msg['parsl_v'] != self.current_platform['parsl_v']):
|
452
|
-
logger.error("Manager {!r} has incompatible version info with the interchange"
|
425
|
+
logger.error(f"Manager {manager_id!r} has incompatible version info with the interchange")
|
453
426
|
logger.debug("Setting kill event")
|
454
427
|
kill_event.set()
|
455
428
|
e = VersionMismatch("py.v={} parsl.v={}".format(self.current_platform['python_v'].rsplit(".", 1)[0],
|
@@ -462,27 +435,49 @@ class Interchange:
|
|
462
435
|
self.results_outgoing.send(pkl_package)
|
463
436
|
logger.error("Sent failure reports, shutting down interchange")
|
464
437
|
else:
|
465
|
-
logger.info("Manager {!r} has compatible Parsl version {
|
466
|
-
logger.info("Manager {!r} has compatible Python version {
|
467
|
-
msg['python_v'].rsplit(".", 1)[0]))
|
438
|
+
logger.info(f"Manager {manager_id!r} has compatible Parsl version {msg['parsl_v']}")
|
439
|
+
logger.info(f"Manager {manager_id!r} has compatible Python version {msg['python_v'].rsplit('.', 1)[0]}")
|
468
440
|
elif msg['type'] == 'heartbeat':
|
469
|
-
|
470
|
-
|
471
|
-
|
441
|
+
manager = self._ready_managers.get(manager_id)
|
442
|
+
if manager:
|
443
|
+
manager['last_heartbeat'] = time.time()
|
444
|
+
logger.debug("Manager %r sent heartbeat via tasks connection", manager_id)
|
445
|
+
self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
|
446
|
+
else:
|
447
|
+
logger.warning("Received heartbeat via tasks connection for not-registered manager %r", manager_id)
|
448
|
+
elif msg['type'] == 'drain':
|
449
|
+
self._ready_managers[manager_id]['draining'] = True
|
450
|
+
logger.debug("Manager %r requested drain", manager_id)
|
472
451
|
else:
|
473
452
|
logger.error(f"Unexpected message type received from manager: {msg['type']}")
|
474
453
|
logger.debug("leaving task_outgoing section")
|
475
454
|
|
455
|
+
def expire_drained_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
456
|
+
|
457
|
+
for manager_id in list(interesting_managers):
|
458
|
+
# is it always true that a draining manager will be in interesting managers?
|
459
|
+
# i think so because it will have outstanding capacity?
|
460
|
+
m = self._ready_managers[manager_id]
|
461
|
+
if m['draining'] and len(m['tasks']) == 0:
|
462
|
+
logger.info(f"Manager {manager_id!r} is drained - sending drained message to manager")
|
463
|
+
self.task_outgoing.send_multipart([manager_id, b'', PKL_DRAINED_CODE])
|
464
|
+
interesting_managers.remove(manager_id)
|
465
|
+
self._ready_managers.pop(manager_id)
|
466
|
+
|
467
|
+
m['active'] = False
|
468
|
+
self._send_monitoring_info(monitoring_radio, m)
|
469
|
+
|
476
470
|
def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
|
477
471
|
# Check if there are tasks that could be sent to managers
|
478
472
|
|
479
|
-
logger.debug(
|
480
|
-
total
|
481
|
-
|
473
|
+
logger.debug(
|
474
|
+
"Managers count (interesting/total): %d/%d",
|
475
|
+
len(interesting_managers),
|
476
|
+
len(self._ready_managers)
|
477
|
+
)
|
482
478
|
|
483
479
|
if interesting_managers and not self.pending_task_queue.empty():
|
484
|
-
shuffled_managers =
|
485
|
-
random.shuffle(shuffled_managers)
|
480
|
+
shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
|
486
481
|
|
487
482
|
while shuffled_managers and not self.pending_task_queue.empty(): # cf. the if statement above...
|
488
483
|
manager_id = shuffled_managers.pop()
|
@@ -490,7 +485,7 @@ class Interchange:
|
|
490
485
|
tasks_inflight = len(m['tasks'])
|
491
486
|
real_capacity = m['max_capacity'] - tasks_inflight
|
492
487
|
|
493
|
-
if
|
488
|
+
if real_capacity and m["active"] and not m["draining"]:
|
494
489
|
tasks = self.get_tasks(real_capacity)
|
495
490
|
if tasks:
|
496
491
|
self.task_outgoing.send_multipart([manager_id, b'', pickle.dumps(tasks)])
|
@@ -499,31 +494,31 @@ class Interchange:
|
|
499
494
|
tids = [t['task_id'] for t in tasks]
|
500
495
|
m['tasks'].extend(tids)
|
501
496
|
m['idle_since'] = None
|
502
|
-
logger.debug("Sent tasks:
|
497
|
+
logger.debug("Sent tasks: %s to manager %r", tids, manager_id)
|
503
498
|
# recompute real_capacity after sending tasks
|
504
499
|
real_capacity = m['max_capacity'] - tasks_inflight
|
505
500
|
if real_capacity > 0:
|
506
|
-
logger.debug("Manager
|
501
|
+
logger.debug("Manager %r has free capacity %s", manager_id, real_capacity)
|
507
502
|
# ... so keep it in the interesting_managers list
|
508
503
|
else:
|
509
|
-
logger.debug("Manager
|
504
|
+
logger.debug("Manager %r is now saturated", manager_id)
|
510
505
|
interesting_managers.remove(manager_id)
|
511
506
|
else:
|
512
507
|
interesting_managers.remove(manager_id)
|
513
508
|
# logger.debug("Nothing to send to manager {}".format(manager_id))
|
514
|
-
logger.debug("leaving _ready_managers section, with
|
509
|
+
logger.debug("leaving _ready_managers section, with %s managers still interesting", len(interesting_managers))
|
515
510
|
else:
|
516
511
|
logger.debug("either no interesting managers or no tasks, so skipping manager pass")
|
517
512
|
|
518
|
-
def process_results_incoming(self, interesting_managers: Set[bytes],
|
513
|
+
def process_results_incoming(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
519
514
|
# Receive any results and forward to client
|
520
515
|
if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN:
|
521
516
|
logger.debug("entering results_incoming section")
|
522
517
|
manager_id, *all_messages = self.results_incoming.recv_multipart()
|
523
518
|
if manager_id not in self._ready_managers:
|
524
|
-
logger.warning("Received a result from a un-registered manager: {!r}"
|
519
|
+
logger.warning(f"Received a result from a un-registered manager: {manager_id!r}")
|
525
520
|
else:
|
526
|
-
logger.debug(
|
521
|
+
logger.debug("Got %s result items in batch from manager %r", len(all_messages), manager_id)
|
527
522
|
|
528
523
|
b_messages = []
|
529
524
|
|
@@ -535,16 +530,15 @@ class Interchange:
|
|
535
530
|
elif r['type'] == 'monitoring':
|
536
531
|
# the monitoring code makes the assumption that no
|
537
532
|
# monitoring messages will be received if monitoring
|
538
|
-
# is not configured, and that
|
533
|
+
# is not configured, and that monitoring_radio will only
|
539
534
|
# be None when monitoring is not configurated.
|
540
|
-
assert
|
535
|
+
assert monitoring_radio is not None
|
541
536
|
|
542
|
-
|
537
|
+
monitoring_radio.send(r['payload'])
|
543
538
|
elif r['type'] == 'heartbeat':
|
544
|
-
logger.debug(
|
545
|
-
b_messages.append((p_message, r))
|
539
|
+
logger.debug("Manager %r sent heartbeat via results connection", manager_id)
|
546
540
|
else:
|
547
|
-
logger.error("Interchange discarding result_queue message of unknown type:
|
541
|
+
logger.error("Interchange discarding result_queue message of unknown type: %s", r["type"])
|
548
542
|
|
549
543
|
got_result = False
|
550
544
|
m = self._ready_managers[manager_id]
|
@@ -553,14 +547,16 @@ class Interchange:
|
|
553
547
|
if r['type'] == 'result':
|
554
548
|
got_result = True
|
555
549
|
try:
|
556
|
-
logger.debug(
|
550
|
+
logger.debug("Removing task %s from manager record %r", r["task_id"], manager_id)
|
557
551
|
m['tasks'].remove(r['task_id'])
|
558
552
|
except Exception:
|
559
553
|
# If we reach here, there's something very wrong.
|
560
|
-
logger.exception(
|
554
|
+
logger.exception(
|
555
|
+
"Ignoring exception removing task_id %s for manager %r with task list %s",
|
561
556
|
r['task_id'],
|
562
557
|
manager_id,
|
563
|
-
m[
|
558
|
+
m["tasks"]
|
559
|
+
)
|
564
560
|
|
565
561
|
b_messages_to_send = []
|
566
562
|
for (b_message, _) in b_messages:
|
@@ -571,7 +567,7 @@ class Interchange:
|
|
571
567
|
self.results_outgoing.send_multipart(b_messages_to_send)
|
572
568
|
logger.debug("Sent messages on results_outgoing")
|
573
569
|
|
574
|
-
logger.debug(
|
570
|
+
logger.debug("Current tasks on manager %r: %s", manager_id, m["tasks"])
|
575
571
|
if len(m['tasks']) == 0 and m['idle_since'] is None:
|
576
572
|
m['idle_since'] = time.time()
|
577
573
|
|
@@ -583,7 +579,7 @@ class Interchange:
|
|
583
579
|
interesting_managers.add(manager_id)
|
584
580
|
logger.debug("leaving results_incoming section")
|
585
581
|
|
586
|
-
def expire_bad_managers(self, interesting_managers: Set[bytes],
|
582
|
+
def expire_bad_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
587
583
|
bad_managers = [(manager_id, m) for (manager_id, m) in self._ready_managers.items() if
|
588
584
|
time.time() - m['last_heartbeat'] > self.heartbeat_threshold]
|
589
585
|
for (manager_id, m) in bad_managers:
|
@@ -591,7 +587,7 @@ class Interchange:
|
|
591
587
|
logger.warning(f"Too many heartbeats missed for manager {manager_id!r} - removing manager")
|
592
588
|
if m['active']:
|
593
589
|
m['active'] = False
|
594
|
-
self._send_monitoring_info(
|
590
|
+
self._send_monitoring_info(monitoring_radio, m)
|
595
591
|
|
596
592
|
logger.warning(f"Cancelling htex tasks {m['tasks']} on removed manager")
|
597
593
|
for tid in m['tasks']:
|
@@ -644,15 +640,10 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
|
|
644
640
|
logger.addHandler(handler)
|
645
641
|
|
646
642
|
|
647
|
-
|
648
|
-
def starter(comm_q: multiprocessing.Queue, *args: Any, **kwargs: Any) -> None:
|
649
|
-
"""Start the interchange process
|
650
|
-
|
651
|
-
The executor is expected to call this function. The args, kwargs match that of the Interchange.__init__
|
652
|
-
"""
|
643
|
+
if __name__ == "__main__":
|
653
644
|
setproctitle("parsl: HTEX interchange")
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
645
|
+
|
646
|
+
config = pickle.load(sys.stdin.buffer)
|
647
|
+
|
648
|
+
ic = Interchange(**config)
|
658
649
|
ic.start()
|
@@ -1,15 +1,20 @@
|
|
1
1
|
from datetime import datetime
|
2
2
|
from typing import Any, List, Optional
|
3
|
+
|
3
4
|
from typing_extensions import TypedDict
|
4
5
|
|
5
6
|
|
6
7
|
class ManagerRecord(TypedDict, total=False):
|
7
8
|
block_id: Optional[str]
|
9
|
+
start_time: float
|
8
10
|
tasks: List[Any]
|
9
11
|
worker_count: int
|
10
12
|
max_capacity: int
|
11
13
|
active: bool
|
14
|
+
draining: bool
|
12
15
|
hostname: str
|
13
16
|
last_heartbeat: float
|
14
17
|
idle_since: Optional[float]
|
15
18
|
timestamp: datetime
|
19
|
+
parsl_version: str
|
20
|
+
python_version: str
|