parsl 2024.3.18__py3-none-any.whl → 2025.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/__init__.py +9 -10
- parsl/addresses.py +26 -6
- parsl/app/app.py +7 -8
- parsl/app/bash.py +15 -8
- parsl/app/errors.py +10 -13
- parsl/app/futures.py +8 -10
- parsl/app/python.py +2 -1
- parsl/benchmark/perf.py +2 -1
- parsl/concurrent/__init__.py +2 -2
- parsl/config.py +53 -10
- parsl/configs/ASPIRE1.py +6 -5
- parsl/configs/Azure.py +9 -8
- parsl/configs/bridges.py +6 -4
- parsl/configs/cc_in2p3.py +3 -3
- parsl/configs/ec2.py +3 -1
- parsl/configs/expanse.py +4 -3
- parsl/configs/frontera.py +3 -4
- parsl/configs/htex_local.py +3 -4
- parsl/configs/illinoiscluster.py +3 -1
- parsl/configs/improv.py +34 -0
- parsl/configs/kubernetes.py +4 -3
- parsl/configs/local_threads.py +5 -1
- parsl/configs/midway.py +5 -3
- parsl/configs/osg.py +4 -2
- parsl/configs/polaris.py +4 -2
- parsl/configs/stampede2.py +6 -5
- parsl/configs/summit.py +3 -3
- parsl/configs/toss3_llnl.py +4 -3
- parsl/configs/vineex_local.py +6 -4
- parsl/configs/wqex_local.py +5 -3
- parsl/curvezmq.py +4 -0
- parsl/data_provider/data_manager.py +4 -3
- parsl/data_provider/file_noop.py +1 -2
- parsl/data_provider/files.py +3 -3
- parsl/data_provider/ftp.py +1 -3
- parsl/data_provider/globus.py +7 -6
- parsl/data_provider/http.py +2 -2
- parsl/data_provider/rsync.py +1 -1
- parsl/data_provider/staging.py +2 -2
- parsl/data_provider/zip.py +135 -0
- parsl/dataflow/dependency_resolvers.py +115 -0
- parsl/dataflow/dflow.py +259 -223
- parsl/dataflow/errors.py +3 -5
- parsl/dataflow/futures.py +27 -14
- parsl/dataflow/memoization.py +5 -5
- parsl/dataflow/rundirs.py +5 -6
- parsl/dataflow/taskrecord.py +4 -5
- parsl/executors/__init__.py +4 -2
- parsl/executors/base.py +45 -15
- parsl/executors/errors.py +13 -0
- parsl/executors/execute_task.py +37 -0
- parsl/executors/flux/execute_parsl_task.py +3 -3
- parsl/executors/flux/executor.py +18 -19
- parsl/executors/flux/flux_instance_manager.py +26 -27
- parsl/executors/high_throughput/errors.py +43 -3
- parsl/executors/high_throughput/executor.py +307 -285
- parsl/executors/high_throughput/interchange.py +137 -168
- parsl/executors/high_throughput/manager_record.py +4 -0
- parsl/executors/high_throughput/manager_selector.py +55 -0
- parsl/executors/high_throughput/monitoring_info.py +2 -1
- parsl/executors/high_throughput/mpi_executor.py +113 -0
- parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
- parsl/executors/high_throughput/mpi_resource_management.py +6 -17
- parsl/executors/high_throughput/probe.py +9 -7
- parsl/executors/high_throughput/process_worker_pool.py +77 -75
- parsl/executors/high_throughput/zmq_pipes.py +81 -23
- parsl/executors/radical/executor.py +130 -79
- parsl/executors/radical/rpex_resources.py +17 -15
- parsl/executors/radical/rpex_worker.py +4 -3
- parsl/executors/status_handling.py +157 -51
- parsl/executors/taskvine/__init__.py +1 -1
- parsl/executors/taskvine/errors.py +1 -1
- parsl/executors/taskvine/exec_parsl_function.py +2 -2
- parsl/executors/taskvine/executor.py +38 -55
- parsl/executors/taskvine/factory.py +1 -1
- parsl/executors/taskvine/factory_config.py +1 -1
- parsl/executors/taskvine/manager.py +17 -13
- parsl/executors/taskvine/manager_config.py +7 -2
- parsl/executors/threads.py +6 -6
- parsl/executors/workqueue/errors.py +1 -1
- parsl/executors/workqueue/exec_parsl_function.py +6 -5
- parsl/executors/workqueue/executor.py +64 -63
- parsl/executors/workqueue/parsl_coprocess.py +1 -1
- parsl/jobs/error_handlers.py +2 -2
- parsl/jobs/job_status_poller.py +28 -112
- parsl/jobs/states.py +7 -2
- parsl/jobs/strategy.py +43 -31
- parsl/launchers/__init__.py +12 -3
- parsl/launchers/errors.py +1 -1
- parsl/launchers/launchers.py +0 -6
- parsl/log_utils.py +1 -2
- parsl/monitoring/db_manager.py +55 -93
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +85 -311
- parsl/monitoring/queries/pandas.py +1 -2
- parsl/monitoring/radios/base.py +13 -0
- parsl/monitoring/radios/filesystem.py +52 -0
- parsl/monitoring/radios/htex.py +57 -0
- parsl/monitoring/radios/multiprocessing.py +17 -0
- parsl/monitoring/radios/udp.py +56 -0
- parsl/monitoring/radios/zmq.py +17 -0
- parsl/monitoring/remote.py +33 -37
- parsl/monitoring/router.py +212 -0
- parsl/monitoring/types.py +5 -6
- parsl/monitoring/visualization/app.py +4 -2
- parsl/monitoring/visualization/models.py +0 -1
- parsl/monitoring/visualization/plots/default/workflow_plots.py +8 -4
- parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
- parsl/monitoring/visualization/utils.py +0 -1
- parsl/monitoring/visualization/views.py +16 -9
- parsl/multiprocessing.py +0 -1
- parsl/process_loggers.py +1 -2
- parsl/providers/__init__.py +8 -17
- parsl/providers/aws/aws.py +2 -3
- parsl/providers/azure/azure.py +4 -5
- parsl/providers/base.py +2 -18
- parsl/providers/cluster_provider.py +3 -9
- parsl/providers/condor/condor.py +7 -17
- parsl/providers/errors.py +2 -2
- parsl/providers/googlecloud/googlecloud.py +2 -1
- parsl/providers/grid_engine/grid_engine.py +5 -14
- parsl/providers/kubernetes/kube.py +80 -40
- parsl/providers/local/local.py +13 -26
- parsl/providers/lsf/lsf.py +5 -23
- parsl/providers/pbspro/pbspro.py +5 -17
- parsl/providers/slurm/slurm.py +81 -39
- parsl/providers/torque/torque.py +3 -14
- parsl/serialize/__init__.py +8 -3
- parsl/serialize/base.py +1 -2
- parsl/serialize/concretes.py +5 -4
- parsl/serialize/facade.py +3 -3
- parsl/serialize/proxystore.py +3 -2
- parsl/tests/__init__.py +1 -1
- parsl/tests/configs/azure_single_node.py +4 -5
- parsl/tests/configs/bridges.py +3 -2
- parsl/tests/configs/cc_in2p3.py +1 -3
- parsl/tests/configs/comet.py +2 -1
- parsl/tests/configs/ec2_single_node.py +1 -2
- parsl/tests/configs/ec2_spot.py +1 -2
- parsl/tests/configs/flux_local.py +11 -0
- parsl/tests/configs/frontera.py +2 -3
- parsl/tests/configs/htex_local.py +3 -5
- parsl/tests/configs/htex_local_alternate.py +11 -15
- parsl/tests/configs/htex_local_intask_staging.py +5 -9
- parsl/tests/configs/htex_local_rsync_staging.py +4 -8
- parsl/tests/configs/local_radical.py +1 -3
- parsl/tests/configs/local_radical_mpi.py +2 -2
- parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
- parsl/tests/configs/local_threads_monitoring.py +0 -1
- parsl/tests/configs/midway.py +2 -2
- parsl/tests/configs/nscc_singapore.py +3 -3
- parsl/tests/configs/osg_htex.py +1 -1
- parsl/tests/configs/petrelkube.py +3 -2
- parsl/tests/configs/slurm_local.py +24 -0
- parsl/tests/configs/summit.py +1 -0
- parsl/tests/configs/taskvine_ex.py +4 -7
- parsl/tests/configs/user_opts.py +0 -7
- parsl/tests/configs/workqueue_ex.py +4 -6
- parsl/tests/conftest.py +27 -13
- parsl/tests/integration/test_stress/test_python_simple.py +3 -4
- parsl/tests/integration/test_stress/test_python_threads.py +3 -5
- parsl/tests/manual_tests/htex_local.py +4 -6
- parsl/tests/manual_tests/test_basic.py +1 -0
- parsl/tests/manual_tests/test_log_filter.py +3 -1
- parsl/tests/manual_tests/test_memory_limits.py +6 -8
- parsl/tests/manual_tests/test_regression_220.py +2 -1
- parsl/tests/manual_tests/test_udp_simple.py +4 -4
- parsl/tests/manual_tests/test_worker_count.py +3 -2
- parsl/tests/scaling_tests/htex_local.py +2 -4
- parsl/tests/scaling_tests/test_scale.py +0 -9
- parsl/tests/scaling_tests/vineex_condor.py +1 -2
- parsl/tests/scaling_tests/vineex_local.py +1 -2
- parsl/tests/site_tests/site_config_selector.py +1 -6
- parsl/tests/site_tests/test_provider.py +4 -2
- parsl/tests/site_tests/test_site.py +2 -0
- parsl/tests/sites/test_affinity.py +7 -7
- parsl/tests/sites/test_dynamic_executor.py +3 -4
- parsl/tests/sites/test_ec2.py +3 -2
- parsl/tests/sites/test_worker_info.py +4 -5
- parsl/tests/test_aalst_patterns.py +0 -1
- parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
- parsl/tests/test_bash_apps/test_basic.py +10 -4
- parsl/tests/test_bash_apps/test_error_codes.py +5 -7
- parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
- parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
- parsl/tests/test_bash_apps/test_memoize.py +2 -8
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
- parsl/tests/test_bash_apps/test_multiline.py +1 -1
- parsl/tests/test_bash_apps/test_pipeline.py +1 -1
- parsl/tests/test_bash_apps/test_std_uri.py +123 -0
- parsl/tests/test_bash_apps/test_stdout.py +33 -8
- parsl/tests/test_callables.py +2 -2
- parsl/tests/test_checkpointing/test_periodic.py +21 -39
- parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
- parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
- parsl/tests/test_checkpointing/test_regression_239.py +1 -1
- parsl/tests/test_checkpointing/test_task_exit.py +2 -3
- parsl/tests/test_docs/test_from_slides.py +5 -2
- parsl/tests/test_docs/test_kwargs.py +4 -1
- parsl/tests/test_docs/test_tutorial_1.py +1 -2
- parsl/tests/test_docs/test_workflow1.py +2 -2
- parsl/tests/test_docs/test_workflow2.py +0 -1
- parsl/tests/test_error_handling/test_rand_fail.py +2 -2
- parsl/tests/test_error_handling/test_resource_spec.py +10 -12
- parsl/tests/test_error_handling/test_retries.py +6 -16
- parsl/tests/test_error_handling/test_retry_handler.py +1 -0
- parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
- parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
- parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
- parsl/tests/test_execute_task.py +29 -0
- parsl/tests/test_flux.py +1 -1
- parsl/tests/test_htex/test_basic.py +2 -3
- parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
- parsl/tests/test_htex/test_command_client_timeout.py +66 -0
- parsl/tests/test_htex/test_connected_blocks.py +3 -2
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
- parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_drain.py +11 -10
- parsl/tests/test_htex/test_htex.py +51 -25
- parsl/tests/test_htex/test_manager_failure.py +0 -1
- parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
- parsl/tests/test_htex/test_managers_command.py +36 -0
- parsl/tests/test_htex/test_missing_worker.py +2 -12
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
- parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
- parsl/tests/test_htex/test_zmq_binding.py +29 -8
- parsl/tests/test_monitoring/test_app_names.py +5 -5
- parsl/tests/test_monitoring/test_basic.py +73 -25
- parsl/tests/test_monitoring/test_db_locks.py +6 -4
- parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
- parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
- parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
- parsl/tests/test_monitoring/test_stdouterr.py +134 -0
- parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
- parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
- parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
- parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
- parsl/tests/test_providers/test_local_provider.py +3 -132
- parsl/tests/test_providers/test_pbspro_template.py +2 -3
- parsl/tests/test_providers/test_slurm_template.py +2 -3
- parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
- parsl/tests/test_python_apps/test_context_manager.py +128 -0
- parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
- parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
- parsl/tests/test_python_apps/test_fail.py +0 -25
- parsl/tests/test_python_apps/test_futures.py +2 -1
- parsl/tests/test_python_apps/test_inputs_default.py +22 -0
- parsl/tests/test_python_apps/test_join.py +0 -1
- parsl/tests/test_python_apps/test_lifted.py +11 -7
- parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
- parsl/tests/test_python_apps/test_outputs.py +1 -1
- parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
- parsl/tests/test_radical/test_mpi_funcs.py +1 -2
- parsl/tests/test_regression/test_1480.py +2 -1
- parsl/tests/test_regression/test_1653.py +2 -1
- parsl/tests/test_regression/test_226.py +1 -0
- parsl/tests/test_regression/test_2652.py +1 -0
- parsl/tests/test_regression/test_69a.py +0 -1
- parsl/tests/test_regression/test_854.py +4 -2
- parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
- parsl/tests/test_regression/test_98.py +0 -1
- parsl/tests/test_scaling/test_block_error_handler.py +9 -4
- parsl/tests/test_scaling/test_regression_1621.py +11 -15
- parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
- parsl/tests/test_scaling/test_scale_down.py +2 -5
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +5 -8
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
- parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
- parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
- parsl/tests/test_serialization/test_basic.py +2 -1
- parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
- parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
- parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
- parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
- parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
- parsl/tests/test_staging/staging_provider.py +2 -2
- parsl/tests/test_staging/test_1316.py +3 -4
- parsl/tests/test_staging/test_docs_1.py +2 -1
- parsl/tests/test_staging/test_docs_2.py +2 -1
- parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
- parsl/tests/{test_data → test_staging}/test_file.py +6 -6
- parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
- parsl/tests/test_staging/test_staging_ftp.py +1 -0
- parsl/tests/test_staging/test_staging_https.py +5 -2
- parsl/tests/test_staging/test_staging_stdout.py +64 -0
- parsl/tests/test_staging/test_zip_in.py +39 -0
- parsl/tests/test_staging/test_zip_out.py +110 -0
- parsl/tests/test_staging/test_zip_to_zip.py +41 -0
- parsl/tests/test_summary.py +2 -2
- parsl/tests/test_thread_parallelism.py +0 -1
- parsl/tests/test_threads/test_configs.py +1 -2
- parsl/tests/test_threads/test_lazy_errors.py +2 -2
- parsl/tests/test_utils/test_execute_wait.py +35 -0
- parsl/tests/test_utils/test_sanitize_dns.py +76 -0
- parsl/tests/unit/test_address.py +20 -0
- parsl/tests/unit/test_file.py +99 -0
- parsl/tests/unit/test_usage_tracking.py +66 -0
- parsl/usage_tracking/api.py +65 -0
- parsl/usage_tracking/levels.py +6 -0
- parsl/usage_tracking/usage.py +104 -62
- parsl/utils.py +137 -4
- parsl/version.py +1 -1
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
- parsl-2025.1.13.data/scripts/interchange.py +649 -0
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +77 -75
- parsl-2025.1.13.dist-info/METADATA +96 -0
- parsl-2025.1.13.dist-info/RECORD +462 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
- parsl/channels/__init__.py +0 -7
- parsl/channels/base.py +0 -141
- parsl/channels/errors.py +0 -113
- parsl/channels/local/local.py +0 -164
- parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
- parsl/channels/ssh/ssh.py +0 -276
- parsl/channels/ssh_il/__init__.py +0 -0
- parsl/channels/ssh_il/ssh_il.py +0 -74
- parsl/configs/ad_hoc.py +0 -35
- parsl/executors/radical/rpex_master.py +0 -42
- parsl/monitoring/radios.py +0 -175
- parsl/providers/ad_hoc/__init__.py +0 -0
- parsl/providers/ad_hoc/ad_hoc.py +0 -248
- parsl/providers/cobalt/__init__.py +0 -0
- parsl/providers/cobalt/cobalt.py +0 -236
- parsl/providers/cobalt/template.py +0 -17
- parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
- parsl/tests/configs/cooley_htex.py +0 -37
- parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
- parsl/tests/configs/local_adhoc.py +0 -18
- parsl/tests/configs/swan_htex.py +0 -43
- parsl/tests/configs/theta.py +0 -37
- parsl/tests/integration/test_channels/__init__.py +0 -0
- parsl/tests/integration/test_channels/test_channels.py +0 -17
- parsl/tests/integration/test_channels/test_local_channel.py +0 -42
- parsl/tests/integration/test_channels/test_scp_1.py +0 -45
- parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
- parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
- parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
- parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
- parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
- parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
- parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
- parsl/tests/sites/test_local_adhoc.py +0 -61
- parsl/tests/test_channels/__init__.py +0 -0
- parsl/tests/test_channels/test_large_output.py +0 -22
- parsl/tests/test_data/__init__.py +0 -0
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
- parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
- parsl-2024.3.18.dist-info/METADATA +0 -98
- parsl-2024.3.18.dist-info/RECORD +0 -449
- parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
- parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
- parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
@@ -1,31 +1,31 @@
|
|
1
1
|
#!/usr/bin/env python
|
2
|
-
import multiprocessing
|
3
|
-
import zmq
|
4
|
-
import os
|
5
|
-
import sys
|
6
|
-
import platform
|
7
|
-
import random
|
8
|
-
import time
|
9
2
|
import datetime
|
10
|
-
import
|
11
|
-
import signal
|
3
|
+
import json
|
12
4
|
import logging
|
5
|
+
import os
|
6
|
+
import pickle
|
7
|
+
import platform
|
13
8
|
import queue
|
9
|
+
import sys
|
14
10
|
import threading
|
15
|
-
import
|
11
|
+
import time
|
12
|
+
from typing import Any, Dict, List, NoReturn, Optional, Sequence, Set, Tuple, cast
|
16
13
|
|
17
|
-
|
14
|
+
import zmq
|
18
15
|
|
19
16
|
from parsl import curvezmq
|
20
|
-
from parsl.
|
21
|
-
from parsl.version import VERSION as PARSL_VERSION
|
22
|
-
from parsl.serialize import serialize as serialize_object
|
23
|
-
|
17
|
+
from parsl.addresses import tcp_url
|
24
18
|
from parsl.app.errors import RemoteExceptionWrapper
|
19
|
+
from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch
|
25
20
|
from parsl.executors.high_throughput.manager_record import ManagerRecord
|
21
|
+
from parsl.executors.high_throughput.manager_selector import ManagerSelector
|
26
22
|
from parsl.monitoring.message_type import MessageType
|
23
|
+
from parsl.monitoring.radios.base import MonitoringRadioSender
|
24
|
+
from parsl.monitoring.radios.zmq import ZMQRadioSender
|
27
25
|
from parsl.process_loggers import wrap_with_logs
|
28
|
-
|
26
|
+
from parsl.serialize import serialize as serialize_object
|
27
|
+
from parsl.utils import setproctitle
|
28
|
+
from parsl.version import VERSION as PARSL_VERSION
|
29
29
|
|
30
30
|
PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
|
31
31
|
PKL_DRAINED_CODE = pickle.dumps((2 ** 32) - 2)
|
@@ -34,32 +34,6 @@ LOGGER_NAME = "interchange"
|
|
34
34
|
logger = logging.getLogger(LOGGER_NAME)
|
35
35
|
|
36
36
|
|
37
|
-
class ManagerLost(Exception):
|
38
|
-
''' Task lost due to manager loss. Manager is considered lost when multiple heartbeats
|
39
|
-
have been missed.
|
40
|
-
'''
|
41
|
-
def __init__(self, manager_id: bytes, hostname: str) -> None:
|
42
|
-
self.manager_id = manager_id
|
43
|
-
self.tstamp = time.time()
|
44
|
-
self.hostname = hostname
|
45
|
-
|
46
|
-
def __str__(self) -> str:
|
47
|
-
return "Task failure due to loss of manager {} on host {}".format(self.manager_id.decode(), self.hostname)
|
48
|
-
|
49
|
-
|
50
|
-
class VersionMismatch(Exception):
|
51
|
-
''' Manager and Interchange versions do not match
|
52
|
-
'''
|
53
|
-
def __init__(self, interchange_version: str, manager_version: str):
|
54
|
-
self.interchange_version = interchange_version
|
55
|
-
self.manager_version = manager_version
|
56
|
-
|
57
|
-
def __str__(self) -> str:
|
58
|
-
return "Manager version info {} does not match interchange version info {}, causing a critical failure".format(
|
59
|
-
self.manager_version,
|
60
|
-
self.interchange_version)
|
61
|
-
|
62
|
-
|
63
37
|
class Interchange:
|
64
38
|
""" Interchange is a task orchestrator for distributed systems.
|
65
39
|
|
@@ -68,18 +42,21 @@ class Interchange:
|
|
68
42
|
3. Detect workers that have failed using heartbeats
|
69
43
|
"""
|
70
44
|
def __init__(self,
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
45
|
+
*,
|
46
|
+
client_address: str,
|
47
|
+
interchange_address: Optional[str],
|
48
|
+
client_ports: Tuple[int, int, int],
|
49
|
+
worker_ports: Optional[Tuple[int, int]],
|
50
|
+
worker_port_range: Tuple[int, int],
|
51
|
+
hub_address: Optional[str],
|
52
|
+
hub_zmq_port: Optional[int],
|
53
|
+
heartbeat_threshold: int,
|
54
|
+
logdir: str,
|
55
|
+
logging_level: int,
|
56
|
+
poll_period: int,
|
57
|
+
cert_dir: Optional[str],
|
58
|
+
manager_selector: ManagerSelector,
|
59
|
+
run_id: str,
|
83
60
|
) -> None:
|
84
61
|
"""
|
85
62
|
Parameters
|
@@ -91,45 +68,44 @@ class Interchange:
|
|
91
68
|
If specified the interchange will only listen on this address for connections from workers
|
92
69
|
else, it binds to all addresses.
|
93
70
|
|
94
|
-
client_ports :
|
71
|
+
client_ports : tuple(int, int, int)
|
95
72
|
The ports at which the client can be reached
|
96
73
|
|
97
74
|
worker_ports : tuple(int, int)
|
98
|
-
The specific two ports at which workers will connect to the Interchange.
|
75
|
+
The specific two ports at which workers will connect to the Interchange.
|
99
76
|
|
100
77
|
worker_port_range : tuple(int, int)
|
101
78
|
The interchange picks ports at random from the range which will be used by workers.
|
102
|
-
This is overridden when the worker_ports option is set.
|
79
|
+
This is overridden when the worker_ports option is set.
|
103
80
|
|
104
81
|
hub_address : str
|
105
82
|
The IP address at which the interchange can send info about managers to when monitoring is enabled.
|
106
|
-
|
83
|
+
When None, monitoring is disabled.
|
107
84
|
|
108
|
-
|
85
|
+
hub_zmq_port : str
|
109
86
|
The port at which the interchange can send info about managers to when monitoring is enabled.
|
110
|
-
|
87
|
+
When None, monitoring is disabled.
|
111
88
|
|
112
89
|
heartbeat_threshold : int
|
113
90
|
Number of seconds since the last heartbeat after which worker is considered lost.
|
114
91
|
|
115
92
|
logdir : str
|
116
|
-
Parsl log directory paths. Logs and temp files go here.
|
93
|
+
Parsl log directory paths. Logs and temp files go here.
|
117
94
|
|
118
95
|
logging_level : int
|
119
|
-
Logging level as defined in the logging module.
|
96
|
+
Logging level as defined in the logging module.
|
120
97
|
|
121
98
|
poll_period : int
|
122
|
-
The main thread polling period, in milliseconds.
|
99
|
+
The main thread polling period, in milliseconds.
|
123
100
|
|
124
101
|
cert_dir : str | None
|
125
|
-
Path to the certificate directory.
|
102
|
+
Path to the certificate directory.
|
126
103
|
"""
|
127
104
|
self.cert_dir = cert_dir
|
128
105
|
self.logdir = logdir
|
129
106
|
os.makedirs(self.logdir, exist_ok=True)
|
130
107
|
|
131
108
|
start_file_logger("{}/interchange.log".format(self.logdir), level=logging_level)
|
132
|
-
logger.propagate = False
|
133
109
|
logger.debug("Initializing Interchange process")
|
134
110
|
|
135
111
|
self.client_address = client_address
|
@@ -141,17 +117,19 @@ class Interchange:
|
|
141
117
|
self.zmq_context = curvezmq.ServerContext(self.cert_dir)
|
142
118
|
self.task_incoming = self.zmq_context.socket(zmq.DEALER)
|
143
119
|
self.task_incoming.set_hwm(0)
|
144
|
-
self.task_incoming.connect(
|
120
|
+
self.task_incoming.connect(tcp_url(client_address, client_ports[0]))
|
145
121
|
self.results_outgoing = self.zmq_context.socket(zmq.DEALER)
|
146
122
|
self.results_outgoing.set_hwm(0)
|
147
|
-
self.results_outgoing.connect(
|
123
|
+
self.results_outgoing.connect(tcp_url(client_address, client_ports[1]))
|
148
124
|
|
149
125
|
self.command_channel = self.zmq_context.socket(zmq.REP)
|
150
|
-
self.command_channel.connect(
|
126
|
+
self.command_channel.connect(tcp_url(client_address, client_ports[2]))
|
151
127
|
logger.info("Connected to client")
|
152
128
|
|
129
|
+
self.run_id = run_id
|
130
|
+
|
153
131
|
self.hub_address = hub_address
|
154
|
-
self.
|
132
|
+
self.hub_zmq_port = hub_zmq_port
|
155
133
|
|
156
134
|
self.pending_task_queue: queue.Queue[Any] = queue.Queue(maxsize=10 ** 6)
|
157
135
|
self.count = 0
|
@@ -168,14 +146,14 @@ class Interchange:
|
|
168
146
|
self.worker_task_port = self.worker_ports[0]
|
169
147
|
self.worker_result_port = self.worker_ports[1]
|
170
148
|
|
171
|
-
self.task_outgoing.bind(
|
172
|
-
self.results_incoming.bind(
|
149
|
+
self.task_outgoing.bind(tcp_url(self.interchange_address, self.worker_task_port))
|
150
|
+
self.results_incoming.bind(tcp_url(self.interchange_address, self.worker_result_port))
|
173
151
|
|
174
152
|
else:
|
175
|
-
self.worker_task_port = self.task_outgoing.bind_to_random_port(
|
153
|
+
self.worker_task_port = self.task_outgoing.bind_to_random_port(tcp_url(self.interchange_address),
|
176
154
|
min_port=worker_port_range[0],
|
177
155
|
max_port=worker_port_range[1], max_tries=100)
|
178
|
-
self.worker_result_port = self.results_incoming.bind_to_random_port(
|
156
|
+
self.worker_result_port = self.results_incoming.bind_to_random_port(tcp_url(self.interchange_address),
|
179
157
|
min_port=worker_port_range[0],
|
180
158
|
max_port=worker_port_range[1], max_tries=100)
|
181
159
|
|
@@ -187,6 +165,8 @@ class Interchange:
|
|
187
165
|
|
188
166
|
self.heartbeat_threshold = heartbeat_threshold
|
189
167
|
|
168
|
+
self.manager_selector = manager_selector
|
169
|
+
|
190
170
|
self.current_platform = {'parsl_v': PARSL_VERSION,
|
191
171
|
'python_v': "{}.{}.{}".format(sys.version_info.major,
|
192
172
|
sys.version_info.minor,
|
@@ -243,27 +223,16 @@ class Interchange:
|
|
243
223
|
task_counter += 1
|
244
224
|
logger.debug(f"Fetched {task_counter} tasks so far")
|
245
225
|
|
246
|
-
def
|
247
|
-
if
|
248
|
-
logger.info("Connecting to MonitoringHub")
|
249
|
-
# This is a one-off because monitoring is unencrypted
|
250
|
-
hub_channel = zmq.Context().socket(zmq.DEALER)
|
251
|
-
hub_channel.set_hwm(0)
|
252
|
-
hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_port))
|
253
|
-
logger.info("Connected to MonitoringHub")
|
254
|
-
return hub_channel
|
255
|
-
else:
|
256
|
-
return None
|
257
|
-
|
258
|
-
def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
|
259
|
-
if hub_channel:
|
226
|
+
def _send_monitoring_info(self, monitoring_radio: Optional[MonitoringRadioSender], manager: ManagerRecord) -> None:
|
227
|
+
if monitoring_radio:
|
260
228
|
logger.info("Sending message {} to MonitoringHub".format(manager))
|
261
229
|
|
262
230
|
d: Dict = cast(Dict, manager.copy())
|
263
231
|
d['timestamp'] = datetime.datetime.now()
|
264
232
|
d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat'])
|
233
|
+
d['run_id'] = self.run_id
|
265
234
|
|
266
|
-
|
235
|
+
monitoring_radio.send((MessageType.NODE_INFO, d))
|
267
236
|
|
268
237
|
@wrap_with_logs(target="interchange")
|
269
238
|
def _command_server(self) -> NoReturn:
|
@@ -271,8 +240,11 @@ class Interchange:
|
|
271
240
|
"""
|
272
241
|
logger.debug("Command Server Starting")
|
273
242
|
|
274
|
-
|
275
|
-
|
243
|
+
if self.hub_address is not None and self.hub_zmq_port is not None:
|
244
|
+
logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
|
245
|
+
monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
|
246
|
+
else:
|
247
|
+
monitoring_radio = None
|
276
248
|
|
277
249
|
reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
|
278
250
|
|
@@ -280,13 +252,7 @@ class Interchange:
|
|
280
252
|
try:
|
281
253
|
command_req = self.command_channel.recv_pyobj()
|
282
254
|
logger.debug("Received command request: {}".format(command_req))
|
283
|
-
if command_req == "
|
284
|
-
outstanding = self.pending_task_queue.qsize()
|
285
|
-
for manager in self._ready_managers.values():
|
286
|
-
outstanding += len(manager['tasks'])
|
287
|
-
reply = outstanding
|
288
|
-
|
289
|
-
elif command_req == "CONNECTED_BLOCKS":
|
255
|
+
if command_req == "CONNECTED_BLOCKS":
|
290
256
|
reply = self.connected_block_history
|
291
257
|
|
292
258
|
elif command_req == "WORKERS":
|
@@ -310,6 +276,8 @@ class Interchange:
|
|
310
276
|
'tasks': len(m['tasks']),
|
311
277
|
'idle_duration': idle_duration,
|
312
278
|
'active': m['active'],
|
279
|
+
'parsl_version': m['parsl_version'],
|
280
|
+
'python_version': m['python_version'],
|
313
281
|
'draining': m['draining']}
|
314
282
|
reply.append(resp)
|
315
283
|
|
@@ -320,13 +288,17 @@ class Interchange:
|
|
320
288
|
if manager_id in self._ready_managers:
|
321
289
|
m = self._ready_managers[manager_id]
|
322
290
|
m['active'] = False
|
323
|
-
self._send_monitoring_info(
|
291
|
+
self._send_monitoring_info(monitoring_radio, m)
|
324
292
|
else:
|
325
293
|
logger.warning("Worker to hold was not in ready managers list")
|
326
294
|
|
327
295
|
reply = None
|
328
296
|
|
297
|
+
elif command_req == "WORKER_PORTS":
|
298
|
+
reply = (self.worker_task_port, self.worker_result_port)
|
299
|
+
|
329
300
|
else:
|
301
|
+
logger.error(f"Received unknown command: {command_req}")
|
330
302
|
reply = None
|
331
303
|
|
332
304
|
logger.debug("Reply: {}".format(reply))
|
@@ -341,19 +313,14 @@ class Interchange:
|
|
341
313
|
""" Start the interchange
|
342
314
|
"""
|
343
315
|
|
344
|
-
|
345
|
-
# handler will be inherited by the interchange process because it is
|
346
|
-
# launched as a multiprocessing fork process.
|
347
|
-
# That can interfere with the interchange shutdown mechanism, which is
|
348
|
-
# to receive a SIGTERM and exit immediately.
|
349
|
-
# See Parsl issue #2343 (Threads and multiprocessing cannot be
|
350
|
-
# intermingled without deadlocks) which talks about other fork-related
|
351
|
-
# parent-process-inheritance problems.
|
352
|
-
signal.signal(signal.SIGTERM, signal.SIG_DFL)
|
353
|
-
|
354
|
-
logger.info("Incoming ports bound")
|
316
|
+
logger.info("Starting main interchange method")
|
355
317
|
|
356
|
-
|
318
|
+
if self.hub_address is not None and self.hub_zmq_port is not None:
|
319
|
+
logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
|
320
|
+
monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
|
321
|
+
logger.debug("Created monitoring radio")
|
322
|
+
else:
|
323
|
+
monitoring_radio = None
|
357
324
|
|
358
325
|
poll_period = self.poll_period
|
359
326
|
|
@@ -384,21 +351,21 @@ class Interchange:
|
|
384
351
|
while not kill_event.is_set():
|
385
352
|
self.socks = dict(poller.poll(timeout=poll_period))
|
386
353
|
|
387
|
-
self.process_task_outgoing_incoming(interesting_managers,
|
388
|
-
self.process_results_incoming(interesting_managers,
|
389
|
-
self.expire_bad_managers(interesting_managers,
|
390
|
-
self.expire_drained_managers(interesting_managers,
|
354
|
+
self.process_task_outgoing_incoming(interesting_managers, monitoring_radio, kill_event)
|
355
|
+
self.process_results_incoming(interesting_managers, monitoring_radio)
|
356
|
+
self.expire_bad_managers(interesting_managers, monitoring_radio)
|
357
|
+
self.expire_drained_managers(interesting_managers, monitoring_radio)
|
391
358
|
self.process_tasks_to_send(interesting_managers)
|
392
359
|
|
393
360
|
self.zmq_context.destroy()
|
394
361
|
delta = time.time() - start
|
395
|
-
logger.info("Processed {} tasks in {} seconds"
|
362
|
+
logger.info(f"Processed {self.count} tasks in {delta} seconds")
|
396
363
|
logger.warning("Exiting")
|
397
364
|
|
398
365
|
def process_task_outgoing_incoming(
|
399
366
|
self,
|
400
367
|
interesting_managers: Set[bytes],
|
401
|
-
|
368
|
+
monitoring_radio: Optional[MonitoringRadioSender],
|
402
369
|
kill_event: threading.Event
|
403
370
|
) -> None:
|
404
371
|
"""Process one message from manager on the task_outgoing channel.
|
@@ -413,9 +380,8 @@ class Interchange:
|
|
413
380
|
try:
|
414
381
|
msg = json.loads(message[1].decode('utf-8'))
|
415
382
|
except Exception:
|
416
|
-
logger.warning("Got Exception reading message from manager: {!r}"
|
417
|
-
|
418
|
-
logger.debug("Message: \n{!r}\n".format(message[1]))
|
383
|
+
logger.warning(f"Got Exception reading message from manager: {manager_id!r}", exc_info=True)
|
384
|
+
logger.debug("Message:\n %r\n", message[1])
|
419
385
|
return
|
420
386
|
|
421
387
|
# perform a bit of validation on the structure of the deserialized
|
@@ -423,7 +389,7 @@ class Interchange:
|
|
423
389
|
# in obviously malformed cases
|
424
390
|
if not isinstance(msg, dict) or 'type' not in msg:
|
425
391
|
logger.error(f"JSON message was not correctly formatted from manager: {manager_id!r}")
|
426
|
-
logger.debug("Message
|
392
|
+
logger.debug("Message:\n %r\n", message[1])
|
427
393
|
return
|
428
394
|
|
429
395
|
if msg['type'] == 'registration':
|
@@ -431,15 +397,18 @@ class Interchange:
|
|
431
397
|
self._ready_managers[manager_id] = {'last_heartbeat': time.time(),
|
432
398
|
'idle_since': time.time(),
|
433
399
|
'block_id': None,
|
400
|
+
'start_time': msg['start_time'],
|
434
401
|
'max_capacity': 0,
|
435
402
|
'worker_count': 0,
|
436
403
|
'active': True,
|
437
404
|
'draining': False,
|
405
|
+
'parsl_version': msg['parsl_v'],
|
406
|
+
'python_version': msg['python_v'],
|
438
407
|
'tasks': []}
|
439
408
|
self.connected_block_history.append(msg['block_id'])
|
440
409
|
|
441
410
|
interesting_managers.add(manager_id)
|
442
|
-
logger.info("Adding manager: {!r} to ready queue"
|
411
|
+
logger.info(f"Adding manager: {manager_id!r} to ready queue")
|
443
412
|
m = self._ready_managers[manager_id]
|
444
413
|
|
445
414
|
# m is a ManagerRecord, but msg is a dict[Any,Any] and so can
|
@@ -448,12 +417,12 @@ class Interchange:
|
|
448
417
|
# later.
|
449
418
|
m.update(msg) # type: ignore[typeddict-item]
|
450
419
|
|
451
|
-
logger.info("Registration info for manager {!r}: {}"
|
452
|
-
self._send_monitoring_info(
|
420
|
+
logger.info(f"Registration info for manager {manager_id!r}: {msg}")
|
421
|
+
self._send_monitoring_info(monitoring_radio, m)
|
453
422
|
|
454
423
|
if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
|
455
424
|
msg['parsl_v'] != self.current_platform['parsl_v']):
|
456
|
-
logger.error("Manager {!r} has incompatible version info with the interchange"
|
425
|
+
logger.error(f"Manager {manager_id!r} has incompatible version info with the interchange")
|
457
426
|
logger.debug("Setting kill event")
|
458
427
|
kill_event.set()
|
459
428
|
e = VersionMismatch("py.v={} parsl.v={}".format(self.current_platform['python_v'].rsplit(".", 1)[0],
|
@@ -466,21 +435,24 @@ class Interchange:
|
|
466
435
|
self.results_outgoing.send(pkl_package)
|
467
436
|
logger.error("Sent failure reports, shutting down interchange")
|
468
437
|
else:
|
469
|
-
logger.info("Manager {!r} has compatible Parsl version {
|
470
|
-
logger.info("Manager {!r} has compatible Python version {
|
471
|
-
msg['python_v'].rsplit(".", 1)[0]))
|
438
|
+
logger.info(f"Manager {manager_id!r} has compatible Parsl version {msg['parsl_v']}")
|
439
|
+
logger.info(f"Manager {manager_id!r} has compatible Python version {msg['python_v'].rsplit('.', 1)[0]}")
|
472
440
|
elif msg['type'] == 'heartbeat':
|
473
|
-
|
474
|
-
|
475
|
-
|
441
|
+
manager = self._ready_managers.get(manager_id)
|
442
|
+
if manager:
|
443
|
+
manager['last_heartbeat'] = time.time()
|
444
|
+
logger.debug("Manager %r sent heartbeat via tasks connection", manager_id)
|
445
|
+
self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
|
446
|
+
else:
|
447
|
+
logger.warning("Received heartbeat via tasks connection for not-registered manager %r", manager_id)
|
476
448
|
elif msg['type'] == 'drain':
|
477
449
|
self._ready_managers[manager_id]['draining'] = True
|
478
|
-
logger.debug(
|
450
|
+
logger.debug("Manager %r requested drain", manager_id)
|
479
451
|
else:
|
480
452
|
logger.error(f"Unexpected message type received from manager: {msg['type']}")
|
481
453
|
logger.debug("leaving task_outgoing section")
|
482
454
|
|
483
|
-
def expire_drained_managers(self, interesting_managers: Set[bytes],
|
455
|
+
def expire_drained_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
484
456
|
|
485
457
|
for manager_id in list(interesting_managers):
|
486
458
|
# is it always true that a draining manager will be in interesting managers?
|
@@ -493,18 +465,19 @@ class Interchange:
|
|
493
465
|
self._ready_managers.pop(manager_id)
|
494
466
|
|
495
467
|
m['active'] = False
|
496
|
-
self._send_monitoring_info(
|
468
|
+
self._send_monitoring_info(monitoring_radio, m)
|
497
469
|
|
498
470
|
def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
|
499
471
|
# Check if there are tasks that could be sent to managers
|
500
472
|
|
501
|
-
logger.debug(
|
502
|
-
total
|
503
|
-
|
473
|
+
logger.debug(
|
474
|
+
"Managers count (interesting/total): %d/%d",
|
475
|
+
len(interesting_managers),
|
476
|
+
len(self._ready_managers)
|
477
|
+
)
|
504
478
|
|
505
479
|
if interesting_managers and not self.pending_task_queue.empty():
|
506
|
-
shuffled_managers =
|
507
|
-
random.shuffle(shuffled_managers)
|
480
|
+
shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
|
508
481
|
|
509
482
|
while shuffled_managers and not self.pending_task_queue.empty(): # cf. the if statement above...
|
510
483
|
manager_id = shuffled_managers.pop()
|
@@ -512,7 +485,7 @@ class Interchange:
|
|
512
485
|
tasks_inflight = len(m['tasks'])
|
513
486
|
real_capacity = m['max_capacity'] - tasks_inflight
|
514
487
|
|
515
|
-
if
|
488
|
+
if real_capacity and m["active"] and not m["draining"]:
|
516
489
|
tasks = self.get_tasks(real_capacity)
|
517
490
|
if tasks:
|
518
491
|
self.task_outgoing.send_multipart([manager_id, b'', pickle.dumps(tasks)])
|
@@ -521,31 +494,31 @@ class Interchange:
|
|
521
494
|
tids = [t['task_id'] for t in tasks]
|
522
495
|
m['tasks'].extend(tids)
|
523
496
|
m['idle_since'] = None
|
524
|
-
logger.debug("Sent tasks:
|
497
|
+
logger.debug("Sent tasks: %s to manager %r", tids, manager_id)
|
525
498
|
# recompute real_capacity after sending tasks
|
526
499
|
real_capacity = m['max_capacity'] - tasks_inflight
|
527
500
|
if real_capacity > 0:
|
528
|
-
logger.debug("Manager
|
501
|
+
logger.debug("Manager %r has free capacity %s", manager_id, real_capacity)
|
529
502
|
# ... so keep it in the interesting_managers list
|
530
503
|
else:
|
531
|
-
logger.debug("Manager
|
504
|
+
logger.debug("Manager %r is now saturated", manager_id)
|
532
505
|
interesting_managers.remove(manager_id)
|
533
506
|
else:
|
534
507
|
interesting_managers.remove(manager_id)
|
535
508
|
# logger.debug("Nothing to send to manager {}".format(manager_id))
|
536
|
-
logger.debug("leaving _ready_managers section, with
|
509
|
+
logger.debug("leaving _ready_managers section, with %s managers still interesting", len(interesting_managers))
|
537
510
|
else:
|
538
511
|
logger.debug("either no interesting managers or no tasks, so skipping manager pass")
|
539
512
|
|
540
|
-
def process_results_incoming(self, interesting_managers: Set[bytes],
|
513
|
+
def process_results_incoming(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
541
514
|
# Receive any results and forward to client
|
542
515
|
if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN:
|
543
516
|
logger.debug("entering results_incoming section")
|
544
517
|
manager_id, *all_messages = self.results_incoming.recv_multipart()
|
545
518
|
if manager_id not in self._ready_managers:
|
546
|
-
logger.warning("Received a result from a un-registered manager: {!r}"
|
519
|
+
logger.warning(f"Received a result from a un-registered manager: {manager_id!r}")
|
547
520
|
else:
|
548
|
-
logger.debug(
|
521
|
+
logger.debug("Got %s result items in batch from manager %r", len(all_messages), manager_id)
|
549
522
|
|
550
523
|
b_messages = []
|
551
524
|
|
@@ -557,16 +530,15 @@ class Interchange:
|
|
557
530
|
elif r['type'] == 'monitoring':
|
558
531
|
# the monitoring code makes the assumption that no
|
559
532
|
# monitoring messages will be received if monitoring
|
560
|
-
# is not configured, and that
|
533
|
+
# is not configured, and that monitoring_radio will only
|
561
534
|
# be None when monitoring is not configurated.
|
562
|
-
assert
|
535
|
+
assert monitoring_radio is not None
|
563
536
|
|
564
|
-
|
537
|
+
monitoring_radio.send(r['payload'])
|
565
538
|
elif r['type'] == 'heartbeat':
|
566
|
-
logger.debug(
|
567
|
-
b_messages.append((p_message, r))
|
539
|
+
logger.debug("Manager %r sent heartbeat via results connection", manager_id)
|
568
540
|
else:
|
569
|
-
logger.error("Interchange discarding result_queue message of unknown type:
|
541
|
+
logger.error("Interchange discarding result_queue message of unknown type: %s", r["type"])
|
570
542
|
|
571
543
|
got_result = False
|
572
544
|
m = self._ready_managers[manager_id]
|
@@ -575,14 +547,16 @@ class Interchange:
|
|
575
547
|
if r['type'] == 'result':
|
576
548
|
got_result = True
|
577
549
|
try:
|
578
|
-
logger.debug(
|
550
|
+
logger.debug("Removing task %s from manager record %r", r["task_id"], manager_id)
|
579
551
|
m['tasks'].remove(r['task_id'])
|
580
552
|
except Exception:
|
581
553
|
# If we reach here, there's something very wrong.
|
582
|
-
logger.exception(
|
554
|
+
logger.exception(
|
555
|
+
"Ignoring exception removing task_id %s for manager %r with task list %s",
|
583
556
|
r['task_id'],
|
584
557
|
manager_id,
|
585
|
-
m[
|
558
|
+
m["tasks"]
|
559
|
+
)
|
586
560
|
|
587
561
|
b_messages_to_send = []
|
588
562
|
for (b_message, _) in b_messages:
|
@@ -593,7 +567,7 @@ class Interchange:
|
|
593
567
|
self.results_outgoing.send_multipart(b_messages_to_send)
|
594
568
|
logger.debug("Sent messages on results_outgoing")
|
595
569
|
|
596
|
-
logger.debug(
|
570
|
+
logger.debug("Current tasks on manager %r: %s", manager_id, m["tasks"])
|
597
571
|
if len(m['tasks']) == 0 and m['idle_since'] is None:
|
598
572
|
m['idle_since'] = time.time()
|
599
573
|
|
@@ -605,7 +579,7 @@ class Interchange:
|
|
605
579
|
interesting_managers.add(manager_id)
|
606
580
|
logger.debug("leaving results_incoming section")
|
607
581
|
|
608
|
-
def expire_bad_managers(self, interesting_managers: Set[bytes],
|
582
|
+
def expire_bad_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
609
583
|
bad_managers = [(manager_id, m) for (manager_id, m) in self._ready_managers.items() if
|
610
584
|
time.time() - m['last_heartbeat'] > self.heartbeat_threshold]
|
611
585
|
for (manager_id, m) in bad_managers:
|
@@ -613,7 +587,7 @@ class Interchange:
|
|
613
587
|
logger.warning(f"Too many heartbeats missed for manager {manager_id!r} - removing manager")
|
614
588
|
if m['active']:
|
615
589
|
m['active'] = False
|
616
|
-
self._send_monitoring_info(
|
590
|
+
self._send_monitoring_info(monitoring_radio, m)
|
617
591
|
|
618
592
|
logger.warning(f"Cancelling htex tasks {m['tasks']} on removed manager")
|
619
593
|
for tid in m['tasks']:
|
@@ -666,15 +640,10 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
|
|
666
640
|
logger.addHandler(handler)
|
667
641
|
|
668
642
|
|
669
|
-
|
670
|
-
def starter(comm_q: multiprocessing.Queue, *args: Any, **kwargs: Any) -> None:
|
671
|
-
"""Start the interchange process
|
672
|
-
|
673
|
-
The executor is expected to call this function. The args, kwargs match that of the Interchange.__init__
|
674
|
-
"""
|
643
|
+
if __name__ == "__main__":
|
675
644
|
setproctitle("parsl: HTEX interchange")
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
645
|
+
|
646
|
+
config = pickle.load(sys.stdin.buffer)
|
647
|
+
|
648
|
+
ic = Interchange(**config)
|
680
649
|
ic.start()
|
@@ -1,10 +1,12 @@
|
|
1
1
|
from datetime import datetime
|
2
2
|
from typing import Any, List, Optional
|
3
|
+
|
3
4
|
from typing_extensions import TypedDict
|
4
5
|
|
5
6
|
|
6
7
|
class ManagerRecord(TypedDict, total=False):
|
7
8
|
block_id: Optional[str]
|
9
|
+
start_time: float
|
8
10
|
tasks: List[Any]
|
9
11
|
worker_count: int
|
10
12
|
max_capacity: int
|
@@ -14,3 +16,5 @@ class ManagerRecord(TypedDict, total=False):
|
|
14
16
|
last_heartbeat: float
|
15
17
|
idle_since: Optional[float]
|
16
18
|
timestamp: datetime
|
19
|
+
parsl_version: str
|
20
|
+
python_version: str
|