parsl 2024.3.18__py3-none-any.whl → 2025.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/__init__.py +9 -10
- parsl/addresses.py +26 -6
- parsl/app/app.py +7 -8
- parsl/app/bash.py +15 -8
- parsl/app/errors.py +10 -13
- parsl/app/futures.py +8 -10
- parsl/app/python.py +2 -1
- parsl/benchmark/perf.py +2 -1
- parsl/concurrent/__init__.py +2 -2
- parsl/config.py +53 -10
- parsl/configs/ASPIRE1.py +6 -5
- parsl/configs/Azure.py +9 -8
- parsl/configs/bridges.py +6 -4
- parsl/configs/cc_in2p3.py +3 -3
- parsl/configs/ec2.py +3 -1
- parsl/configs/expanse.py +4 -3
- parsl/configs/frontera.py +3 -4
- parsl/configs/htex_local.py +3 -4
- parsl/configs/illinoiscluster.py +3 -1
- parsl/configs/improv.py +34 -0
- parsl/configs/kubernetes.py +4 -3
- parsl/configs/local_threads.py +5 -1
- parsl/configs/midway.py +5 -3
- parsl/configs/osg.py +4 -2
- parsl/configs/polaris.py +4 -2
- parsl/configs/stampede2.py +6 -5
- parsl/configs/summit.py +3 -3
- parsl/configs/toss3_llnl.py +4 -3
- parsl/configs/vineex_local.py +6 -4
- parsl/configs/wqex_local.py +5 -3
- parsl/curvezmq.py +4 -0
- parsl/data_provider/data_manager.py +4 -3
- parsl/data_provider/file_noop.py +1 -2
- parsl/data_provider/files.py +3 -3
- parsl/data_provider/ftp.py +1 -3
- parsl/data_provider/globus.py +7 -6
- parsl/data_provider/http.py +2 -2
- parsl/data_provider/rsync.py +1 -1
- parsl/data_provider/staging.py +2 -2
- parsl/data_provider/zip.py +135 -0
- parsl/dataflow/dependency_resolvers.py +115 -0
- parsl/dataflow/dflow.py +259 -223
- parsl/dataflow/errors.py +3 -5
- parsl/dataflow/futures.py +27 -14
- parsl/dataflow/memoization.py +5 -5
- parsl/dataflow/rundirs.py +5 -6
- parsl/dataflow/taskrecord.py +4 -5
- parsl/executors/__init__.py +4 -2
- parsl/executors/base.py +45 -15
- parsl/executors/errors.py +13 -0
- parsl/executors/execute_task.py +37 -0
- parsl/executors/flux/execute_parsl_task.py +3 -3
- parsl/executors/flux/executor.py +18 -19
- parsl/executors/flux/flux_instance_manager.py +26 -27
- parsl/executors/high_throughput/errors.py +43 -3
- parsl/executors/high_throughput/executor.py +307 -285
- parsl/executors/high_throughput/interchange.py +137 -168
- parsl/executors/high_throughput/manager_record.py +4 -0
- parsl/executors/high_throughput/manager_selector.py +55 -0
- parsl/executors/high_throughput/monitoring_info.py +2 -1
- parsl/executors/high_throughput/mpi_executor.py +113 -0
- parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
- parsl/executors/high_throughput/mpi_resource_management.py +6 -17
- parsl/executors/high_throughput/probe.py +9 -7
- parsl/executors/high_throughput/process_worker_pool.py +77 -75
- parsl/executors/high_throughput/zmq_pipes.py +81 -23
- parsl/executors/radical/executor.py +130 -79
- parsl/executors/radical/rpex_resources.py +17 -15
- parsl/executors/radical/rpex_worker.py +4 -3
- parsl/executors/status_handling.py +157 -51
- parsl/executors/taskvine/__init__.py +1 -1
- parsl/executors/taskvine/errors.py +1 -1
- parsl/executors/taskvine/exec_parsl_function.py +2 -2
- parsl/executors/taskvine/executor.py +38 -55
- parsl/executors/taskvine/factory.py +1 -1
- parsl/executors/taskvine/factory_config.py +1 -1
- parsl/executors/taskvine/manager.py +17 -13
- parsl/executors/taskvine/manager_config.py +7 -2
- parsl/executors/threads.py +6 -6
- parsl/executors/workqueue/errors.py +1 -1
- parsl/executors/workqueue/exec_parsl_function.py +6 -5
- parsl/executors/workqueue/executor.py +64 -63
- parsl/executors/workqueue/parsl_coprocess.py +1 -1
- parsl/jobs/error_handlers.py +2 -2
- parsl/jobs/job_status_poller.py +28 -112
- parsl/jobs/states.py +7 -2
- parsl/jobs/strategy.py +43 -31
- parsl/launchers/__init__.py +12 -3
- parsl/launchers/errors.py +1 -1
- parsl/launchers/launchers.py +0 -6
- parsl/log_utils.py +1 -2
- parsl/monitoring/db_manager.py +55 -93
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +85 -311
- parsl/monitoring/queries/pandas.py +1 -2
- parsl/monitoring/radios/base.py +13 -0
- parsl/monitoring/radios/filesystem.py +52 -0
- parsl/monitoring/radios/htex.py +57 -0
- parsl/monitoring/radios/multiprocessing.py +17 -0
- parsl/monitoring/radios/udp.py +56 -0
- parsl/monitoring/radios/zmq.py +17 -0
- parsl/monitoring/remote.py +33 -37
- parsl/monitoring/router.py +212 -0
- parsl/monitoring/types.py +5 -6
- parsl/monitoring/visualization/app.py +4 -2
- parsl/monitoring/visualization/models.py +0 -1
- parsl/monitoring/visualization/plots/default/workflow_plots.py +8 -4
- parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
- parsl/monitoring/visualization/utils.py +0 -1
- parsl/monitoring/visualization/views.py +16 -9
- parsl/multiprocessing.py +0 -1
- parsl/process_loggers.py +1 -2
- parsl/providers/__init__.py +8 -17
- parsl/providers/aws/aws.py +2 -3
- parsl/providers/azure/azure.py +4 -5
- parsl/providers/base.py +2 -18
- parsl/providers/cluster_provider.py +3 -9
- parsl/providers/condor/condor.py +7 -17
- parsl/providers/errors.py +2 -2
- parsl/providers/googlecloud/googlecloud.py +2 -1
- parsl/providers/grid_engine/grid_engine.py +5 -14
- parsl/providers/kubernetes/kube.py +80 -40
- parsl/providers/local/local.py +13 -26
- parsl/providers/lsf/lsf.py +5 -23
- parsl/providers/pbspro/pbspro.py +5 -17
- parsl/providers/slurm/slurm.py +81 -39
- parsl/providers/torque/torque.py +3 -14
- parsl/serialize/__init__.py +8 -3
- parsl/serialize/base.py +1 -2
- parsl/serialize/concretes.py +5 -4
- parsl/serialize/facade.py +3 -3
- parsl/serialize/proxystore.py +3 -2
- parsl/tests/__init__.py +1 -1
- parsl/tests/configs/azure_single_node.py +4 -5
- parsl/tests/configs/bridges.py +3 -2
- parsl/tests/configs/cc_in2p3.py +1 -3
- parsl/tests/configs/comet.py +2 -1
- parsl/tests/configs/ec2_single_node.py +1 -2
- parsl/tests/configs/ec2_spot.py +1 -2
- parsl/tests/configs/flux_local.py +11 -0
- parsl/tests/configs/frontera.py +2 -3
- parsl/tests/configs/htex_local.py +3 -5
- parsl/tests/configs/htex_local_alternate.py +11 -15
- parsl/tests/configs/htex_local_intask_staging.py +5 -9
- parsl/tests/configs/htex_local_rsync_staging.py +4 -8
- parsl/tests/configs/local_radical.py +1 -3
- parsl/tests/configs/local_radical_mpi.py +2 -2
- parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
- parsl/tests/configs/local_threads_monitoring.py +0 -1
- parsl/tests/configs/midway.py +2 -2
- parsl/tests/configs/nscc_singapore.py +3 -3
- parsl/tests/configs/osg_htex.py +1 -1
- parsl/tests/configs/petrelkube.py +3 -2
- parsl/tests/configs/slurm_local.py +24 -0
- parsl/tests/configs/summit.py +1 -0
- parsl/tests/configs/taskvine_ex.py +4 -7
- parsl/tests/configs/user_opts.py +0 -7
- parsl/tests/configs/workqueue_ex.py +4 -6
- parsl/tests/conftest.py +27 -13
- parsl/tests/integration/test_stress/test_python_simple.py +3 -4
- parsl/tests/integration/test_stress/test_python_threads.py +3 -5
- parsl/tests/manual_tests/htex_local.py +4 -6
- parsl/tests/manual_tests/test_basic.py +1 -0
- parsl/tests/manual_tests/test_log_filter.py +3 -1
- parsl/tests/manual_tests/test_memory_limits.py +6 -8
- parsl/tests/manual_tests/test_regression_220.py +2 -1
- parsl/tests/manual_tests/test_udp_simple.py +4 -4
- parsl/tests/manual_tests/test_worker_count.py +3 -2
- parsl/tests/scaling_tests/htex_local.py +2 -4
- parsl/tests/scaling_tests/test_scale.py +0 -9
- parsl/tests/scaling_tests/vineex_condor.py +1 -2
- parsl/tests/scaling_tests/vineex_local.py +1 -2
- parsl/tests/site_tests/site_config_selector.py +1 -6
- parsl/tests/site_tests/test_provider.py +4 -2
- parsl/tests/site_tests/test_site.py +2 -0
- parsl/tests/sites/test_affinity.py +7 -7
- parsl/tests/sites/test_dynamic_executor.py +3 -4
- parsl/tests/sites/test_ec2.py +3 -2
- parsl/tests/sites/test_worker_info.py +4 -5
- parsl/tests/test_aalst_patterns.py +0 -1
- parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
- parsl/tests/test_bash_apps/test_basic.py +10 -4
- parsl/tests/test_bash_apps/test_error_codes.py +5 -7
- parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
- parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
- parsl/tests/test_bash_apps/test_memoize.py +2 -8
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
- parsl/tests/test_bash_apps/test_multiline.py +1 -1
- parsl/tests/test_bash_apps/test_pipeline.py +1 -1
- parsl/tests/test_bash_apps/test_std_uri.py +123 -0
- parsl/tests/test_bash_apps/test_stdout.py +33 -8
- parsl/tests/test_callables.py +2 -2
- parsl/tests/test_checkpointing/test_periodic.py +21 -39
- parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
- parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
- parsl/tests/test_checkpointing/test_regression_239.py +1 -1
- parsl/tests/test_checkpointing/test_task_exit.py +2 -3
- parsl/tests/test_docs/test_from_slides.py +5 -2
- parsl/tests/test_docs/test_kwargs.py +4 -1
- parsl/tests/test_docs/test_tutorial_1.py +1 -2
- parsl/tests/test_docs/test_workflow1.py +2 -2
- parsl/tests/test_docs/test_workflow2.py +0 -1
- parsl/tests/test_error_handling/test_rand_fail.py +2 -2
- parsl/tests/test_error_handling/test_resource_spec.py +10 -12
- parsl/tests/test_error_handling/test_retries.py +6 -16
- parsl/tests/test_error_handling/test_retry_handler.py +1 -0
- parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
- parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
- parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
- parsl/tests/test_execute_task.py +29 -0
- parsl/tests/test_flux.py +1 -1
- parsl/tests/test_htex/test_basic.py +2 -3
- parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
- parsl/tests/test_htex/test_command_client_timeout.py +66 -0
- parsl/tests/test_htex/test_connected_blocks.py +3 -2
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
- parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_drain.py +11 -10
- parsl/tests/test_htex/test_htex.py +51 -25
- parsl/tests/test_htex/test_manager_failure.py +0 -1
- parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
- parsl/tests/test_htex/test_managers_command.py +36 -0
- parsl/tests/test_htex/test_missing_worker.py +2 -12
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
- parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
- parsl/tests/test_htex/test_zmq_binding.py +29 -8
- parsl/tests/test_monitoring/test_app_names.py +5 -5
- parsl/tests/test_monitoring/test_basic.py +73 -25
- parsl/tests/test_monitoring/test_db_locks.py +6 -4
- parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
- parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
- parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
- parsl/tests/test_monitoring/test_stdouterr.py +134 -0
- parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
- parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
- parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
- parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
- parsl/tests/test_providers/test_local_provider.py +3 -132
- parsl/tests/test_providers/test_pbspro_template.py +2 -3
- parsl/tests/test_providers/test_slurm_template.py +2 -3
- parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
- parsl/tests/test_python_apps/test_context_manager.py +128 -0
- parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
- parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
- parsl/tests/test_python_apps/test_fail.py +0 -25
- parsl/tests/test_python_apps/test_futures.py +2 -1
- parsl/tests/test_python_apps/test_inputs_default.py +22 -0
- parsl/tests/test_python_apps/test_join.py +0 -1
- parsl/tests/test_python_apps/test_lifted.py +11 -7
- parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
- parsl/tests/test_python_apps/test_outputs.py +1 -1
- parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
- parsl/tests/test_radical/test_mpi_funcs.py +1 -2
- parsl/tests/test_regression/test_1480.py +2 -1
- parsl/tests/test_regression/test_1653.py +2 -1
- parsl/tests/test_regression/test_226.py +1 -0
- parsl/tests/test_regression/test_2652.py +1 -0
- parsl/tests/test_regression/test_69a.py +0 -1
- parsl/tests/test_regression/test_854.py +4 -2
- parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
- parsl/tests/test_regression/test_98.py +0 -1
- parsl/tests/test_scaling/test_block_error_handler.py +9 -4
- parsl/tests/test_scaling/test_regression_1621.py +11 -15
- parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
- parsl/tests/test_scaling/test_scale_down.py +2 -5
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +5 -8
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
- parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
- parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
- parsl/tests/test_serialization/test_basic.py +2 -1
- parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
- parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
- parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
- parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
- parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
- parsl/tests/test_staging/staging_provider.py +2 -2
- parsl/tests/test_staging/test_1316.py +3 -4
- parsl/tests/test_staging/test_docs_1.py +2 -1
- parsl/tests/test_staging/test_docs_2.py +2 -1
- parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
- parsl/tests/{test_data → test_staging}/test_file.py +6 -6
- parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
- parsl/tests/test_staging/test_staging_ftp.py +1 -0
- parsl/tests/test_staging/test_staging_https.py +5 -2
- parsl/tests/test_staging/test_staging_stdout.py +64 -0
- parsl/tests/test_staging/test_zip_in.py +39 -0
- parsl/tests/test_staging/test_zip_out.py +110 -0
- parsl/tests/test_staging/test_zip_to_zip.py +41 -0
- parsl/tests/test_summary.py +2 -2
- parsl/tests/test_thread_parallelism.py +0 -1
- parsl/tests/test_threads/test_configs.py +1 -2
- parsl/tests/test_threads/test_lazy_errors.py +2 -2
- parsl/tests/test_utils/test_execute_wait.py +35 -0
- parsl/tests/test_utils/test_sanitize_dns.py +76 -0
- parsl/tests/unit/test_address.py +20 -0
- parsl/tests/unit/test_file.py +99 -0
- parsl/tests/unit/test_usage_tracking.py +66 -0
- parsl/usage_tracking/api.py +65 -0
- parsl/usage_tracking/levels.py +6 -0
- parsl/usage_tracking/usage.py +104 -62
- parsl/utils.py +137 -4
- parsl/version.py +1 -1
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
- parsl-2025.1.13.data/scripts/interchange.py +649 -0
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +77 -75
- parsl-2025.1.13.dist-info/METADATA +96 -0
- parsl-2025.1.13.dist-info/RECORD +462 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
- parsl/channels/__init__.py +0 -7
- parsl/channels/base.py +0 -141
- parsl/channels/errors.py +0 -113
- parsl/channels/local/local.py +0 -164
- parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
- parsl/channels/ssh/ssh.py +0 -276
- parsl/channels/ssh_il/__init__.py +0 -0
- parsl/channels/ssh_il/ssh_il.py +0 -74
- parsl/configs/ad_hoc.py +0 -35
- parsl/executors/radical/rpex_master.py +0 -42
- parsl/monitoring/radios.py +0 -175
- parsl/providers/ad_hoc/__init__.py +0 -0
- parsl/providers/ad_hoc/ad_hoc.py +0 -248
- parsl/providers/cobalt/__init__.py +0 -0
- parsl/providers/cobalt/cobalt.py +0 -236
- parsl/providers/cobalt/template.py +0 -17
- parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
- parsl/tests/configs/cooley_htex.py +0 -37
- parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
- parsl/tests/configs/local_adhoc.py +0 -18
- parsl/tests/configs/swan_htex.py +0 -43
- parsl/tests/configs/theta.py +0 -37
- parsl/tests/integration/test_channels/__init__.py +0 -0
- parsl/tests/integration/test_channels/test_channels.py +0 -17
- parsl/tests/integration/test_channels/test_local_channel.py +0 -42
- parsl/tests/integration/test_channels/test_scp_1.py +0 -45
- parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
- parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
- parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
- parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
- parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
- parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
- parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
- parsl/tests/sites/test_local_adhoc.py +0 -61
- parsl/tests/test_channels/__init__.py +0 -0
- parsl/tests/test_channels/test_large_output.py +0 -22
- parsl/tests/test_data/__init__.py +0 -0
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
- parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
- parsl-2024.3.18.dist-info/METADATA +0 -98
- parsl-2024.3.18.dist-info/RECORD +0 -449
- parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
- parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
- parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
parsl/monitoring/monitoring.py
CHANGED
@@ -1,30 +1,25 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import logging
|
4
|
+
import multiprocessing.synchronize as ms
|
3
5
|
import os
|
4
|
-
import socket
|
5
|
-
import time
|
6
6
|
import pickle
|
7
|
-
import logging
|
8
|
-
import typeguard
|
9
|
-
import zmq
|
10
|
-
|
11
7
|
import queue
|
8
|
+
import time
|
9
|
+
from multiprocessing import Event
|
10
|
+
from multiprocessing.queues import Queue
|
11
|
+
from typing import TYPE_CHECKING, Literal, Optional, Tuple, Union, cast
|
12
12
|
|
13
|
-
import
|
13
|
+
import typeguard
|
14
14
|
|
15
|
-
from parsl.multiprocessing import ForkProcess, SizedQueue
|
16
|
-
from multiprocessing import Process
|
17
|
-
from multiprocessing.queues import Queue
|
18
15
|
from parsl.log_utils import set_file_logger
|
19
|
-
from parsl.
|
16
|
+
from parsl.monitoring.errors import MonitoringHubStartError
|
17
|
+
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
18
|
+
from parsl.monitoring.router import router_starter
|
19
|
+
from parsl.monitoring.types import TaggedMonitoringMessage
|
20
|
+
from parsl.multiprocessing import ForkProcess, SizedQueue
|
20
21
|
from parsl.process_loggers import wrap_with_logs
|
21
|
-
from parsl.utils import setproctitle
|
22
|
-
|
23
|
-
from parsl.serialize import deserialize
|
24
|
-
|
25
|
-
from parsl.monitoring.message_type import MessageType
|
26
|
-
from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
|
27
|
-
from typing import cast, Any, Callable, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING
|
22
|
+
from parsl.utils import RepresentationMixin, setproctitle
|
28
23
|
|
29
24
|
_db_manager_excepts: Optional[Exception]
|
30
25
|
|
@@ -49,7 +44,6 @@ class MonitoringHub(RepresentationMixin):
|
|
49
44
|
workflow_name: Optional[str] = None,
|
50
45
|
workflow_version: Optional[str] = None,
|
51
46
|
logging_endpoint: Optional[str] = None,
|
52
|
-
logdir: Optional[str] = None,
|
53
47
|
monitoring_debug: bool = False,
|
54
48
|
resource_monitoring_enabled: bool = True,
|
55
49
|
resource_monitoring_interval: float = 30): # in seconds
|
@@ -78,8 +72,6 @@ class MonitoringHub(RepresentationMixin):
|
|
78
72
|
The database connection url for monitoring to log the information.
|
79
73
|
These URLs follow RFC-1738, and can include username, password, hostname, database name.
|
80
74
|
Default: sqlite, in the configured run_dir.
|
81
|
-
logdir : str
|
82
|
-
Parsl log directory paths. Logs and temp files go here. Default: '.'
|
83
75
|
monitoring_debug : Bool
|
84
76
|
Enable monitoring debug logging. Default: False
|
85
77
|
resource_monitoring_enabled : boolean
|
@@ -93,14 +85,6 @@ class MonitoringHub(RepresentationMixin):
|
|
93
85
|
Default: 30 seconds
|
94
86
|
"""
|
95
87
|
|
96
|
-
self.logger = logger
|
97
|
-
|
98
|
-
# Any is used to disable typechecking on uses of _dfk_channel,
|
99
|
-
# because it is used in the code as if it points to a channel, but
|
100
|
-
# the static type is that it can also be None. The code relies on
|
101
|
-
# .start() being called and initialising this to a real channel.
|
102
|
-
self._dfk_channel = None # type: Any
|
103
|
-
|
104
88
|
if _db_manager_excepts:
|
105
89
|
raise _db_manager_excepts
|
106
90
|
|
@@ -109,7 +93,6 @@ class MonitoringHub(RepresentationMixin):
|
|
109
93
|
self.hub_port_range = hub_port_range
|
110
94
|
|
111
95
|
self.logging_endpoint = logging_endpoint
|
112
|
-
self.logdir = logdir
|
113
96
|
self.monitoring_debug = monitoring_debug
|
114
97
|
|
115
98
|
self.workflow_name = workflow_name
|
@@ -118,19 +101,15 @@ class MonitoringHub(RepresentationMixin):
|
|
118
101
|
self.resource_monitoring_enabled = resource_monitoring_enabled
|
119
102
|
self.resource_monitoring_interval = resource_monitoring_interval
|
120
103
|
|
121
|
-
def start(self,
|
104
|
+
def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None:
|
122
105
|
|
123
|
-
|
124
|
-
self.logdir = "."
|
106
|
+
logger.debug("Starting MonitoringHub")
|
125
107
|
|
126
108
|
if self.logging_endpoint is None:
|
127
109
|
self.logging_endpoint = f"sqlite:///{os.fspath(config_run_dir)}/monitoring.db"
|
128
110
|
|
129
|
-
os.makedirs(
|
111
|
+
os.makedirs(dfk_run_dir, exist_ok=True)
|
130
112
|
|
131
|
-
# Initialize the ZMQ pipe to the Parsl Client
|
132
|
-
|
133
|
-
self.logger.debug("Initializing ZMQ Pipes to client")
|
134
113
|
self.monitoring_hub_active = True
|
135
114
|
|
136
115
|
# This annotation is incompatible with typeguard 4.x instrumentation
|
@@ -151,26 +130,22 @@ class MonitoringHub(RepresentationMixin):
|
|
151
130
|
self.exception_q: Queue[Tuple[str, str]]
|
152
131
|
self.exception_q = SizedQueue(maxsize=10)
|
153
132
|
|
154
|
-
self.
|
155
|
-
self.priority_msgs = SizedQueue()
|
156
|
-
|
157
|
-
self.resource_msgs: Queue[AddressedMonitoringMessage]
|
133
|
+
self.resource_msgs: Queue[Union[TaggedMonitoringMessage, Literal["STOP"]]]
|
158
134
|
self.resource_msgs = SizedQueue()
|
159
135
|
|
160
|
-
self.
|
161
|
-
self.
|
162
|
-
|
163
|
-
self.block_msgs: Queue[AddressedMonitoringMessage]
|
164
|
-
self.block_msgs = SizedQueue()
|
136
|
+
self.router_exit_event: ms.Event
|
137
|
+
self.router_exit_event = Event()
|
165
138
|
|
166
139
|
self.router_proc = ForkProcess(target=router_starter,
|
167
|
-
|
168
|
-
|
169
|
-
"
|
170
|
-
"
|
171
|
-
"
|
140
|
+
kwargs={"comm_q": comm_q,
|
141
|
+
"exception_q": self.exception_q,
|
142
|
+
"resource_msgs": self.resource_msgs,
|
143
|
+
"exit_event": self.router_exit_event,
|
144
|
+
"hub_address": self.hub_address,
|
145
|
+
"udp_port": self.hub_port,
|
146
|
+
"zmq_port_range": self.hub_port_range,
|
147
|
+
"run_dir": dfk_run_dir,
|
172
148
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
173
|
-
"run_id": run_id
|
174
149
|
},
|
175
150
|
name="Monitoring-Router-Process",
|
176
151
|
daemon=True,
|
@@ -178,8 +153,8 @@ class MonitoringHub(RepresentationMixin):
|
|
178
153
|
self.router_proc.start()
|
179
154
|
|
180
155
|
self.dbm_proc = ForkProcess(target=dbm_starter,
|
181
|
-
args=(self.exception_q, self.
|
182
|
-
kwargs={"
|
156
|
+
args=(self.exception_q, self.resource_msgs,),
|
157
|
+
kwargs={"run_dir": dfk_run_dir,
|
183
158
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
184
159
|
"db_url": self.logging_endpoint,
|
185
160
|
},
|
@@ -187,113 +162,97 @@ class MonitoringHub(RepresentationMixin):
|
|
187
162
|
daemon=True,
|
188
163
|
)
|
189
164
|
self.dbm_proc.start()
|
190
|
-
|
165
|
+
logger.info("Started the router process %s and DBM process %s", self.router_proc.pid, self.dbm_proc.pid)
|
191
166
|
|
192
|
-
self.filesystem_proc =
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
167
|
+
self.filesystem_proc = ForkProcess(target=filesystem_receiver,
|
168
|
+
args=(self.resource_msgs, dfk_run_dir),
|
169
|
+
name="Monitoring-Filesystem-Process",
|
170
|
+
daemon=True
|
171
|
+
)
|
197
172
|
self.filesystem_proc.start()
|
198
|
-
|
173
|
+
logger.info("Started filesystem radio receiver process %s", self.filesystem_proc.pid)
|
174
|
+
|
175
|
+
self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
|
199
176
|
|
200
177
|
try:
|
201
178
|
comm_q_result = comm_q.get(block=True, timeout=120)
|
179
|
+
comm_q.close()
|
180
|
+
comm_q.join_thread()
|
202
181
|
except queue.Empty:
|
203
|
-
|
204
|
-
raise
|
182
|
+
logger.error("Hub has not completed initialization in 120s. Aborting")
|
183
|
+
raise MonitoringHubStartError()
|
205
184
|
|
206
185
|
if isinstance(comm_q_result, str):
|
207
|
-
|
186
|
+
logger.error("MonitoringRouter sent an error message: %s", comm_q_result)
|
208
187
|
raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
|
209
188
|
|
210
|
-
udp_port,
|
189
|
+
udp_port, zmq_port = comm_q_result
|
211
190
|
|
212
191
|
self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
|
213
192
|
|
214
|
-
|
215
|
-
self.dfk_channel_timeout = 10000 # in milliseconds
|
216
|
-
self._dfk_channel = context.socket(zmq.DEALER)
|
217
|
-
self._dfk_channel.setsockopt(zmq.LINGER, 0)
|
218
|
-
self._dfk_channel.set_hwm(0)
|
219
|
-
self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
|
220
|
-
self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, ic_port))
|
193
|
+
logger.info("Monitoring Hub initialized")
|
221
194
|
|
222
|
-
self.
|
195
|
+
self.hub_zmq_port = zmq_port
|
223
196
|
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
def send(self, mtype: MessageType, message: Any) -> None:
|
228
|
-
self.logger.debug("Sending message type {}".format(mtype))
|
229
|
-
try:
|
230
|
-
self._dfk_channel.send_pyobj((mtype, message))
|
231
|
-
except zmq.Again:
|
232
|
-
self.logger.exception(
|
233
|
-
"The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
|
197
|
+
def send(self, message: TaggedMonitoringMessage) -> None:
|
198
|
+
logger.debug("Sending message type %s", message[0])
|
199
|
+
self.radio.send(message)
|
234
200
|
|
235
201
|
def close(self) -> None:
|
236
|
-
|
202
|
+
logger.info("Terminating Monitoring Hub")
|
237
203
|
exception_msgs = []
|
238
204
|
while True:
|
239
205
|
try:
|
240
206
|
exception_msgs.append(self.exception_q.get(block=False))
|
241
|
-
|
207
|
+
logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
|
242
208
|
except queue.Empty:
|
243
209
|
break
|
244
|
-
if self.
|
210
|
+
if self.monitoring_hub_active:
|
245
211
|
self.monitoring_hub_active = False
|
246
|
-
self._dfk_channel.close()
|
247
212
|
if exception_msgs:
|
248
213
|
for exception_msg in exception_msgs:
|
249
|
-
|
250
|
-
"
|
251
|
-
|
252
|
-
|
253
|
-
)
|
214
|
+
logger.error(
|
215
|
+
"%s process delivered an exception: %s. Terminating all monitoring processes immediately.",
|
216
|
+
exception_msg[0],
|
217
|
+
exception_msg[1]
|
254
218
|
)
|
255
219
|
self.router_proc.terminate()
|
256
220
|
self.dbm_proc.terminate()
|
257
221
|
self.filesystem_proc.terminate()
|
258
|
-
|
222
|
+
logger.info("Setting router termination event")
|
223
|
+
self.router_exit_event.set()
|
224
|
+
logger.info("Waiting for router to terminate")
|
259
225
|
self.router_proc.join()
|
260
|
-
self.
|
226
|
+
self.router_proc.close()
|
227
|
+
logger.debug("Finished waiting for router termination")
|
261
228
|
if len(exception_msgs) == 0:
|
262
|
-
|
263
|
-
self.
|
229
|
+
logger.debug("Sending STOP to DBM")
|
230
|
+
self.resource_msgs.put("STOP")
|
264
231
|
else:
|
265
|
-
|
266
|
-
|
232
|
+
logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
|
233
|
+
logger.debug("Waiting for DB termination")
|
267
234
|
self.dbm_proc.join()
|
268
|
-
self.
|
235
|
+
self.dbm_proc.close()
|
236
|
+
logger.debug("Finished waiting for DBM termination")
|
269
237
|
|
270
238
|
# should this be message based? it probably doesn't need to be if
|
271
239
|
# we believe we've received all messages
|
272
|
-
|
240
|
+
logger.info("Terminating filesystem radio receiver process")
|
273
241
|
self.filesystem_proc.terminate()
|
274
242
|
self.filesystem_proc.join()
|
243
|
+
self.filesystem_proc.close()
|
275
244
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
monitoring_hub_url: str,
|
283
|
-
run_id: str,
|
284
|
-
logging_level: int,
|
285
|
-
sleep_dur: float,
|
286
|
-
radio_mode: str,
|
287
|
-
monitor_resources: bool,
|
288
|
-
run_dir: str) -> Tuple[Callable, Sequence, Dict]:
|
289
|
-
return parsl.monitoring.remote.monitor_wrapper(f, args, kwargs, try_id, task_id, monitoring_hub_url,
|
290
|
-
run_id, logging_level, sleep_dur, radio_mode,
|
291
|
-
monitor_resources, run_dir)
|
245
|
+
logger.info("Closing monitoring multiprocessing queues")
|
246
|
+
self.exception_q.close()
|
247
|
+
self.exception_q.join_thread()
|
248
|
+
self.resource_msgs.close()
|
249
|
+
self.resource_msgs.join_thread()
|
250
|
+
logger.info("Closed monitoring multiprocessing queues")
|
292
251
|
|
293
252
|
|
294
253
|
@wrap_with_logs
|
295
|
-
def filesystem_receiver(
|
296
|
-
logger = set_file_logger("{}/monitoring_filesystem_radio.log"
|
254
|
+
def filesystem_receiver(q: Queue[TaggedMonitoringMessage], run_dir: str) -> None:
|
255
|
+
logger = set_file_logger(f"{run_dir}/monitoring_filesystem_radio.log",
|
297
256
|
name="monitoring_filesystem_radio",
|
298
257
|
level=logging.INFO)
|
299
258
|
|
@@ -302,7 +261,9 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
|
|
302
261
|
base_path = f"{run_dir}/monitor-fs-radio/"
|
303
262
|
tmp_dir = f"{base_path}/tmp/"
|
304
263
|
new_dir = f"{base_path}/new/"
|
305
|
-
logger.debug(
|
264
|
+
logger.debug("Creating new and tmp paths under %s", base_path)
|
265
|
+
|
266
|
+
target_radio = MultiprocessingQueueRadioSender(q)
|
306
267
|
|
307
268
|
os.makedirs(tmp_dir, exist_ok=True)
|
308
269
|
os.makedirs(new_dir, exist_ok=True)
|
@@ -313,202 +274,15 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
|
|
313
274
|
# iterate over files in new_dir
|
314
275
|
for filename in os.listdir(new_dir):
|
315
276
|
try:
|
316
|
-
logger.info(
|
277
|
+
logger.info("Processing filesystem radio file %s", filename)
|
317
278
|
full_path_filename = f"{new_dir}/{filename}"
|
318
279
|
with open(full_path_filename, "rb") as f:
|
319
|
-
message =
|
320
|
-
logger.debug(
|
280
|
+
message = pickle.load(f)
|
281
|
+
logger.debug("Message received is: %s", message)
|
321
282
|
assert isinstance(message, tuple)
|
322
|
-
|
283
|
+
target_radio.send(cast(TaggedMonitoringMessage, message))
|
323
284
|
os.remove(full_path_filename)
|
324
285
|
except Exception:
|
325
|
-
logger.exception(
|
286
|
+
logger.exception("Exception processing %s - probably will be retried next iteration", filename)
|
326
287
|
|
327
288
|
time.sleep(1) # whats a good time for this poll?
|
328
|
-
|
329
|
-
|
330
|
-
class MonitoringRouter:
|
331
|
-
|
332
|
-
def __init__(self,
|
333
|
-
*,
|
334
|
-
hub_address: str,
|
335
|
-
hub_port: Optional[int] = None,
|
336
|
-
hub_port_range: Tuple[int, int] = (55050, 56000),
|
337
|
-
|
338
|
-
monitoring_hub_address: str = "127.0.0.1",
|
339
|
-
logdir: str = ".",
|
340
|
-
run_id: str,
|
341
|
-
logging_level: int = logging.INFO,
|
342
|
-
atexit_timeout: int = 3 # in seconds
|
343
|
-
):
|
344
|
-
""" Initializes a monitoring configuration class.
|
345
|
-
|
346
|
-
Parameters
|
347
|
-
----------
|
348
|
-
hub_address : str
|
349
|
-
The ip address at which the workers will be able to reach the Hub.
|
350
|
-
hub_port : int
|
351
|
-
The specific port at which workers will be able to reach the Hub via UDP. Default: None
|
352
|
-
hub_port_range : tuple(int, int)
|
353
|
-
The MonitoringHub picks ports at random from the range which will be used by Hub.
|
354
|
-
This is overridden when the hub_port option is set. Default: (55050, 56000)
|
355
|
-
logdir : str
|
356
|
-
Parsl log directory paths. Logs and temp files go here. Default: '.'
|
357
|
-
logging_level : int
|
358
|
-
Logging level as defined in the logging module. Default: logging.INFO
|
359
|
-
atexit_timeout : float, optional
|
360
|
-
The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
|
361
|
-
|
362
|
-
"""
|
363
|
-
os.makedirs(logdir, exist_ok=True)
|
364
|
-
self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
|
365
|
-
name="monitoring_router",
|
366
|
-
level=logging_level)
|
367
|
-
self.logger.debug("Monitoring router starting")
|
368
|
-
|
369
|
-
self.hub_address = hub_address
|
370
|
-
self.atexit_timeout = atexit_timeout
|
371
|
-
self.run_id = run_id
|
372
|
-
|
373
|
-
self.loop_freq = 10.0 # milliseconds
|
374
|
-
|
375
|
-
# Initialize the UDP socket
|
376
|
-
self.sock = socket.socket(socket.AF_INET,
|
377
|
-
socket.SOCK_DGRAM,
|
378
|
-
socket.IPPROTO_UDP)
|
379
|
-
|
380
|
-
# We are trying to bind to all interfaces with 0.0.0.0
|
381
|
-
if not hub_port:
|
382
|
-
self.sock.bind(('0.0.0.0', 0))
|
383
|
-
self.hub_port = self.sock.getsockname()[1]
|
384
|
-
else:
|
385
|
-
self.hub_port = hub_port
|
386
|
-
try:
|
387
|
-
self.sock.bind(('0.0.0.0', self.hub_port))
|
388
|
-
except Exception as e:
|
389
|
-
raise RuntimeError(f"Could not bind to hub_port {hub_port} because: {e}")
|
390
|
-
self.sock.settimeout(self.loop_freq / 1000)
|
391
|
-
self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.hub_port))
|
392
|
-
|
393
|
-
self._context = zmq.Context()
|
394
|
-
self.ic_channel = self._context.socket(zmq.DEALER)
|
395
|
-
self.ic_channel.setsockopt(zmq.LINGER, 0)
|
396
|
-
self.ic_channel.set_hwm(0)
|
397
|
-
self.ic_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
|
398
|
-
self.logger.debug("hub_address: {}. hub_port_range {}".format(hub_address, hub_port_range))
|
399
|
-
self.ic_port = self.ic_channel.bind_to_random_port("tcp://*",
|
400
|
-
min_port=hub_port_range[0],
|
401
|
-
max_port=hub_port_range[1])
|
402
|
-
|
403
|
-
def start(self,
|
404
|
-
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
405
|
-
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
406
|
-
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
407
|
-
resource_msgs: "queue.Queue[AddressedMonitoringMessage]") -> None:
|
408
|
-
try:
|
409
|
-
router_keep_going = True
|
410
|
-
while router_keep_going:
|
411
|
-
try:
|
412
|
-
data, addr = self.sock.recvfrom(2048)
|
413
|
-
resource_msg = pickle.loads(data)
|
414
|
-
self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
415
|
-
resource_msgs.put((resource_msg, addr))
|
416
|
-
except socket.timeout:
|
417
|
-
pass
|
418
|
-
|
419
|
-
try:
|
420
|
-
dfk_loop_start = time.time()
|
421
|
-
while time.time() - dfk_loop_start < 1.0: # TODO make configurable
|
422
|
-
# note that nothing checks that msg really is of the annotated type
|
423
|
-
msg: TaggedMonitoringMessage
|
424
|
-
msg = self.ic_channel.recv_pyobj()
|
425
|
-
|
426
|
-
assert isinstance(msg, tuple), "IC Channel expects only tuples, got {}".format(msg)
|
427
|
-
assert len(msg) >= 1, "IC Channel expects tuples of length at least 1, got {}".format(msg)
|
428
|
-
assert len(msg) == 2, "IC Channel expects message tuples of exactly length 2, got {}".format(msg)
|
429
|
-
|
430
|
-
msg_0: AddressedMonitoringMessage
|
431
|
-
msg_0 = (msg, 0)
|
432
|
-
|
433
|
-
if msg[0] == MessageType.NODE_INFO:
|
434
|
-
msg[1]['run_id'] = self.run_id
|
435
|
-
node_msgs.put(msg_0)
|
436
|
-
elif msg[0] == MessageType.RESOURCE_INFO:
|
437
|
-
resource_msgs.put(msg_0)
|
438
|
-
elif msg[0] == MessageType.BLOCK_INFO:
|
439
|
-
block_msgs.put(msg_0)
|
440
|
-
elif msg[0] == MessageType.TASK_INFO:
|
441
|
-
priority_msgs.put(msg_0)
|
442
|
-
elif msg[0] == MessageType.WORKFLOW_INFO:
|
443
|
-
priority_msgs.put(msg_0)
|
444
|
-
if 'exit_now' in msg[1] and msg[1]['exit_now']:
|
445
|
-
router_keep_going = False
|
446
|
-
else:
|
447
|
-
# There is a type: ignore here because if msg[0]
|
448
|
-
# is of the correct type, this code is unreachable,
|
449
|
-
# but there is no verification that the message
|
450
|
-
# received from ic_channel.recv_pyobj() is actually
|
451
|
-
# of that type.
|
452
|
-
self.logger.error("Discarding message " # type: ignore[unreachable]
|
453
|
-
f"from interchange with unknown type {msg[0].value}")
|
454
|
-
except zmq.Again:
|
455
|
-
pass
|
456
|
-
except Exception:
|
457
|
-
# This will catch malformed messages. What happens if the
|
458
|
-
# channel is broken in such a way that it always raises
|
459
|
-
# an exception? Looping on this would maybe be the wrong
|
460
|
-
# thing to do.
|
461
|
-
self.logger.warning("Failure processing a ZMQ message", exc_info=True)
|
462
|
-
|
463
|
-
self.logger.info("Monitoring router draining")
|
464
|
-
last_msg_received_time = time.time()
|
465
|
-
while time.time() - last_msg_received_time < self.atexit_timeout:
|
466
|
-
try:
|
467
|
-
data, addr = self.sock.recvfrom(2048)
|
468
|
-
msg = pickle.loads(data)
|
469
|
-
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
470
|
-
resource_msgs.put((msg, addr))
|
471
|
-
last_msg_received_time = time.time()
|
472
|
-
except socket.timeout:
|
473
|
-
pass
|
474
|
-
|
475
|
-
self.logger.info("Monitoring router finishing normally")
|
476
|
-
finally:
|
477
|
-
self.logger.info("Monitoring router finished")
|
478
|
-
|
479
|
-
|
480
|
-
@wrap_with_logs
|
481
|
-
def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
482
|
-
exception_q: "queue.Queue[Tuple[str, str]]",
|
483
|
-
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
484
|
-
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
485
|
-
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
486
|
-
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
487
|
-
|
488
|
-
hub_address: str,
|
489
|
-
hub_port: Optional[int],
|
490
|
-
hub_port_range: Tuple[int, int],
|
491
|
-
|
492
|
-
logdir: str,
|
493
|
-
logging_level: int,
|
494
|
-
run_id: str) -> None:
|
495
|
-
setproctitle("parsl: monitoring router")
|
496
|
-
try:
|
497
|
-
router = MonitoringRouter(hub_address=hub_address,
|
498
|
-
hub_port=hub_port,
|
499
|
-
hub_port_range=hub_port_range,
|
500
|
-
logdir=logdir,
|
501
|
-
logging_level=logging_level,
|
502
|
-
run_id=run_id)
|
503
|
-
except Exception as e:
|
504
|
-
logger.error("MonitoringRouter construction failed.", exc_info=True)
|
505
|
-
comm_q.put(f"Monitoring router construction failed: {e}")
|
506
|
-
else:
|
507
|
-
comm_q.put((router.hub_port, router.ic_port))
|
508
|
-
|
509
|
-
router.logger.info("Starting MonitoringRouter in router_starter")
|
510
|
-
try:
|
511
|
-
router.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
|
512
|
-
except Exception as e:
|
513
|
-
router.logger.exception("router.start exception")
|
514
|
-
exception_q.put(('Hub', str(e)))
|
@@ -0,0 +1,13 @@
|
|
1
|
+
import logging
|
2
|
+
from abc import ABCMeta, abstractmethod
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
_db_manager_excepts: Optional[Exception]
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
class MonitoringRadioSender(metaclass=ABCMeta):
|
11
|
+
@abstractmethod
|
12
|
+
def send(self, message: object) -> None:
|
13
|
+
pass
|
@@ -0,0 +1,52 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import pickle
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
from parsl.monitoring.radios.base import MonitoringRadioSender
|
7
|
+
|
8
|
+
logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
class FilesystemRadioSender(MonitoringRadioSender):
|
12
|
+
"""A MonitoringRadioSender that sends messages over a shared filesystem.
|
13
|
+
|
14
|
+
The messsage directory structure is based on maildir,
|
15
|
+
https://en.wikipedia.org/wiki/Maildir
|
16
|
+
|
17
|
+
The writer creates a message in tmp/ and then when it is fully
|
18
|
+
written, moves it atomically into new/
|
19
|
+
|
20
|
+
The reader ignores tmp/ and only reads and deletes messages from
|
21
|
+
new/
|
22
|
+
|
23
|
+
This avoids a race condition of reading partially written messages.
|
24
|
+
|
25
|
+
This radio is likely to give higher shared filesystem load compared to
|
26
|
+
the UDP radio, but should be much more reliable.
|
27
|
+
"""
|
28
|
+
|
29
|
+
def __init__(self, *, monitoring_url: str, timeout: int = 10, run_dir: str):
|
30
|
+
logger.info("filesystem based monitoring channel initializing")
|
31
|
+
self.base_path = f"{run_dir}/monitor-fs-radio/"
|
32
|
+
self.tmp_path = f"{self.base_path}/tmp"
|
33
|
+
self.new_path = f"{self.base_path}/new"
|
34
|
+
|
35
|
+
os.makedirs(self.tmp_path, exist_ok=True)
|
36
|
+
os.makedirs(self.new_path, exist_ok=True)
|
37
|
+
|
38
|
+
def send(self, message: object) -> None:
|
39
|
+
logger.info("Sending a monitoring message via filesystem")
|
40
|
+
|
41
|
+
unique_id = str(uuid.uuid4())
|
42
|
+
|
43
|
+
tmp_filename = f"{self.tmp_path}/{unique_id}"
|
44
|
+
new_filename = f"{self.new_path}/{unique_id}"
|
45
|
+
buffer = message
|
46
|
+
|
47
|
+
# this will write the message out then atomically
|
48
|
+
# move it into new/, so that a partially written
|
49
|
+
# file will never be observed in new/
|
50
|
+
with open(tmp_filename, "wb") as f:
|
51
|
+
pickle.dump(buffer, f)
|
52
|
+
os.rename(tmp_filename, new_filename)
|
@@ -0,0 +1,57 @@
|
|
1
|
+
import logging
|
2
|
+
import pickle
|
3
|
+
|
4
|
+
from parsl.monitoring.radios.base import MonitoringRadioSender
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
|
8
|
+
|
9
|
+
class HTEXRadioSender(MonitoringRadioSender):
|
10
|
+
|
11
|
+
def __init__(self, monitoring_url: str, timeout: int = 10):
|
12
|
+
"""
|
13
|
+
Parameters
|
14
|
+
----------
|
15
|
+
|
16
|
+
monitoring_url : str
|
17
|
+
URL of the form <scheme>://<IP>:<PORT>
|
18
|
+
timeout : int
|
19
|
+
timeout, default=10s
|
20
|
+
"""
|
21
|
+
logger.info("htex-based monitoring channel initialising")
|
22
|
+
|
23
|
+
def send(self, message: object) -> None:
|
24
|
+
""" Sends a message to the UDP receiver
|
25
|
+
|
26
|
+
Parameter
|
27
|
+
---------
|
28
|
+
|
29
|
+
message: object
|
30
|
+
Arbitrary pickle-able object that is to be sent
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
None
|
34
|
+
"""
|
35
|
+
|
36
|
+
import parsl.executors.high_throughput.monitoring_info
|
37
|
+
|
38
|
+
result_queue = parsl.executors.high_throughput.monitoring_info.result_queue
|
39
|
+
|
40
|
+
# this message needs to go in the result queue tagged so that it is treated
|
41
|
+
# i) as a monitoring message by the interchange, and then further more treated
|
42
|
+
# as a RESOURCE_INFO message when received by monitoring (rather than a NODE_INFO
|
43
|
+
# which is the implicit default for messages from the interchange)
|
44
|
+
|
45
|
+
# for the interchange, the outer wrapper, this needs to be a dict:
|
46
|
+
|
47
|
+
interchange_msg = {
|
48
|
+
'type': 'monitoring',
|
49
|
+
'payload': message
|
50
|
+
}
|
51
|
+
|
52
|
+
if result_queue:
|
53
|
+
result_queue.put(pickle.dumps(interchange_msg))
|
54
|
+
else:
|
55
|
+
logger.error("result_queue is uninitialized - cannot put monitoring message")
|
56
|
+
|
57
|
+
return
|