parsl 2024.3.11__py3-none-any.whl → 2025.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/__init__.py +9 -10
- parsl/addresses.py +29 -7
- parsl/app/app.py +7 -8
- parsl/app/bash.py +15 -8
- parsl/app/errors.py +10 -13
- parsl/app/futures.py +8 -10
- parsl/app/python.py +2 -1
- parsl/benchmark/perf.py +2 -1
- parsl/concurrent/__init__.py +2 -2
- parsl/config.py +57 -10
- parsl/configs/ASPIRE1.py +6 -5
- parsl/configs/Azure.py +9 -8
- parsl/configs/bridges.py +6 -4
- parsl/configs/cc_in2p3.py +3 -3
- parsl/configs/ec2.py +3 -1
- parsl/configs/expanse.py +4 -3
- parsl/configs/frontera.py +3 -4
- parsl/configs/htex_local.py +3 -4
- parsl/configs/illinoiscluster.py +3 -1
- parsl/configs/improv.py +34 -0
- parsl/configs/kubernetes.py +4 -3
- parsl/configs/local_threads.py +5 -1
- parsl/configs/midway.py +5 -3
- parsl/configs/osg.py +4 -2
- parsl/configs/polaris.py +4 -2
- parsl/configs/stampede2.py +6 -5
- parsl/configs/summit.py +3 -3
- parsl/configs/toss3_llnl.py +4 -3
- parsl/configs/vineex_local.py +6 -4
- parsl/configs/wqex_local.py +5 -3
- parsl/curvezmq.py +4 -0
- parsl/data_provider/data_manager.py +4 -3
- parsl/data_provider/file_noop.py +1 -2
- parsl/data_provider/files.py +3 -3
- parsl/data_provider/ftp.py +1 -3
- parsl/data_provider/globus.py +7 -6
- parsl/data_provider/http.py +2 -2
- parsl/data_provider/rsync.py +1 -1
- parsl/data_provider/staging.py +2 -2
- parsl/data_provider/zip.py +135 -0
- parsl/dataflow/dependency_resolvers.py +115 -0
- parsl/dataflow/dflow.py +262 -224
- parsl/dataflow/errors.py +3 -5
- parsl/dataflow/futures.py +27 -14
- parsl/dataflow/memoization.py +5 -5
- parsl/dataflow/rundirs.py +5 -6
- parsl/dataflow/taskrecord.py +4 -5
- parsl/executors/__init__.py +4 -2
- parsl/executors/base.py +45 -15
- parsl/executors/errors.py +13 -0
- parsl/executors/execute_task.py +37 -0
- parsl/executors/flux/execute_parsl_task.py +3 -3
- parsl/executors/flux/executor.py +18 -19
- parsl/executors/flux/flux_instance_manager.py +26 -27
- parsl/executors/high_throughput/errors.py +43 -3
- parsl/executors/high_throughput/executor.py +316 -282
- parsl/executors/high_throughput/interchange.py +158 -167
- parsl/executors/high_throughput/manager_record.py +5 -0
- parsl/executors/high_throughput/manager_selector.py +55 -0
- parsl/executors/high_throughput/monitoring_info.py +2 -1
- parsl/executors/high_throughput/mpi_executor.py +113 -0
- parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
- parsl/executors/high_throughput/mpi_resource_management.py +6 -17
- parsl/executors/high_throughput/probe.py +9 -7
- parsl/executors/high_throughput/process_worker_pool.py +115 -77
- parsl/executors/high_throughput/zmq_pipes.py +81 -23
- parsl/executors/radical/executor.py +130 -79
- parsl/executors/radical/rpex_resources.py +17 -15
- parsl/executors/radical/rpex_worker.py +4 -3
- parsl/executors/status_handling.py +157 -51
- parsl/executors/taskvine/__init__.py +1 -1
- parsl/executors/taskvine/errors.py +1 -1
- parsl/executors/taskvine/exec_parsl_function.py +2 -2
- parsl/executors/taskvine/executor.py +41 -57
- parsl/executors/taskvine/factory.py +1 -1
- parsl/executors/taskvine/factory_config.py +1 -1
- parsl/executors/taskvine/manager.py +18 -13
- parsl/executors/taskvine/manager_config.py +9 -5
- parsl/executors/threads.py +6 -6
- parsl/executors/workqueue/errors.py +1 -1
- parsl/executors/workqueue/exec_parsl_function.py +6 -5
- parsl/executors/workqueue/executor.py +64 -63
- parsl/executors/workqueue/parsl_coprocess.py +1 -1
- parsl/jobs/error_handlers.py +2 -2
- parsl/jobs/job_status_poller.py +30 -113
- parsl/jobs/states.py +7 -2
- parsl/jobs/strategy.py +43 -31
- parsl/launchers/__init__.py +12 -3
- parsl/launchers/errors.py +1 -1
- parsl/launchers/launchers.py +6 -12
- parsl/log_utils.py +9 -6
- parsl/monitoring/db_manager.py +59 -95
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +87 -356
- parsl/monitoring/queries/pandas.py +1 -2
- parsl/monitoring/radios/base.py +13 -0
- parsl/monitoring/radios/filesystem.py +52 -0
- parsl/monitoring/radios/htex.py +57 -0
- parsl/monitoring/radios/multiprocessing.py +17 -0
- parsl/monitoring/radios/udp.py +56 -0
- parsl/monitoring/radios/zmq.py +17 -0
- parsl/monitoring/remote.py +33 -37
- parsl/monitoring/router.py +212 -0
- parsl/monitoring/types.py +5 -6
- parsl/monitoring/visualization/app.py +4 -2
- parsl/monitoring/visualization/models.py +0 -1
- parsl/monitoring/visualization/plots/default/workflow_plots.py +11 -4
- parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
- parsl/monitoring/visualization/utils.py +0 -1
- parsl/monitoring/visualization/views.py +16 -8
- parsl/multiprocessing.py +0 -1
- parsl/process_loggers.py +1 -2
- parsl/providers/__init__.py +8 -17
- parsl/providers/aws/aws.py +2 -3
- parsl/providers/azure/azure.py +4 -5
- parsl/providers/base.py +2 -18
- parsl/providers/cluster_provider.py +4 -12
- parsl/providers/condor/condor.py +7 -17
- parsl/providers/errors.py +2 -2
- parsl/providers/googlecloud/googlecloud.py +2 -1
- parsl/providers/grid_engine/grid_engine.py +5 -14
- parsl/providers/kubernetes/kube.py +80 -40
- parsl/providers/local/local.py +13 -26
- parsl/providers/lsf/lsf.py +5 -23
- parsl/providers/pbspro/pbspro.py +5 -17
- parsl/providers/slurm/slurm.py +81 -39
- parsl/providers/torque/torque.py +3 -14
- parsl/serialize/__init__.py +8 -3
- parsl/serialize/base.py +1 -2
- parsl/serialize/concretes.py +5 -4
- parsl/serialize/facade.py +3 -3
- parsl/serialize/proxystore.py +3 -2
- parsl/tests/__init__.py +1 -1
- parsl/tests/configs/azure_single_node.py +4 -5
- parsl/tests/configs/bridges.py +3 -2
- parsl/tests/configs/cc_in2p3.py +1 -3
- parsl/tests/configs/comet.py +2 -1
- parsl/tests/configs/ec2_single_node.py +1 -2
- parsl/tests/configs/ec2_spot.py +1 -2
- parsl/tests/configs/flux_local.py +11 -0
- parsl/tests/configs/frontera.py +2 -3
- parsl/tests/configs/htex_local.py +3 -5
- parsl/tests/configs/htex_local_alternate.py +11 -15
- parsl/tests/configs/htex_local_intask_staging.py +5 -9
- parsl/tests/configs/htex_local_rsync_staging.py +4 -8
- parsl/tests/configs/local_radical.py +1 -3
- parsl/tests/configs/local_radical_mpi.py +2 -2
- parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
- parsl/tests/configs/local_threads_monitoring.py +0 -1
- parsl/tests/configs/midway.py +2 -2
- parsl/tests/configs/nscc_singapore.py +3 -3
- parsl/tests/configs/osg_htex.py +1 -1
- parsl/tests/configs/petrelkube.py +3 -2
- parsl/tests/configs/slurm_local.py +24 -0
- parsl/tests/configs/summit.py +1 -0
- parsl/tests/configs/taskvine_ex.py +4 -7
- parsl/tests/configs/user_opts.py +2 -8
- parsl/tests/configs/workqueue_ex.py +4 -6
- parsl/tests/conftest.py +27 -13
- parsl/tests/integration/test_stress/test_python_simple.py +3 -4
- parsl/tests/integration/test_stress/test_python_threads.py +3 -5
- parsl/tests/manual_tests/htex_local.py +4 -6
- parsl/tests/manual_tests/test_basic.py +1 -0
- parsl/tests/manual_tests/test_log_filter.py +3 -1
- parsl/tests/manual_tests/test_memory_limits.py +6 -8
- parsl/tests/manual_tests/test_regression_220.py +2 -1
- parsl/tests/manual_tests/test_udp_simple.py +4 -4
- parsl/tests/manual_tests/test_worker_count.py +3 -2
- parsl/tests/scaling_tests/htex_local.py +2 -4
- parsl/tests/scaling_tests/test_scale.py +0 -9
- parsl/tests/scaling_tests/vineex_condor.py +1 -2
- parsl/tests/scaling_tests/vineex_local.py +1 -2
- parsl/tests/site_tests/site_config_selector.py +1 -6
- parsl/tests/site_tests/test_provider.py +4 -2
- parsl/tests/site_tests/test_site.py +2 -0
- parsl/tests/sites/test_affinity.py +7 -7
- parsl/tests/sites/test_dynamic_executor.py +3 -4
- parsl/tests/sites/test_ec2.py +3 -2
- parsl/tests/sites/test_worker_info.py +4 -5
- parsl/tests/test_aalst_patterns.py +0 -1
- parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
- parsl/tests/test_bash_apps/test_basic.py +10 -4
- parsl/tests/test_bash_apps/test_error_codes.py +5 -7
- parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
- parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
- parsl/tests/test_bash_apps/test_memoize.py +2 -8
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
- parsl/tests/test_bash_apps/test_multiline.py +1 -1
- parsl/tests/test_bash_apps/test_pipeline.py +1 -1
- parsl/tests/test_bash_apps/test_std_uri.py +123 -0
- parsl/tests/test_bash_apps/test_stdout.py +33 -8
- parsl/tests/test_callables.py +2 -2
- parsl/tests/test_checkpointing/test_periodic.py +21 -39
- parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
- parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
- parsl/tests/test_checkpointing/test_regression_239.py +1 -1
- parsl/tests/test_checkpointing/test_task_exit.py +2 -3
- parsl/tests/test_docs/test_from_slides.py +5 -2
- parsl/tests/test_docs/test_kwargs.py +4 -1
- parsl/tests/test_docs/test_tutorial_1.py +1 -2
- parsl/tests/test_docs/test_workflow1.py +2 -2
- parsl/tests/test_docs/test_workflow2.py +0 -1
- parsl/tests/test_error_handling/test_rand_fail.py +2 -2
- parsl/tests/test_error_handling/test_resource_spec.py +10 -12
- parsl/tests/test_error_handling/test_retries.py +6 -16
- parsl/tests/test_error_handling/test_retry_handler.py +1 -0
- parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
- parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
- parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
- parsl/tests/test_execute_task.py +29 -0
- parsl/tests/test_flux.py +1 -1
- parsl/tests/test_htex/test_basic.py +2 -3
- parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
- parsl/tests/test_htex/test_command_client_timeout.py +66 -0
- parsl/tests/test_htex/test_connected_blocks.py +3 -2
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
- parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_drain.py +79 -0
- parsl/tests/test_htex/test_htex.py +51 -25
- parsl/tests/test_htex/test_manager_failure.py +0 -1
- parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
- parsl/tests/test_htex/test_managers_command.py +36 -0
- parsl/tests/test_htex/test_missing_worker.py +2 -12
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
- parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
- parsl/tests/test_htex/test_zmq_binding.py +29 -8
- parsl/tests/test_monitoring/test_app_names.py +86 -0
- parsl/tests/test_monitoring/test_basic.py +73 -25
- parsl/tests/test_monitoring/test_db_locks.py +6 -4
- parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
- parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
- parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
- parsl/tests/test_monitoring/test_stdouterr.py +134 -0
- parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
- parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
- parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
- parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
- parsl/tests/test_providers/test_local_provider.py +3 -132
- parsl/tests/test_providers/test_pbspro_template.py +2 -3
- parsl/tests/test_providers/test_slurm_template.py +2 -3
- parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
- parsl/tests/test_python_apps/test_context_manager.py +128 -0
- parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
- parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
- parsl/tests/test_python_apps/test_fail.py +0 -25
- parsl/tests/test_python_apps/test_futures.py +2 -1
- parsl/tests/test_python_apps/test_inputs_default.py +22 -0
- parsl/tests/test_python_apps/test_join.py +0 -1
- parsl/tests/test_python_apps/test_lifted.py +11 -7
- parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
- parsl/tests/test_python_apps/test_outputs.py +1 -1
- parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
- parsl/tests/test_radical/test_mpi_funcs.py +1 -2
- parsl/tests/test_regression/test_1480.py +2 -1
- parsl/tests/test_regression/test_1653.py +2 -1
- parsl/tests/test_regression/test_226.py +1 -0
- parsl/tests/test_regression/test_2652.py +1 -0
- parsl/tests/test_regression/test_69a.py +0 -1
- parsl/tests/test_regression/test_854.py +4 -2
- parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
- parsl/tests/test_regression/test_98.py +0 -1
- parsl/tests/test_scaling/test_block_error_handler.py +9 -4
- parsl/tests/test_scaling/test_regression_1621.py +11 -15
- parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
- parsl/tests/test_scaling/test_scale_down.py +2 -5
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +6 -18
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
- parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
- parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
- parsl/tests/test_serialization/test_basic.py +2 -1
- parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
- parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
- parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
- parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
- parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
- parsl/tests/test_staging/staging_provider.py +2 -2
- parsl/tests/test_staging/test_1316.py +3 -4
- parsl/tests/test_staging/test_docs_1.py +2 -1
- parsl/tests/test_staging/test_docs_2.py +2 -1
- parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
- parsl/tests/{test_data → test_staging}/test_file.py +6 -6
- parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
- parsl/tests/test_staging/test_staging_ftp.py +1 -0
- parsl/tests/test_staging/test_staging_https.py +5 -2
- parsl/tests/test_staging/test_staging_stdout.py +64 -0
- parsl/tests/test_staging/test_zip_in.py +39 -0
- parsl/tests/test_staging/test_zip_out.py +110 -0
- parsl/tests/test_staging/test_zip_to_zip.py +41 -0
- parsl/tests/test_summary.py +2 -2
- parsl/tests/test_thread_parallelism.py +0 -1
- parsl/tests/test_threads/test_configs.py +1 -2
- parsl/tests/test_threads/test_lazy_errors.py +2 -2
- parsl/tests/test_utils/test_execute_wait.py +35 -0
- parsl/tests/test_utils/test_sanitize_dns.py +76 -0
- parsl/tests/unit/test_address.py +20 -0
- parsl/tests/unit/test_file.py +99 -0
- parsl/tests/unit/test_usage_tracking.py +66 -0
- parsl/usage_tracking/api.py +65 -0
- parsl/usage_tracking/levels.py +6 -0
- parsl/usage_tracking/usage.py +104 -62
- parsl/utils.py +139 -6
- parsl/version.py +1 -1
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
- parsl-2025.1.13.data/scripts/interchange.py +649 -0
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +115 -77
- parsl-2025.1.13.dist-info/METADATA +96 -0
- parsl-2025.1.13.dist-info/RECORD +462 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
- parsl/channels/__init__.py +0 -7
- parsl/channels/base.py +0 -141
- parsl/channels/errors.py +0 -113
- parsl/channels/local/local.py +0 -164
- parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
- parsl/channels/ssh/ssh.py +0 -276
- parsl/channels/ssh_il/__init__.py +0 -0
- parsl/channels/ssh_il/ssh_il.py +0 -74
- parsl/configs/ad_hoc.py +0 -35
- parsl/executors/radical/rpex_master.py +0 -42
- parsl/monitoring/radios.py +0 -175
- parsl/providers/ad_hoc/__init__.py +0 -0
- parsl/providers/ad_hoc/ad_hoc.py +0 -248
- parsl/providers/cobalt/__init__.py +0 -0
- parsl/providers/cobalt/cobalt.py +0 -236
- parsl/providers/cobalt/template.py +0 -17
- parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
- parsl/tests/configs/cooley_htex.py +0 -37
- parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
- parsl/tests/configs/local_adhoc.py +0 -18
- parsl/tests/configs/swan_htex.py +0 -43
- parsl/tests/configs/theta.py +0 -37
- parsl/tests/integration/test_channels/__init__.py +0 -0
- parsl/tests/integration/test_channels/test_channels.py +0 -17
- parsl/tests/integration/test_channels/test_local_channel.py +0 -42
- parsl/tests/integration/test_channels/test_scp_1.py +0 -45
- parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
- parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
- parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
- parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
- parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
- parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
- parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
- parsl/tests/sites/test_local_adhoc.py +0 -61
- parsl/tests/test_channels/__init__.py +0 -0
- parsl/tests/test_channels/test_large_output.py +0 -22
- parsl/tests/test_data/__init__.py +0 -0
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
- parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
- parsl-2024.3.11.dist-info/METADATA +0 -98
- parsl-2024.3.11.dist-info/RECORD +0 -447
- parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
- parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
- parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
parsl/monitoring/monitoring.py
CHANGED
@@ -1,29 +1,25 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import logging
|
4
|
+
import multiprocessing.synchronize as ms
|
3
5
|
import os
|
4
|
-
import socket
|
5
|
-
import time
|
6
6
|
import pickle
|
7
|
-
import logging
|
8
|
-
import typeguard
|
9
|
-
import zmq
|
10
|
-
|
11
7
|
import queue
|
8
|
+
import time
|
9
|
+
from multiprocessing import Event
|
10
|
+
from multiprocessing.queues import Queue
|
11
|
+
from typing import TYPE_CHECKING, Literal, Optional, Tuple, Union, cast
|
12
12
|
|
13
|
-
import
|
13
|
+
import typeguard
|
14
14
|
|
15
|
+
from parsl.log_utils import set_file_logger
|
16
|
+
from parsl.monitoring.errors import MonitoringHubStartError
|
17
|
+
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
18
|
+
from parsl.monitoring.router import router_starter
|
19
|
+
from parsl.monitoring.types import TaggedMonitoringMessage
|
15
20
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
16
|
-
from multiprocessing import Process
|
17
|
-
from multiprocessing.queues import Queue
|
18
|
-
from parsl.utils import RepresentationMixin
|
19
21
|
from parsl.process_loggers import wrap_with_logs
|
20
|
-
from parsl.utils import setproctitle
|
21
|
-
|
22
|
-
from parsl.serialize import deserialize
|
23
|
-
|
24
|
-
from parsl.monitoring.message_type import MessageType
|
25
|
-
from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
|
26
|
-
from typing import cast, Any, Callable, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING
|
22
|
+
from parsl.utils import RepresentationMixin, setproctitle
|
27
23
|
|
28
24
|
_db_manager_excepts: Optional[Exception]
|
29
25
|
|
@@ -38,40 +34,6 @@ else:
|
|
38
34
|
logger = logging.getLogger(__name__)
|
39
35
|
|
40
36
|
|
41
|
-
def start_file_logger(filename: str, name: str = 'monitoring', level: int = logging.DEBUG, format_string: Optional[str] = None) -> logging.Logger:
|
42
|
-
"""Add a stream log handler.
|
43
|
-
|
44
|
-
Parameters
|
45
|
-
---------
|
46
|
-
|
47
|
-
filename: string
|
48
|
-
Name of the file to write logs to. Required.
|
49
|
-
name: string
|
50
|
-
Logger name.
|
51
|
-
level: logging.LEVEL
|
52
|
-
Set the logging level. Default=logging.DEBUG
|
53
|
-
- format_string (string): Set the format string
|
54
|
-
format_string: string
|
55
|
-
Format string to use.
|
56
|
-
|
57
|
-
Returns
|
58
|
-
-------
|
59
|
-
None.
|
60
|
-
"""
|
61
|
-
if format_string is None:
|
62
|
-
format_string = "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d [%(levelname)s] %(message)s"
|
63
|
-
|
64
|
-
logger = logging.getLogger(name)
|
65
|
-
logger.setLevel(level)
|
66
|
-
logger.propagate = False
|
67
|
-
handler = logging.FileHandler(filename)
|
68
|
-
handler.setLevel(level)
|
69
|
-
formatter = logging.Formatter(format_string, datefmt='%Y-%m-%d %H:%M:%S')
|
70
|
-
handler.setFormatter(formatter)
|
71
|
-
logger.addHandler(handler)
|
72
|
-
return logger
|
73
|
-
|
74
|
-
|
75
37
|
@typeguard.typechecked
|
76
38
|
class MonitoringHub(RepresentationMixin):
|
77
39
|
def __init__(self,
|
@@ -79,13 +41,9 @@ class MonitoringHub(RepresentationMixin):
|
|
79
41
|
hub_port: Optional[int] = None,
|
80
42
|
hub_port_range: Tuple[int, int] = (55050, 56000),
|
81
43
|
|
82
|
-
client_address: str = "127.0.0.1",
|
83
|
-
client_port_range: Tuple[int, int] = (55000, 56000),
|
84
|
-
|
85
44
|
workflow_name: Optional[str] = None,
|
86
45
|
workflow_version: Optional[str] = None,
|
87
46
|
logging_endpoint: Optional[str] = None,
|
88
|
-
logdir: Optional[str] = None,
|
89
47
|
monitoring_debug: bool = False,
|
90
48
|
resource_monitoring_enabled: bool = True,
|
91
49
|
resource_monitoring_interval: float = 30): # in seconds
|
@@ -106,11 +64,6 @@ class MonitoringHub(RepresentationMixin):
|
|
106
64
|
to deliver monitoring messages to the monitoring router.
|
107
65
|
Note that despite the similar name, this is not related to hub_port.
|
108
66
|
Default: (55050, 56000)
|
109
|
-
client_address : str
|
110
|
-
The ip address at which the dfk will be able to reach Hub. Default: "127.0.0.1"
|
111
|
-
client_port_range : tuple(int, int)
|
112
|
-
The MonitoringHub picks ports at random from the range which will be used by Hub.
|
113
|
-
Default: (55000, 56000)
|
114
67
|
workflow_name : str
|
115
68
|
The name for the workflow. Default to the name of the parsl script
|
116
69
|
workflow_version : str
|
@@ -119,8 +72,6 @@ class MonitoringHub(RepresentationMixin):
|
|
119
72
|
The database connection url for monitoring to log the information.
|
120
73
|
These URLs follow RFC-1738, and can include username, password, hostname, database name.
|
121
74
|
Default: sqlite, in the configured run_dir.
|
122
|
-
logdir : str
|
123
|
-
Parsl log directory paths. Logs and temp files go here. Default: '.'
|
124
75
|
monitoring_debug : Bool
|
125
76
|
Enable monitoring debug logging. Default: False
|
126
77
|
resource_monitoring_enabled : boolean
|
@@ -134,26 +85,14 @@ class MonitoringHub(RepresentationMixin):
|
|
134
85
|
Default: 30 seconds
|
135
86
|
"""
|
136
87
|
|
137
|
-
self.logger = logger
|
138
|
-
|
139
|
-
# Any is used to disable typechecking on uses of _dfk_channel,
|
140
|
-
# because it is used in the code as if it points to a channel, but
|
141
|
-
# the static type is that it can also be None. The code relies on
|
142
|
-
# .start() being called and initialising this to a real channel.
|
143
|
-
self._dfk_channel = None # type: Any
|
144
|
-
|
145
88
|
if _db_manager_excepts:
|
146
89
|
raise _db_manager_excepts
|
147
90
|
|
148
|
-
self.client_address = client_address
|
149
|
-
self.client_port_range = client_port_range
|
150
|
-
|
151
91
|
self.hub_address = hub_address
|
152
92
|
self.hub_port = hub_port
|
153
93
|
self.hub_port_range = hub_port_range
|
154
94
|
|
155
95
|
self.logging_endpoint = logging_endpoint
|
156
|
-
self.logdir = logdir
|
157
96
|
self.monitoring_debug = monitoring_debug
|
158
97
|
|
159
98
|
self.workflow_name = workflow_name
|
@@ -162,19 +101,15 @@ class MonitoringHub(RepresentationMixin):
|
|
162
101
|
self.resource_monitoring_enabled = resource_monitoring_enabled
|
163
102
|
self.resource_monitoring_interval = resource_monitoring_interval
|
164
103
|
|
165
|
-
def start(self,
|
104
|
+
def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None:
|
166
105
|
|
167
|
-
|
168
|
-
self.logdir = "."
|
106
|
+
logger.debug("Starting MonitoringHub")
|
169
107
|
|
170
108
|
if self.logging_endpoint is None:
|
171
109
|
self.logging_endpoint = f"sqlite:///{os.fspath(config_run_dir)}/monitoring.db"
|
172
110
|
|
173
|
-
os.makedirs(
|
174
|
-
|
175
|
-
# Initialize the ZMQ pipe to the Parsl Client
|
111
|
+
os.makedirs(dfk_run_dir, exist_ok=True)
|
176
112
|
|
177
|
-
self.logger.debug("Initializing ZMQ Pipes to client")
|
178
113
|
self.monitoring_hub_active = True
|
179
114
|
|
180
115
|
# This annotation is incompatible with typeguard 4.x instrumentation
|
@@ -195,26 +130,22 @@ class MonitoringHub(RepresentationMixin):
|
|
195
130
|
self.exception_q: Queue[Tuple[str, str]]
|
196
131
|
self.exception_q = SizedQueue(maxsize=10)
|
197
132
|
|
198
|
-
self.
|
199
|
-
self.priority_msgs = SizedQueue()
|
200
|
-
|
201
|
-
self.resource_msgs: Queue[AddressedMonitoringMessage]
|
133
|
+
self.resource_msgs: Queue[Union[TaggedMonitoringMessage, Literal["STOP"]]]
|
202
134
|
self.resource_msgs = SizedQueue()
|
203
135
|
|
204
|
-
self.
|
205
|
-
self.
|
206
|
-
|
207
|
-
self.block_msgs: Queue[AddressedMonitoringMessage]
|
208
|
-
self.block_msgs = SizedQueue()
|
136
|
+
self.router_exit_event: ms.Event
|
137
|
+
self.router_exit_event = Event()
|
209
138
|
|
210
139
|
self.router_proc = ForkProcess(target=router_starter,
|
211
|
-
|
212
|
-
|
213
|
-
"
|
214
|
-
"
|
215
|
-
"
|
140
|
+
kwargs={"comm_q": comm_q,
|
141
|
+
"exception_q": self.exception_q,
|
142
|
+
"resource_msgs": self.resource_msgs,
|
143
|
+
"exit_event": self.router_exit_event,
|
144
|
+
"hub_address": self.hub_address,
|
145
|
+
"udp_port": self.hub_port,
|
146
|
+
"zmq_port_range": self.hub_port_range,
|
147
|
+
"run_dir": dfk_run_dir,
|
216
148
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
217
|
-
"run_id": run_id
|
218
149
|
},
|
219
150
|
name="Monitoring-Router-Process",
|
220
151
|
daemon=True,
|
@@ -222,8 +153,8 @@ class MonitoringHub(RepresentationMixin):
|
|
222
153
|
self.router_proc.start()
|
223
154
|
|
224
155
|
self.dbm_proc = ForkProcess(target=dbm_starter,
|
225
|
-
args=(self.exception_q, self.
|
226
|
-
kwargs={"
|
156
|
+
args=(self.exception_q, self.resource_msgs,),
|
157
|
+
kwargs={"run_dir": dfk_run_dir,
|
227
158
|
"logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
|
228
159
|
"db_url": self.logging_endpoint,
|
229
160
|
},
|
@@ -231,122 +162,108 @@ class MonitoringHub(RepresentationMixin):
|
|
231
162
|
daemon=True,
|
232
163
|
)
|
233
164
|
self.dbm_proc.start()
|
234
|
-
|
165
|
+
logger.info("Started the router process %s and DBM process %s", self.router_proc.pid, self.dbm_proc.pid)
|
235
166
|
|
236
|
-
self.filesystem_proc =
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
167
|
+
self.filesystem_proc = ForkProcess(target=filesystem_receiver,
|
168
|
+
args=(self.resource_msgs, dfk_run_dir),
|
169
|
+
name="Monitoring-Filesystem-Process",
|
170
|
+
daemon=True
|
171
|
+
)
|
241
172
|
self.filesystem_proc.start()
|
242
|
-
|
173
|
+
logger.info("Started filesystem radio receiver process %s", self.filesystem_proc.pid)
|
174
|
+
|
175
|
+
self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
|
243
176
|
|
244
177
|
try:
|
245
178
|
comm_q_result = comm_q.get(block=True, timeout=120)
|
179
|
+
comm_q.close()
|
180
|
+
comm_q.join_thread()
|
246
181
|
except queue.Empty:
|
247
|
-
|
248
|
-
raise
|
182
|
+
logger.error("Hub has not completed initialization in 120s. Aborting")
|
183
|
+
raise MonitoringHubStartError()
|
249
184
|
|
250
185
|
if isinstance(comm_q_result, str):
|
251
|
-
|
186
|
+
logger.error("MonitoringRouter sent an error message: %s", comm_q_result)
|
252
187
|
raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
|
253
188
|
|
254
|
-
udp_port,
|
189
|
+
udp_port, zmq_port = comm_q_result
|
255
190
|
|
256
191
|
self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
|
257
192
|
|
258
|
-
|
259
|
-
self.dfk_channel_timeout = 10000 # in milliseconds
|
260
|
-
self._dfk_channel = context.socket(zmq.DEALER)
|
261
|
-
self._dfk_channel.setsockopt(zmq.LINGER, 0)
|
262
|
-
self._dfk_channel.set_hwm(0)
|
263
|
-
self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
|
264
|
-
self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, ic_port))
|
265
|
-
|
266
|
-
self.logger.info("Monitoring Hub initialized")
|
193
|
+
logger.info("Monitoring Hub initialized")
|
267
194
|
|
268
|
-
|
195
|
+
self.hub_zmq_port = zmq_port
|
269
196
|
|
270
|
-
|
271
|
-
|
272
|
-
self.
|
273
|
-
try:
|
274
|
-
self._dfk_channel.send_pyobj((mtype, message))
|
275
|
-
except zmq.Again:
|
276
|
-
self.logger.exception(
|
277
|
-
"The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
|
197
|
+
def send(self, message: TaggedMonitoringMessage) -> None:
|
198
|
+
logger.debug("Sending message type %s", message[0])
|
199
|
+
self.radio.send(message)
|
278
200
|
|
279
201
|
def close(self) -> None:
|
280
|
-
|
202
|
+
logger.info("Terminating Monitoring Hub")
|
281
203
|
exception_msgs = []
|
282
204
|
while True:
|
283
205
|
try:
|
284
206
|
exception_msgs.append(self.exception_q.get(block=False))
|
285
|
-
|
207
|
+
logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
|
286
208
|
except queue.Empty:
|
287
209
|
break
|
288
|
-
if self.
|
210
|
+
if self.monitoring_hub_active:
|
289
211
|
self.monitoring_hub_active = False
|
290
|
-
self._dfk_channel.close()
|
291
212
|
if exception_msgs:
|
292
213
|
for exception_msg in exception_msgs:
|
293
|
-
|
294
|
-
"
|
295
|
-
|
296
|
-
|
297
|
-
)
|
214
|
+
logger.error(
|
215
|
+
"%s process delivered an exception: %s. Terminating all monitoring processes immediately.",
|
216
|
+
exception_msg[0],
|
217
|
+
exception_msg[1]
|
298
218
|
)
|
299
219
|
self.router_proc.terminate()
|
300
220
|
self.dbm_proc.terminate()
|
301
221
|
self.filesystem_proc.terminate()
|
302
|
-
|
222
|
+
logger.info("Setting router termination event")
|
223
|
+
self.router_exit_event.set()
|
224
|
+
logger.info("Waiting for router to terminate")
|
303
225
|
self.router_proc.join()
|
304
|
-
self.
|
226
|
+
self.router_proc.close()
|
227
|
+
logger.debug("Finished waiting for router termination")
|
305
228
|
if len(exception_msgs) == 0:
|
306
|
-
|
307
|
-
self.
|
229
|
+
logger.debug("Sending STOP to DBM")
|
230
|
+
self.resource_msgs.put("STOP")
|
308
231
|
else:
|
309
|
-
|
310
|
-
|
232
|
+
logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
|
233
|
+
logger.debug("Waiting for DB termination")
|
311
234
|
self.dbm_proc.join()
|
312
|
-
self.
|
235
|
+
self.dbm_proc.close()
|
236
|
+
logger.debug("Finished waiting for DBM termination")
|
313
237
|
|
314
238
|
# should this be message based? it probably doesn't need to be if
|
315
239
|
# we believe we've received all messages
|
316
|
-
|
240
|
+
logger.info("Terminating filesystem radio receiver process")
|
317
241
|
self.filesystem_proc.terminate()
|
318
242
|
self.filesystem_proc.join()
|
243
|
+
self.filesystem_proc.close()
|
319
244
|
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
monitoring_hub_url: str,
|
327
|
-
run_id: str,
|
328
|
-
logging_level: int,
|
329
|
-
sleep_dur: float,
|
330
|
-
radio_mode: str,
|
331
|
-
monitor_resources: bool,
|
332
|
-
run_dir: str) -> Tuple[Callable, Sequence, Dict]:
|
333
|
-
return parsl.monitoring.remote.monitor_wrapper(f, args, kwargs, try_id, task_id, monitoring_hub_url,
|
334
|
-
run_id, logging_level, sleep_dur, radio_mode,
|
335
|
-
monitor_resources, run_dir)
|
245
|
+
logger.info("Closing monitoring multiprocessing queues")
|
246
|
+
self.exception_q.close()
|
247
|
+
self.exception_q.join_thread()
|
248
|
+
self.resource_msgs.close()
|
249
|
+
self.resource_msgs.join_thread()
|
250
|
+
logger.info("Closed monitoring multiprocessing queues")
|
336
251
|
|
337
252
|
|
338
253
|
@wrap_with_logs
|
339
|
-
def filesystem_receiver(
|
340
|
-
logger =
|
341
|
-
|
342
|
-
|
254
|
+
def filesystem_receiver(q: Queue[TaggedMonitoringMessage], run_dir: str) -> None:
|
255
|
+
logger = set_file_logger(f"{run_dir}/monitoring_filesystem_radio.log",
|
256
|
+
name="monitoring_filesystem_radio",
|
257
|
+
level=logging.INFO)
|
343
258
|
|
344
259
|
logger.info("Starting filesystem radio receiver")
|
345
260
|
setproctitle("parsl: monitoring filesystem receiver")
|
346
261
|
base_path = f"{run_dir}/monitor-fs-radio/"
|
347
262
|
tmp_dir = f"{base_path}/tmp/"
|
348
263
|
new_dir = f"{base_path}/new/"
|
349
|
-
logger.debug(
|
264
|
+
logger.debug("Creating new and tmp paths under %s", base_path)
|
265
|
+
|
266
|
+
target_radio = MultiprocessingQueueRadioSender(q)
|
350
267
|
|
351
268
|
os.makedirs(tmp_dir, exist_ok=True)
|
352
269
|
os.makedirs(new_dir, exist_ok=True)
|
@@ -357,201 +274,15 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
|
|
357
274
|
# iterate over files in new_dir
|
358
275
|
for filename in os.listdir(new_dir):
|
359
276
|
try:
|
360
|
-
logger.info(
|
277
|
+
logger.info("Processing filesystem radio file %s", filename)
|
361
278
|
full_path_filename = f"{new_dir}/{filename}"
|
362
279
|
with open(full_path_filename, "rb") as f:
|
363
|
-
message =
|
364
|
-
logger.debug(
|
280
|
+
message = pickle.load(f)
|
281
|
+
logger.debug("Message received is: %s", message)
|
365
282
|
assert isinstance(message, tuple)
|
366
|
-
|
283
|
+
target_radio.send(cast(TaggedMonitoringMessage, message))
|
367
284
|
os.remove(full_path_filename)
|
368
285
|
except Exception:
|
369
|
-
logger.exception(
|
286
|
+
logger.exception("Exception processing %s - probably will be retried next iteration", filename)
|
370
287
|
|
371
288
|
time.sleep(1) # whats a good time for this poll?
|
372
|
-
|
373
|
-
|
374
|
-
class MonitoringRouter:
|
375
|
-
|
376
|
-
def __init__(self,
|
377
|
-
*,
|
378
|
-
hub_address: str,
|
379
|
-
hub_port: Optional[int] = None,
|
380
|
-
hub_port_range: Tuple[int, int] = (55050, 56000),
|
381
|
-
|
382
|
-
monitoring_hub_address: str = "127.0.0.1",
|
383
|
-
logdir: str = ".",
|
384
|
-
run_id: str,
|
385
|
-
logging_level: int = logging.INFO,
|
386
|
-
atexit_timeout: int = 3 # in seconds
|
387
|
-
):
|
388
|
-
""" Initializes a monitoring configuration class.
|
389
|
-
|
390
|
-
Parameters
|
391
|
-
----------
|
392
|
-
hub_address : str
|
393
|
-
The ip address at which the workers will be able to reach the Hub.
|
394
|
-
hub_port : int
|
395
|
-
The specific port at which workers will be able to reach the Hub via UDP. Default: None
|
396
|
-
hub_port_range : tuple(int, int)
|
397
|
-
The MonitoringHub picks ports at random from the range which will be used by Hub.
|
398
|
-
This is overridden when the hub_port option is set. Default: (55050, 56000)
|
399
|
-
logdir : str
|
400
|
-
Parsl log directory paths. Logs and temp files go here. Default: '.'
|
401
|
-
logging_level : int
|
402
|
-
Logging level as defined in the logging module. Default: logging.INFO
|
403
|
-
atexit_timeout : float, optional
|
404
|
-
The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
|
405
|
-
|
406
|
-
"""
|
407
|
-
os.makedirs(logdir, exist_ok=True)
|
408
|
-
self.logger = start_file_logger("{}/monitoring_router.log".format(logdir),
|
409
|
-
name="monitoring_router",
|
410
|
-
level=logging_level)
|
411
|
-
self.logger.debug("Monitoring router starting")
|
412
|
-
|
413
|
-
self.hub_address = hub_address
|
414
|
-
self.atexit_timeout = atexit_timeout
|
415
|
-
self.run_id = run_id
|
416
|
-
|
417
|
-
self.loop_freq = 10.0 # milliseconds
|
418
|
-
|
419
|
-
# Initialize the UDP socket
|
420
|
-
self.sock = socket.socket(socket.AF_INET,
|
421
|
-
socket.SOCK_DGRAM,
|
422
|
-
socket.IPPROTO_UDP)
|
423
|
-
|
424
|
-
# We are trying to bind to all interfaces with 0.0.0.0
|
425
|
-
if not hub_port:
|
426
|
-
self.sock.bind(('0.0.0.0', 0))
|
427
|
-
self.hub_port = self.sock.getsockname()[1]
|
428
|
-
else:
|
429
|
-
self.hub_port = hub_port
|
430
|
-
try:
|
431
|
-
self.sock.bind(('0.0.0.0', self.hub_port))
|
432
|
-
except Exception as e:
|
433
|
-
raise RuntimeError(f"Could not bind to hub_port {hub_port} because: {e}")
|
434
|
-
self.sock.settimeout(self.loop_freq / 1000)
|
435
|
-
self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.hub_port))
|
436
|
-
|
437
|
-
self._context = zmq.Context()
|
438
|
-
self.ic_channel = self._context.socket(zmq.DEALER)
|
439
|
-
self.ic_channel.setsockopt(zmq.LINGER, 0)
|
440
|
-
self.ic_channel.set_hwm(0)
|
441
|
-
self.ic_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
|
442
|
-
self.logger.debug("hub_address: {}. hub_port_range {}".format(hub_address, hub_port_range))
|
443
|
-
self.ic_port = self.ic_channel.bind_to_random_port("tcp://*",
|
444
|
-
min_port=hub_port_range[0],
|
445
|
-
max_port=hub_port_range[1])
|
446
|
-
|
447
|
-
def start(self,
|
448
|
-
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
449
|
-
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
450
|
-
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
451
|
-
resource_msgs: "queue.Queue[AddressedMonitoringMessage]") -> None:
|
452
|
-
try:
|
453
|
-
router_keep_going = True
|
454
|
-
while router_keep_going:
|
455
|
-
try:
|
456
|
-
data, addr = self.sock.recvfrom(2048)
|
457
|
-
resource_msg = pickle.loads(data)
|
458
|
-
self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
|
459
|
-
resource_msgs.put((resource_msg, addr))
|
460
|
-
except socket.timeout:
|
461
|
-
pass
|
462
|
-
|
463
|
-
try:
|
464
|
-
dfk_loop_start = time.time()
|
465
|
-
while time.time() - dfk_loop_start < 1.0: # TODO make configurable
|
466
|
-
# note that nothing checks that msg really is of the annotated type
|
467
|
-
msg: TaggedMonitoringMessage
|
468
|
-
msg = self.ic_channel.recv_pyobj()
|
469
|
-
|
470
|
-
assert isinstance(msg, tuple), "IC Channel expects only tuples, got {}".format(msg)
|
471
|
-
assert len(msg) >= 1, "IC Channel expects tuples of length at least 1, got {}".format(msg)
|
472
|
-
assert len(msg) == 2, "IC Channel expects message tuples of exactly length 2, got {}".format(msg)
|
473
|
-
|
474
|
-
msg_0: AddressedMonitoringMessage
|
475
|
-
msg_0 = (msg, 0)
|
476
|
-
|
477
|
-
if msg[0] == MessageType.NODE_INFO:
|
478
|
-
msg[1]['run_id'] = self.run_id
|
479
|
-
node_msgs.put(msg_0)
|
480
|
-
elif msg[0] == MessageType.RESOURCE_INFO:
|
481
|
-
resource_msgs.put(msg_0)
|
482
|
-
elif msg[0] == MessageType.BLOCK_INFO:
|
483
|
-
block_msgs.put(msg_0)
|
484
|
-
elif msg[0] == MessageType.TASK_INFO:
|
485
|
-
priority_msgs.put(msg_0)
|
486
|
-
elif msg[0] == MessageType.WORKFLOW_INFO:
|
487
|
-
priority_msgs.put(msg_0)
|
488
|
-
if 'exit_now' in msg[1] and msg[1]['exit_now']:
|
489
|
-
router_keep_going = False
|
490
|
-
else:
|
491
|
-
# There is a type: ignore here because if msg[0]
|
492
|
-
# is of the correct type, this code is unreachable,
|
493
|
-
# but there is no verification that the message
|
494
|
-
# received from ic_channel.recv_pyobj() is actually
|
495
|
-
# of that type.
|
496
|
-
self.logger.error(f"Discarding message from interchange with unknown type {msg[0].value}") # type: ignore[unreachable]
|
497
|
-
except zmq.Again:
|
498
|
-
pass
|
499
|
-
except Exception:
|
500
|
-
# This will catch malformed messages. What happens if the
|
501
|
-
# channel is broken in such a way that it always raises
|
502
|
-
# an exception? Looping on this would maybe be the wrong
|
503
|
-
# thing to do.
|
504
|
-
self.logger.warning("Failure processing a ZMQ message", exc_info=True)
|
505
|
-
|
506
|
-
self.logger.info("Monitoring router draining")
|
507
|
-
last_msg_received_time = time.time()
|
508
|
-
while time.time() - last_msg_received_time < self.atexit_timeout:
|
509
|
-
try:
|
510
|
-
data, addr = self.sock.recvfrom(2048)
|
511
|
-
msg = pickle.loads(data)
|
512
|
-
self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
|
513
|
-
resource_msgs.put((msg, addr))
|
514
|
-
last_msg_received_time = time.time()
|
515
|
-
except socket.timeout:
|
516
|
-
pass
|
517
|
-
|
518
|
-
self.logger.info("Monitoring router finishing normally")
|
519
|
-
finally:
|
520
|
-
self.logger.info("Monitoring router finished")
|
521
|
-
|
522
|
-
|
523
|
-
@wrap_with_logs
|
524
|
-
def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
|
525
|
-
exception_q: "queue.Queue[Tuple[str, str]]",
|
526
|
-
priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
527
|
-
node_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
528
|
-
block_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
529
|
-
resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
|
530
|
-
|
531
|
-
hub_address: str,
|
532
|
-
hub_port: Optional[int],
|
533
|
-
hub_port_range: Tuple[int, int],
|
534
|
-
|
535
|
-
logdir: str,
|
536
|
-
logging_level: int,
|
537
|
-
run_id: str) -> None:
|
538
|
-
setproctitle("parsl: monitoring router")
|
539
|
-
try:
|
540
|
-
router = MonitoringRouter(hub_address=hub_address,
|
541
|
-
hub_port=hub_port,
|
542
|
-
hub_port_range=hub_port_range,
|
543
|
-
logdir=logdir,
|
544
|
-
logging_level=logging_level,
|
545
|
-
run_id=run_id)
|
546
|
-
except Exception as e:
|
547
|
-
logger.error("MonitoringRouter construction failed.", exc_info=True)
|
548
|
-
comm_q.put(f"Monitoring router construction failed: {e}")
|
549
|
-
else:
|
550
|
-
comm_q.put((router.hub_port, router.ic_port))
|
551
|
-
|
552
|
-
router.logger.info("Starting MonitoringRouter in router_starter")
|
553
|
-
try:
|
554
|
-
router.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
|
555
|
-
except Exception as e:
|
556
|
-
router.logger.exception("router.start exception")
|
557
|
-
exception_q.put(('Hub', str(e)))
|
@@ -0,0 +1,13 @@
|
|
1
|
+
import logging
|
2
|
+
from abc import ABCMeta, abstractmethod
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
_db_manager_excepts: Optional[Exception]
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
class MonitoringRadioSender(metaclass=ABCMeta):
|
11
|
+
@abstractmethod
|
12
|
+
def send(self, message: object) -> None:
|
13
|
+
pass
|
@@ -0,0 +1,52 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import pickle
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
from parsl.monitoring.radios.base import MonitoringRadioSender
|
7
|
+
|
8
|
+
logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
class FilesystemRadioSender(MonitoringRadioSender):
|
12
|
+
"""A MonitoringRadioSender that sends messages over a shared filesystem.
|
13
|
+
|
14
|
+
The messsage directory structure is based on maildir,
|
15
|
+
https://en.wikipedia.org/wiki/Maildir
|
16
|
+
|
17
|
+
The writer creates a message in tmp/ and then when it is fully
|
18
|
+
written, moves it atomically into new/
|
19
|
+
|
20
|
+
The reader ignores tmp/ and only reads and deletes messages from
|
21
|
+
new/
|
22
|
+
|
23
|
+
This avoids a race condition of reading partially written messages.
|
24
|
+
|
25
|
+
This radio is likely to give higher shared filesystem load compared to
|
26
|
+
the UDP radio, but should be much more reliable.
|
27
|
+
"""
|
28
|
+
|
29
|
+
def __init__(self, *, monitoring_url: str, timeout: int = 10, run_dir: str):
|
30
|
+
logger.info("filesystem based monitoring channel initializing")
|
31
|
+
self.base_path = f"{run_dir}/monitor-fs-radio/"
|
32
|
+
self.tmp_path = f"{self.base_path}/tmp"
|
33
|
+
self.new_path = f"{self.base_path}/new"
|
34
|
+
|
35
|
+
os.makedirs(self.tmp_path, exist_ok=True)
|
36
|
+
os.makedirs(self.new_path, exist_ok=True)
|
37
|
+
|
38
|
+
def send(self, message: object) -> None:
|
39
|
+
logger.info("Sending a monitoring message via filesystem")
|
40
|
+
|
41
|
+
unique_id = str(uuid.uuid4())
|
42
|
+
|
43
|
+
tmp_filename = f"{self.tmp_path}/{unique_id}"
|
44
|
+
new_filename = f"{self.new_path}/{unique_id}"
|
45
|
+
buffer = message
|
46
|
+
|
47
|
+
# this will write the message out then atomically
|
48
|
+
# move it into new/, so that a partially written
|
49
|
+
# file will never be observed in new/
|
50
|
+
with open(tmp_filename, "wb") as f:
|
51
|
+
pickle.dump(buffer, f)
|
52
|
+
os.rename(tmp_filename, new_filename)
|