parsl 2024.3.11__py3-none-any.whl → 2025.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/__init__.py +9 -10
- parsl/addresses.py +29 -7
- parsl/app/app.py +7 -8
- parsl/app/bash.py +15 -8
- parsl/app/errors.py +10 -13
- parsl/app/futures.py +8 -10
- parsl/app/python.py +2 -1
- parsl/benchmark/perf.py +2 -1
- parsl/concurrent/__init__.py +2 -2
- parsl/config.py +57 -10
- parsl/configs/ASPIRE1.py +6 -5
- parsl/configs/Azure.py +9 -8
- parsl/configs/bridges.py +6 -4
- parsl/configs/cc_in2p3.py +3 -3
- parsl/configs/ec2.py +3 -1
- parsl/configs/expanse.py +4 -3
- parsl/configs/frontera.py +3 -4
- parsl/configs/htex_local.py +3 -4
- parsl/configs/illinoiscluster.py +3 -1
- parsl/configs/improv.py +34 -0
- parsl/configs/kubernetes.py +4 -3
- parsl/configs/local_threads.py +5 -1
- parsl/configs/midway.py +5 -3
- parsl/configs/osg.py +4 -2
- parsl/configs/polaris.py +4 -2
- parsl/configs/stampede2.py +6 -5
- parsl/configs/summit.py +3 -3
- parsl/configs/toss3_llnl.py +4 -3
- parsl/configs/vineex_local.py +6 -4
- parsl/configs/wqex_local.py +5 -3
- parsl/curvezmq.py +4 -0
- parsl/data_provider/data_manager.py +4 -3
- parsl/data_provider/file_noop.py +1 -2
- parsl/data_provider/files.py +3 -3
- parsl/data_provider/ftp.py +1 -3
- parsl/data_provider/globus.py +7 -6
- parsl/data_provider/http.py +2 -2
- parsl/data_provider/rsync.py +1 -1
- parsl/data_provider/staging.py +2 -2
- parsl/data_provider/zip.py +135 -0
- parsl/dataflow/dependency_resolvers.py +115 -0
- parsl/dataflow/dflow.py +262 -224
- parsl/dataflow/errors.py +3 -5
- parsl/dataflow/futures.py +27 -14
- parsl/dataflow/memoization.py +5 -5
- parsl/dataflow/rundirs.py +5 -6
- parsl/dataflow/taskrecord.py +4 -5
- parsl/executors/__init__.py +4 -2
- parsl/executors/base.py +45 -15
- parsl/executors/errors.py +13 -0
- parsl/executors/execute_task.py +37 -0
- parsl/executors/flux/execute_parsl_task.py +3 -3
- parsl/executors/flux/executor.py +18 -19
- parsl/executors/flux/flux_instance_manager.py +26 -27
- parsl/executors/high_throughput/errors.py +43 -3
- parsl/executors/high_throughput/executor.py +316 -282
- parsl/executors/high_throughput/interchange.py +158 -167
- parsl/executors/high_throughput/manager_record.py +5 -0
- parsl/executors/high_throughput/manager_selector.py +55 -0
- parsl/executors/high_throughput/monitoring_info.py +2 -1
- parsl/executors/high_throughput/mpi_executor.py +113 -0
- parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
- parsl/executors/high_throughput/mpi_resource_management.py +6 -17
- parsl/executors/high_throughput/probe.py +9 -7
- parsl/executors/high_throughput/process_worker_pool.py +115 -77
- parsl/executors/high_throughput/zmq_pipes.py +81 -23
- parsl/executors/radical/executor.py +130 -79
- parsl/executors/radical/rpex_resources.py +17 -15
- parsl/executors/radical/rpex_worker.py +4 -3
- parsl/executors/status_handling.py +157 -51
- parsl/executors/taskvine/__init__.py +1 -1
- parsl/executors/taskvine/errors.py +1 -1
- parsl/executors/taskvine/exec_parsl_function.py +2 -2
- parsl/executors/taskvine/executor.py +41 -57
- parsl/executors/taskvine/factory.py +1 -1
- parsl/executors/taskvine/factory_config.py +1 -1
- parsl/executors/taskvine/manager.py +18 -13
- parsl/executors/taskvine/manager_config.py +9 -5
- parsl/executors/threads.py +6 -6
- parsl/executors/workqueue/errors.py +1 -1
- parsl/executors/workqueue/exec_parsl_function.py +6 -5
- parsl/executors/workqueue/executor.py +64 -63
- parsl/executors/workqueue/parsl_coprocess.py +1 -1
- parsl/jobs/error_handlers.py +2 -2
- parsl/jobs/job_status_poller.py +30 -113
- parsl/jobs/states.py +7 -2
- parsl/jobs/strategy.py +43 -31
- parsl/launchers/__init__.py +12 -3
- parsl/launchers/errors.py +1 -1
- parsl/launchers/launchers.py +6 -12
- parsl/log_utils.py +9 -6
- parsl/monitoring/db_manager.py +59 -95
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +87 -356
- parsl/monitoring/queries/pandas.py +1 -2
- parsl/monitoring/radios/base.py +13 -0
- parsl/monitoring/radios/filesystem.py +52 -0
- parsl/monitoring/radios/htex.py +57 -0
- parsl/monitoring/radios/multiprocessing.py +17 -0
- parsl/monitoring/radios/udp.py +56 -0
- parsl/monitoring/radios/zmq.py +17 -0
- parsl/monitoring/remote.py +33 -37
- parsl/monitoring/router.py +212 -0
- parsl/monitoring/types.py +5 -6
- parsl/monitoring/visualization/app.py +4 -2
- parsl/monitoring/visualization/models.py +0 -1
- parsl/monitoring/visualization/plots/default/workflow_plots.py +11 -4
- parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
- parsl/monitoring/visualization/utils.py +0 -1
- parsl/monitoring/visualization/views.py +16 -8
- parsl/multiprocessing.py +0 -1
- parsl/process_loggers.py +1 -2
- parsl/providers/__init__.py +8 -17
- parsl/providers/aws/aws.py +2 -3
- parsl/providers/azure/azure.py +4 -5
- parsl/providers/base.py +2 -18
- parsl/providers/cluster_provider.py +4 -12
- parsl/providers/condor/condor.py +7 -17
- parsl/providers/errors.py +2 -2
- parsl/providers/googlecloud/googlecloud.py +2 -1
- parsl/providers/grid_engine/grid_engine.py +5 -14
- parsl/providers/kubernetes/kube.py +80 -40
- parsl/providers/local/local.py +13 -26
- parsl/providers/lsf/lsf.py +5 -23
- parsl/providers/pbspro/pbspro.py +5 -17
- parsl/providers/slurm/slurm.py +81 -39
- parsl/providers/torque/torque.py +3 -14
- parsl/serialize/__init__.py +8 -3
- parsl/serialize/base.py +1 -2
- parsl/serialize/concretes.py +5 -4
- parsl/serialize/facade.py +3 -3
- parsl/serialize/proxystore.py +3 -2
- parsl/tests/__init__.py +1 -1
- parsl/tests/configs/azure_single_node.py +4 -5
- parsl/tests/configs/bridges.py +3 -2
- parsl/tests/configs/cc_in2p3.py +1 -3
- parsl/tests/configs/comet.py +2 -1
- parsl/tests/configs/ec2_single_node.py +1 -2
- parsl/tests/configs/ec2_spot.py +1 -2
- parsl/tests/configs/flux_local.py +11 -0
- parsl/tests/configs/frontera.py +2 -3
- parsl/tests/configs/htex_local.py +3 -5
- parsl/tests/configs/htex_local_alternate.py +11 -15
- parsl/tests/configs/htex_local_intask_staging.py +5 -9
- parsl/tests/configs/htex_local_rsync_staging.py +4 -8
- parsl/tests/configs/local_radical.py +1 -3
- parsl/tests/configs/local_radical_mpi.py +2 -2
- parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
- parsl/tests/configs/local_threads_monitoring.py +0 -1
- parsl/tests/configs/midway.py +2 -2
- parsl/tests/configs/nscc_singapore.py +3 -3
- parsl/tests/configs/osg_htex.py +1 -1
- parsl/tests/configs/petrelkube.py +3 -2
- parsl/tests/configs/slurm_local.py +24 -0
- parsl/tests/configs/summit.py +1 -0
- parsl/tests/configs/taskvine_ex.py +4 -7
- parsl/tests/configs/user_opts.py +2 -8
- parsl/tests/configs/workqueue_ex.py +4 -6
- parsl/tests/conftest.py +27 -13
- parsl/tests/integration/test_stress/test_python_simple.py +3 -4
- parsl/tests/integration/test_stress/test_python_threads.py +3 -5
- parsl/tests/manual_tests/htex_local.py +4 -6
- parsl/tests/manual_tests/test_basic.py +1 -0
- parsl/tests/manual_tests/test_log_filter.py +3 -1
- parsl/tests/manual_tests/test_memory_limits.py +6 -8
- parsl/tests/manual_tests/test_regression_220.py +2 -1
- parsl/tests/manual_tests/test_udp_simple.py +4 -4
- parsl/tests/manual_tests/test_worker_count.py +3 -2
- parsl/tests/scaling_tests/htex_local.py +2 -4
- parsl/tests/scaling_tests/test_scale.py +0 -9
- parsl/tests/scaling_tests/vineex_condor.py +1 -2
- parsl/tests/scaling_tests/vineex_local.py +1 -2
- parsl/tests/site_tests/site_config_selector.py +1 -6
- parsl/tests/site_tests/test_provider.py +4 -2
- parsl/tests/site_tests/test_site.py +2 -0
- parsl/tests/sites/test_affinity.py +7 -7
- parsl/tests/sites/test_dynamic_executor.py +3 -4
- parsl/tests/sites/test_ec2.py +3 -2
- parsl/tests/sites/test_worker_info.py +4 -5
- parsl/tests/test_aalst_patterns.py +0 -1
- parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
- parsl/tests/test_bash_apps/test_basic.py +10 -4
- parsl/tests/test_bash_apps/test_error_codes.py +5 -7
- parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
- parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
- parsl/tests/test_bash_apps/test_memoize.py +2 -8
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
- parsl/tests/test_bash_apps/test_multiline.py +1 -1
- parsl/tests/test_bash_apps/test_pipeline.py +1 -1
- parsl/tests/test_bash_apps/test_std_uri.py +123 -0
- parsl/tests/test_bash_apps/test_stdout.py +33 -8
- parsl/tests/test_callables.py +2 -2
- parsl/tests/test_checkpointing/test_periodic.py +21 -39
- parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
- parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
- parsl/tests/test_checkpointing/test_regression_239.py +1 -1
- parsl/tests/test_checkpointing/test_task_exit.py +2 -3
- parsl/tests/test_docs/test_from_slides.py +5 -2
- parsl/tests/test_docs/test_kwargs.py +4 -1
- parsl/tests/test_docs/test_tutorial_1.py +1 -2
- parsl/tests/test_docs/test_workflow1.py +2 -2
- parsl/tests/test_docs/test_workflow2.py +0 -1
- parsl/tests/test_error_handling/test_rand_fail.py +2 -2
- parsl/tests/test_error_handling/test_resource_spec.py +10 -12
- parsl/tests/test_error_handling/test_retries.py +6 -16
- parsl/tests/test_error_handling/test_retry_handler.py +1 -0
- parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
- parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
- parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
- parsl/tests/test_execute_task.py +29 -0
- parsl/tests/test_flux.py +1 -1
- parsl/tests/test_htex/test_basic.py +2 -3
- parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
- parsl/tests/test_htex/test_command_client_timeout.py +66 -0
- parsl/tests/test_htex/test_connected_blocks.py +3 -2
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
- parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_drain.py +79 -0
- parsl/tests/test_htex/test_htex.py +51 -25
- parsl/tests/test_htex/test_manager_failure.py +0 -1
- parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
- parsl/tests/test_htex/test_managers_command.py +36 -0
- parsl/tests/test_htex/test_missing_worker.py +2 -12
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
- parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
- parsl/tests/test_htex/test_zmq_binding.py +29 -8
- parsl/tests/test_monitoring/test_app_names.py +86 -0
- parsl/tests/test_monitoring/test_basic.py +73 -25
- parsl/tests/test_monitoring/test_db_locks.py +6 -4
- parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
- parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
- parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
- parsl/tests/test_monitoring/test_stdouterr.py +134 -0
- parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
- parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
- parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
- parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
- parsl/tests/test_providers/test_local_provider.py +3 -132
- parsl/tests/test_providers/test_pbspro_template.py +2 -3
- parsl/tests/test_providers/test_slurm_template.py +2 -3
- parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
- parsl/tests/test_python_apps/test_context_manager.py +128 -0
- parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
- parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
- parsl/tests/test_python_apps/test_fail.py +0 -25
- parsl/tests/test_python_apps/test_futures.py +2 -1
- parsl/tests/test_python_apps/test_inputs_default.py +22 -0
- parsl/tests/test_python_apps/test_join.py +0 -1
- parsl/tests/test_python_apps/test_lifted.py +11 -7
- parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
- parsl/tests/test_python_apps/test_outputs.py +1 -1
- parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
- parsl/tests/test_radical/test_mpi_funcs.py +1 -2
- parsl/tests/test_regression/test_1480.py +2 -1
- parsl/tests/test_regression/test_1653.py +2 -1
- parsl/tests/test_regression/test_226.py +1 -0
- parsl/tests/test_regression/test_2652.py +1 -0
- parsl/tests/test_regression/test_69a.py +0 -1
- parsl/tests/test_regression/test_854.py +4 -2
- parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
- parsl/tests/test_regression/test_98.py +0 -1
- parsl/tests/test_scaling/test_block_error_handler.py +9 -4
- parsl/tests/test_scaling/test_regression_1621.py +11 -15
- parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
- parsl/tests/test_scaling/test_scale_down.py +2 -5
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +6 -18
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
- parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
- parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
- parsl/tests/test_serialization/test_basic.py +2 -1
- parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
- parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
- parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
- parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
- parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
- parsl/tests/test_staging/staging_provider.py +2 -2
- parsl/tests/test_staging/test_1316.py +3 -4
- parsl/tests/test_staging/test_docs_1.py +2 -1
- parsl/tests/test_staging/test_docs_2.py +2 -1
- parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
- parsl/tests/{test_data → test_staging}/test_file.py +6 -6
- parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
- parsl/tests/test_staging/test_staging_ftp.py +1 -0
- parsl/tests/test_staging/test_staging_https.py +5 -2
- parsl/tests/test_staging/test_staging_stdout.py +64 -0
- parsl/tests/test_staging/test_zip_in.py +39 -0
- parsl/tests/test_staging/test_zip_out.py +110 -0
- parsl/tests/test_staging/test_zip_to_zip.py +41 -0
- parsl/tests/test_summary.py +2 -2
- parsl/tests/test_thread_parallelism.py +0 -1
- parsl/tests/test_threads/test_configs.py +1 -2
- parsl/tests/test_threads/test_lazy_errors.py +2 -2
- parsl/tests/test_utils/test_execute_wait.py +35 -0
- parsl/tests/test_utils/test_sanitize_dns.py +76 -0
- parsl/tests/unit/test_address.py +20 -0
- parsl/tests/unit/test_file.py +99 -0
- parsl/tests/unit/test_usage_tracking.py +66 -0
- parsl/usage_tracking/api.py +65 -0
- parsl/usage_tracking/levels.py +6 -0
- parsl/usage_tracking/usage.py +104 -62
- parsl/utils.py +139 -6
- parsl/version.py +1 -1
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
- parsl-2025.1.13.data/scripts/interchange.py +649 -0
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +115 -77
- parsl-2025.1.13.dist-info/METADATA +96 -0
- parsl-2025.1.13.dist-info/RECORD +462 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
- parsl/channels/__init__.py +0 -7
- parsl/channels/base.py +0 -141
- parsl/channels/errors.py +0 -113
- parsl/channels/local/local.py +0 -164
- parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
- parsl/channels/ssh/ssh.py +0 -276
- parsl/channels/ssh_il/__init__.py +0 -0
- parsl/channels/ssh_il/ssh_il.py +0 -74
- parsl/configs/ad_hoc.py +0 -35
- parsl/executors/radical/rpex_master.py +0 -42
- parsl/monitoring/radios.py +0 -175
- parsl/providers/ad_hoc/__init__.py +0 -0
- parsl/providers/ad_hoc/ad_hoc.py +0 -248
- parsl/providers/cobalt/__init__.py +0 -0
- parsl/providers/cobalt/cobalt.py +0 -236
- parsl/providers/cobalt/template.py +0 -17
- parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
- parsl/tests/configs/cooley_htex.py +0 -37
- parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
- parsl/tests/configs/local_adhoc.py +0 -18
- parsl/tests/configs/swan_htex.py +0 -43
- parsl/tests/configs/theta.py +0 -37
- parsl/tests/integration/test_channels/__init__.py +0 -0
- parsl/tests/integration/test_channels/test_channels.py +0 -17
- parsl/tests/integration/test_channels/test_local_channel.py +0 -42
- parsl/tests/integration/test_channels/test_scp_1.py +0 -45
- parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
- parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
- parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
- parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
- parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
- parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
- parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
- parsl/tests/sites/test_local_adhoc.py +0 -61
- parsl/tests/test_channels/__init__.py +0 -0
- parsl/tests/test_channels/test_large_output.py +0 -22
- parsl/tests/test_data/__init__.py +0 -0
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
- parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
- parsl-2024.3.11.dist-info/METADATA +0 -98
- parsl-2024.3.11.dist-info/RECORD +0 -447
- parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
- parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
- parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
- {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
parsl/dataflow/dflow.py
CHANGED
@@ -1,50 +1,54 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
+
|
2
3
|
import atexit
|
4
|
+
import concurrent.futures as cf
|
5
|
+
import datetime
|
6
|
+
import inspect
|
3
7
|
import logging
|
4
8
|
import os
|
5
|
-
import pathlib
|
6
9
|
import pickle
|
7
10
|
import random
|
8
|
-
import time
|
9
|
-
import typeguard
|
10
|
-
import inspect
|
11
|
-
import threading
|
12
11
|
import sys
|
13
|
-
import
|
12
|
+
import threading
|
13
|
+
import time
|
14
|
+
from concurrent.futures import Future
|
15
|
+
from functools import partial
|
14
16
|
from getpass import getuser
|
15
|
-
from
|
17
|
+
from socket import gethostname
|
16
18
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
|
17
19
|
from uuid import uuid4
|
18
|
-
|
19
|
-
|
20
|
-
from
|
20
|
+
|
21
|
+
import typeguard
|
22
|
+
from typeguard import typechecked
|
21
23
|
|
22
24
|
import parsl
|
23
25
|
from parsl.app.errors import RemoteExceptionWrapper
|
24
26
|
from parsl.app.futures import DataFuture
|
25
|
-
from parsl.channels import Channel
|
26
27
|
from parsl.config import Config
|
27
28
|
from parsl.data_provider.data_manager import DataManager
|
28
29
|
from parsl.data_provider.files import File
|
30
|
+
from parsl.dataflow.dependency_resolvers import SHALLOW_DEPENDENCY_RESOLVER
|
29
31
|
from parsl.dataflow.errors import BadCheckpoint, DependencyError, JoinError
|
30
32
|
from parsl.dataflow.futures import AppFuture
|
31
33
|
from parsl.dataflow.memoization import Memoizer
|
32
34
|
from parsl.dataflow.rundirs import make_rundir
|
33
|
-
from parsl.dataflow.states import
|
35
|
+
from parsl.dataflow.states import FINAL_FAILURE_STATES, FINAL_STATES, States
|
34
36
|
from parsl.dataflow.taskrecord import TaskRecord
|
35
|
-
from parsl.errors import
|
36
|
-
|
37
|
-
|
38
|
-
|
37
|
+
from parsl.errors import (
|
38
|
+
ConfigurationError,
|
39
|
+
InternalConsistencyError,
|
40
|
+
NoDataFlowKernelError,
|
41
|
+
)
|
39
42
|
from parsl.executors.base import ParslExecutor
|
40
43
|
from parsl.executors.status_handling import BlockProviderExecutor
|
41
44
|
from parsl.executors.threads import ThreadPoolExecutor
|
45
|
+
from parsl.jobs.job_status_poller import JobStatusPoller
|
42
46
|
from parsl.monitoring import MonitoringHub
|
43
|
-
from parsl.process_loggers import wrap_with_logs
|
44
|
-
from parsl.providers.base import ExecutionProvider
|
45
|
-
from parsl.utils import get_version, get_std_fname_mode, get_all_checkpoints, Timer
|
46
|
-
|
47
47
|
from parsl.monitoring.message_type import MessageType
|
48
|
+
from parsl.monitoring.remote import monitor_wrapper
|
49
|
+
from parsl.process_loggers import wrap_with_logs
|
50
|
+
from parsl.usage_tracking.usage import UsageTracker
|
51
|
+
from parsl.utils import Timer, get_all_checkpoints, get_std_fname_mode, get_version
|
48
52
|
|
49
53
|
logger = logging.getLogger(__name__)
|
50
54
|
|
@@ -106,14 +110,8 @@ class DataFlowKernel:
|
|
106
110
|
self.monitoring: Optional[MonitoringHub]
|
107
111
|
self.monitoring = config.monitoring
|
108
112
|
|
109
|
-
# hub address and port for interchange to connect
|
110
|
-
self.hub_address = None # type: Optional[str]
|
111
|
-
self.hub_interchange_port = None # type: Optional[int]
|
112
113
|
if self.monitoring:
|
113
|
-
|
114
|
-
self.monitoring.logdir = self.run_dir
|
115
|
-
self.hub_address = self.monitoring.hub_address
|
116
|
-
self.hub_interchange_port = self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir)
|
114
|
+
self.monitoring.start(self.run_dir, self.config.run_dir)
|
117
115
|
|
118
116
|
self.time_began = datetime.datetime.now()
|
119
117
|
self.time_completed: Optional[datetime.datetime] = None
|
@@ -159,8 +157,8 @@ class DataFlowKernel:
|
|
159
157
|
}
|
160
158
|
|
161
159
|
if self.monitoring:
|
162
|
-
self.monitoring.send(MessageType.WORKFLOW_INFO,
|
163
|
-
workflow_info)
|
160
|
+
self.monitoring.send((MessageType.WORKFLOW_INFO,
|
161
|
+
workflow_info))
|
164
162
|
|
165
163
|
if config.checkpoint_files is not None:
|
166
164
|
checkpoints = self.load_checkpoints(config.checkpoint_files)
|
@@ -178,8 +176,8 @@ class DataFlowKernel:
|
|
178
176
|
# this must be set before executors are added since add_executors calls
|
179
177
|
# job_status_poller.add_executors.
|
180
178
|
self.job_status_poller = JobStatusPoller(strategy=self.config.strategy,
|
181
|
-
|
182
|
-
|
179
|
+
strategy_period=self.config.strategy_period,
|
180
|
+
max_idletime=self.config.max_idletime)
|
183
181
|
|
184
182
|
self.executors: Dict[str, ParslExecutor] = {}
|
185
183
|
|
@@ -203,21 +201,52 @@ class DataFlowKernel:
|
|
203
201
|
self.tasks: Dict[int, TaskRecord] = {}
|
204
202
|
self.submitter_lock = threading.Lock()
|
205
203
|
|
204
|
+
self.dependency_launch_pool = cf.ThreadPoolExecutor(max_workers=1, thread_name_prefix="Dependency-Launch")
|
205
|
+
|
206
|
+
self.dependency_resolver = self.config.dependency_resolver if self.config.dependency_resolver is not None \
|
207
|
+
else SHALLOW_DEPENDENCY_RESOLVER
|
208
|
+
|
206
209
|
atexit.register(self.atexit_cleanup)
|
207
210
|
|
211
|
+
def __enter__(self):
|
212
|
+
return self
|
213
|
+
|
214
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
215
|
+
mode = self.config.exit_mode
|
216
|
+
logger.debug("Exiting context manager, with exit mode '%s'", mode)
|
217
|
+
if mode == "cleanup":
|
218
|
+
logger.info("Calling cleanup for DFK")
|
219
|
+
self.cleanup()
|
220
|
+
elif mode == "skip":
|
221
|
+
logger.info("Skipping all cleanup handling")
|
222
|
+
elif mode == "wait":
|
223
|
+
if exc_type is None:
|
224
|
+
logger.info("Waiting for all tasks to complete")
|
225
|
+
self.wait_for_current_tasks()
|
226
|
+
self.cleanup()
|
227
|
+
else:
|
228
|
+
logger.info("There was an exception - cleaning up without waiting for task completion")
|
229
|
+
self.cleanup()
|
230
|
+
else:
|
231
|
+
raise InternalConsistencyError(f"Exit case for {mode} should be unreachable, validated by typeguard on Config()")
|
232
|
+
|
208
233
|
def _send_task_log_info(self, task_record: TaskRecord) -> None:
|
209
234
|
if self.monitoring:
|
210
235
|
task_log_info = self._create_task_log_info(task_record)
|
211
|
-
self.monitoring.send(MessageType.TASK_INFO, task_log_info)
|
236
|
+
self.monitoring.send((MessageType.TASK_INFO, task_log_info))
|
212
237
|
|
213
|
-
def _create_task_log_info(self, task_record):
|
238
|
+
def _create_task_log_info(self, task_record: TaskRecord) -> Dict[str, Any]:
|
214
239
|
"""
|
215
240
|
Create the dictionary that will be included in the log.
|
216
241
|
"""
|
217
242
|
info_to_monitor = ['func_name', 'memoize', 'hashsum', 'fail_count', 'fail_cost', 'status',
|
218
243
|
'id', 'time_invoked', 'try_time_launched', 'time_returned', 'try_time_returned', 'executor']
|
219
244
|
|
220
|
-
|
245
|
+
# mypy cannot verify that these task_record[k] references are valid:
|
246
|
+
# They are valid if all entries in info_to_monitor are declared in the definition of TaskRecord
|
247
|
+
# This type: ignore[literal-required] asserts that fact.
|
248
|
+
task_log_info = {"task_" + k: task_record[k] for k in info_to_monitor} # type: ignore[literal-required]
|
249
|
+
|
221
250
|
task_log_info['run_id'] = self.run_id
|
222
251
|
task_log_info['try_id'] = task_record['try_id']
|
223
252
|
task_log_info['timestamp'] = datetime.datetime.now()
|
@@ -229,20 +258,28 @@ class DataFlowKernel:
|
|
229
258
|
task_log_info['task_inputs'] = str(task_record['kwargs'].get('inputs', None))
|
230
259
|
task_log_info['task_outputs'] = str(task_record['kwargs'].get('outputs', None))
|
231
260
|
task_log_info['task_stdin'] = task_record['kwargs'].get('stdin', None)
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
261
|
+
|
262
|
+
def std_spec_to_name(name, spec):
|
263
|
+
if spec is None:
|
264
|
+
name = ""
|
265
|
+
elif isinstance(spec, File):
|
266
|
+
name = spec.url
|
267
|
+
else:
|
268
|
+
# fallthrough case is various str, os.PathLike, tuple modes that
|
269
|
+
# can be interpreted by get_std_fname_mode.
|
270
|
+
try:
|
271
|
+
name, _ = get_std_fname_mode(name, spec)
|
272
|
+
except Exception:
|
273
|
+
logger.exception(f"Could not parse {name} specification {spec} for task {task_record['id']}")
|
274
|
+
name = ""
|
275
|
+
return name
|
276
|
+
|
277
|
+
stdout_spec = task_record['kwargs'].get('stdout')
|
278
|
+
task_log_info['task_stdout'] = std_spec_to_name('stdout', stdout_spec)
|
279
|
+
|
280
|
+
stderr_spec = task_record['kwargs'].get('stderr')
|
281
|
+
task_log_info['task_stderr'] = std_spec_to_name('stderr', stderr_spec)
|
282
|
+
|
246
283
|
task_log_info['task_fail_history'] = ",".join(task_record['fail_history'])
|
247
284
|
task_log_info['task_depends'] = None
|
248
285
|
if task_record['depends'] is not None:
|
@@ -583,9 +620,9 @@ class DataFlowKernel:
|
|
583
620
|
return kwargs.get('_parsl_staging_inhibit', False)
|
584
621
|
|
585
622
|
def launch_if_ready(self, task_record: TaskRecord) -> None:
|
586
|
-
"""
|
587
|
-
|
588
|
-
|
623
|
+
"""Schedules a task record for re-inspection to see if it is ready
|
624
|
+
for launch and for launch if it is ready. The call will return
|
625
|
+
immediately.
|
589
626
|
|
590
627
|
This should be called by any piece of the DataFlowKernel that
|
591
628
|
thinks a task may have become ready to run.
|
@@ -594,13 +631,17 @@ class DataFlowKernel:
|
|
594
631
|
ready to run - launch_if_ready will not incorrectly launch that
|
595
632
|
task.
|
596
633
|
|
597
|
-
It is also not an error to call launch_if_ready on a task that has
|
598
|
-
already been launched - launch_if_ready will not re-launch that
|
599
|
-
task.
|
600
|
-
|
601
634
|
launch_if_ready is thread safe, so may be called from any thread
|
602
635
|
or callback.
|
603
636
|
"""
|
637
|
+
self.dependency_launch_pool.submit(self._launch_if_ready_async, task_record)
|
638
|
+
|
639
|
+
@wrap_with_logs
|
640
|
+
def _launch_if_ready_async(self, task_record: TaskRecord) -> None:
|
641
|
+
"""
|
642
|
+
_launch_if_ready will launch the specified task, if it is ready
|
643
|
+
to run (for example, without dependencies, and in pending state).
|
644
|
+
"""
|
604
645
|
exec_fu = None
|
605
646
|
|
606
647
|
task_id = task_record['id']
|
@@ -666,14 +707,6 @@ class DataFlowKernel:
|
|
666
707
|
def launch_task(self, task_record: TaskRecord) -> Future:
|
667
708
|
"""Handle the actual submission of the task to the executor layer.
|
668
709
|
|
669
|
-
If the app task has the executors attributes not set (default=='all')
|
670
|
-
the task is launched on a randomly selected executor from the
|
671
|
-
list of executors. This behavior could later be updated to support
|
672
|
-
binding to executors based on user specified criteria.
|
673
|
-
|
674
|
-
If the app task specifies a particular set of executors, it will be
|
675
|
-
targeted at those specific executors.
|
676
|
-
|
677
710
|
Args:
|
678
711
|
task_record : The task record
|
679
712
|
|
@@ -706,14 +739,18 @@ class DataFlowKernel:
|
|
706
739
|
|
707
740
|
if self.monitoring is not None and self.monitoring.resource_monitoring_enabled:
|
708
741
|
wrapper_logging_level = logging.DEBUG if self.monitoring.monitoring_debug else logging.INFO
|
709
|
-
(function, args, kwargs) =
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
742
|
+
(function, args, kwargs) = monitor_wrapper(f=function,
|
743
|
+
args=args,
|
744
|
+
kwargs=kwargs,
|
745
|
+
x_try_id=try_id,
|
746
|
+
x_task_id=task_id,
|
747
|
+
monitoring_hub_url=self.monitoring.monitoring_hub_url,
|
748
|
+
run_id=self.run_id,
|
749
|
+
logging_level=wrapper_logging_level,
|
750
|
+
sleep_dur=self.monitoring.resource_monitoring_interval,
|
751
|
+
radio_mode=executor.radio_mode,
|
752
|
+
monitor_resources=executor.monitor_resources(),
|
753
|
+
run_dir=self.run_dir)
|
717
754
|
|
718
755
|
with self.submitter_lock:
|
719
756
|
exec_fu = executor.submit(function, task_record['resource_specification'], *args, **kwargs)
|
@@ -756,6 +793,10 @@ class DataFlowKernel:
|
|
756
793
|
(inputs[idx], func) = self.data_manager.optionally_stage_in(f, func, executor)
|
757
794
|
|
758
795
|
for kwarg, f in kwargs.items():
|
796
|
+
# stdout and stderr files should not be staging in (they will be staged *out*
|
797
|
+
# in _add_output_deps)
|
798
|
+
if kwarg in ['stdout', 'stderr']:
|
799
|
+
continue
|
759
800
|
(kwargs[kwarg], func) = self.data_manager.optionally_stage_in(f, func, executor)
|
760
801
|
|
761
802
|
newargs = list(args)
|
@@ -768,33 +809,55 @@ class DataFlowKernel:
|
|
768
809
|
logger.debug("Adding output dependencies")
|
769
810
|
outputs = kwargs.get('outputs', [])
|
770
811
|
app_fut._outputs = []
|
771
|
-
|
772
|
-
|
812
|
+
|
813
|
+
# Pass over all possible outputs: the outputs kwarg, stdout and stderr
|
814
|
+
# and for each of those, perform possible stage-out. This can result in:
|
815
|
+
# a DataFuture to be exposed in app_fut to represent the completion of
|
816
|
+
# that stageout (sometimes backed by a new sub-workflow for separate-task
|
817
|
+
# stageout), a replacement for the function to be executed (intended to
|
818
|
+
# be the original function wrapped with an in-task stageout wrapper), a
|
819
|
+
# rewritten File object to be passed to task to be executed
|
820
|
+
|
821
|
+
def stageout_one_file(file: File, rewritable_func: Callable):
|
822
|
+
if not self.check_staging_inhibited(kwargs):
|
773
823
|
# replace a File with a DataFuture - either completing when the stageout
|
774
824
|
# future completes, or if no stage out future is returned, then when the
|
775
825
|
# app itself completes.
|
776
826
|
|
777
827
|
# The staging code will get a clean copy which it is allowed to mutate,
|
778
828
|
# while the DataFuture-contained original will not be modified by any staging.
|
779
|
-
f_copy =
|
780
|
-
outputs[idx] = f_copy
|
829
|
+
f_copy = file.cleancopy()
|
781
830
|
|
782
|
-
logger.debug("Submitting stage out for output file {}".format(repr(
|
831
|
+
logger.debug("Submitting stage out for output file {}".format(repr(file)))
|
783
832
|
stageout_fut = self.data_manager.stage_out(f_copy, executor, app_fut)
|
784
833
|
if stageout_fut:
|
785
|
-
logger.debug("Adding a dependency on stageout future for {}".format(repr(
|
786
|
-
|
834
|
+
logger.debug("Adding a dependency on stageout future for {}".format(repr(file)))
|
835
|
+
df = DataFuture(stageout_fut, file, tid=app_fut.tid)
|
787
836
|
else:
|
788
|
-
logger.debug("No stageout dependency for {}".format(repr(
|
789
|
-
|
837
|
+
logger.debug("No stageout dependency for {}".format(repr(file)))
|
838
|
+
df = DataFuture(app_fut, file, tid=app_fut.tid)
|
790
839
|
|
791
840
|
# this is a hook for post-task stageout
|
792
841
|
# note that nothing depends on the output - which is maybe a bug
|
793
842
|
# in the not-very-tested stageout system?
|
794
|
-
|
843
|
+
rewritable_func = self.data_manager.replace_task_stage_out(f_copy, rewritable_func, executor)
|
844
|
+
return rewritable_func, f_copy, df
|
795
845
|
else:
|
796
|
-
logger.debug("Not performing output staging for: {}".format(repr(
|
797
|
-
|
846
|
+
logger.debug("Not performing output staging for: {}".format(repr(file)))
|
847
|
+
return rewritable_func, file, DataFuture(app_fut, file, tid=app_fut.tid)
|
848
|
+
|
849
|
+
for idx, file in enumerate(outputs):
|
850
|
+
func, outputs[idx], o = stageout_one_file(file, func)
|
851
|
+
app_fut._outputs.append(o)
|
852
|
+
|
853
|
+
file = kwargs.get('stdout')
|
854
|
+
if isinstance(file, File):
|
855
|
+
func, kwargs['stdout'], app_fut._stdout_future = stageout_one_file(file, func)
|
856
|
+
|
857
|
+
file = kwargs.get('stderr')
|
858
|
+
if isinstance(file, File):
|
859
|
+
func, kwargs['stderr'], app_fut._stderr_future = stageout_one_file(file, func)
|
860
|
+
|
798
861
|
return func
|
799
862
|
|
800
863
|
def _gather_all_deps(self, args: Sequence[Any], kwargs: Dict[str, Any]) -> List[Future]:
|
@@ -811,8 +874,11 @@ class DataFlowKernel:
|
|
811
874
|
depends: List[Future] = []
|
812
875
|
|
813
876
|
def check_dep(d: Any) -> None:
|
814
|
-
|
815
|
-
depends.extend(
|
877
|
+
try:
|
878
|
+
depends.extend(self.dependency_resolver.traverse_to_gather(d))
|
879
|
+
except Exception:
|
880
|
+
logger.exception("Exception in dependency_resolver.traverse_to_gather")
|
881
|
+
raise
|
816
882
|
|
817
883
|
# Check the positional args
|
818
884
|
for dep in args:
|
@@ -829,7 +895,8 @@ class DataFlowKernel:
|
|
829
895
|
|
830
896
|
return depends
|
831
897
|
|
832
|
-
def _unwrap_futures(self, args, kwargs)
|
898
|
+
def _unwrap_futures(self, args: Sequence[Any], kwargs: Dict[str, Any]) \
|
899
|
+
-> Tuple[Sequence[Any], Dict[str, Any], Sequence[Tuple[Exception, str]]]:
|
833
900
|
"""This function should be called when all dependencies have completed.
|
834
901
|
|
835
902
|
It will rewrite the arguments for that task, replacing each Future
|
@@ -850,53 +917,40 @@ class DataFlowKernel:
|
|
850
917
|
"""
|
851
918
|
dep_failures = []
|
852
919
|
|
920
|
+
def append_failure(e: Exception, dep: Future) -> None:
|
921
|
+
# If this Future is associated with a task inside this DFK,
|
922
|
+
# then refer to the task ID.
|
923
|
+
# Otherwise make a repr of the Future object.
|
924
|
+
if hasattr(dep, 'task_record') and dep.task_record['dfk'] == self:
|
925
|
+
tid = "task " + repr(dep.task_record['id'])
|
926
|
+
else:
|
927
|
+
tid = repr(dep)
|
928
|
+
dep_failures.extend([(e, tid)])
|
929
|
+
|
853
930
|
# Replace item in args
|
854
931
|
new_args = []
|
855
932
|
for dep in args:
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
# If this Future is associated with a task inside this DFK,
|
861
|
-
# then refer to the task ID.
|
862
|
-
# Otherwise make a repr of the Future object.
|
863
|
-
if hasattr(dep, 'task_record') and dep.task_record['dfk'] == self:
|
864
|
-
tid = "task " + repr(dep.task_record['id'])
|
865
|
-
else:
|
866
|
-
tid = repr(dep)
|
867
|
-
dep_failures.extend([(e, tid)])
|
868
|
-
else:
|
869
|
-
new_args.extend([dep])
|
933
|
+
try:
|
934
|
+
new_args.extend([self.dependency_resolver.traverse_to_unwrap(dep)])
|
935
|
+
except Exception as e:
|
936
|
+
append_failure(e, dep)
|
870
937
|
|
871
938
|
# Check for explicit kwargs ex, fu_1=<fut>
|
872
939
|
for key in kwargs:
|
873
940
|
dep = kwargs[key]
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
if hasattr(dep, 'task_record'):
|
879
|
-
tid = dep.task_record['id']
|
880
|
-
else:
|
881
|
-
tid = None
|
882
|
-
dep_failures.extend([(e, tid)])
|
941
|
+
try:
|
942
|
+
kwargs[key] = self.dependency_resolver.traverse_to_unwrap(dep)
|
943
|
+
except Exception as e:
|
944
|
+
append_failure(e, dep)
|
883
945
|
|
884
946
|
# Check for futures in inputs=[<fut>...]
|
885
947
|
if 'inputs' in kwargs:
|
886
948
|
new_inputs = []
|
887
949
|
for dep in kwargs['inputs']:
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
if hasattr(dep, 'task_record'):
|
893
|
-
tid = dep.task_record['id']
|
894
|
-
else:
|
895
|
-
tid = None
|
896
|
-
dep_failures.extend([(e, tid)])
|
897
|
-
|
898
|
-
else:
|
899
|
-
new_inputs.extend([dep])
|
950
|
+
try:
|
951
|
+
new_inputs.extend([self.dependency_resolver.traverse_to_unwrap(dep)])
|
952
|
+
except Exception as e:
|
953
|
+
append_failure(e, dep)
|
900
954
|
kwargs['inputs'] = new_inputs
|
901
955
|
|
902
956
|
return new_args, kwargs, dep_failures
|
@@ -928,7 +982,7 @@ class DataFlowKernel:
|
|
928
982
|
- app_kwargs (dict) : Rest of the kwargs to the fn passed as dict.
|
929
983
|
|
930
984
|
Returns:
|
931
|
-
|
985
|
+
AppFuture
|
932
986
|
|
933
987
|
"""
|
934
988
|
|
@@ -952,32 +1006,16 @@ class DataFlowKernel:
|
|
952
1006
|
executor = random.choice(choices)
|
953
1007
|
logger.debug("Task {} will be sent to executor {}".format(task_id, executor))
|
954
1008
|
|
955
|
-
# The below uses func.__name__ before it has been wrapped by any staging code.
|
956
|
-
|
957
|
-
label = app_kwargs.get('label')
|
958
|
-
for kw in ['stdout', 'stderr']:
|
959
|
-
if kw in app_kwargs:
|
960
|
-
if app_kwargs[kw] == parsl.AUTO_LOGNAME:
|
961
|
-
if kw not in ignore_for_cache:
|
962
|
-
ignore_for_cache += [kw]
|
963
|
-
app_kwargs[kw] = os.path.join(
|
964
|
-
self.run_dir,
|
965
|
-
'task_logs',
|
966
|
-
str(int(task_id / 10000)).zfill(4), # limit logs to 10k entries per directory
|
967
|
-
'task_{}_{}{}.{}'.format(
|
968
|
-
str(task_id).zfill(4),
|
969
|
-
func.__name__,
|
970
|
-
'' if label is None else '_{}'.format(label),
|
971
|
-
kw)
|
972
|
-
)
|
973
|
-
|
974
1009
|
resource_specification = app_kwargs.get('parsl_resource_specification', {})
|
975
1010
|
|
976
1011
|
task_record: TaskRecord
|
977
|
-
task_record = {'
|
1012
|
+
task_record = {'args': app_args,
|
1013
|
+
'depends': [],
|
978
1014
|
'dfk': self,
|
979
1015
|
'executor': executor,
|
1016
|
+
'func': func,
|
980
1017
|
'func_name': func.__name__,
|
1018
|
+
'kwargs': app_kwargs,
|
981
1019
|
'memoize': cache,
|
982
1020
|
'hashsum': None,
|
983
1021
|
'exec_fu': None,
|
@@ -999,25 +1037,41 @@ class DataFlowKernel:
|
|
999
1037
|
|
1000
1038
|
self.update_task_state(task_record, States.unsched)
|
1001
1039
|
|
1040
|
+
for kw in ['stdout', 'stderr']:
|
1041
|
+
if kw in app_kwargs:
|
1042
|
+
if app_kwargs[kw] == parsl.AUTO_LOGNAME:
|
1043
|
+
if kw not in ignore_for_cache:
|
1044
|
+
ignore_for_cache += [kw]
|
1045
|
+
if self.config.std_autopath is None:
|
1046
|
+
app_kwargs[kw] = self.default_std_autopath(task_record, kw)
|
1047
|
+
else:
|
1048
|
+
app_kwargs[kw] = self.config.std_autopath(task_record, kw)
|
1049
|
+
|
1002
1050
|
app_fu = AppFuture(task_record)
|
1051
|
+
task_record['app_fu'] = app_fu
|
1003
1052
|
|
1004
1053
|
# Transform remote input files to data futures
|
1005
1054
|
app_args, app_kwargs, func = self._add_input_deps(executor, app_args, app_kwargs, func)
|
1006
1055
|
|
1007
1056
|
func = self._add_output_deps(executor, app_args, app_kwargs, app_fu, func)
|
1008
1057
|
|
1058
|
+
logger.debug("Added output dependencies")
|
1059
|
+
|
1060
|
+
# Replace the function invocation in the TaskRecord with whatever file-staging
|
1061
|
+
# substitutions have been made.
|
1009
1062
|
task_record.update({
|
1010
1063
|
'args': app_args,
|
1011
1064
|
'func': func,
|
1012
|
-
'kwargs': app_kwargs
|
1013
|
-
'app_fu': app_fu})
|
1065
|
+
'kwargs': app_kwargs})
|
1014
1066
|
|
1015
1067
|
assert task_id not in self.tasks
|
1016
1068
|
|
1017
1069
|
self.tasks[task_id] = task_record
|
1018
1070
|
|
1071
|
+
logger.debug("Gathering dependencies")
|
1019
1072
|
# Get the list of dependencies for the task
|
1020
1073
|
depends = self._gather_all_deps(app_args, app_kwargs)
|
1074
|
+
logger.debug("Gathered dependencies")
|
1021
1075
|
task_record['depends'] = depends
|
1022
1076
|
|
1023
1077
|
depend_descs = []
|
@@ -1084,73 +1138,28 @@ class DataFlowKernel:
|
|
1084
1138
|
|
1085
1139
|
logger.info("End of summary")
|
1086
1140
|
|
1087
|
-
def
|
1088
|
-
"""Create script directories across a channel
|
1089
|
-
|
1090
|
-
Parameters
|
1091
|
-
----------
|
1092
|
-
provider: Provider obj
|
1093
|
-
Provider for which scripts dirs are being created
|
1094
|
-
channel: Channel obj
|
1095
|
-
Channel over which the remote dirs are to be created
|
1096
|
-
"""
|
1097
|
-
run_dir = self.run_dir
|
1098
|
-
if channel.script_dir is None:
|
1099
|
-
|
1100
|
-
# This case will be detected as unreachable by mypy, because of
|
1101
|
-
# the type of script_dir, which is str, not Optional[str].
|
1102
|
-
# The type system doesn't represent the initialized/uninitialized
|
1103
|
-
# state of a channel so cannot represent that a channel needs
|
1104
|
-
# its script directory set or not.
|
1105
|
-
|
1106
|
-
channel.script_dir = os.path.join(run_dir, 'submit_scripts') # type: ignore[unreachable]
|
1107
|
-
|
1108
|
-
# Only create dirs if we aren't on a shared-fs
|
1109
|
-
if not channel.isdir(run_dir):
|
1110
|
-
parent, child = pathlib.Path(run_dir).parts[-2:]
|
1111
|
-
remote_run_dir = os.path.join(parent, child)
|
1112
|
-
channel.script_dir = os.path.join(remote_run_dir, 'remote_submit_scripts')
|
1113
|
-
provider.script_dir = os.path.join(run_dir, 'local_submit_scripts')
|
1114
|
-
|
1115
|
-
channel.makedirs(channel.script_dir, exist_ok=True)
|
1116
|
-
|
1117
|
-
def add_executors(self, executors):
|
1141
|
+
def add_executors(self, executors: Sequence[ParslExecutor]) -> None:
|
1118
1142
|
for executor in executors:
|
1119
1143
|
executor.run_id = self.run_id
|
1120
1144
|
executor.run_dir = self.run_dir
|
1121
|
-
|
1122
|
-
|
1145
|
+
if self.monitoring:
|
1146
|
+
executor.hub_address = self.monitoring.hub_address
|
1147
|
+
executor.hub_zmq_port = self.monitoring.hub_zmq_port
|
1148
|
+
executor.submit_monitoring_radio = self.monitoring.radio
|
1123
1149
|
if hasattr(executor, 'provider'):
|
1124
1150
|
if hasattr(executor.provider, 'script_dir'):
|
1125
1151
|
executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
|
1126
1152
|
os.makedirs(executor.provider.script_dir, exist_ok=True)
|
1127
1153
|
|
1128
|
-
if hasattr(executor.provider, 'channels'):
|
1129
|
-
logger.debug("Creating script_dir across multiple channels")
|
1130
|
-
for channel in executor.provider.channels:
|
1131
|
-
self._create_remote_dirs_over_channel(executor.provider, channel)
|
1132
|
-
else:
|
1133
|
-
self._create_remote_dirs_over_channel(executor.provider, executor.provider.channel)
|
1134
|
-
|
1135
1154
|
self.executors[executor.label] = executor
|
1136
|
-
|
1137
|
-
if self.monitoring and block_ids:
|
1138
|
-
new_status = {}
|
1139
|
-
for bid in block_ids:
|
1140
|
-
new_status[bid] = JobStatus(JobState.PENDING)
|
1141
|
-
msg = executor.create_monitoring_info(new_status)
|
1142
|
-
logger.debug("Sending monitoring message {} to hub from DFK".format(msg))
|
1143
|
-
self.monitoring.send(MessageType.BLOCK_INFO, msg)
|
1155
|
+
executor.start()
|
1144
1156
|
block_executors = [e for e in executors if isinstance(e, BlockProviderExecutor)]
|
1145
1157
|
self.job_status_poller.add_executors(block_executors)
|
1146
1158
|
|
1147
1159
|
def atexit_cleanup(self) -> None:
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
"exiting to release any resources")
|
1152
|
-
else:
|
1153
|
-
logger.info("python process is exiting, but DFK has already been cleaned up")
|
1160
|
+
logger.warning("Python is exiting with a DFK still running. "
|
1161
|
+
"You should call parsl.dfk().cleanup() before "
|
1162
|
+
"exiting to release any resources")
|
1154
1163
|
|
1155
1164
|
def wait_for_current_tasks(self) -> None:
|
1156
1165
|
"""Waits for all tasks in the task list to be completed, by waiting for their
|
@@ -1170,7 +1179,8 @@ class DataFlowKernel:
|
|
1170
1179
|
fut = task_record['app_fu']
|
1171
1180
|
if not fut.done():
|
1172
1181
|
fut.exception()
|
1173
|
-
# now app future is done, poll until DFK state is final: a
|
1182
|
+
# now app future is done, poll until DFK state is final: a
|
1183
|
+
# DFK state being final and the app future being done do not imply each other.
|
1174
1184
|
while task_record['status'] not in FINAL_STATES:
|
1175
1185
|
time.sleep(0.1)
|
1176
1186
|
|
@@ -1205,31 +1215,18 @@ class DataFlowKernel:
|
|
1205
1215
|
self._checkpoint_timer.close()
|
1206
1216
|
|
1207
1217
|
# Send final stats
|
1218
|
+
logger.info("Sending end message for usage tracking")
|
1208
1219
|
self.usage_tracker.send_end_message()
|
1209
1220
|
self.usage_tracker.close()
|
1221
|
+
logger.info("Closed usage tracking")
|
1210
1222
|
|
1211
1223
|
logger.info("Closing job status poller")
|
1212
1224
|
self.job_status_poller.close()
|
1213
1225
|
logger.info("Terminated job status poller")
|
1214
1226
|
|
1215
|
-
logger.info("
|
1227
|
+
logger.info("Shutting down executors")
|
1216
1228
|
|
1217
1229
|
for executor in self.executors.values():
|
1218
|
-
if isinstance(executor, BlockProviderExecutor):
|
1219
|
-
if not executor.bad_state_is_set:
|
1220
|
-
logger.info(f"Scaling in executor {executor.label}")
|
1221
|
-
if executor.provider:
|
1222
|
-
job_ids = executor.provider.resources.keys()
|
1223
|
-
block_ids = executor.scale_in(len(job_ids))
|
1224
|
-
if self.monitoring and block_ids:
|
1225
|
-
new_status = {}
|
1226
|
-
for bid in block_ids:
|
1227
|
-
new_status[bid] = JobStatus(JobState.CANCELLED)
|
1228
|
-
msg = executor.create_monitoring_info(new_status)
|
1229
|
-
logger.debug("Sending message {} to hub from DFK".format(msg))
|
1230
|
-
self.monitoring.send(MessageType.BLOCK_INFO, msg)
|
1231
|
-
else: # and bad_state_is_set
|
1232
|
-
logger.warning(f"Not shutting down executor {executor.label} because it is in bad state")
|
1233
1230
|
logger.info(f"Shutting down executor {executor.label}")
|
1234
1231
|
executor.shutdown()
|
1235
1232
|
logger.info(f"Shut down executor {executor.label}")
|
@@ -1239,18 +1236,32 @@ class DataFlowKernel:
|
|
1239
1236
|
|
1240
1237
|
if self.monitoring:
|
1241
1238
|
logger.info("Sending final monitoring message")
|
1242
|
-
self.monitoring.send(MessageType.WORKFLOW_INFO,
|
1239
|
+
self.monitoring.send((MessageType.WORKFLOW_INFO,
|
1243
1240
|
{'tasks_failed_count': self.task_state_counts[States.failed],
|
1244
1241
|
'tasks_completed_count': self.task_state_counts[States.exec_done],
|
1245
1242
|
"time_began": self.time_began,
|
1246
1243
|
'time_completed': self.time_completed,
|
1247
|
-
'run_id': self.run_id, 'rundir': self.run_dir
|
1248
|
-
'exit_now': True})
|
1244
|
+
'run_id': self.run_id, 'rundir': self.run_dir}))
|
1249
1245
|
|
1250
1246
|
logger.info("Terminating monitoring")
|
1251
1247
|
self.monitoring.close()
|
1252
1248
|
logger.info("Terminated monitoring")
|
1253
1249
|
|
1250
|
+
logger.info("Terminating dependency launch pool")
|
1251
|
+
self.dependency_launch_pool.shutdown()
|
1252
|
+
logger.info("Terminated dependency launch pool")
|
1253
|
+
|
1254
|
+
logger.info("Unregistering atexit hook")
|
1255
|
+
atexit.unregister(self.atexit_cleanup)
|
1256
|
+
logger.info("Unregistered atexit hook")
|
1257
|
+
|
1258
|
+
if DataFlowKernelLoader._dfk is self:
|
1259
|
+
logger.info("Unregistering default DFK")
|
1260
|
+
parsl.clear()
|
1261
|
+
logger.info("Unregistered default DFK")
|
1262
|
+
else:
|
1263
|
+
logger.debug("Cleaning up non-default DFK - not unregistering")
|
1264
|
+
|
1254
1265
|
logger.info("DFK cleanup complete")
|
1255
1266
|
|
1256
1267
|
def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> str:
|
@@ -1386,8 +1397,6 @@ class DataFlowKernel:
|
|
1386
1397
|
Returns:
|
1387
1398
|
- dict containing, hashed -> future mappings
|
1388
1399
|
"""
|
1389
|
-
self.memo_lookup_table = None
|
1390
|
-
|
1391
1400
|
if checkpointDirs:
|
1392
1401
|
return self._load_checkpoints(checkpointDirs)
|
1393
1402
|
else:
|
@@ -1395,10 +1404,39 @@ class DataFlowKernel:
|
|
1395
1404
|
|
1396
1405
|
@staticmethod
|
1397
1406
|
def _log_std_streams(task_record: TaskRecord) -> None:
|
1398
|
-
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1407
|
+
tid = task_record['id']
|
1408
|
+
|
1409
|
+
def log_std_stream(name: str, target) -> None:
|
1410
|
+
if target is None:
|
1411
|
+
logger.info(f"{name} for task {tid} will not be redirected.")
|
1412
|
+
elif isinstance(target, str):
|
1413
|
+
logger.info(f"{name} for task {tid} will be redirected to {target}")
|
1414
|
+
elif isinstance(target, os.PathLike):
|
1415
|
+
logger.info(f"{name} for task {tid} will be redirected to {os.fspath(target)}")
|
1416
|
+
elif isinstance(target, tuple) and len(target) == 2 and isinstance(target[0], str):
|
1417
|
+
logger.info(f"{name} for task {tid} will be redirected to {target[0]} with mode {target[1]}")
|
1418
|
+
elif isinstance(target, tuple) and len(target) == 2 and isinstance(target[0], os.PathLike):
|
1419
|
+
logger.info(f"{name} for task {tid} will be redirected to {os.fspath(target[0])} with mode {target[1]}")
|
1420
|
+
elif isinstance(target, DataFuture):
|
1421
|
+
logger.info(f"{name} for task {tid} will staged to {target.file_obj.url}")
|
1422
|
+
else:
|
1423
|
+
logger.error(f"{name} for task {tid} has unknown specification: {target!r}")
|
1424
|
+
|
1425
|
+
log_std_stream("Standard out", task_record['app_fu'].stdout)
|
1426
|
+
log_std_stream("Standard error", task_record['app_fu'].stderr)
|
1427
|
+
|
1428
|
+
def default_std_autopath(self, taskrecord, kw):
|
1429
|
+
label = taskrecord['kwargs'].get('label')
|
1430
|
+
task_id = taskrecord['id']
|
1431
|
+
return os.path.join(
|
1432
|
+
self.run_dir,
|
1433
|
+
'task_logs',
|
1434
|
+
str(int(task_id / 10000)).zfill(4), # limit logs to 10k entries per directory
|
1435
|
+
'task_{}_{}{}.{}'.format(
|
1436
|
+
str(task_id).zfill(4),
|
1437
|
+
taskrecord['func_name'],
|
1438
|
+
'' if label is None else '_{}'.format(label),
|
1439
|
+
kw))
|
1402
1440
|
|
1403
1441
|
|
1404
1442
|
class DataFlowKernelLoader:
|