parsl 2024.3.18__py3-none-any.whl → 2025.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/__init__.py +9 -10
- parsl/addresses.py +26 -6
- parsl/app/app.py +7 -8
- parsl/app/bash.py +15 -8
- parsl/app/errors.py +10 -13
- parsl/app/futures.py +8 -10
- parsl/app/python.py +2 -1
- parsl/benchmark/perf.py +2 -1
- parsl/concurrent/__init__.py +2 -2
- parsl/config.py +53 -10
- parsl/configs/ASPIRE1.py +6 -5
- parsl/configs/Azure.py +9 -8
- parsl/configs/bridges.py +6 -4
- parsl/configs/cc_in2p3.py +3 -3
- parsl/configs/ec2.py +3 -1
- parsl/configs/expanse.py +4 -3
- parsl/configs/frontera.py +3 -4
- parsl/configs/htex_local.py +3 -4
- parsl/configs/illinoiscluster.py +3 -1
- parsl/configs/improv.py +34 -0
- parsl/configs/kubernetes.py +4 -3
- parsl/configs/local_threads.py +5 -1
- parsl/configs/midway.py +5 -3
- parsl/configs/osg.py +4 -2
- parsl/configs/polaris.py +4 -2
- parsl/configs/stampede2.py +6 -5
- parsl/configs/summit.py +3 -3
- parsl/configs/toss3_llnl.py +4 -3
- parsl/configs/vineex_local.py +6 -4
- parsl/configs/wqex_local.py +5 -3
- parsl/curvezmq.py +4 -0
- parsl/data_provider/data_manager.py +4 -3
- parsl/data_provider/file_noop.py +1 -2
- parsl/data_provider/files.py +3 -3
- parsl/data_provider/ftp.py +1 -3
- parsl/data_provider/globus.py +7 -6
- parsl/data_provider/http.py +2 -2
- parsl/data_provider/rsync.py +1 -1
- parsl/data_provider/staging.py +2 -2
- parsl/data_provider/zip.py +135 -0
- parsl/dataflow/dependency_resolvers.py +115 -0
- parsl/dataflow/dflow.py +259 -223
- parsl/dataflow/errors.py +3 -5
- parsl/dataflow/futures.py +27 -14
- parsl/dataflow/memoization.py +5 -5
- parsl/dataflow/rundirs.py +5 -6
- parsl/dataflow/taskrecord.py +4 -5
- parsl/executors/__init__.py +4 -2
- parsl/executors/base.py +45 -15
- parsl/executors/errors.py +13 -0
- parsl/executors/execute_task.py +37 -0
- parsl/executors/flux/execute_parsl_task.py +3 -3
- parsl/executors/flux/executor.py +18 -19
- parsl/executors/flux/flux_instance_manager.py +26 -27
- parsl/executors/high_throughput/errors.py +43 -3
- parsl/executors/high_throughput/executor.py +307 -285
- parsl/executors/high_throughput/interchange.py +137 -168
- parsl/executors/high_throughput/manager_record.py +4 -0
- parsl/executors/high_throughput/manager_selector.py +55 -0
- parsl/executors/high_throughput/monitoring_info.py +2 -1
- parsl/executors/high_throughput/mpi_executor.py +113 -0
- parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
- parsl/executors/high_throughput/mpi_resource_management.py +6 -17
- parsl/executors/high_throughput/probe.py +9 -7
- parsl/executors/high_throughput/process_worker_pool.py +77 -75
- parsl/executors/high_throughput/zmq_pipes.py +81 -23
- parsl/executors/radical/executor.py +130 -79
- parsl/executors/radical/rpex_resources.py +17 -15
- parsl/executors/radical/rpex_worker.py +4 -3
- parsl/executors/status_handling.py +157 -51
- parsl/executors/taskvine/__init__.py +1 -1
- parsl/executors/taskvine/errors.py +1 -1
- parsl/executors/taskvine/exec_parsl_function.py +2 -2
- parsl/executors/taskvine/executor.py +38 -55
- parsl/executors/taskvine/factory.py +1 -1
- parsl/executors/taskvine/factory_config.py +1 -1
- parsl/executors/taskvine/manager.py +17 -13
- parsl/executors/taskvine/manager_config.py +7 -2
- parsl/executors/threads.py +6 -6
- parsl/executors/workqueue/errors.py +1 -1
- parsl/executors/workqueue/exec_parsl_function.py +6 -5
- parsl/executors/workqueue/executor.py +64 -63
- parsl/executors/workqueue/parsl_coprocess.py +1 -1
- parsl/jobs/error_handlers.py +2 -2
- parsl/jobs/job_status_poller.py +28 -112
- parsl/jobs/states.py +7 -2
- parsl/jobs/strategy.py +43 -31
- parsl/launchers/__init__.py +12 -3
- parsl/launchers/errors.py +1 -1
- parsl/launchers/launchers.py +0 -6
- parsl/log_utils.py +1 -2
- parsl/monitoring/db_manager.py +55 -93
- parsl/monitoring/errors.py +6 -0
- parsl/monitoring/monitoring.py +85 -311
- parsl/monitoring/queries/pandas.py +1 -2
- parsl/monitoring/radios/base.py +13 -0
- parsl/monitoring/radios/filesystem.py +52 -0
- parsl/monitoring/radios/htex.py +57 -0
- parsl/monitoring/radios/multiprocessing.py +17 -0
- parsl/monitoring/radios/udp.py +56 -0
- parsl/monitoring/radios/zmq.py +17 -0
- parsl/monitoring/remote.py +33 -37
- parsl/monitoring/router.py +212 -0
- parsl/monitoring/types.py +5 -6
- parsl/monitoring/visualization/app.py +4 -2
- parsl/monitoring/visualization/models.py +0 -1
- parsl/monitoring/visualization/plots/default/workflow_plots.py +8 -4
- parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
- parsl/monitoring/visualization/utils.py +0 -1
- parsl/monitoring/visualization/views.py +16 -9
- parsl/multiprocessing.py +0 -1
- parsl/process_loggers.py +1 -2
- parsl/providers/__init__.py +8 -17
- parsl/providers/aws/aws.py +2 -3
- parsl/providers/azure/azure.py +4 -5
- parsl/providers/base.py +2 -18
- parsl/providers/cluster_provider.py +3 -9
- parsl/providers/condor/condor.py +7 -17
- parsl/providers/errors.py +2 -2
- parsl/providers/googlecloud/googlecloud.py +2 -1
- parsl/providers/grid_engine/grid_engine.py +5 -14
- parsl/providers/kubernetes/kube.py +80 -40
- parsl/providers/local/local.py +13 -26
- parsl/providers/lsf/lsf.py +5 -23
- parsl/providers/pbspro/pbspro.py +5 -17
- parsl/providers/slurm/slurm.py +81 -39
- parsl/providers/torque/torque.py +3 -14
- parsl/serialize/__init__.py +8 -3
- parsl/serialize/base.py +1 -2
- parsl/serialize/concretes.py +5 -4
- parsl/serialize/facade.py +3 -3
- parsl/serialize/proxystore.py +3 -2
- parsl/tests/__init__.py +1 -1
- parsl/tests/configs/azure_single_node.py +4 -5
- parsl/tests/configs/bridges.py +3 -2
- parsl/tests/configs/cc_in2p3.py +1 -3
- parsl/tests/configs/comet.py +2 -1
- parsl/tests/configs/ec2_single_node.py +1 -2
- parsl/tests/configs/ec2_spot.py +1 -2
- parsl/tests/configs/flux_local.py +11 -0
- parsl/tests/configs/frontera.py +2 -3
- parsl/tests/configs/htex_local.py +3 -5
- parsl/tests/configs/htex_local_alternate.py +11 -15
- parsl/tests/configs/htex_local_intask_staging.py +5 -9
- parsl/tests/configs/htex_local_rsync_staging.py +4 -8
- parsl/tests/configs/local_radical.py +1 -3
- parsl/tests/configs/local_radical_mpi.py +2 -2
- parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
- parsl/tests/configs/local_threads_monitoring.py +0 -1
- parsl/tests/configs/midway.py +2 -2
- parsl/tests/configs/nscc_singapore.py +3 -3
- parsl/tests/configs/osg_htex.py +1 -1
- parsl/tests/configs/petrelkube.py +3 -2
- parsl/tests/configs/slurm_local.py +24 -0
- parsl/tests/configs/summit.py +1 -0
- parsl/tests/configs/taskvine_ex.py +4 -7
- parsl/tests/configs/user_opts.py +0 -7
- parsl/tests/configs/workqueue_ex.py +4 -6
- parsl/tests/conftest.py +27 -13
- parsl/tests/integration/test_stress/test_python_simple.py +3 -4
- parsl/tests/integration/test_stress/test_python_threads.py +3 -5
- parsl/tests/manual_tests/htex_local.py +4 -6
- parsl/tests/manual_tests/test_basic.py +1 -0
- parsl/tests/manual_tests/test_log_filter.py +3 -1
- parsl/tests/manual_tests/test_memory_limits.py +6 -8
- parsl/tests/manual_tests/test_regression_220.py +2 -1
- parsl/tests/manual_tests/test_udp_simple.py +4 -4
- parsl/tests/manual_tests/test_worker_count.py +3 -2
- parsl/tests/scaling_tests/htex_local.py +2 -4
- parsl/tests/scaling_tests/test_scale.py +0 -9
- parsl/tests/scaling_tests/vineex_condor.py +1 -2
- parsl/tests/scaling_tests/vineex_local.py +1 -2
- parsl/tests/site_tests/site_config_selector.py +1 -6
- parsl/tests/site_tests/test_provider.py +4 -2
- parsl/tests/site_tests/test_site.py +2 -0
- parsl/tests/sites/test_affinity.py +7 -7
- parsl/tests/sites/test_dynamic_executor.py +3 -4
- parsl/tests/sites/test_ec2.py +3 -2
- parsl/tests/sites/test_worker_info.py +4 -5
- parsl/tests/test_aalst_patterns.py +0 -1
- parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
- parsl/tests/test_bash_apps/test_basic.py +10 -4
- parsl/tests/test_bash_apps/test_error_codes.py +5 -7
- parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
- parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
- parsl/tests/test_bash_apps/test_memoize.py +2 -8
- parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
- parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
- parsl/tests/test_bash_apps/test_multiline.py +1 -1
- parsl/tests/test_bash_apps/test_pipeline.py +1 -1
- parsl/tests/test_bash_apps/test_std_uri.py +123 -0
- parsl/tests/test_bash_apps/test_stdout.py +33 -8
- parsl/tests/test_callables.py +2 -2
- parsl/tests/test_checkpointing/test_periodic.py +21 -39
- parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
- parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
- parsl/tests/test_checkpointing/test_regression_239.py +1 -1
- parsl/tests/test_checkpointing/test_task_exit.py +2 -3
- parsl/tests/test_docs/test_from_slides.py +5 -2
- parsl/tests/test_docs/test_kwargs.py +4 -1
- parsl/tests/test_docs/test_tutorial_1.py +1 -2
- parsl/tests/test_docs/test_workflow1.py +2 -2
- parsl/tests/test_docs/test_workflow2.py +0 -1
- parsl/tests/test_error_handling/test_rand_fail.py +2 -2
- parsl/tests/test_error_handling/test_resource_spec.py +10 -12
- parsl/tests/test_error_handling/test_retries.py +6 -16
- parsl/tests/test_error_handling/test_retry_handler.py +1 -0
- parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
- parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
- parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
- parsl/tests/test_execute_task.py +29 -0
- parsl/tests/test_flux.py +1 -1
- parsl/tests/test_htex/test_basic.py +2 -3
- parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
- parsl/tests/test_htex/test_command_client_timeout.py +66 -0
- parsl/tests/test_htex/test_connected_blocks.py +3 -2
- parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
- parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
- parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
- parsl/tests/test_htex/test_drain.py +11 -10
- parsl/tests/test_htex/test_htex.py +51 -25
- parsl/tests/test_htex/test_manager_failure.py +0 -1
- parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
- parsl/tests/test_htex/test_managers_command.py +36 -0
- parsl/tests/test_htex/test_missing_worker.py +2 -12
- parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
- parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
- parsl/tests/test_htex/test_zmq_binding.py +29 -8
- parsl/tests/test_monitoring/test_app_names.py +5 -5
- parsl/tests/test_monitoring/test_basic.py +73 -25
- parsl/tests/test_monitoring/test_db_locks.py +6 -4
- parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
- parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
- parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
- parsl/tests/test_monitoring/test_stdouterr.py +134 -0
- parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
- parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
- parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
- parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
- parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
- parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
- parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
- parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
- parsl/tests/test_providers/test_local_provider.py +3 -132
- parsl/tests/test_providers/test_pbspro_template.py +2 -3
- parsl/tests/test_providers/test_slurm_template.py +2 -3
- parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
- parsl/tests/test_python_apps/test_context_manager.py +128 -0
- parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
- parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
- parsl/tests/test_python_apps/test_fail.py +0 -25
- parsl/tests/test_python_apps/test_futures.py +2 -1
- parsl/tests/test_python_apps/test_inputs_default.py +22 -0
- parsl/tests/test_python_apps/test_join.py +0 -1
- parsl/tests/test_python_apps/test_lifted.py +11 -7
- parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
- parsl/tests/test_python_apps/test_outputs.py +1 -1
- parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
- parsl/tests/test_radical/test_mpi_funcs.py +1 -2
- parsl/tests/test_regression/test_1480.py +2 -1
- parsl/tests/test_regression/test_1653.py +2 -1
- parsl/tests/test_regression/test_226.py +1 -0
- parsl/tests/test_regression/test_2652.py +1 -0
- parsl/tests/test_regression/test_69a.py +0 -1
- parsl/tests/test_regression/test_854.py +4 -2
- parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
- parsl/tests/test_regression/test_98.py +0 -1
- parsl/tests/test_scaling/test_block_error_handler.py +9 -4
- parsl/tests/test_scaling/test_regression_1621.py +11 -15
- parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
- parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
- parsl/tests/test_scaling/test_scale_down.py +2 -5
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +5 -8
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
- parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
- parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
- parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
- parsl/tests/test_serialization/test_basic.py +2 -1
- parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
- parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
- parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
- parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
- parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
- parsl/tests/test_staging/staging_provider.py +2 -2
- parsl/tests/test_staging/test_1316.py +3 -4
- parsl/tests/test_staging/test_docs_1.py +2 -1
- parsl/tests/test_staging/test_docs_2.py +2 -1
- parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
- parsl/tests/{test_data → test_staging}/test_file.py +6 -6
- parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
- parsl/tests/test_staging/test_staging_ftp.py +1 -0
- parsl/tests/test_staging/test_staging_https.py +5 -2
- parsl/tests/test_staging/test_staging_stdout.py +64 -0
- parsl/tests/test_staging/test_zip_in.py +39 -0
- parsl/tests/test_staging/test_zip_out.py +110 -0
- parsl/tests/test_staging/test_zip_to_zip.py +41 -0
- parsl/tests/test_summary.py +2 -2
- parsl/tests/test_thread_parallelism.py +0 -1
- parsl/tests/test_threads/test_configs.py +1 -2
- parsl/tests/test_threads/test_lazy_errors.py +2 -2
- parsl/tests/test_utils/test_execute_wait.py +35 -0
- parsl/tests/test_utils/test_sanitize_dns.py +76 -0
- parsl/tests/unit/test_address.py +20 -0
- parsl/tests/unit/test_file.py +99 -0
- parsl/tests/unit/test_usage_tracking.py +66 -0
- parsl/usage_tracking/api.py +65 -0
- parsl/usage_tracking/levels.py +6 -0
- parsl/usage_tracking/usage.py +104 -62
- parsl/utils.py +137 -4
- parsl/version.py +1 -1
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
- parsl-2025.1.13.data/scripts/interchange.py +649 -0
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +77 -75
- parsl-2025.1.13.dist-info/METADATA +96 -0
- parsl-2025.1.13.dist-info/RECORD +462 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
- parsl/channels/__init__.py +0 -7
- parsl/channels/base.py +0 -141
- parsl/channels/errors.py +0 -113
- parsl/channels/local/local.py +0 -164
- parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
- parsl/channels/ssh/ssh.py +0 -276
- parsl/channels/ssh_il/__init__.py +0 -0
- parsl/channels/ssh_il/ssh_il.py +0 -74
- parsl/configs/ad_hoc.py +0 -35
- parsl/executors/radical/rpex_master.py +0 -42
- parsl/monitoring/radios.py +0 -175
- parsl/providers/ad_hoc/__init__.py +0 -0
- parsl/providers/ad_hoc/ad_hoc.py +0 -248
- parsl/providers/cobalt/__init__.py +0 -0
- parsl/providers/cobalt/cobalt.py +0 -236
- parsl/providers/cobalt/template.py +0 -17
- parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
- parsl/tests/configs/cooley_htex.py +0 -37
- parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
- parsl/tests/configs/local_adhoc.py +0 -18
- parsl/tests/configs/swan_htex.py +0 -43
- parsl/tests/configs/theta.py +0 -37
- parsl/tests/integration/test_channels/__init__.py +0 -0
- parsl/tests/integration/test_channels/test_channels.py +0 -17
- parsl/tests/integration/test_channels/test_local_channel.py +0 -42
- parsl/tests/integration/test_channels/test_scp_1.py +0 -45
- parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
- parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
- parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
- parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
- parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
- parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
- parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
- parsl/tests/sites/test_local_adhoc.py +0 -61
- parsl/tests/test_channels/__init__.py +0 -0
- parsl/tests/test_channels/test_large_output.py +0 -22
- parsl/tests/test_data/__init__.py +0 -0
- parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
- parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
- parsl-2024.3.18.dist-info/METADATA +0 -98
- parsl-2024.3.18.dist-info/RECORD +0 -449
- parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
- parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
- parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
- parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
- {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
- {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
parsl/dataflow/dflow.py
CHANGED
@@ -1,50 +1,54 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
+
|
2
3
|
import atexit
|
4
|
+
import concurrent.futures as cf
|
5
|
+
import datetime
|
6
|
+
import inspect
|
3
7
|
import logging
|
4
8
|
import os
|
5
|
-
import pathlib
|
6
9
|
import pickle
|
7
10
|
import random
|
8
|
-
import time
|
9
|
-
import typeguard
|
10
|
-
import inspect
|
11
|
-
import threading
|
12
11
|
import sys
|
13
|
-
import
|
12
|
+
import threading
|
13
|
+
import time
|
14
|
+
from concurrent.futures import Future
|
15
|
+
from functools import partial
|
14
16
|
from getpass import getuser
|
15
|
-
from
|
17
|
+
from socket import gethostname
|
16
18
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
|
17
19
|
from uuid import uuid4
|
18
|
-
|
19
|
-
|
20
|
-
from
|
20
|
+
|
21
|
+
import typeguard
|
22
|
+
from typeguard import typechecked
|
21
23
|
|
22
24
|
import parsl
|
23
25
|
from parsl.app.errors import RemoteExceptionWrapper
|
24
26
|
from parsl.app.futures import DataFuture
|
25
|
-
from parsl.channels import Channel
|
26
27
|
from parsl.config import Config
|
27
28
|
from parsl.data_provider.data_manager import DataManager
|
28
29
|
from parsl.data_provider.files import File
|
30
|
+
from parsl.dataflow.dependency_resolvers import SHALLOW_DEPENDENCY_RESOLVER
|
29
31
|
from parsl.dataflow.errors import BadCheckpoint, DependencyError, JoinError
|
30
32
|
from parsl.dataflow.futures import AppFuture
|
31
33
|
from parsl.dataflow.memoization import Memoizer
|
32
34
|
from parsl.dataflow.rundirs import make_rundir
|
33
|
-
from parsl.dataflow.states import
|
35
|
+
from parsl.dataflow.states import FINAL_FAILURE_STATES, FINAL_STATES, States
|
34
36
|
from parsl.dataflow.taskrecord import TaskRecord
|
35
|
-
from parsl.errors import
|
36
|
-
|
37
|
-
|
38
|
-
|
37
|
+
from parsl.errors import (
|
38
|
+
ConfigurationError,
|
39
|
+
InternalConsistencyError,
|
40
|
+
NoDataFlowKernelError,
|
41
|
+
)
|
39
42
|
from parsl.executors.base import ParslExecutor
|
40
43
|
from parsl.executors.status_handling import BlockProviderExecutor
|
41
44
|
from parsl.executors.threads import ThreadPoolExecutor
|
45
|
+
from parsl.jobs.job_status_poller import JobStatusPoller
|
42
46
|
from parsl.monitoring import MonitoringHub
|
43
|
-
from parsl.process_loggers import wrap_with_logs
|
44
|
-
from parsl.providers.base import ExecutionProvider
|
45
|
-
from parsl.utils import get_version, get_std_fname_mode, get_all_checkpoints, Timer
|
46
|
-
|
47
47
|
from parsl.monitoring.message_type import MessageType
|
48
|
+
from parsl.monitoring.remote import monitor_wrapper
|
49
|
+
from parsl.process_loggers import wrap_with_logs
|
50
|
+
from parsl.usage_tracking.usage import UsageTracker
|
51
|
+
from parsl.utils import Timer, get_all_checkpoints, get_std_fname_mode, get_version
|
48
52
|
|
49
53
|
logger = logging.getLogger(__name__)
|
50
54
|
|
@@ -106,14 +110,8 @@ class DataFlowKernel:
|
|
106
110
|
self.monitoring: Optional[MonitoringHub]
|
107
111
|
self.monitoring = config.monitoring
|
108
112
|
|
109
|
-
# hub address and port for interchange to connect
|
110
|
-
self.hub_address = None # type: Optional[str]
|
111
|
-
self.hub_interchange_port = None # type: Optional[int]
|
112
113
|
if self.monitoring:
|
113
|
-
|
114
|
-
self.monitoring.logdir = self.run_dir
|
115
|
-
self.hub_address = self.monitoring.hub_address
|
116
|
-
self.hub_interchange_port = self.monitoring.start(self.run_id, self.run_dir, self.config.run_dir)
|
114
|
+
self.monitoring.start(self.run_dir, self.config.run_dir)
|
117
115
|
|
118
116
|
self.time_began = datetime.datetime.now()
|
119
117
|
self.time_completed: Optional[datetime.datetime] = None
|
@@ -159,8 +157,8 @@ class DataFlowKernel:
|
|
159
157
|
}
|
160
158
|
|
161
159
|
if self.monitoring:
|
162
|
-
self.monitoring.send(MessageType.WORKFLOW_INFO,
|
163
|
-
workflow_info)
|
160
|
+
self.monitoring.send((MessageType.WORKFLOW_INFO,
|
161
|
+
workflow_info))
|
164
162
|
|
165
163
|
if config.checkpoint_files is not None:
|
166
164
|
checkpoints = self.load_checkpoints(config.checkpoint_files)
|
@@ -179,8 +177,7 @@ class DataFlowKernel:
|
|
179
177
|
# job_status_poller.add_executors.
|
180
178
|
self.job_status_poller = JobStatusPoller(strategy=self.config.strategy,
|
181
179
|
strategy_period=self.config.strategy_period,
|
182
|
-
max_idletime=self.config.max_idletime
|
183
|
-
dfk=self)
|
180
|
+
max_idletime=self.config.max_idletime)
|
184
181
|
|
185
182
|
self.executors: Dict[str, ParslExecutor] = {}
|
186
183
|
|
@@ -204,21 +201,52 @@ class DataFlowKernel:
|
|
204
201
|
self.tasks: Dict[int, TaskRecord] = {}
|
205
202
|
self.submitter_lock = threading.Lock()
|
206
203
|
|
204
|
+
self.dependency_launch_pool = cf.ThreadPoolExecutor(max_workers=1, thread_name_prefix="Dependency-Launch")
|
205
|
+
|
206
|
+
self.dependency_resolver = self.config.dependency_resolver if self.config.dependency_resolver is not None \
|
207
|
+
else SHALLOW_DEPENDENCY_RESOLVER
|
208
|
+
|
207
209
|
atexit.register(self.atexit_cleanup)
|
208
210
|
|
211
|
+
def __enter__(self):
|
212
|
+
return self
|
213
|
+
|
214
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
215
|
+
mode = self.config.exit_mode
|
216
|
+
logger.debug("Exiting context manager, with exit mode '%s'", mode)
|
217
|
+
if mode == "cleanup":
|
218
|
+
logger.info("Calling cleanup for DFK")
|
219
|
+
self.cleanup()
|
220
|
+
elif mode == "skip":
|
221
|
+
logger.info("Skipping all cleanup handling")
|
222
|
+
elif mode == "wait":
|
223
|
+
if exc_type is None:
|
224
|
+
logger.info("Waiting for all tasks to complete")
|
225
|
+
self.wait_for_current_tasks()
|
226
|
+
self.cleanup()
|
227
|
+
else:
|
228
|
+
logger.info("There was an exception - cleaning up without waiting for task completion")
|
229
|
+
self.cleanup()
|
230
|
+
else:
|
231
|
+
raise InternalConsistencyError(f"Exit case for {mode} should be unreachable, validated by typeguard on Config()")
|
232
|
+
|
209
233
|
def _send_task_log_info(self, task_record: TaskRecord) -> None:
|
210
234
|
if self.monitoring:
|
211
235
|
task_log_info = self._create_task_log_info(task_record)
|
212
|
-
self.monitoring.send(MessageType.TASK_INFO, task_log_info)
|
236
|
+
self.monitoring.send((MessageType.TASK_INFO, task_log_info))
|
213
237
|
|
214
|
-
def _create_task_log_info(self, task_record):
|
238
|
+
def _create_task_log_info(self, task_record: TaskRecord) -> Dict[str, Any]:
|
215
239
|
"""
|
216
240
|
Create the dictionary that will be included in the log.
|
217
241
|
"""
|
218
242
|
info_to_monitor = ['func_name', 'memoize', 'hashsum', 'fail_count', 'fail_cost', 'status',
|
219
243
|
'id', 'time_invoked', 'try_time_launched', 'time_returned', 'try_time_returned', 'executor']
|
220
244
|
|
221
|
-
|
245
|
+
# mypy cannot verify that these task_record[k] references are valid:
|
246
|
+
# They are valid if all entries in info_to_monitor are declared in the definition of TaskRecord
|
247
|
+
# This type: ignore[literal-required] asserts that fact.
|
248
|
+
task_log_info = {"task_" + k: task_record[k] for k in info_to_monitor} # type: ignore[literal-required]
|
249
|
+
|
222
250
|
task_log_info['run_id'] = self.run_id
|
223
251
|
task_log_info['try_id'] = task_record['try_id']
|
224
252
|
task_log_info['timestamp'] = datetime.datetime.now()
|
@@ -230,20 +258,28 @@ class DataFlowKernel:
|
|
230
258
|
task_log_info['task_inputs'] = str(task_record['kwargs'].get('inputs', None))
|
231
259
|
task_log_info['task_outputs'] = str(task_record['kwargs'].get('outputs', None))
|
232
260
|
task_log_info['task_stdin'] = task_record['kwargs'].get('stdin', None)
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
261
|
+
|
262
|
+
def std_spec_to_name(name, spec):
|
263
|
+
if spec is None:
|
264
|
+
name = ""
|
265
|
+
elif isinstance(spec, File):
|
266
|
+
name = spec.url
|
267
|
+
else:
|
268
|
+
# fallthrough case is various str, os.PathLike, tuple modes that
|
269
|
+
# can be interpreted by get_std_fname_mode.
|
270
|
+
try:
|
271
|
+
name, _ = get_std_fname_mode(name, spec)
|
272
|
+
except Exception:
|
273
|
+
logger.exception(f"Could not parse {name} specification {spec} for task {task_record['id']}")
|
274
|
+
name = ""
|
275
|
+
return name
|
276
|
+
|
277
|
+
stdout_spec = task_record['kwargs'].get('stdout')
|
278
|
+
task_log_info['task_stdout'] = std_spec_to_name('stdout', stdout_spec)
|
279
|
+
|
280
|
+
stderr_spec = task_record['kwargs'].get('stderr')
|
281
|
+
task_log_info['task_stderr'] = std_spec_to_name('stderr', stderr_spec)
|
282
|
+
|
247
283
|
task_log_info['task_fail_history'] = ",".join(task_record['fail_history'])
|
248
284
|
task_log_info['task_depends'] = None
|
249
285
|
if task_record['depends'] is not None:
|
@@ -584,9 +620,9 @@ class DataFlowKernel:
|
|
584
620
|
return kwargs.get('_parsl_staging_inhibit', False)
|
585
621
|
|
586
622
|
def launch_if_ready(self, task_record: TaskRecord) -> None:
|
587
|
-
"""
|
588
|
-
|
589
|
-
|
623
|
+
"""Schedules a task record for re-inspection to see if it is ready
|
624
|
+
for launch and for launch if it is ready. The call will return
|
625
|
+
immediately.
|
590
626
|
|
591
627
|
This should be called by any piece of the DataFlowKernel that
|
592
628
|
thinks a task may have become ready to run.
|
@@ -595,13 +631,17 @@ class DataFlowKernel:
|
|
595
631
|
ready to run - launch_if_ready will not incorrectly launch that
|
596
632
|
task.
|
597
633
|
|
598
|
-
It is also not an error to call launch_if_ready on a task that has
|
599
|
-
already been launched - launch_if_ready will not re-launch that
|
600
|
-
task.
|
601
|
-
|
602
634
|
launch_if_ready is thread safe, so may be called from any thread
|
603
635
|
or callback.
|
604
636
|
"""
|
637
|
+
self.dependency_launch_pool.submit(self._launch_if_ready_async, task_record)
|
638
|
+
|
639
|
+
@wrap_with_logs
|
640
|
+
def _launch_if_ready_async(self, task_record: TaskRecord) -> None:
|
641
|
+
"""
|
642
|
+
_launch_if_ready will launch the specified task, if it is ready
|
643
|
+
to run (for example, without dependencies, and in pending state).
|
644
|
+
"""
|
605
645
|
exec_fu = None
|
606
646
|
|
607
647
|
task_id = task_record['id']
|
@@ -667,14 +707,6 @@ class DataFlowKernel:
|
|
667
707
|
def launch_task(self, task_record: TaskRecord) -> Future:
|
668
708
|
"""Handle the actual submission of the task to the executor layer.
|
669
709
|
|
670
|
-
If the app task has the executors attributes not set (default=='all')
|
671
|
-
the task is launched on a randomly selected executor from the
|
672
|
-
list of executors. This behavior could later be updated to support
|
673
|
-
binding to executors based on user specified criteria.
|
674
|
-
|
675
|
-
If the app task specifies a particular set of executors, it will be
|
676
|
-
targeted at those specific executors.
|
677
|
-
|
678
710
|
Args:
|
679
711
|
task_record : The task record
|
680
712
|
|
@@ -707,14 +739,18 @@ class DataFlowKernel:
|
|
707
739
|
|
708
740
|
if self.monitoring is not None and self.monitoring.resource_monitoring_enabled:
|
709
741
|
wrapper_logging_level = logging.DEBUG if self.monitoring.monitoring_debug else logging.INFO
|
710
|
-
(function, args, kwargs) =
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
742
|
+
(function, args, kwargs) = monitor_wrapper(f=function,
|
743
|
+
args=args,
|
744
|
+
kwargs=kwargs,
|
745
|
+
x_try_id=try_id,
|
746
|
+
x_task_id=task_id,
|
747
|
+
monitoring_hub_url=self.monitoring.monitoring_hub_url,
|
748
|
+
run_id=self.run_id,
|
749
|
+
logging_level=wrapper_logging_level,
|
750
|
+
sleep_dur=self.monitoring.resource_monitoring_interval,
|
751
|
+
radio_mode=executor.radio_mode,
|
752
|
+
monitor_resources=executor.monitor_resources(),
|
753
|
+
run_dir=self.run_dir)
|
718
754
|
|
719
755
|
with self.submitter_lock:
|
720
756
|
exec_fu = executor.submit(function, task_record['resource_specification'], *args, **kwargs)
|
@@ -757,6 +793,10 @@ class DataFlowKernel:
|
|
757
793
|
(inputs[idx], func) = self.data_manager.optionally_stage_in(f, func, executor)
|
758
794
|
|
759
795
|
for kwarg, f in kwargs.items():
|
796
|
+
# stdout and stderr files should not be staging in (they will be staged *out*
|
797
|
+
# in _add_output_deps)
|
798
|
+
if kwarg in ['stdout', 'stderr']:
|
799
|
+
continue
|
760
800
|
(kwargs[kwarg], func) = self.data_manager.optionally_stage_in(f, func, executor)
|
761
801
|
|
762
802
|
newargs = list(args)
|
@@ -769,33 +809,55 @@ class DataFlowKernel:
|
|
769
809
|
logger.debug("Adding output dependencies")
|
770
810
|
outputs = kwargs.get('outputs', [])
|
771
811
|
app_fut._outputs = []
|
772
|
-
|
773
|
-
|
812
|
+
|
813
|
+
# Pass over all possible outputs: the outputs kwarg, stdout and stderr
|
814
|
+
# and for each of those, perform possible stage-out. This can result in:
|
815
|
+
# a DataFuture to be exposed in app_fut to represent the completion of
|
816
|
+
# that stageout (sometimes backed by a new sub-workflow for separate-task
|
817
|
+
# stageout), a replacement for the function to be executed (intended to
|
818
|
+
# be the original function wrapped with an in-task stageout wrapper), a
|
819
|
+
# rewritten File object to be passed to task to be executed
|
820
|
+
|
821
|
+
def stageout_one_file(file: File, rewritable_func: Callable):
|
822
|
+
if not self.check_staging_inhibited(kwargs):
|
774
823
|
# replace a File with a DataFuture - either completing when the stageout
|
775
824
|
# future completes, or if no stage out future is returned, then when the
|
776
825
|
# app itself completes.
|
777
826
|
|
778
827
|
# The staging code will get a clean copy which it is allowed to mutate,
|
779
828
|
# while the DataFuture-contained original will not be modified by any staging.
|
780
|
-
f_copy =
|
781
|
-
outputs[idx] = f_copy
|
829
|
+
f_copy = file.cleancopy()
|
782
830
|
|
783
|
-
logger.debug("Submitting stage out for output file {}".format(repr(
|
831
|
+
logger.debug("Submitting stage out for output file {}".format(repr(file)))
|
784
832
|
stageout_fut = self.data_manager.stage_out(f_copy, executor, app_fut)
|
785
833
|
if stageout_fut:
|
786
|
-
logger.debug("Adding a dependency on stageout future for {}".format(repr(
|
787
|
-
|
834
|
+
logger.debug("Adding a dependency on stageout future for {}".format(repr(file)))
|
835
|
+
df = DataFuture(stageout_fut, file, tid=app_fut.tid)
|
788
836
|
else:
|
789
|
-
logger.debug("No stageout dependency for {}".format(repr(
|
790
|
-
|
837
|
+
logger.debug("No stageout dependency for {}".format(repr(file)))
|
838
|
+
df = DataFuture(app_fut, file, tid=app_fut.tid)
|
791
839
|
|
792
840
|
# this is a hook for post-task stageout
|
793
841
|
# note that nothing depends on the output - which is maybe a bug
|
794
842
|
# in the not-very-tested stageout system?
|
795
|
-
|
843
|
+
rewritable_func = self.data_manager.replace_task_stage_out(f_copy, rewritable_func, executor)
|
844
|
+
return rewritable_func, f_copy, df
|
796
845
|
else:
|
797
|
-
logger.debug("Not performing output staging for: {}".format(repr(
|
798
|
-
|
846
|
+
logger.debug("Not performing output staging for: {}".format(repr(file)))
|
847
|
+
return rewritable_func, file, DataFuture(app_fut, file, tid=app_fut.tid)
|
848
|
+
|
849
|
+
for idx, file in enumerate(outputs):
|
850
|
+
func, outputs[idx], o = stageout_one_file(file, func)
|
851
|
+
app_fut._outputs.append(o)
|
852
|
+
|
853
|
+
file = kwargs.get('stdout')
|
854
|
+
if isinstance(file, File):
|
855
|
+
func, kwargs['stdout'], app_fut._stdout_future = stageout_one_file(file, func)
|
856
|
+
|
857
|
+
file = kwargs.get('stderr')
|
858
|
+
if isinstance(file, File):
|
859
|
+
func, kwargs['stderr'], app_fut._stderr_future = stageout_one_file(file, func)
|
860
|
+
|
799
861
|
return func
|
800
862
|
|
801
863
|
def _gather_all_deps(self, args: Sequence[Any], kwargs: Dict[str, Any]) -> List[Future]:
|
@@ -812,8 +874,11 @@ class DataFlowKernel:
|
|
812
874
|
depends: List[Future] = []
|
813
875
|
|
814
876
|
def check_dep(d: Any) -> None:
|
815
|
-
|
816
|
-
depends.extend(
|
877
|
+
try:
|
878
|
+
depends.extend(self.dependency_resolver.traverse_to_gather(d))
|
879
|
+
except Exception:
|
880
|
+
logger.exception("Exception in dependency_resolver.traverse_to_gather")
|
881
|
+
raise
|
817
882
|
|
818
883
|
# Check the positional args
|
819
884
|
for dep in args:
|
@@ -830,7 +895,8 @@ class DataFlowKernel:
|
|
830
895
|
|
831
896
|
return depends
|
832
897
|
|
833
|
-
def _unwrap_futures(self, args, kwargs)
|
898
|
+
def _unwrap_futures(self, args: Sequence[Any], kwargs: Dict[str, Any]) \
|
899
|
+
-> Tuple[Sequence[Any], Dict[str, Any], Sequence[Tuple[Exception, str]]]:
|
834
900
|
"""This function should be called when all dependencies have completed.
|
835
901
|
|
836
902
|
It will rewrite the arguments for that task, replacing each Future
|
@@ -851,53 +917,40 @@ class DataFlowKernel:
|
|
851
917
|
"""
|
852
918
|
dep_failures = []
|
853
919
|
|
920
|
+
def append_failure(e: Exception, dep: Future) -> None:
|
921
|
+
# If this Future is associated with a task inside this DFK,
|
922
|
+
# then refer to the task ID.
|
923
|
+
# Otherwise make a repr of the Future object.
|
924
|
+
if hasattr(dep, 'task_record') and dep.task_record['dfk'] == self:
|
925
|
+
tid = "task " + repr(dep.task_record['id'])
|
926
|
+
else:
|
927
|
+
tid = repr(dep)
|
928
|
+
dep_failures.extend([(e, tid)])
|
929
|
+
|
854
930
|
# Replace item in args
|
855
931
|
new_args = []
|
856
932
|
for dep in args:
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
# If this Future is associated with a task inside this DFK,
|
862
|
-
# then refer to the task ID.
|
863
|
-
# Otherwise make a repr of the Future object.
|
864
|
-
if hasattr(dep, 'task_record') and dep.task_record['dfk'] == self:
|
865
|
-
tid = "task " + repr(dep.task_record['id'])
|
866
|
-
else:
|
867
|
-
tid = repr(dep)
|
868
|
-
dep_failures.extend([(e, tid)])
|
869
|
-
else:
|
870
|
-
new_args.extend([dep])
|
933
|
+
try:
|
934
|
+
new_args.extend([self.dependency_resolver.traverse_to_unwrap(dep)])
|
935
|
+
except Exception as e:
|
936
|
+
append_failure(e, dep)
|
871
937
|
|
872
938
|
# Check for explicit kwargs ex, fu_1=<fut>
|
873
939
|
for key in kwargs:
|
874
940
|
dep = kwargs[key]
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
if hasattr(dep, 'task_record'):
|
880
|
-
tid = dep.task_record['id']
|
881
|
-
else:
|
882
|
-
tid = None
|
883
|
-
dep_failures.extend([(e, tid)])
|
941
|
+
try:
|
942
|
+
kwargs[key] = self.dependency_resolver.traverse_to_unwrap(dep)
|
943
|
+
except Exception as e:
|
944
|
+
append_failure(e, dep)
|
884
945
|
|
885
946
|
# Check for futures in inputs=[<fut>...]
|
886
947
|
if 'inputs' in kwargs:
|
887
948
|
new_inputs = []
|
888
949
|
for dep in kwargs['inputs']:
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
if hasattr(dep, 'task_record'):
|
894
|
-
tid = dep.task_record['id']
|
895
|
-
else:
|
896
|
-
tid = None
|
897
|
-
dep_failures.extend([(e, tid)])
|
898
|
-
|
899
|
-
else:
|
900
|
-
new_inputs.extend([dep])
|
950
|
+
try:
|
951
|
+
new_inputs.extend([self.dependency_resolver.traverse_to_unwrap(dep)])
|
952
|
+
except Exception as e:
|
953
|
+
append_failure(e, dep)
|
901
954
|
kwargs['inputs'] = new_inputs
|
902
955
|
|
903
956
|
return new_args, kwargs, dep_failures
|
@@ -929,7 +982,7 @@ class DataFlowKernel:
|
|
929
982
|
- app_kwargs (dict) : Rest of the kwargs to the fn passed as dict.
|
930
983
|
|
931
984
|
Returns:
|
932
|
-
|
985
|
+
AppFuture
|
933
986
|
|
934
987
|
"""
|
935
988
|
|
@@ -953,32 +1006,16 @@ class DataFlowKernel:
|
|
953
1006
|
executor = random.choice(choices)
|
954
1007
|
logger.debug("Task {} will be sent to executor {}".format(task_id, executor))
|
955
1008
|
|
956
|
-
# The below uses func.__name__ before it has been wrapped by any staging code.
|
957
|
-
|
958
|
-
label = app_kwargs.get('label')
|
959
|
-
for kw in ['stdout', 'stderr']:
|
960
|
-
if kw in app_kwargs:
|
961
|
-
if app_kwargs[kw] == parsl.AUTO_LOGNAME:
|
962
|
-
if kw not in ignore_for_cache:
|
963
|
-
ignore_for_cache += [kw]
|
964
|
-
app_kwargs[kw] = os.path.join(
|
965
|
-
self.run_dir,
|
966
|
-
'task_logs',
|
967
|
-
str(int(task_id / 10000)).zfill(4), # limit logs to 10k entries per directory
|
968
|
-
'task_{}_{}{}.{}'.format(
|
969
|
-
str(task_id).zfill(4),
|
970
|
-
func.__name__,
|
971
|
-
'' if label is None else '_{}'.format(label),
|
972
|
-
kw)
|
973
|
-
)
|
974
|
-
|
975
1009
|
resource_specification = app_kwargs.get('parsl_resource_specification', {})
|
976
1010
|
|
977
1011
|
task_record: TaskRecord
|
978
|
-
task_record = {'
|
1012
|
+
task_record = {'args': app_args,
|
1013
|
+
'depends': [],
|
979
1014
|
'dfk': self,
|
980
1015
|
'executor': executor,
|
1016
|
+
'func': func,
|
981
1017
|
'func_name': func.__name__,
|
1018
|
+
'kwargs': app_kwargs,
|
982
1019
|
'memoize': cache,
|
983
1020
|
'hashsum': None,
|
984
1021
|
'exec_fu': None,
|
@@ -1000,25 +1037,41 @@ class DataFlowKernel:
|
|
1000
1037
|
|
1001
1038
|
self.update_task_state(task_record, States.unsched)
|
1002
1039
|
|
1040
|
+
for kw in ['stdout', 'stderr']:
|
1041
|
+
if kw in app_kwargs:
|
1042
|
+
if app_kwargs[kw] == parsl.AUTO_LOGNAME:
|
1043
|
+
if kw not in ignore_for_cache:
|
1044
|
+
ignore_for_cache += [kw]
|
1045
|
+
if self.config.std_autopath is None:
|
1046
|
+
app_kwargs[kw] = self.default_std_autopath(task_record, kw)
|
1047
|
+
else:
|
1048
|
+
app_kwargs[kw] = self.config.std_autopath(task_record, kw)
|
1049
|
+
|
1003
1050
|
app_fu = AppFuture(task_record)
|
1051
|
+
task_record['app_fu'] = app_fu
|
1004
1052
|
|
1005
1053
|
# Transform remote input files to data futures
|
1006
1054
|
app_args, app_kwargs, func = self._add_input_deps(executor, app_args, app_kwargs, func)
|
1007
1055
|
|
1008
1056
|
func = self._add_output_deps(executor, app_args, app_kwargs, app_fu, func)
|
1009
1057
|
|
1058
|
+
logger.debug("Added output dependencies")
|
1059
|
+
|
1060
|
+
# Replace the function invocation in the TaskRecord with whatever file-staging
|
1061
|
+
# substitutions have been made.
|
1010
1062
|
task_record.update({
|
1011
1063
|
'args': app_args,
|
1012
1064
|
'func': func,
|
1013
|
-
'kwargs': app_kwargs
|
1014
|
-
'app_fu': app_fu})
|
1065
|
+
'kwargs': app_kwargs})
|
1015
1066
|
|
1016
1067
|
assert task_id not in self.tasks
|
1017
1068
|
|
1018
1069
|
self.tasks[task_id] = task_record
|
1019
1070
|
|
1071
|
+
logger.debug("Gathering dependencies")
|
1020
1072
|
# Get the list of dependencies for the task
|
1021
1073
|
depends = self._gather_all_deps(app_args, app_kwargs)
|
1074
|
+
logger.debug("Gathered dependencies")
|
1022
1075
|
task_record['depends'] = depends
|
1023
1076
|
|
1024
1077
|
depend_descs = []
|
@@ -1085,73 +1138,28 @@ class DataFlowKernel:
|
|
1085
1138
|
|
1086
1139
|
logger.info("End of summary")
|
1087
1140
|
|
1088
|
-
def
|
1089
|
-
"""Create script directories across a channel
|
1090
|
-
|
1091
|
-
Parameters
|
1092
|
-
----------
|
1093
|
-
provider: Provider obj
|
1094
|
-
Provider for which scripts dirs are being created
|
1095
|
-
channel: Channel obj
|
1096
|
-
Channel over which the remote dirs are to be created
|
1097
|
-
"""
|
1098
|
-
run_dir = self.run_dir
|
1099
|
-
if channel.script_dir is None:
|
1100
|
-
|
1101
|
-
# This case will be detected as unreachable by mypy, because of
|
1102
|
-
# the type of script_dir, which is str, not Optional[str].
|
1103
|
-
# The type system doesn't represent the initialized/uninitialized
|
1104
|
-
# state of a channel so cannot represent that a channel needs
|
1105
|
-
# its script directory set or not.
|
1106
|
-
|
1107
|
-
channel.script_dir = os.path.join(run_dir, 'submit_scripts') # type: ignore[unreachable]
|
1108
|
-
|
1109
|
-
# Only create dirs if we aren't on a shared-fs
|
1110
|
-
if not channel.isdir(run_dir):
|
1111
|
-
parent, child = pathlib.Path(run_dir).parts[-2:]
|
1112
|
-
remote_run_dir = os.path.join(parent, child)
|
1113
|
-
channel.script_dir = os.path.join(remote_run_dir, 'remote_submit_scripts')
|
1114
|
-
provider.script_dir = os.path.join(run_dir, 'local_submit_scripts')
|
1115
|
-
|
1116
|
-
channel.makedirs(channel.script_dir, exist_ok=True)
|
1117
|
-
|
1118
|
-
def add_executors(self, executors):
|
1141
|
+
def add_executors(self, executors: Sequence[ParslExecutor]) -> None:
|
1119
1142
|
for executor in executors:
|
1120
1143
|
executor.run_id = self.run_id
|
1121
1144
|
executor.run_dir = self.run_dir
|
1122
|
-
|
1123
|
-
|
1145
|
+
if self.monitoring:
|
1146
|
+
executor.hub_address = self.monitoring.hub_address
|
1147
|
+
executor.hub_zmq_port = self.monitoring.hub_zmq_port
|
1148
|
+
executor.submit_monitoring_radio = self.monitoring.radio
|
1124
1149
|
if hasattr(executor, 'provider'):
|
1125
1150
|
if hasattr(executor.provider, 'script_dir'):
|
1126
1151
|
executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
|
1127
1152
|
os.makedirs(executor.provider.script_dir, exist_ok=True)
|
1128
1153
|
|
1129
|
-
if hasattr(executor.provider, 'channels'):
|
1130
|
-
logger.debug("Creating script_dir across multiple channels")
|
1131
|
-
for channel in executor.provider.channels:
|
1132
|
-
self._create_remote_dirs_over_channel(executor.provider, channel)
|
1133
|
-
else:
|
1134
|
-
self._create_remote_dirs_over_channel(executor.provider, executor.provider.channel)
|
1135
|
-
|
1136
1154
|
self.executors[executor.label] = executor
|
1137
|
-
|
1138
|
-
if self.monitoring and block_ids:
|
1139
|
-
new_status = {}
|
1140
|
-
for bid in block_ids:
|
1141
|
-
new_status[bid] = JobStatus(JobState.PENDING)
|
1142
|
-
msg = executor.create_monitoring_info(new_status)
|
1143
|
-
logger.debug("Sending monitoring message {} to hub from DFK".format(msg))
|
1144
|
-
self.monitoring.send(MessageType.BLOCK_INFO, msg)
|
1155
|
+
executor.start()
|
1145
1156
|
block_executors = [e for e in executors if isinstance(e, BlockProviderExecutor)]
|
1146
1157
|
self.job_status_poller.add_executors(block_executors)
|
1147
1158
|
|
1148
1159
|
def atexit_cleanup(self) -> None:
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
"exiting to release any resources")
|
1153
|
-
else:
|
1154
|
-
logger.info("python process is exiting, but DFK has already been cleaned up")
|
1160
|
+
logger.warning("Python is exiting with a DFK still running. "
|
1161
|
+
"You should call parsl.dfk().cleanup() before "
|
1162
|
+
"exiting to release any resources")
|
1155
1163
|
|
1156
1164
|
def wait_for_current_tasks(self) -> None:
|
1157
1165
|
"""Waits for all tasks in the task list to be completed, by waiting for their
|
@@ -1207,31 +1215,18 @@ class DataFlowKernel:
|
|
1207
1215
|
self._checkpoint_timer.close()
|
1208
1216
|
|
1209
1217
|
# Send final stats
|
1218
|
+
logger.info("Sending end message for usage tracking")
|
1210
1219
|
self.usage_tracker.send_end_message()
|
1211
1220
|
self.usage_tracker.close()
|
1221
|
+
logger.info("Closed usage tracking")
|
1212
1222
|
|
1213
1223
|
logger.info("Closing job status poller")
|
1214
1224
|
self.job_status_poller.close()
|
1215
1225
|
logger.info("Terminated job status poller")
|
1216
1226
|
|
1217
|
-
logger.info("
|
1227
|
+
logger.info("Shutting down executors")
|
1218
1228
|
|
1219
1229
|
for executor in self.executors.values():
|
1220
|
-
if isinstance(executor, BlockProviderExecutor):
|
1221
|
-
if not executor.bad_state_is_set:
|
1222
|
-
logger.info(f"Scaling in executor {executor.label}")
|
1223
|
-
if executor.provider:
|
1224
|
-
job_ids = executor.provider.resources.keys()
|
1225
|
-
block_ids = executor.scale_in(len(job_ids))
|
1226
|
-
if self.monitoring and block_ids:
|
1227
|
-
new_status = {}
|
1228
|
-
for bid in block_ids:
|
1229
|
-
new_status[bid] = JobStatus(JobState.CANCELLED)
|
1230
|
-
msg = executor.create_monitoring_info(new_status)
|
1231
|
-
logger.debug("Sending message {} to hub from DFK".format(msg))
|
1232
|
-
self.monitoring.send(MessageType.BLOCK_INFO, msg)
|
1233
|
-
else: # and bad_state_is_set
|
1234
|
-
logger.warning(f"Not shutting down executor {executor.label} because it is in bad state")
|
1235
1230
|
logger.info(f"Shutting down executor {executor.label}")
|
1236
1231
|
executor.shutdown()
|
1237
1232
|
logger.info(f"Shut down executor {executor.label}")
|
@@ -1241,18 +1236,32 @@ class DataFlowKernel:
|
|
1241
1236
|
|
1242
1237
|
if self.monitoring:
|
1243
1238
|
logger.info("Sending final monitoring message")
|
1244
|
-
self.monitoring.send(MessageType.WORKFLOW_INFO,
|
1239
|
+
self.monitoring.send((MessageType.WORKFLOW_INFO,
|
1245
1240
|
{'tasks_failed_count': self.task_state_counts[States.failed],
|
1246
1241
|
'tasks_completed_count': self.task_state_counts[States.exec_done],
|
1247
1242
|
"time_began": self.time_began,
|
1248
1243
|
'time_completed': self.time_completed,
|
1249
|
-
'run_id': self.run_id, 'rundir': self.run_dir
|
1250
|
-
'exit_now': True})
|
1244
|
+
'run_id': self.run_id, 'rundir': self.run_dir}))
|
1251
1245
|
|
1252
1246
|
logger.info("Terminating monitoring")
|
1253
1247
|
self.monitoring.close()
|
1254
1248
|
logger.info("Terminated monitoring")
|
1255
1249
|
|
1250
|
+
logger.info("Terminating dependency launch pool")
|
1251
|
+
self.dependency_launch_pool.shutdown()
|
1252
|
+
logger.info("Terminated dependency launch pool")
|
1253
|
+
|
1254
|
+
logger.info("Unregistering atexit hook")
|
1255
|
+
atexit.unregister(self.atexit_cleanup)
|
1256
|
+
logger.info("Unregistered atexit hook")
|
1257
|
+
|
1258
|
+
if DataFlowKernelLoader._dfk is self:
|
1259
|
+
logger.info("Unregistering default DFK")
|
1260
|
+
parsl.clear()
|
1261
|
+
logger.info("Unregistered default DFK")
|
1262
|
+
else:
|
1263
|
+
logger.debug("Cleaning up non-default DFK - not unregistering")
|
1264
|
+
|
1256
1265
|
logger.info("DFK cleanup complete")
|
1257
1266
|
|
1258
1267
|
def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> str:
|
@@ -1388,8 +1397,6 @@ class DataFlowKernel:
|
|
1388
1397
|
Returns:
|
1389
1398
|
- dict containing, hashed -> future mappings
|
1390
1399
|
"""
|
1391
|
-
self.memo_lookup_table = None
|
1392
|
-
|
1393
1400
|
if checkpointDirs:
|
1394
1401
|
return self._load_checkpoints(checkpointDirs)
|
1395
1402
|
else:
|
@@ -1397,10 +1404,39 @@ class DataFlowKernel:
|
|
1397
1404
|
|
1398
1405
|
@staticmethod
|
1399
1406
|
def _log_std_streams(task_record: TaskRecord) -> None:
|
1400
|
-
|
1401
|
-
|
1402
|
-
|
1403
|
-
|
1407
|
+
tid = task_record['id']
|
1408
|
+
|
1409
|
+
def log_std_stream(name: str, target) -> None:
|
1410
|
+
if target is None:
|
1411
|
+
logger.info(f"{name} for task {tid} will not be redirected.")
|
1412
|
+
elif isinstance(target, str):
|
1413
|
+
logger.info(f"{name} for task {tid} will be redirected to {target}")
|
1414
|
+
elif isinstance(target, os.PathLike):
|
1415
|
+
logger.info(f"{name} for task {tid} will be redirected to {os.fspath(target)}")
|
1416
|
+
elif isinstance(target, tuple) and len(target) == 2 and isinstance(target[0], str):
|
1417
|
+
logger.info(f"{name} for task {tid} will be redirected to {target[0]} with mode {target[1]}")
|
1418
|
+
elif isinstance(target, tuple) and len(target) == 2 and isinstance(target[0], os.PathLike):
|
1419
|
+
logger.info(f"{name} for task {tid} will be redirected to {os.fspath(target[0])} with mode {target[1]}")
|
1420
|
+
elif isinstance(target, DataFuture):
|
1421
|
+
logger.info(f"{name} for task {tid} will staged to {target.file_obj.url}")
|
1422
|
+
else:
|
1423
|
+
logger.error(f"{name} for task {tid} has unknown specification: {target!r}")
|
1424
|
+
|
1425
|
+
log_std_stream("Standard out", task_record['app_fu'].stdout)
|
1426
|
+
log_std_stream("Standard error", task_record['app_fu'].stderr)
|
1427
|
+
|
1428
|
+
def default_std_autopath(self, taskrecord, kw):
|
1429
|
+
label = taskrecord['kwargs'].get('label')
|
1430
|
+
task_id = taskrecord['id']
|
1431
|
+
return os.path.join(
|
1432
|
+
self.run_dir,
|
1433
|
+
'task_logs',
|
1434
|
+
str(int(task_id / 10000)).zfill(4), # limit logs to 10k entries per directory
|
1435
|
+
'task_{}_{}{}.{}'.format(
|
1436
|
+
str(task_id).zfill(4),
|
1437
|
+
taskrecord['func_name'],
|
1438
|
+
'' if label is None else '_{}'.format(label),
|
1439
|
+
kw))
|
1404
1440
|
|
1405
1441
|
|
1406
1442
|
class DataFlowKernelLoader:
|