parsl 2024.3.11__py3-none-any.whl → 2025.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. parsl/__init__.py +9 -10
  2. parsl/addresses.py +29 -7
  3. parsl/app/app.py +7 -8
  4. parsl/app/bash.py +15 -8
  5. parsl/app/errors.py +10 -13
  6. parsl/app/futures.py +8 -10
  7. parsl/app/python.py +2 -1
  8. parsl/benchmark/perf.py +2 -1
  9. parsl/concurrent/__init__.py +2 -2
  10. parsl/config.py +57 -10
  11. parsl/configs/ASPIRE1.py +6 -5
  12. parsl/configs/Azure.py +9 -8
  13. parsl/configs/bridges.py +6 -4
  14. parsl/configs/cc_in2p3.py +3 -3
  15. parsl/configs/ec2.py +3 -1
  16. parsl/configs/expanse.py +4 -3
  17. parsl/configs/frontera.py +3 -4
  18. parsl/configs/htex_local.py +3 -4
  19. parsl/configs/illinoiscluster.py +3 -1
  20. parsl/configs/improv.py +34 -0
  21. parsl/configs/kubernetes.py +4 -3
  22. parsl/configs/local_threads.py +5 -1
  23. parsl/configs/midway.py +5 -3
  24. parsl/configs/osg.py +4 -2
  25. parsl/configs/polaris.py +4 -2
  26. parsl/configs/stampede2.py +6 -5
  27. parsl/configs/summit.py +3 -3
  28. parsl/configs/toss3_llnl.py +4 -3
  29. parsl/configs/vineex_local.py +6 -4
  30. parsl/configs/wqex_local.py +5 -3
  31. parsl/curvezmq.py +4 -0
  32. parsl/data_provider/data_manager.py +4 -3
  33. parsl/data_provider/file_noop.py +1 -2
  34. parsl/data_provider/files.py +3 -3
  35. parsl/data_provider/ftp.py +1 -3
  36. parsl/data_provider/globus.py +7 -6
  37. parsl/data_provider/http.py +2 -2
  38. parsl/data_provider/rsync.py +1 -1
  39. parsl/data_provider/staging.py +2 -2
  40. parsl/data_provider/zip.py +135 -0
  41. parsl/dataflow/dependency_resolvers.py +115 -0
  42. parsl/dataflow/dflow.py +262 -224
  43. parsl/dataflow/errors.py +3 -5
  44. parsl/dataflow/futures.py +27 -14
  45. parsl/dataflow/memoization.py +5 -5
  46. parsl/dataflow/rundirs.py +5 -6
  47. parsl/dataflow/taskrecord.py +4 -5
  48. parsl/executors/__init__.py +4 -2
  49. parsl/executors/base.py +45 -15
  50. parsl/executors/errors.py +13 -0
  51. parsl/executors/execute_task.py +37 -0
  52. parsl/executors/flux/execute_parsl_task.py +3 -3
  53. parsl/executors/flux/executor.py +18 -19
  54. parsl/executors/flux/flux_instance_manager.py +26 -27
  55. parsl/executors/high_throughput/errors.py +43 -3
  56. parsl/executors/high_throughput/executor.py +316 -282
  57. parsl/executors/high_throughput/interchange.py +158 -167
  58. parsl/executors/high_throughput/manager_record.py +5 -0
  59. parsl/executors/high_throughput/manager_selector.py +55 -0
  60. parsl/executors/high_throughput/monitoring_info.py +2 -1
  61. parsl/executors/high_throughput/mpi_executor.py +113 -0
  62. parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
  63. parsl/executors/high_throughput/mpi_resource_management.py +6 -17
  64. parsl/executors/high_throughput/probe.py +9 -7
  65. parsl/executors/high_throughput/process_worker_pool.py +115 -77
  66. parsl/executors/high_throughput/zmq_pipes.py +81 -23
  67. parsl/executors/radical/executor.py +130 -79
  68. parsl/executors/radical/rpex_resources.py +17 -15
  69. parsl/executors/radical/rpex_worker.py +4 -3
  70. parsl/executors/status_handling.py +157 -51
  71. parsl/executors/taskvine/__init__.py +1 -1
  72. parsl/executors/taskvine/errors.py +1 -1
  73. parsl/executors/taskvine/exec_parsl_function.py +2 -2
  74. parsl/executors/taskvine/executor.py +41 -57
  75. parsl/executors/taskvine/factory.py +1 -1
  76. parsl/executors/taskvine/factory_config.py +1 -1
  77. parsl/executors/taskvine/manager.py +18 -13
  78. parsl/executors/taskvine/manager_config.py +9 -5
  79. parsl/executors/threads.py +6 -6
  80. parsl/executors/workqueue/errors.py +1 -1
  81. parsl/executors/workqueue/exec_parsl_function.py +6 -5
  82. parsl/executors/workqueue/executor.py +64 -63
  83. parsl/executors/workqueue/parsl_coprocess.py +1 -1
  84. parsl/jobs/error_handlers.py +2 -2
  85. parsl/jobs/job_status_poller.py +30 -113
  86. parsl/jobs/states.py +7 -2
  87. parsl/jobs/strategy.py +43 -31
  88. parsl/launchers/__init__.py +12 -3
  89. parsl/launchers/errors.py +1 -1
  90. parsl/launchers/launchers.py +6 -12
  91. parsl/log_utils.py +9 -6
  92. parsl/monitoring/db_manager.py +59 -95
  93. parsl/monitoring/errors.py +6 -0
  94. parsl/monitoring/monitoring.py +87 -356
  95. parsl/monitoring/queries/pandas.py +1 -2
  96. parsl/monitoring/radios/base.py +13 -0
  97. parsl/monitoring/radios/filesystem.py +52 -0
  98. parsl/monitoring/radios/htex.py +57 -0
  99. parsl/monitoring/radios/multiprocessing.py +17 -0
  100. parsl/monitoring/radios/udp.py +56 -0
  101. parsl/monitoring/radios/zmq.py +17 -0
  102. parsl/monitoring/remote.py +33 -37
  103. parsl/monitoring/router.py +212 -0
  104. parsl/monitoring/types.py +5 -6
  105. parsl/monitoring/visualization/app.py +4 -2
  106. parsl/monitoring/visualization/models.py +0 -1
  107. parsl/monitoring/visualization/plots/default/workflow_plots.py +11 -4
  108. parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
  109. parsl/monitoring/visualization/utils.py +0 -1
  110. parsl/monitoring/visualization/views.py +16 -8
  111. parsl/multiprocessing.py +0 -1
  112. parsl/process_loggers.py +1 -2
  113. parsl/providers/__init__.py +8 -17
  114. parsl/providers/aws/aws.py +2 -3
  115. parsl/providers/azure/azure.py +4 -5
  116. parsl/providers/base.py +2 -18
  117. parsl/providers/cluster_provider.py +4 -12
  118. parsl/providers/condor/condor.py +7 -17
  119. parsl/providers/errors.py +2 -2
  120. parsl/providers/googlecloud/googlecloud.py +2 -1
  121. parsl/providers/grid_engine/grid_engine.py +5 -14
  122. parsl/providers/kubernetes/kube.py +80 -40
  123. parsl/providers/local/local.py +13 -26
  124. parsl/providers/lsf/lsf.py +5 -23
  125. parsl/providers/pbspro/pbspro.py +5 -17
  126. parsl/providers/slurm/slurm.py +81 -39
  127. parsl/providers/torque/torque.py +3 -14
  128. parsl/serialize/__init__.py +8 -3
  129. parsl/serialize/base.py +1 -2
  130. parsl/serialize/concretes.py +5 -4
  131. parsl/serialize/facade.py +3 -3
  132. parsl/serialize/proxystore.py +3 -2
  133. parsl/tests/__init__.py +1 -1
  134. parsl/tests/configs/azure_single_node.py +4 -5
  135. parsl/tests/configs/bridges.py +3 -2
  136. parsl/tests/configs/cc_in2p3.py +1 -3
  137. parsl/tests/configs/comet.py +2 -1
  138. parsl/tests/configs/ec2_single_node.py +1 -2
  139. parsl/tests/configs/ec2_spot.py +1 -2
  140. parsl/tests/configs/flux_local.py +11 -0
  141. parsl/tests/configs/frontera.py +2 -3
  142. parsl/tests/configs/htex_local.py +3 -5
  143. parsl/tests/configs/htex_local_alternate.py +11 -15
  144. parsl/tests/configs/htex_local_intask_staging.py +5 -9
  145. parsl/tests/configs/htex_local_rsync_staging.py +4 -8
  146. parsl/tests/configs/local_radical.py +1 -3
  147. parsl/tests/configs/local_radical_mpi.py +2 -2
  148. parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
  149. parsl/tests/configs/local_threads_monitoring.py +0 -1
  150. parsl/tests/configs/midway.py +2 -2
  151. parsl/tests/configs/nscc_singapore.py +3 -3
  152. parsl/tests/configs/osg_htex.py +1 -1
  153. parsl/tests/configs/petrelkube.py +3 -2
  154. parsl/tests/configs/slurm_local.py +24 -0
  155. parsl/tests/configs/summit.py +1 -0
  156. parsl/tests/configs/taskvine_ex.py +4 -7
  157. parsl/tests/configs/user_opts.py +2 -8
  158. parsl/tests/configs/workqueue_ex.py +4 -6
  159. parsl/tests/conftest.py +27 -13
  160. parsl/tests/integration/test_stress/test_python_simple.py +3 -4
  161. parsl/tests/integration/test_stress/test_python_threads.py +3 -5
  162. parsl/tests/manual_tests/htex_local.py +4 -6
  163. parsl/tests/manual_tests/test_basic.py +1 -0
  164. parsl/tests/manual_tests/test_log_filter.py +3 -1
  165. parsl/tests/manual_tests/test_memory_limits.py +6 -8
  166. parsl/tests/manual_tests/test_regression_220.py +2 -1
  167. parsl/tests/manual_tests/test_udp_simple.py +4 -4
  168. parsl/tests/manual_tests/test_worker_count.py +3 -2
  169. parsl/tests/scaling_tests/htex_local.py +2 -4
  170. parsl/tests/scaling_tests/test_scale.py +0 -9
  171. parsl/tests/scaling_tests/vineex_condor.py +1 -2
  172. parsl/tests/scaling_tests/vineex_local.py +1 -2
  173. parsl/tests/site_tests/site_config_selector.py +1 -6
  174. parsl/tests/site_tests/test_provider.py +4 -2
  175. parsl/tests/site_tests/test_site.py +2 -0
  176. parsl/tests/sites/test_affinity.py +7 -7
  177. parsl/tests/sites/test_dynamic_executor.py +3 -4
  178. parsl/tests/sites/test_ec2.py +3 -2
  179. parsl/tests/sites/test_worker_info.py +4 -5
  180. parsl/tests/test_aalst_patterns.py +0 -1
  181. parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
  182. parsl/tests/test_bash_apps/test_basic.py +10 -4
  183. parsl/tests/test_bash_apps/test_error_codes.py +5 -7
  184. parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
  185. parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
  186. parsl/tests/test_bash_apps/test_memoize.py +2 -8
  187. parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
  188. parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
  189. parsl/tests/test_bash_apps/test_multiline.py +1 -1
  190. parsl/tests/test_bash_apps/test_pipeline.py +1 -1
  191. parsl/tests/test_bash_apps/test_std_uri.py +123 -0
  192. parsl/tests/test_bash_apps/test_stdout.py +33 -8
  193. parsl/tests/test_callables.py +2 -2
  194. parsl/tests/test_checkpointing/test_periodic.py +21 -39
  195. parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
  196. parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
  197. parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
  198. parsl/tests/test_checkpointing/test_regression_239.py +1 -1
  199. parsl/tests/test_checkpointing/test_task_exit.py +2 -3
  200. parsl/tests/test_docs/test_from_slides.py +5 -2
  201. parsl/tests/test_docs/test_kwargs.py +4 -1
  202. parsl/tests/test_docs/test_tutorial_1.py +1 -2
  203. parsl/tests/test_docs/test_workflow1.py +2 -2
  204. parsl/tests/test_docs/test_workflow2.py +0 -1
  205. parsl/tests/test_error_handling/test_rand_fail.py +2 -2
  206. parsl/tests/test_error_handling/test_resource_spec.py +10 -12
  207. parsl/tests/test_error_handling/test_retries.py +6 -16
  208. parsl/tests/test_error_handling/test_retry_handler.py +1 -0
  209. parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
  210. parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
  211. parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
  212. parsl/tests/test_execute_task.py +29 -0
  213. parsl/tests/test_flux.py +1 -1
  214. parsl/tests/test_htex/test_basic.py +2 -3
  215. parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
  216. parsl/tests/test_htex/test_command_client_timeout.py +66 -0
  217. parsl/tests/test_htex/test_connected_blocks.py +3 -2
  218. parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
  219. parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
  220. parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
  221. parsl/tests/test_htex/test_drain.py +79 -0
  222. parsl/tests/test_htex/test_htex.py +51 -25
  223. parsl/tests/test_htex/test_manager_failure.py +0 -1
  224. parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
  225. parsl/tests/test_htex/test_managers_command.py +36 -0
  226. parsl/tests/test_htex/test_missing_worker.py +2 -12
  227. parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
  228. parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
  229. parsl/tests/test_htex/test_zmq_binding.py +29 -8
  230. parsl/tests/test_monitoring/test_app_names.py +86 -0
  231. parsl/tests/test_monitoring/test_basic.py +73 -25
  232. parsl/tests/test_monitoring/test_db_locks.py +6 -4
  233. parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
  234. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
  235. parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
  236. parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
  237. parsl/tests/test_monitoring/test_stdouterr.py +134 -0
  238. parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
  239. parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
  240. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
  241. parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
  242. parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
  243. parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
  244. parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
  245. parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
  246. parsl/tests/test_providers/test_local_provider.py +3 -132
  247. parsl/tests/test_providers/test_pbspro_template.py +2 -3
  248. parsl/tests/test_providers/test_slurm_template.py +2 -3
  249. parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
  250. parsl/tests/test_python_apps/test_context_manager.py +128 -0
  251. parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
  252. parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
  253. parsl/tests/test_python_apps/test_fail.py +0 -25
  254. parsl/tests/test_python_apps/test_futures.py +2 -1
  255. parsl/tests/test_python_apps/test_inputs_default.py +22 -0
  256. parsl/tests/test_python_apps/test_join.py +0 -1
  257. parsl/tests/test_python_apps/test_lifted.py +11 -7
  258. parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
  259. parsl/tests/test_python_apps/test_outputs.py +1 -1
  260. parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
  261. parsl/tests/test_radical/test_mpi_funcs.py +1 -2
  262. parsl/tests/test_regression/test_1480.py +2 -1
  263. parsl/tests/test_regression/test_1653.py +2 -1
  264. parsl/tests/test_regression/test_226.py +1 -0
  265. parsl/tests/test_regression/test_2652.py +1 -0
  266. parsl/tests/test_regression/test_69a.py +0 -1
  267. parsl/tests/test_regression/test_854.py +4 -2
  268. parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
  269. parsl/tests/test_regression/test_98.py +0 -1
  270. parsl/tests/test_scaling/test_block_error_handler.py +9 -4
  271. parsl/tests/test_scaling/test_regression_1621.py +11 -15
  272. parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
  273. parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
  274. parsl/tests/test_scaling/test_scale_down.py +2 -5
  275. parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +6 -18
  276. parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
  277. parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
  278. parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
  279. parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
  280. parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
  281. parsl/tests/test_serialization/test_basic.py +2 -1
  282. parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
  283. parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
  284. parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
  285. parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
  286. parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
  287. parsl/tests/test_staging/staging_provider.py +2 -2
  288. parsl/tests/test_staging/test_1316.py +3 -4
  289. parsl/tests/test_staging/test_docs_1.py +2 -1
  290. parsl/tests/test_staging/test_docs_2.py +2 -1
  291. parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
  292. parsl/tests/{test_data → test_staging}/test_file.py +6 -6
  293. parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
  294. parsl/tests/test_staging/test_staging_ftp.py +1 -0
  295. parsl/tests/test_staging/test_staging_https.py +5 -2
  296. parsl/tests/test_staging/test_staging_stdout.py +64 -0
  297. parsl/tests/test_staging/test_zip_in.py +39 -0
  298. parsl/tests/test_staging/test_zip_out.py +110 -0
  299. parsl/tests/test_staging/test_zip_to_zip.py +41 -0
  300. parsl/tests/test_summary.py +2 -2
  301. parsl/tests/test_thread_parallelism.py +0 -1
  302. parsl/tests/test_threads/test_configs.py +1 -2
  303. parsl/tests/test_threads/test_lazy_errors.py +2 -2
  304. parsl/tests/test_utils/test_execute_wait.py +35 -0
  305. parsl/tests/test_utils/test_sanitize_dns.py +76 -0
  306. parsl/tests/unit/test_address.py +20 -0
  307. parsl/tests/unit/test_file.py +99 -0
  308. parsl/tests/unit/test_usage_tracking.py +66 -0
  309. parsl/usage_tracking/api.py +65 -0
  310. parsl/usage_tracking/levels.py +6 -0
  311. parsl/usage_tracking/usage.py +104 -62
  312. parsl/utils.py +139 -6
  313. parsl/version.py +1 -1
  314. {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
  315. parsl-2025.1.13.data/scripts/interchange.py +649 -0
  316. {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +115 -77
  317. parsl-2025.1.13.dist-info/METADATA +96 -0
  318. parsl-2025.1.13.dist-info/RECORD +462 -0
  319. {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
  320. parsl/channels/__init__.py +0 -7
  321. parsl/channels/base.py +0 -141
  322. parsl/channels/errors.py +0 -113
  323. parsl/channels/local/local.py +0 -164
  324. parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
  325. parsl/channels/ssh/ssh.py +0 -276
  326. parsl/channels/ssh_il/__init__.py +0 -0
  327. parsl/channels/ssh_il/ssh_il.py +0 -74
  328. parsl/configs/ad_hoc.py +0 -35
  329. parsl/executors/radical/rpex_master.py +0 -42
  330. parsl/monitoring/radios.py +0 -175
  331. parsl/providers/ad_hoc/__init__.py +0 -0
  332. parsl/providers/ad_hoc/ad_hoc.py +0 -248
  333. parsl/providers/cobalt/__init__.py +0 -0
  334. parsl/providers/cobalt/cobalt.py +0 -236
  335. parsl/providers/cobalt/template.py +0 -17
  336. parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
  337. parsl/tests/configs/cooley_htex.py +0 -37
  338. parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
  339. parsl/tests/configs/local_adhoc.py +0 -18
  340. parsl/tests/configs/swan_htex.py +0 -43
  341. parsl/tests/configs/theta.py +0 -37
  342. parsl/tests/integration/test_channels/__init__.py +0 -0
  343. parsl/tests/integration/test_channels/test_channels.py +0 -17
  344. parsl/tests/integration/test_channels/test_local_channel.py +0 -42
  345. parsl/tests/integration/test_channels/test_scp_1.py +0 -45
  346. parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
  347. parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
  348. parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
  349. parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
  350. parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
  351. parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
  352. parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
  353. parsl/tests/sites/test_local_adhoc.py +0 -61
  354. parsl/tests/test_channels/__init__.py +0 -0
  355. parsl/tests/test_channels/test_large_output.py +0 -22
  356. parsl/tests/test_data/__init__.py +0 -0
  357. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
  358. parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
  359. parsl-2024.3.11.dist-info/METADATA +0 -98
  360. parsl-2024.3.11.dist-info/RECORD +0 -447
  361. parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
  362. parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
  363. parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
  364. parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
  365. parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
  366. {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
  367. {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
  368. {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
  369. {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
@@ -1,43 +1,41 @@
1
- import typing
2
- from concurrent.futures import Future
3
- import typeguard
4
1
  import logging
5
- import threading
6
- import queue
7
- import datetime
8
- import pickle
9
- from dataclasses import dataclass
10
- from multiprocessing import Process, Queue
11
- from typing import Dict, Sequence
12
- from typing import List, Optional, Tuple, Union, Callable
13
2
  import math
3
+ import pickle
4
+ import subprocess
5
+ import threading
6
+ import typing
14
7
  import warnings
8
+ from collections import defaultdict
9
+ from concurrent.futures import Future
10
+ from dataclasses import dataclass
11
+ from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
12
+
13
+ import typeguard
15
14
 
16
- import parsl.launchers
17
- from parsl.serialize import pack_res_spec_apply_message, deserialize
18
- from parsl.serialize.errors import SerializationError, DeserializationError
15
+ from parsl import curvezmq
16
+ from parsl.addresses import get_all_addresses
19
17
  from parsl.app.errors import RemoteExceptionWrapper
20
- from parsl.jobs.states import JobStatus, JobState
21
- from parsl.executors.high_throughput import zmq_pipes
22
- from parsl.executors.high_throughput import interchange
18
+ from parsl.data_provider.staging import Staging
23
19
  from parsl.executors.errors import (
24
- BadMessage, ScalingFailed,
20
+ BadMessage,
21
+ InvalidResourceSpecification,
22
+ ScalingFailed,
25
23
  )
26
- from parsl.executors.high_throughput.mpi_prefix_composer import (
27
- VALID_LAUNCHERS,
28
- validate_resource_spec
24
+ from parsl.executors.high_throughput import zmq_pipes
25
+ from parsl.executors.high_throughput.errors import CommandClientTimeoutError
26
+ from parsl.executors.high_throughput.manager_selector import (
27
+ ManagerSelector,
28
+ RandomManagerSelector,
29
29
  )
30
-
31
- from parsl import curvezmq
32
30
  from parsl.executors.status_handling import BlockProviderExecutor
33
- from parsl.providers.base import ExecutionProvider
34
- from parsl.data_provider.staging import Staging
35
- from parsl.addresses import get_all_addresses
31
+ from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
36
32
  from parsl.process_loggers import wrap_with_logs
37
-
38
- from parsl.multiprocessing import ForkProcess
39
- from parsl.utils import RepresentationMixin
40
33
  from parsl.providers import LocalProvider
34
+ from parsl.providers.base import ExecutionProvider
35
+ from parsl.serialize import deserialize, pack_res_spec_apply_message
36
+ from parsl.serialize.errors import DeserializationError, SerializationError
37
+ from parsl.usage_tracking.api import UsageInformation
38
+ from parsl.utils import RepresentationMixin
41
39
 
42
40
  logger = logging.getLogger(__name__)
43
41
 
@@ -55,54 +53,16 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
55
53
  "--hb_period={heartbeat_period} "
56
54
  "{address_probe_timeout_string} "
57
55
  "--hb_threshold={heartbeat_threshold} "
56
+ "--drain_period={drain_period} "
58
57
  "--cpu-affinity {cpu_affinity} "
59
58
  "{enable_mpi_mode} "
60
59
  "--mpi-launcher={mpi_launcher} "
61
60
  "--available-accelerators {accelerators}")
62
61
 
62
+ DEFAULT_INTERCHANGE_LAUNCH_CMD = ["interchange.py"]
63
63
 
64
- class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
65
- """Executor designed for cluster-scale
66
-
67
- The HighThroughputExecutor system has the following components:
68
- 1. The HighThroughputExecutor instance which is run as part of the Parsl script.
69
- 2. The Interchange which acts as a load-balancing proxy between workers and Parsl
70
- 3. The multiprocessing based worker pool which coordinates task execution over several
71
- cores on a node.
72
- 4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
73
-
74
- Here is a diagram
75
-
76
- .. code:: python
77
-
78
-
79
- | Data | Executor | Interchange | External Process(es)
80
- | Flow | | |
81
- Task | Kernel | | |
82
- +----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
83
- | | | | batching | | |
84
- Parsl<---Fut-| | | load-balancing| result exception
85
- ^ | | | watchdogs | | |
86
- | | | Q_mngmnt | | V V
87
- | | | Thread<--|-incoming_q<---|--- +---------+
88
- | | | | | |
89
- | | | | | |
90
- +----update_fut-----+
91
-
92
-
93
- Each of the workers in each process_worker_pool has access to its local rank through
94
- an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
95
- and is an integer in the range from 0 to the number of workers per in the pool minus 1.
96
- The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
97
- and the size of the worker pool as ``PARSL_WORKER_COUNT``.
98
-
99
-
100
- Parameters
101
- ----------
102
-
103
- provider : :class:`~parsl.providers.base.ExecutionProvider`
64
+ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
104
65
  Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
105
- :class:`~parsl.providers.cobalt.cobalt.Cobalt`,
106
66
  :class:`~parsl.providers.condor.condor.Condor`,
107
67
  :class:`~parsl.providers.googlecloud.googlecloud.GoogleCloud`,
108
68
  :class:`~parsl.providers.gridEngine.gridEngine.GridEngine`,
@@ -120,9 +80,13 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
120
80
  cores_per_worker, nodes_per_block, heartbeat_period ,heartbeat_threshold, logdir). For example:
121
81
  launch_cmd="process_worker_pool.py {debug} -c {cores_per_worker} --task_url={task_url} --result_url={result_url}"
122
82
 
83
+ interchange_launch_cmd : Sequence[str]
84
+ Custom sequence of command line tokens to launch the interchange process from the executor. If
85
+ undefined, the executor will use the default "interchange.py" command.
86
+
123
87
  address : string
124
88
  An address to connect to the main Parsl process which is reachable from the network in which
125
- workers will be running. This field expects an IPv4 address (xxx.xxx.xxx.xxx).
89
+ workers will be running. This field expects an IPv4 or IPv6 address.
126
90
  Most login nodes on clusters have several network interfaces available, only some of which
127
91
  can be reached from the compute nodes. This field can be used to limit the executor to listen
128
92
  only on a specific interface, and limiting connections to the internal network.
@@ -130,6 +94,11 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
130
94
  Setting an address here overrides the default behavior.
131
95
  default=None
132
96
 
97
+ loopback_address: string
98
+ Specify address used for internal communication between executor and interchange.
99
+ Supports IPv4 and IPv6 addresses
100
+ default=127.0.0.1
101
+
133
102
  worker_ports : (int, int)
134
103
  Specify the ports to be used by workers to connect to Parsl. If this option is specified,
135
104
  worker_port_range will not be honored.
@@ -146,6 +115,91 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
146
115
  worker_debug : Bool
147
116
  Enables worker debug logging.
148
117
 
118
+ prefetch_capacity : int
119
+ Number of tasks that could be prefetched over available worker capacity.
120
+ When there are a few tasks (<100) or when tasks are long running, this option should
121
+ be set to 0 for better load balancing. Default is 0.
122
+
123
+ address_probe_timeout : int | None
124
+ Managers attempt connecting over many different addresses to determine a viable address.
125
+ This option sets a time limit in seconds on the connection attempt.
126
+ Default of None implies 30s timeout set on worker.
127
+
128
+ heartbeat_threshold : int
129
+ Seconds since the last message from the counterpart in the communication pair:
130
+ (interchange, manager) after which the counterpart is assumed to be un-available. Default: 120s
131
+
132
+ heartbeat_period : int
133
+ Number of seconds after which a heartbeat message indicating liveness is sent to the
134
+ counterpart (interchange, manager). Default: 30s
135
+
136
+ poll_period : int
137
+ Timeout period to be used by the executor components in milliseconds. Increasing poll_periods
138
+ trades performance for cpu efficiency. Default: 10ms
139
+
140
+ drain_period : int
141
+ The number of seconds after start when workers will begin to drain
142
+ and then exit. Set this to a time that is slightly less than the
143
+ maximum walltime of batch jobs to avoid killing tasks while they
144
+ execute. For example, you could set this to the walltime minus a grace
145
+ period for the batch job to start the workers, minus the expected
146
+ maximum length of an individual task.
147
+
148
+ worker_logdir_root : string
149
+ In case of a remote file system, specify the path to where logs will be kept.
150
+
151
+ encrypted : bool
152
+ Flag to enable/disable encryption (CurveZMQ). Default is False.
153
+
154
+ manager_selector: ManagerSelector
155
+ Determines what strategy the interchange uses to select managers during task distribution.
156
+ See API reference under "Manager Selectors" regarding the various manager selectors.
157
+ Default: 'RandomManagerSelector'
158
+ """ # Documentation for params used by both HTEx and MPIEx
159
+
160
+
161
+ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageInformation):
162
+ __doc__ = f"""Executor designed for cluster-scale
163
+
164
+ The HighThroughputExecutor system has the following components:
165
+ 1. The HighThroughputExecutor instance which is run as part of the Parsl script.
166
+ 2. The Interchange which acts as a load-balancing proxy between workers and Parsl
167
+ 3. The multiprocessing based worker pool which coordinates task execution over several
168
+ cores on a node.
169
+ 4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
170
+
171
+ Here is a diagram
172
+
173
+ .. code:: python
174
+
175
+
176
+ | Data | Executor | Interchange | External Process(es)
177
+ | Flow | | |
178
+ Task | Kernel | | |
179
+ +----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
180
+ | | | | batching | | |
181
+ Parsl<---Fut-| | | load-balancing| result exception
182
+ ^ | | | watchdogs | | |
183
+ | | | Result | | | |
184
+ | | | Queue | | V V
185
+ | | | Thread<--|-incoming_q<---|--- +---------+
186
+ | | | | | |
187
+ | | | | | |
188
+ +----update_fut-----+
189
+
190
+
191
+ Each of the workers in each process_worker_pool has access to its local rank through
192
+ an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
193
+ and is an integer in the range from 0 to the number of workers per in the pool minus 1.
194
+ The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
195
+ and the size of the worker pool as ``PARSL_WORKER_COUNT``.
196
+
197
+
198
+ Parameters
199
+ ----------
200
+
201
+ {GENERAL_HTEX_PARAM_DOCS}
202
+
149
203
  cores_per_worker : float
150
204
  cores to be assigned to each worker. Oversubscription is possible
151
205
  by setting cores_per_worker < 1.0. Default=1
@@ -155,9 +209,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
155
209
  will check the available memory at startup and limit the number of workers such that
156
210
  the there's sufficient memory for each worker. Default: None
157
211
 
158
- max_workers : int
159
- Deprecated. Please use max_workers_per_node instead.
160
-
161
212
  max_workers_per_node : int
162
213
  Caps the number of workers launched per node. Default: None
163
214
 
@@ -179,44 +230,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
179
230
 
180
231
  default: empty list
181
232
 
182
- prefetch_capacity : int
183
- Number of tasks that could be prefetched over available worker capacity.
184
- When there are a few tasks (<100) or when tasks are long running, this option should
185
- be set to 0 for better load balancing. Default is 0.
186
-
187
- address_probe_timeout : int | None
188
- Managers attempt connecting over many different addresses to determine a viable address.
189
- This option sets a time limit in seconds on the connection attempt.
190
- Default of None implies 30s timeout set on worker.
191
-
192
- heartbeat_threshold : int
193
- Seconds since the last message from the counterpart in the communication pair:
194
- (interchange, manager) after which the counterpart is assumed to be un-available. Default: 120s
195
-
196
- heartbeat_period : int
197
- Number of seconds after which a heartbeat message indicating liveness is sent to the
198
- counterpart (interchange, manager). Default: 30s
199
-
200
- poll_period : int
201
- Timeout period to be used by the executor components in milliseconds. Increasing poll_periods
202
- trades performance for cpu efficiency. Default: 10ms
203
-
204
- worker_logdir_root : string
205
- In case of a remote file system, specify the path to where logs will be kept.
206
-
207
- enable_mpi_mode: bool
208
- If enabled, MPI launch prefixes will be composed for the batch scheduler based on
209
- the nodes available in each batch job and the resource_specification dict passed
210
- from the app. This is an experimental feature, please refer to the following doc section
211
- before use: https://parsl.readthedocs.io/en/stable/userguide/mpi_apps.html
212
-
213
- mpi_launcher: str
214
- This field is only used if enable_mpi_mode is set. Select one from the
215
- list of supported MPI launchers = ("srun", "aprun", "mpiexec").
216
- default: "mpiexec"
217
-
218
- encrypted : bool
219
- Flag to enable/disable encryption (CurveZMQ). Default is False.
220
233
  """
221
234
 
222
235
  @typeguard.typechecked
@@ -224,7 +237,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
224
237
  label: str = 'HighThroughputExecutor',
225
238
  provider: ExecutionProvider = LocalProvider(),
226
239
  launch_cmd: Optional[str] = None,
240
+ interchange_launch_cmd: Optional[Sequence[str]] = None,
227
241
  address: Optional[str] = None,
242
+ loopback_address: str = "127.0.0.1",
228
243
  worker_ports: Optional[Tuple[int, int]] = None,
229
244
  worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
230
245
  interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000),
@@ -233,18 +248,17 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
233
248
  worker_debug: bool = False,
234
249
  cores_per_worker: float = 1.0,
235
250
  mem_per_worker: Optional[float] = None,
236
- max_workers: Optional[Union[int, float]] = None,
237
251
  max_workers_per_node: Optional[Union[int, float]] = None,
238
252
  cpu_affinity: str = 'none',
239
253
  available_accelerators: Union[int, Sequence[str]] = (),
240
254
  prefetch_capacity: int = 0,
241
255
  heartbeat_threshold: int = 120,
242
256
  heartbeat_period: int = 30,
257
+ drain_period: Optional[int] = None,
243
258
  poll_period: int = 10,
244
259
  address_probe_timeout: Optional[int] = None,
245
260
  worker_logdir_root: Optional[str] = None,
246
- enable_mpi_mode: bool = False,
247
- mpi_launcher: str = "mpiexec",
261
+ manager_selector: ManagerSelector = RandomManagerSelector(),
248
262
  block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
249
263
  encrypted: bool = False):
250
264
 
@@ -260,14 +274,15 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
260
274
  self.prefetch_capacity = prefetch_capacity
261
275
  self.address = address
262
276
  self.address_probe_timeout = address_probe_timeout
277
+ self.manager_selector = manager_selector
278
+ self.loopback_address = loopback_address
279
+
263
280
  if self.address:
264
281
  self.all_addresses = address
265
282
  else:
266
283
  self.all_addresses = ','.join(get_all_addresses())
267
284
 
268
- if max_workers:
269
- self._warn_deprecated("max_workers", "max_workers_per_node")
270
- self.max_workers_per_node = max_workers_per_node or max_workers or float("inf")
285
+ self.max_workers_per_node = max_workers_per_node or float("inf")
271
286
 
272
287
  mem_slots = self.max_workers_per_node
273
288
  cpu_slots = self.max_workers_per_node
@@ -294,15 +309,13 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
294
309
  self._workers_per_node = 1 # our best guess-- we do not have any provider hints
295
310
 
296
311
  self._task_counter = 0
297
- self.run_id = None # set to the correct run_id in dfk
298
- self.hub_address = None # set to the correct hub address in dfk
299
- self.hub_port = None # set to the correct hub port in dfk
300
312
  self.worker_ports = worker_ports
301
313
  self.worker_port_range = worker_port_range
302
- self.interchange_proc: Optional[Process] = None
314
+ self.interchange_proc: Optional[subprocess.Popen] = None
303
315
  self.interchange_port_range = interchange_port_range
304
316
  self.heartbeat_threshold = heartbeat_threshold
305
317
  self.heartbeat_period = heartbeat_period
318
+ self.drain_period = drain_period
306
319
  self.poll_period = poll_period
307
320
  self.run_dir = '.'
308
321
  self.worker_logdir_root = worker_logdir_root
@@ -310,20 +323,20 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
310
323
  self.encrypted = encrypted
311
324
  self.cert_dir = None
312
325
 
313
- self.enable_mpi_mode = enable_mpi_mode
314
- assert mpi_launcher in VALID_LAUNCHERS, \
315
- f"mpi_launcher must be set to one of {VALID_LAUNCHERS}"
316
- if self.enable_mpi_mode:
317
- assert isinstance(self.provider.launcher, parsl.launchers.SingleNodeLauncher), \
318
- "mpi_mode requires the provider to be configured to use a SingleNodeLauncher"
319
-
320
- self.mpi_launcher = mpi_launcher
321
-
322
326
  if not launch_cmd:
323
327
  launch_cmd = DEFAULT_LAUNCH_CMD
324
328
  self.launch_cmd = launch_cmd
325
329
 
330
+ if not interchange_launch_cmd:
331
+ interchange_launch_cmd = DEFAULT_INTERCHANGE_LAUNCH_CMD
332
+ self.interchange_launch_cmd = interchange_launch_cmd
333
+
334
+ self._result_queue_thread_exit = threading.Event()
335
+ self._result_queue_thread: Optional[threading.Thread] = None
336
+
326
337
  radio_mode = "htex"
338
+ enable_mpi_mode: bool = False
339
+ mpi_launcher: str = "mpiexec"
327
340
 
328
341
  def _warn_deprecated(self, old: str, new: str):
329
342
  warnings.warn(
@@ -333,16 +346,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
333
346
  stacklevel=2
334
347
  )
335
348
 
336
- @property
337
- def max_workers(self):
338
- self._warn_deprecated("max_workers", "max_workers_per_node")
339
- return self.max_workers_per_node
340
-
341
- @max_workers.setter
342
- def max_workers(self, val: Union[int, float]):
343
- self._warn_deprecated("max_workers", "max_workers_per_node")
344
- self.max_workers_per_node = val
345
-
346
349
  @property
347
350
  def logdir(self):
348
351
  return "{}/{}".format(self.run_dir, self.label)
@@ -353,6 +356,20 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
353
356
  return "{}/{}".format(self.worker_logdir_root, self.label)
354
357
  return self.logdir
355
358
 
359
+ def validate_resource_spec(self, resource_specification: dict):
360
+ """HTEX supports the following *Optional* resource specifications:
361
+ priority: lower value is higher priority"""
362
+ if resource_specification:
363
+ acceptable_fields = {'priority'}
364
+ keys = set(resource_specification.keys())
365
+ invalid_keys = keys - acceptable_fields
366
+ if invalid_keys:
367
+ message = "Task resource specification only accepts these types of resources: {}".format(
368
+ ', '.join(acceptable_fields))
369
+ logger.error(message)
370
+ raise InvalidResourceSpecification(set(invalid_keys), message)
371
+ return
372
+
356
373
  def initialize_scaling(self):
357
374
  """Compose the launch command and scale out the initial blocks.
358
375
  """
@@ -376,6 +393,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
376
393
  nodes_per_block=self.provider.nodes_per_block,
377
394
  heartbeat_period=self.heartbeat_period,
378
395
  heartbeat_threshold=self.heartbeat_threshold,
396
+ drain_period=self.drain_period,
379
397
  poll_period=self.poll_period,
380
398
  cert_dir=self.cert_dir,
381
399
  logdir=self.worker_logdir,
@@ -388,16 +406,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
388
406
 
389
407
  logger.debug("Starting HighThroughputExecutor with provider:\n%s", self.provider)
390
408
 
391
- # TODO: why is this a provider property?
392
- block_ids = []
393
- if hasattr(self.provider, 'init_blocks'):
394
- try:
395
- block_ids = self.scale_out(blocks=self.provider.init_blocks)
396
- except Exception as e:
397
- logger.error("Scaling out failed: {}".format(e))
398
- raise e
399
- return block_ids
400
-
401
409
  def start(self):
402
410
  """Create the Interchange process and connect to it.
403
411
  """
@@ -412,30 +420,28 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
412
420
  )
413
421
 
414
422
  self.outgoing_q = zmq_pipes.TasksOutgoing(
415
- curvezmq.ClientContext(self.cert_dir), "127.0.0.1", self.interchange_port_range
423
+ self.loopback_address, self.interchange_port_range, self.cert_dir
416
424
  )
417
425
  self.incoming_q = zmq_pipes.ResultsIncoming(
418
- curvezmq.ClientContext(self.cert_dir), "127.0.0.1", self.interchange_port_range
426
+ self.loopback_address, self.interchange_port_range, self.cert_dir
419
427
  )
420
428
  self.command_client = zmq_pipes.CommandClient(
421
- curvezmq.ClientContext(self.cert_dir), "127.0.0.1", self.interchange_port_range
429
+ self.loopback_address, self.interchange_port_range, self.cert_dir
422
430
  )
423
431
 
424
- self._queue_management_thread = None
425
- self._start_queue_management_thread()
432
+ self._result_queue_thread = None
433
+ self._start_result_queue_thread()
426
434
  self._start_local_interchange_process()
427
435
 
428
- logger.debug("Created management thread: {}".format(self._queue_management_thread))
436
+ logger.debug("Created result queue thread: %s", self._result_queue_thread)
429
437
 
430
- block_ids = self.initialize_scaling()
431
- return block_ids
438
+ self.initialize_scaling()
432
439
 
433
440
  @wrap_with_logs
434
- def _queue_management_worker(self):
435
- """Listen to the queue for task status messages and handle them.
441
+ def _result_queue_worker(self):
442
+ """Listen to the queue for task result messages and handle them.
436
443
 
437
- Depending on the message, tasks will be updated with results, exceptions,
438
- or updates. It expects the following messages:
444
+ Depending on the message, tasks will be updated with results or exceptions.
439
445
 
440
446
  .. code:: python
441
447
 
@@ -449,14 +455,14 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
449
455
  "task_id" : <task_id>
450
456
  "exception" : serialized exception object, on failure
451
457
  }
452
-
453
- The `None` message is a die request.
454
458
  """
455
- logger.debug("Queue management worker starting")
459
+ logger.debug("Result queue worker starting")
456
460
 
457
- while not self.bad_state_is_set:
461
+ while not self.bad_state_is_set and not self._result_queue_thread_exit.is_set():
458
462
  try:
459
- msgs = self.incoming_q.get()
463
+ msgs = self.incoming_q.get(timeout_ms=self.poll_period)
464
+ if msgs is None: # timeout
465
+ continue
460
466
 
461
467
  except IOError as e:
462
468
  logger.exception("Caught broken queue with exception code {}: {}".format(e.errno, e))
@@ -468,109 +474,114 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
468
474
 
469
475
  else:
470
476
 
471
- if msgs is None:
472
- logger.debug("Got None, exiting")
473
- return
477
+ for serialized_msg in msgs:
478
+ try:
479
+ msg = pickle.loads(serialized_msg)
480
+ except pickle.UnpicklingError:
481
+ raise BadMessage("Message received could not be unpickled")
474
482
 
475
- else:
476
- for serialized_msg in msgs:
483
+ if msg['type'] == 'result':
477
484
  try:
478
- msg = pickle.loads(serialized_msg)
479
- except pickle.UnpicklingError:
480
- raise BadMessage("Message received could not be unpickled")
485
+ tid = msg['task_id']
486
+ except Exception:
487
+ raise BadMessage("Message received does not contain 'task_id' field")
488
+
489
+ if tid == -1 and 'exception' in msg:
490
+ logger.warning("Executor shutting down due to exception from interchange")
491
+ exception = deserialize(msg['exception'])
492
+ self.set_bad_state_and_fail_all(exception)
493
+ break
494
+
495
+ task_fut = self.tasks.pop(tid)
481
496
 
482
- if msg['type'] == 'heartbeat':
483
- continue
484
- elif msg['type'] == 'result':
497
+ if 'result' in msg:
498
+ result = deserialize(msg['result'])
499
+ task_fut.set_result(result)
500
+
501
+ elif 'exception' in msg:
485
502
  try:
486
- tid = msg['task_id']
487
- except Exception:
488
- raise BadMessage("Message received does not contain 'task_id' field")
489
-
490
- if tid == -1 and 'exception' in msg:
491
- logger.warning("Executor shutting down due to exception from interchange")
492
- exception = deserialize(msg['exception'])
493
- self.set_bad_state_and_fail_all(exception)
494
- break
495
-
496
- task_fut = self.tasks.pop(tid)
497
-
498
- if 'result' in msg:
499
- result = deserialize(msg['result'])
500
- task_fut.set_result(result)
501
-
502
- elif 'exception' in msg:
503
- try:
504
- s = deserialize(msg['exception'])
505
- # s should be a RemoteExceptionWrapper... so we can reraise it
506
- if isinstance(s, RemoteExceptionWrapper):
507
- try:
508
- s.reraise()
509
- except Exception as e:
510
- task_fut.set_exception(e)
511
- elif isinstance(s, Exception):
512
- task_fut.set_exception(s)
513
- else:
514
- raise ValueError("Unknown exception-like type received: {}".format(type(s)))
515
- except Exception as e:
516
- # TODO could be a proper wrapped exception?
517
- task_fut.set_exception(
518
- DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
519
- else:
520
- raise BadMessage("Message received is neither result or exception")
503
+ s = deserialize(msg['exception'])
504
+ # s should be a RemoteExceptionWrapper... so we can reraise it
505
+ if isinstance(s, RemoteExceptionWrapper):
506
+ try:
507
+ s.reraise()
508
+ except Exception as e:
509
+ task_fut.set_exception(e)
510
+ elif isinstance(s, Exception):
511
+ task_fut.set_exception(s)
512
+ else:
513
+ raise ValueError("Unknown exception-like type received: {}".format(type(s)))
514
+ except Exception as e:
515
+ # TODO could be a proper wrapped exception?
516
+ task_fut.set_exception(
517
+ DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
521
518
  else:
522
- raise BadMessage("Message received with unknown type {}".format(msg['type']))
519
+ raise BadMessage("Message received is neither result or exception")
520
+ else:
521
+ raise BadMessage("Message received with unknown type {}".format(msg['type']))
523
522
 
524
- logger.info("Queue management worker finished")
523
+ logger.info("Closing result ZMQ pipe")
524
+ self.incoming_q.close()
525
+ logger.info("Result queue worker finished")
525
526
 
526
- def _start_local_interchange_process(self):
527
+ def _start_local_interchange_process(self) -> None:
527
528
  """ Starts the interchange process locally
528
529
 
529
- Starts the interchange process locally and uses an internal command queue to
530
+ Starts the interchange process locally and uses the command queue to
530
531
  get the worker task and result ports that the interchange has bound to.
531
532
  """
532
- comm_q = Queue(maxsize=10)
533
- self.interchange_proc = ForkProcess(target=interchange.starter,
534
- args=(comm_q,),
535
- kwargs={"client_ports": (self.outgoing_q.port,
536
- self.incoming_q.port,
537
- self.command_client.port),
538
- "interchange_address": self.address,
539
- "worker_ports": self.worker_ports,
540
- "worker_port_range": self.worker_port_range,
541
- "hub_address": self.hub_address,
542
- "hub_port": self.hub_port,
543
- "logdir": self.logdir,
544
- "heartbeat_threshold": self.heartbeat_threshold,
545
- "poll_period": self.poll_period,
546
- "logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
547
- "cert_dir": self.cert_dir,
548
- },
549
- daemon=True,
550
- name="HTEX-Interchange"
551
- )
552
- self.interchange_proc.start()
533
+
534
+ interchange_config = {"client_address": self.loopback_address,
535
+ "client_ports": (self.outgoing_q.port,
536
+ self.incoming_q.port,
537
+ self.command_client.port),
538
+ "interchange_address": self.address,
539
+ "worker_ports": self.worker_ports,
540
+ "worker_port_range": self.worker_port_range,
541
+ "hub_address": self.hub_address,
542
+ "hub_zmq_port": self.hub_zmq_port,
543
+ "logdir": self.logdir,
544
+ "heartbeat_threshold": self.heartbeat_threshold,
545
+ "poll_period": self.poll_period,
546
+ "logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
547
+ "cert_dir": self.cert_dir,
548
+ "manager_selector": self.manager_selector,
549
+ "run_id": self.run_id,
550
+ }
551
+
552
+ config_pickle = pickle.dumps(interchange_config)
553
+
554
+ self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd, stdin=subprocess.PIPE)
555
+ stdin = self.interchange_proc.stdin
556
+ assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
557
+
558
+ logger.debug("Popened interchange process. Writing config object")
559
+ stdin.write(config_pickle)
560
+ stdin.flush()
561
+ stdin.close()
562
+ logger.debug("Sent config object. Requesting worker ports")
553
563
  try:
554
- (self.worker_task_port, self.worker_result_port) = comm_q.get(block=True, timeout=120)
555
- except queue.Empty:
556
- logger.error("Interchange has not completed initialization in 120s. Aborting")
564
+ (self.worker_task_port, self.worker_result_port) = self.command_client.run("WORKER_PORTS", timeout_s=120)
565
+ except CommandClientTimeoutError:
566
+ logger.error("Interchange has not completed initialization. Aborting")
557
567
  raise Exception("Interchange failed to start")
568
+ logger.debug("Got worker ports")
558
569
 
559
- def _start_queue_management_thread(self):
560
- """Method to start the management thread as a daemon.
570
+ def _start_result_queue_thread(self):
571
+ """Method to start the result queue thread as a daemon.
561
572
 
562
573
  Checks if a thread already exists, then starts it.
563
- Could be used later as a restart if the management thread dies.
574
+ Could be used later as a restart if the result queue thread dies.
564
575
  """
565
- if self._queue_management_thread is None:
566
- logger.debug("Starting queue management thread")
567
- self._queue_management_thread = threading.Thread(target=self._queue_management_worker, name="HTEX-Queue-Management-Thread")
568
- self._queue_management_thread.daemon = True
569
- self._queue_management_thread.start()
570
- logger.debug("Started queue management thread")
576
+ if self._result_queue_thread is None:
577
+ logger.debug("Starting result queue thread")
578
+ self._result_queue_thread = threading.Thread(target=self._result_queue_worker, name="HTEX-Result-Queue-Thread")
579
+ self._result_queue_thread.daemon = True
580
+ self._result_queue_thread.start()
581
+ logger.debug("Started result queue thread")
571
582
 
572
583
  else:
573
- logger.error("Management thread already exists, returning")
584
+ logger.error("Result queue thread already exists, returning")
574
585
 
575
586
  def hold_worker(self, worker_id: str) -> None:
576
587
  """Puts a worker on hold, preventing scheduling of additional tasks to it.
@@ -591,7 +602,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
591
602
  def outstanding(self) -> int:
592
603
  """Returns the count of tasks outstanding across the interchange
593
604
  and managers"""
594
- return self.command_client.run("OUTSTANDING_C")
605
+ return len(self.tasks)
595
606
 
596
607
  @property
597
608
  def connected_workers(self) -> int:
@@ -643,7 +654,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
643
654
  Returns:
644
655
  Future
645
656
  """
646
- validate_resource_spec(resource_specification)
657
+
658
+ self.validate_resource_spec(resource_specification)
647
659
 
648
660
  if self.bad_state_is_set:
649
661
  raise self.executor_exception
@@ -667,7 +679,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
667
679
  except TypeError:
668
680
  raise SerializationError(func.__name__)
669
681
 
670
- msg = {"task_id": task_id, "buffer": fn_buf}
682
+ msg = {"task_id": task_id, "resource_spec": resource_specification, "buffer": fn_buf}
671
683
 
672
684
  # Post task to the outgoing queue
673
685
  self.outgoing_q.put(msg)
@@ -675,22 +687,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
675
687
  # Return the future
676
688
  return fut
677
689
 
678
- def create_monitoring_info(self, status):
679
- """ Create a msg for monitoring based on the poll status
680
-
681
- """
682
- msg = []
683
- for bid, s in status.items():
684
- d = {}
685
- d['run_id'] = self.run_id
686
- d['status'] = s.status_name
687
- d['timestamp'] = datetime.datetime.now()
688
- d['executor_label'] = self.label
689
- d['job_id'] = self.blocks.get(bid, None)
690
- d['block_id'] = bid
691
- msg.append(d)
692
- return msg
693
-
694
690
  @property
695
691
  def workers_per_node(self) -> Union[int, float]:
696
692
  return self._workers_per_node
@@ -728,14 +724,24 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
728
724
  tasks: int # sum of tasks in this block
729
725
  idle: float # shortest idle time of any manager in this block
730
726
 
727
+ # block_info will be populated from two sources:
728
+ # the Job Status Poller mutable block list, and the list of blocks
729
+ # which have connected to the interchange.
730
+
731
+ def new_block_info():
732
+ return BlockInfo(tasks=0, idle=float('inf'))
733
+
734
+ block_info: Dict[str, BlockInfo] = defaultdict(new_block_info)
735
+
736
+ for block_id, job_status in self._status.items():
737
+ if job_status.state not in TERMINAL_STATES:
738
+ block_info[block_id] = new_block_info()
739
+
731
740
  managers = self.connected_managers()
732
- block_info: Dict[str, BlockInfo] = {}
733
741
  for manager in managers:
734
742
  if not manager['active']:
735
743
  continue
736
744
  b_id = manager['block_id']
737
- if b_id not in block_info:
738
- block_info[b_id] = BlockInfo(tasks=0, idle=float('inf'))
739
745
  block_info[b_id].tasks += manager['tasks']
740
746
  block_info[b_id].idle = min(block_info[b_id].idle, manager['idle_duration'])
741
747
 
@@ -767,14 +773,14 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
767
773
 
768
774
  # Now kill via provider
769
775
  # Potential issue with multiple threads trying to remove the same blocks
770
- to_kill = [self.blocks[bid] for bid in block_ids_to_kill if bid in self.blocks]
776
+ to_kill = [self.blocks_to_job_id[bid] for bid in block_ids_to_kill if bid in self.blocks_to_job_id]
771
777
 
772
778
  r = self.provider.cancel(to_kill)
773
779
  job_ids = self._filter_scale_in_ids(to_kill, r)
774
780
 
775
- # to_kill block_ids are fetched from self.blocks
776
- # If a block_id is in self.block, it must exist in self.block_mapping
777
- block_ids_killed = [self.block_mapping[jid] for jid in job_ids]
781
+ # to_kill block_ids are fetched from self.blocks_to_job_id
782
+ # If a block_id is in self.blocks_to_job_id, it must exist in self.job_ids_to_block
783
+ block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
778
784
 
779
785
  return block_ids_killed
780
786
 
@@ -789,7 +795,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
789
795
  connected_blocks = self.connected_blocks()
790
796
  for job_id in job_status:
791
797
  job_info = job_status[job_id]
792
- if job_info.terminal and job_id not in connected_blocks:
798
+ if job_info.terminal and job_id not in connected_blocks and job_info.state != JobState.SCALED_IN:
799
+ logger.debug("Rewriting job %s from status %s to MISSING", job_id, job_info)
793
800
  job_status[job_id].state = JobState.MISSING
794
801
  if job_status[job_id].message is None:
795
802
  job_status[job_id].message = (
@@ -817,10 +824,37 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
817
824
 
818
825
  logger.info("Attempting HighThroughputExecutor shutdown")
819
826
 
827
+ logger.info("Terminating interchange and result queue thread")
828
+ self._result_queue_thread_exit.set()
820
829
  self.interchange_proc.terminate()
821
- self.interchange_proc.join(timeout=timeout)
822
- if self.interchange_proc.is_alive():
823
- logger.info("Unable to terminate Interchange process; sending SIGKILL")
830
+ try:
831
+ self.interchange_proc.wait(timeout=timeout)
832
+ except subprocess.TimeoutExpired:
833
+ logger.warning("Unable to terminate Interchange process; sending SIGKILL")
824
834
  self.interchange_proc.kill()
825
835
 
836
+ logger.info("Closing ZMQ pipes")
837
+
838
+ # These pipes are used in a thread unsafe manner. If you have traced a
839
+ # problem to this block of code, you might consider what is happening
840
+ # with other threads that access these.
841
+
842
+ # incoming_q is not closed here because it is used by the results queue
843
+ # worker which is not shut down at this point.
844
+
845
+ if hasattr(self, 'outgoing_q'):
846
+ logger.info("Closing outgoing_q")
847
+ self.outgoing_q.close()
848
+
849
+ if hasattr(self, 'command_client'):
850
+ logger.info("Closing command client")
851
+ self.command_client.close()
852
+
853
+ logger.info("Waiting for result queue thread exit")
854
+ if self._result_queue_thread:
855
+ self._result_queue_thread.join()
856
+
826
857
  logger.info("Finished HighThroughputExecutor shutdown attempt")
858
+
859
+ def get_usage_information(self):
860
+ return {"mpi": self.enable_mpi_mode}