parsl 2024.3.18__py3-none-any.whl → 2025.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. parsl/__init__.py +9 -10
  2. parsl/addresses.py +26 -6
  3. parsl/app/app.py +7 -8
  4. parsl/app/bash.py +15 -8
  5. parsl/app/errors.py +10 -13
  6. parsl/app/futures.py +8 -10
  7. parsl/app/python.py +2 -1
  8. parsl/benchmark/perf.py +2 -1
  9. parsl/concurrent/__init__.py +2 -2
  10. parsl/config.py +53 -10
  11. parsl/configs/ASPIRE1.py +6 -5
  12. parsl/configs/Azure.py +9 -8
  13. parsl/configs/bridges.py +6 -4
  14. parsl/configs/cc_in2p3.py +3 -3
  15. parsl/configs/ec2.py +3 -1
  16. parsl/configs/expanse.py +4 -3
  17. parsl/configs/frontera.py +3 -4
  18. parsl/configs/htex_local.py +3 -4
  19. parsl/configs/illinoiscluster.py +3 -1
  20. parsl/configs/improv.py +34 -0
  21. parsl/configs/kubernetes.py +4 -3
  22. parsl/configs/local_threads.py +5 -1
  23. parsl/configs/midway.py +5 -3
  24. parsl/configs/osg.py +4 -2
  25. parsl/configs/polaris.py +4 -2
  26. parsl/configs/stampede2.py +6 -5
  27. parsl/configs/summit.py +3 -3
  28. parsl/configs/toss3_llnl.py +4 -3
  29. parsl/configs/vineex_local.py +6 -4
  30. parsl/configs/wqex_local.py +5 -3
  31. parsl/curvezmq.py +4 -0
  32. parsl/data_provider/data_manager.py +4 -3
  33. parsl/data_provider/file_noop.py +1 -2
  34. parsl/data_provider/files.py +3 -3
  35. parsl/data_provider/ftp.py +1 -3
  36. parsl/data_provider/globus.py +7 -6
  37. parsl/data_provider/http.py +2 -2
  38. parsl/data_provider/rsync.py +1 -1
  39. parsl/data_provider/staging.py +2 -2
  40. parsl/data_provider/zip.py +135 -0
  41. parsl/dataflow/dependency_resolvers.py +115 -0
  42. parsl/dataflow/dflow.py +259 -223
  43. parsl/dataflow/errors.py +3 -5
  44. parsl/dataflow/futures.py +27 -14
  45. parsl/dataflow/memoization.py +5 -5
  46. parsl/dataflow/rundirs.py +5 -6
  47. parsl/dataflow/taskrecord.py +4 -5
  48. parsl/executors/__init__.py +4 -2
  49. parsl/executors/base.py +45 -15
  50. parsl/executors/errors.py +13 -0
  51. parsl/executors/execute_task.py +37 -0
  52. parsl/executors/flux/execute_parsl_task.py +3 -3
  53. parsl/executors/flux/executor.py +18 -19
  54. parsl/executors/flux/flux_instance_manager.py +26 -27
  55. parsl/executors/high_throughput/errors.py +43 -3
  56. parsl/executors/high_throughput/executor.py +307 -285
  57. parsl/executors/high_throughput/interchange.py +137 -168
  58. parsl/executors/high_throughput/manager_record.py +4 -0
  59. parsl/executors/high_throughput/manager_selector.py +55 -0
  60. parsl/executors/high_throughput/monitoring_info.py +2 -1
  61. parsl/executors/high_throughput/mpi_executor.py +113 -0
  62. parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
  63. parsl/executors/high_throughput/mpi_resource_management.py +6 -17
  64. parsl/executors/high_throughput/probe.py +9 -7
  65. parsl/executors/high_throughput/process_worker_pool.py +77 -75
  66. parsl/executors/high_throughput/zmq_pipes.py +81 -23
  67. parsl/executors/radical/executor.py +130 -79
  68. parsl/executors/radical/rpex_resources.py +17 -15
  69. parsl/executors/radical/rpex_worker.py +4 -3
  70. parsl/executors/status_handling.py +157 -51
  71. parsl/executors/taskvine/__init__.py +1 -1
  72. parsl/executors/taskvine/errors.py +1 -1
  73. parsl/executors/taskvine/exec_parsl_function.py +2 -2
  74. parsl/executors/taskvine/executor.py +38 -55
  75. parsl/executors/taskvine/factory.py +1 -1
  76. parsl/executors/taskvine/factory_config.py +1 -1
  77. parsl/executors/taskvine/manager.py +17 -13
  78. parsl/executors/taskvine/manager_config.py +7 -2
  79. parsl/executors/threads.py +6 -6
  80. parsl/executors/workqueue/errors.py +1 -1
  81. parsl/executors/workqueue/exec_parsl_function.py +6 -5
  82. parsl/executors/workqueue/executor.py +64 -63
  83. parsl/executors/workqueue/parsl_coprocess.py +1 -1
  84. parsl/jobs/error_handlers.py +2 -2
  85. parsl/jobs/job_status_poller.py +28 -112
  86. parsl/jobs/states.py +7 -2
  87. parsl/jobs/strategy.py +43 -31
  88. parsl/launchers/__init__.py +12 -3
  89. parsl/launchers/errors.py +1 -1
  90. parsl/launchers/launchers.py +0 -6
  91. parsl/log_utils.py +1 -2
  92. parsl/monitoring/db_manager.py +55 -93
  93. parsl/monitoring/errors.py +6 -0
  94. parsl/monitoring/monitoring.py +85 -311
  95. parsl/monitoring/queries/pandas.py +1 -2
  96. parsl/monitoring/radios/base.py +13 -0
  97. parsl/monitoring/radios/filesystem.py +52 -0
  98. parsl/monitoring/radios/htex.py +57 -0
  99. parsl/monitoring/radios/multiprocessing.py +17 -0
  100. parsl/monitoring/radios/udp.py +56 -0
  101. parsl/monitoring/radios/zmq.py +17 -0
  102. parsl/monitoring/remote.py +33 -37
  103. parsl/monitoring/router.py +212 -0
  104. parsl/monitoring/types.py +5 -6
  105. parsl/monitoring/visualization/app.py +4 -2
  106. parsl/monitoring/visualization/models.py +0 -1
  107. parsl/monitoring/visualization/plots/default/workflow_plots.py +8 -4
  108. parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
  109. parsl/monitoring/visualization/utils.py +0 -1
  110. parsl/monitoring/visualization/views.py +16 -9
  111. parsl/multiprocessing.py +0 -1
  112. parsl/process_loggers.py +1 -2
  113. parsl/providers/__init__.py +8 -17
  114. parsl/providers/aws/aws.py +2 -3
  115. parsl/providers/azure/azure.py +4 -5
  116. parsl/providers/base.py +2 -18
  117. parsl/providers/cluster_provider.py +3 -9
  118. parsl/providers/condor/condor.py +7 -17
  119. parsl/providers/errors.py +2 -2
  120. parsl/providers/googlecloud/googlecloud.py +2 -1
  121. parsl/providers/grid_engine/grid_engine.py +5 -14
  122. parsl/providers/kubernetes/kube.py +80 -40
  123. parsl/providers/local/local.py +13 -26
  124. parsl/providers/lsf/lsf.py +5 -23
  125. parsl/providers/pbspro/pbspro.py +5 -17
  126. parsl/providers/slurm/slurm.py +81 -39
  127. parsl/providers/torque/torque.py +3 -14
  128. parsl/serialize/__init__.py +8 -3
  129. parsl/serialize/base.py +1 -2
  130. parsl/serialize/concretes.py +5 -4
  131. parsl/serialize/facade.py +3 -3
  132. parsl/serialize/proxystore.py +3 -2
  133. parsl/tests/__init__.py +1 -1
  134. parsl/tests/configs/azure_single_node.py +4 -5
  135. parsl/tests/configs/bridges.py +3 -2
  136. parsl/tests/configs/cc_in2p3.py +1 -3
  137. parsl/tests/configs/comet.py +2 -1
  138. parsl/tests/configs/ec2_single_node.py +1 -2
  139. parsl/tests/configs/ec2_spot.py +1 -2
  140. parsl/tests/configs/flux_local.py +11 -0
  141. parsl/tests/configs/frontera.py +2 -3
  142. parsl/tests/configs/htex_local.py +3 -5
  143. parsl/tests/configs/htex_local_alternate.py +11 -15
  144. parsl/tests/configs/htex_local_intask_staging.py +5 -9
  145. parsl/tests/configs/htex_local_rsync_staging.py +4 -8
  146. parsl/tests/configs/local_radical.py +1 -3
  147. parsl/tests/configs/local_radical_mpi.py +2 -2
  148. parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
  149. parsl/tests/configs/local_threads_monitoring.py +0 -1
  150. parsl/tests/configs/midway.py +2 -2
  151. parsl/tests/configs/nscc_singapore.py +3 -3
  152. parsl/tests/configs/osg_htex.py +1 -1
  153. parsl/tests/configs/petrelkube.py +3 -2
  154. parsl/tests/configs/slurm_local.py +24 -0
  155. parsl/tests/configs/summit.py +1 -0
  156. parsl/tests/configs/taskvine_ex.py +4 -7
  157. parsl/tests/configs/user_opts.py +0 -7
  158. parsl/tests/configs/workqueue_ex.py +4 -6
  159. parsl/tests/conftest.py +27 -13
  160. parsl/tests/integration/test_stress/test_python_simple.py +3 -4
  161. parsl/tests/integration/test_stress/test_python_threads.py +3 -5
  162. parsl/tests/manual_tests/htex_local.py +4 -6
  163. parsl/tests/manual_tests/test_basic.py +1 -0
  164. parsl/tests/manual_tests/test_log_filter.py +3 -1
  165. parsl/tests/manual_tests/test_memory_limits.py +6 -8
  166. parsl/tests/manual_tests/test_regression_220.py +2 -1
  167. parsl/tests/manual_tests/test_udp_simple.py +4 -4
  168. parsl/tests/manual_tests/test_worker_count.py +3 -2
  169. parsl/tests/scaling_tests/htex_local.py +2 -4
  170. parsl/tests/scaling_tests/test_scale.py +0 -9
  171. parsl/tests/scaling_tests/vineex_condor.py +1 -2
  172. parsl/tests/scaling_tests/vineex_local.py +1 -2
  173. parsl/tests/site_tests/site_config_selector.py +1 -6
  174. parsl/tests/site_tests/test_provider.py +4 -2
  175. parsl/tests/site_tests/test_site.py +2 -0
  176. parsl/tests/sites/test_affinity.py +7 -7
  177. parsl/tests/sites/test_dynamic_executor.py +3 -4
  178. parsl/tests/sites/test_ec2.py +3 -2
  179. parsl/tests/sites/test_worker_info.py +4 -5
  180. parsl/tests/test_aalst_patterns.py +0 -1
  181. parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
  182. parsl/tests/test_bash_apps/test_basic.py +10 -4
  183. parsl/tests/test_bash_apps/test_error_codes.py +5 -7
  184. parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
  185. parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
  186. parsl/tests/test_bash_apps/test_memoize.py +2 -8
  187. parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
  188. parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
  189. parsl/tests/test_bash_apps/test_multiline.py +1 -1
  190. parsl/tests/test_bash_apps/test_pipeline.py +1 -1
  191. parsl/tests/test_bash_apps/test_std_uri.py +123 -0
  192. parsl/tests/test_bash_apps/test_stdout.py +33 -8
  193. parsl/tests/test_callables.py +2 -2
  194. parsl/tests/test_checkpointing/test_periodic.py +21 -39
  195. parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
  196. parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
  197. parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
  198. parsl/tests/test_checkpointing/test_regression_239.py +1 -1
  199. parsl/tests/test_checkpointing/test_task_exit.py +2 -3
  200. parsl/tests/test_docs/test_from_slides.py +5 -2
  201. parsl/tests/test_docs/test_kwargs.py +4 -1
  202. parsl/tests/test_docs/test_tutorial_1.py +1 -2
  203. parsl/tests/test_docs/test_workflow1.py +2 -2
  204. parsl/tests/test_docs/test_workflow2.py +0 -1
  205. parsl/tests/test_error_handling/test_rand_fail.py +2 -2
  206. parsl/tests/test_error_handling/test_resource_spec.py +10 -12
  207. parsl/tests/test_error_handling/test_retries.py +6 -16
  208. parsl/tests/test_error_handling/test_retry_handler.py +1 -0
  209. parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
  210. parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
  211. parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
  212. parsl/tests/test_execute_task.py +29 -0
  213. parsl/tests/test_flux.py +1 -1
  214. parsl/tests/test_htex/test_basic.py +2 -3
  215. parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
  216. parsl/tests/test_htex/test_command_client_timeout.py +66 -0
  217. parsl/tests/test_htex/test_connected_blocks.py +3 -2
  218. parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
  219. parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
  220. parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
  221. parsl/tests/test_htex/test_drain.py +11 -10
  222. parsl/tests/test_htex/test_htex.py +51 -25
  223. parsl/tests/test_htex/test_manager_failure.py +0 -1
  224. parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
  225. parsl/tests/test_htex/test_managers_command.py +36 -0
  226. parsl/tests/test_htex/test_missing_worker.py +2 -12
  227. parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
  228. parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
  229. parsl/tests/test_htex/test_zmq_binding.py +29 -8
  230. parsl/tests/test_monitoring/test_app_names.py +5 -5
  231. parsl/tests/test_monitoring/test_basic.py +73 -25
  232. parsl/tests/test_monitoring/test_db_locks.py +6 -4
  233. parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
  234. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
  235. parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
  236. parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
  237. parsl/tests/test_monitoring/test_stdouterr.py +134 -0
  238. parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
  239. parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
  240. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
  241. parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
  242. parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
  243. parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
  244. parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
  245. parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
  246. parsl/tests/test_providers/test_local_provider.py +3 -132
  247. parsl/tests/test_providers/test_pbspro_template.py +2 -3
  248. parsl/tests/test_providers/test_slurm_template.py +2 -3
  249. parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
  250. parsl/tests/test_python_apps/test_context_manager.py +128 -0
  251. parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
  252. parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
  253. parsl/tests/test_python_apps/test_fail.py +0 -25
  254. parsl/tests/test_python_apps/test_futures.py +2 -1
  255. parsl/tests/test_python_apps/test_inputs_default.py +22 -0
  256. parsl/tests/test_python_apps/test_join.py +0 -1
  257. parsl/tests/test_python_apps/test_lifted.py +11 -7
  258. parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
  259. parsl/tests/test_python_apps/test_outputs.py +1 -1
  260. parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
  261. parsl/tests/test_radical/test_mpi_funcs.py +1 -2
  262. parsl/tests/test_regression/test_1480.py +2 -1
  263. parsl/tests/test_regression/test_1653.py +2 -1
  264. parsl/tests/test_regression/test_226.py +1 -0
  265. parsl/tests/test_regression/test_2652.py +1 -0
  266. parsl/tests/test_regression/test_69a.py +0 -1
  267. parsl/tests/test_regression/test_854.py +4 -2
  268. parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
  269. parsl/tests/test_regression/test_98.py +0 -1
  270. parsl/tests/test_scaling/test_block_error_handler.py +9 -4
  271. parsl/tests/test_scaling/test_regression_1621.py +11 -15
  272. parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
  273. parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
  274. parsl/tests/test_scaling/test_scale_down.py +2 -5
  275. parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +5 -8
  276. parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
  277. parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
  278. parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
  279. parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
  280. parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
  281. parsl/tests/test_serialization/test_basic.py +2 -1
  282. parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
  283. parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
  284. parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
  285. parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
  286. parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
  287. parsl/tests/test_staging/staging_provider.py +2 -2
  288. parsl/tests/test_staging/test_1316.py +3 -4
  289. parsl/tests/test_staging/test_docs_1.py +2 -1
  290. parsl/tests/test_staging/test_docs_2.py +2 -1
  291. parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
  292. parsl/tests/{test_data → test_staging}/test_file.py +6 -6
  293. parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
  294. parsl/tests/test_staging/test_staging_ftp.py +1 -0
  295. parsl/tests/test_staging/test_staging_https.py +5 -2
  296. parsl/tests/test_staging/test_staging_stdout.py +64 -0
  297. parsl/tests/test_staging/test_zip_in.py +39 -0
  298. parsl/tests/test_staging/test_zip_out.py +110 -0
  299. parsl/tests/test_staging/test_zip_to_zip.py +41 -0
  300. parsl/tests/test_summary.py +2 -2
  301. parsl/tests/test_thread_parallelism.py +0 -1
  302. parsl/tests/test_threads/test_configs.py +1 -2
  303. parsl/tests/test_threads/test_lazy_errors.py +2 -2
  304. parsl/tests/test_utils/test_execute_wait.py +35 -0
  305. parsl/tests/test_utils/test_sanitize_dns.py +76 -0
  306. parsl/tests/unit/test_address.py +20 -0
  307. parsl/tests/unit/test_file.py +99 -0
  308. parsl/tests/unit/test_usage_tracking.py +66 -0
  309. parsl/usage_tracking/api.py +65 -0
  310. parsl/usage_tracking/levels.py +6 -0
  311. parsl/usage_tracking/usage.py +104 -62
  312. parsl/utils.py +137 -4
  313. parsl/version.py +1 -1
  314. {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
  315. parsl-2025.1.13.data/scripts/interchange.py +649 -0
  316. {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +77 -75
  317. parsl-2025.1.13.dist-info/METADATA +96 -0
  318. parsl-2025.1.13.dist-info/RECORD +462 -0
  319. {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
  320. parsl/channels/__init__.py +0 -7
  321. parsl/channels/base.py +0 -141
  322. parsl/channels/errors.py +0 -113
  323. parsl/channels/local/local.py +0 -164
  324. parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
  325. parsl/channels/ssh/ssh.py +0 -276
  326. parsl/channels/ssh_il/__init__.py +0 -0
  327. parsl/channels/ssh_il/ssh_il.py +0 -74
  328. parsl/configs/ad_hoc.py +0 -35
  329. parsl/executors/radical/rpex_master.py +0 -42
  330. parsl/monitoring/radios.py +0 -175
  331. parsl/providers/ad_hoc/__init__.py +0 -0
  332. parsl/providers/ad_hoc/ad_hoc.py +0 -248
  333. parsl/providers/cobalt/__init__.py +0 -0
  334. parsl/providers/cobalt/cobalt.py +0 -236
  335. parsl/providers/cobalt/template.py +0 -17
  336. parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
  337. parsl/tests/configs/cooley_htex.py +0 -37
  338. parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
  339. parsl/tests/configs/local_adhoc.py +0 -18
  340. parsl/tests/configs/swan_htex.py +0 -43
  341. parsl/tests/configs/theta.py +0 -37
  342. parsl/tests/integration/test_channels/__init__.py +0 -0
  343. parsl/tests/integration/test_channels/test_channels.py +0 -17
  344. parsl/tests/integration/test_channels/test_local_channel.py +0 -42
  345. parsl/tests/integration/test_channels/test_scp_1.py +0 -45
  346. parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
  347. parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
  348. parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
  349. parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
  350. parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
  351. parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
  352. parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
  353. parsl/tests/sites/test_local_adhoc.py +0 -61
  354. parsl/tests/test_channels/__init__.py +0 -0
  355. parsl/tests/test_channels/test_large_output.py +0 -22
  356. parsl/tests/test_data/__init__.py +0 -0
  357. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
  358. parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
  359. parsl-2024.3.18.dist-info/METADATA +0 -98
  360. parsl-2024.3.18.dist-info/RECORD +0 -449
  361. parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
  362. parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
  363. parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
  364. parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
  365. parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
  366. {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
  367. {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
  368. {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
  369. {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
@@ -1,43 +1,41 @@
1
- import typing
2
- from concurrent.futures import Future
3
- import typeguard
4
1
  import logging
5
- import threading
6
- import queue
7
- import datetime
8
- import pickle
9
- from dataclasses import dataclass
10
- from multiprocessing import Process, Queue
11
- from typing import Dict, Sequence
12
- from typing import List, Optional, Tuple, Union, Callable
13
2
  import math
3
+ import pickle
4
+ import subprocess
5
+ import threading
6
+ import typing
14
7
  import warnings
8
+ from collections import defaultdict
9
+ from concurrent.futures import Future
10
+ from dataclasses import dataclass
11
+ from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
12
+
13
+ import typeguard
15
14
 
16
- import parsl.launchers
17
- from parsl.serialize import pack_res_spec_apply_message, deserialize
18
- from parsl.serialize.errors import SerializationError, DeserializationError
15
+ from parsl import curvezmq
16
+ from parsl.addresses import get_all_addresses
19
17
  from parsl.app.errors import RemoteExceptionWrapper
20
- from parsl.jobs.states import JobStatus, JobState
21
- from parsl.executors.high_throughput import zmq_pipes
22
- from parsl.executors.high_throughput import interchange
18
+ from parsl.data_provider.staging import Staging
23
19
  from parsl.executors.errors import (
24
- BadMessage, ScalingFailed,
20
+ BadMessage,
21
+ InvalidResourceSpecification,
22
+ ScalingFailed,
25
23
  )
26
- from parsl.executors.high_throughput.mpi_prefix_composer import (
27
- VALID_LAUNCHERS,
28
- validate_resource_spec
24
+ from parsl.executors.high_throughput import zmq_pipes
25
+ from parsl.executors.high_throughput.errors import CommandClientTimeoutError
26
+ from parsl.executors.high_throughput.manager_selector import (
27
+ ManagerSelector,
28
+ RandomManagerSelector,
29
29
  )
30
-
31
- from parsl import curvezmq
32
30
  from parsl.executors.status_handling import BlockProviderExecutor
33
- from parsl.providers.base import ExecutionProvider
34
- from parsl.data_provider.staging import Staging
35
- from parsl.addresses import get_all_addresses
31
+ from parsl.jobs.states import TERMINAL_STATES, JobState, JobStatus
36
32
  from parsl.process_loggers import wrap_with_logs
37
-
38
- from parsl.multiprocessing import ForkProcess
39
- from parsl.utils import RepresentationMixin
40
33
  from parsl.providers import LocalProvider
34
+ from parsl.providers.base import ExecutionProvider
35
+ from parsl.serialize import deserialize, pack_res_spec_apply_message
36
+ from parsl.serialize.errors import DeserializationError, SerializationError
37
+ from parsl.usage_tracking.api import UsageInformation
38
+ from parsl.utils import RepresentationMixin
41
39
 
42
40
  logger = logging.getLogger(__name__)
43
41
 
@@ -61,49 +59,10 @@ DEFAULT_LAUNCH_CMD = ("process_worker_pool.py {debug} {max_workers_per_node} "
61
59
  "--mpi-launcher={mpi_launcher} "
62
60
  "--available-accelerators {accelerators}")
63
61
 
62
+ DEFAULT_INTERCHANGE_LAUNCH_CMD = ["interchange.py"]
64
63
 
65
- class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
66
- """Executor designed for cluster-scale
67
-
68
- The HighThroughputExecutor system has the following components:
69
- 1. The HighThroughputExecutor instance which is run as part of the Parsl script.
70
- 2. The Interchange which acts as a load-balancing proxy between workers and Parsl
71
- 3. The multiprocessing based worker pool which coordinates task execution over several
72
- cores on a node.
73
- 4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
74
-
75
- Here is a diagram
76
-
77
- .. code:: python
78
-
79
-
80
- | Data | Executor | Interchange | External Process(es)
81
- | Flow | | |
82
- Task | Kernel | | |
83
- +----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
84
- | | | | batching | | |
85
- Parsl<---Fut-| | | load-balancing| result exception
86
- ^ | | | watchdogs | | |
87
- | | | Q_mngmnt | | V V
88
- | | | Thread<--|-incoming_q<---|--- +---------+
89
- | | | | | |
90
- | | | | | |
91
- +----update_fut-----+
92
-
93
-
94
- Each of the workers in each process_worker_pool has access to its local rank through
95
- an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
96
- and is an integer in the range from 0 to the number of workers per in the pool minus 1.
97
- The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
98
- and the size of the worker pool as ``PARSL_WORKER_COUNT``.
99
-
100
-
101
- Parameters
102
- ----------
103
-
104
- provider : :class:`~parsl.providers.base.ExecutionProvider`
64
+ GENERAL_HTEX_PARAM_DOCS = """provider : :class:`~parsl.providers.base.ExecutionProvider`
105
65
  Provider to access computation resources. Can be one of :class:`~parsl.providers.aws.aws.EC2Provider`,
106
- :class:`~parsl.providers.cobalt.cobalt.Cobalt`,
107
66
  :class:`~parsl.providers.condor.condor.Condor`,
108
67
  :class:`~parsl.providers.googlecloud.googlecloud.GoogleCloud`,
109
68
  :class:`~parsl.providers.gridEngine.gridEngine.GridEngine`,
@@ -121,9 +80,13 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
121
80
  cores_per_worker, nodes_per_block, heartbeat_period ,heartbeat_threshold, logdir). For example:
122
81
  launch_cmd="process_worker_pool.py {debug} -c {cores_per_worker} --task_url={task_url} --result_url={result_url}"
123
82
 
83
+ interchange_launch_cmd : Sequence[str]
84
+ Custom sequence of command line tokens to launch the interchange process from the executor. If
85
+ undefined, the executor will use the default "interchange.py" command.
86
+
124
87
  address : string
125
88
  An address to connect to the main Parsl process which is reachable from the network in which
126
- workers will be running. This field expects an IPv4 address (xxx.xxx.xxx.xxx).
89
+ workers will be running. This field expects an IPv4 or IPv6 address.
127
90
  Most login nodes on clusters have several network interfaces available, only some of which
128
91
  can be reached from the compute nodes. This field can be used to limit the executor to listen
129
92
  only on a specific interface, and limiting connections to the internal network.
@@ -131,6 +94,11 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
131
94
  Setting an address here overrides the default behavior.
132
95
  default=None
133
96
 
97
+ loopback_address: string
98
+ Specify address used for internal communication between executor and interchange.
99
+ Supports IPv4 and IPv6 addresses
100
+ default=127.0.0.1
101
+
134
102
  worker_ports : (int, int)
135
103
  Specify the ports to be used by workers to connect to Parsl. If this option is specified,
136
104
  worker_port_range will not be honored.
@@ -147,39 +115,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
147
115
  worker_debug : Bool
148
116
  Enables worker debug logging.
149
117
 
150
- cores_per_worker : float
151
- cores to be assigned to each worker. Oversubscription is possible
152
- by setting cores_per_worker < 1.0. Default=1
153
-
154
- mem_per_worker : float
155
- GB of memory required per worker. If this option is specified, the node manager
156
- will check the available memory at startup and limit the number of workers such that
157
- the there's sufficient memory for each worker. Default: None
158
-
159
- max_workers : int
160
- Deprecated. Please use max_workers_per_node instead.
161
-
162
- max_workers_per_node : int
163
- Caps the number of workers launched per node. Default: None
164
-
165
- cpu_affinity: string
166
- Whether or how each worker process sets thread affinity. Options include "none" to forgo
167
- any CPU affinity configuration, "block" to assign adjacent cores to workers
168
- (ex: assign 0-1 to worker 0, 2-3 to worker 1), and
169
- "alternating" to assign cores to workers in round-robin
170
- (ex: assign 0,2 to worker 0, 1,3 to worker 1).
171
- The "block-reverse" option assigns adjacent cores to workers, but assigns
172
- the CPUs with large indices to low index workers (ex: assign 2-3 to worker 1, 0,1 to worker 2)
173
-
174
- available_accelerators: int | list
175
- Accelerators available for workers to use. Each worker will be pinned to exactly one of the provided
176
- accelerators, and no more workers will be launched than the number of accelerators.
177
-
178
- Either provide the list of accelerator names or the number available. If a number is provided,
179
- Parsl will create names as integers starting with 0.
180
-
181
- default: empty list
182
-
183
118
  prefetch_capacity : int
184
119
  Number of tasks that could be prefetched over available worker capacity.
185
120
  When there are a few tasks (<100) or when tasks are long running, this option should
@@ -213,19 +148,88 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
213
148
  worker_logdir_root : string
214
149
  In case of a remote file system, specify the path to where logs will be kept.
215
150
 
216
- enable_mpi_mode: bool
217
- If enabled, MPI launch prefixes will be composed for the batch scheduler based on
218
- the nodes available in each batch job and the resource_specification dict passed
219
- from the app. This is an experimental feature, please refer to the following doc section
220
- before use: https://parsl.readthedocs.io/en/stable/userguide/mpi_apps.html
221
-
222
- mpi_launcher: str
223
- This field is only used if enable_mpi_mode is set. Select one from the
224
- list of supported MPI launchers = ("srun", "aprun", "mpiexec").
225
- default: "mpiexec"
226
-
227
151
  encrypted : bool
228
152
  Flag to enable/disable encryption (CurveZMQ). Default is False.
153
+
154
+ manager_selector: ManagerSelector
155
+ Determines what strategy the interchange uses to select managers during task distribution.
156
+ See API reference under "Manager Selectors" regarding the various manager selectors.
157
+ Default: 'RandomManagerSelector'
158
+ """ # Documentation for params used by both HTEx and MPIEx
159
+
160
+
161
+ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageInformation):
162
+ __doc__ = f"""Executor designed for cluster-scale
163
+
164
+ The HighThroughputExecutor system has the following components:
165
+ 1. The HighThroughputExecutor instance which is run as part of the Parsl script.
166
+ 2. The Interchange which acts as a load-balancing proxy between workers and Parsl
167
+ 3. The multiprocessing based worker pool which coordinates task execution over several
168
+ cores on a node.
169
+ 4. ZeroMQ pipes connect the HighThroughputExecutor, Interchange and the process_worker_pool
170
+
171
+ Here is a diagram
172
+
173
+ .. code:: python
174
+
175
+
176
+ | Data | Executor | Interchange | External Process(es)
177
+ | Flow | | |
178
+ Task | Kernel | | |
179
+ +----->|-------->|------------>|->outgoing_q---|-> process_worker_pool
180
+ | | | | batching | | |
181
+ Parsl<---Fut-| | | load-balancing| result exception
182
+ ^ | | | watchdogs | | |
183
+ | | | Result | | | |
184
+ | | | Queue | | V V
185
+ | | | Thread<--|-incoming_q<---|--- +---------+
186
+ | | | | | |
187
+ | | | | | |
188
+ +----update_fut-----+
189
+
190
+
191
+ Each of the workers in each process_worker_pool has access to its local rank through
192
+ an environmental variable, ``PARSL_WORKER_RANK``. The local rank is unique for each process
193
+ and is an integer in the range from 0 to the number of workers per in the pool minus 1.
194
+ The workers also have access to the ID of the worker pool as ``PARSL_WORKER_POOL_ID``
195
+ and the size of the worker pool as ``PARSL_WORKER_COUNT``.
196
+
197
+
198
+ Parameters
199
+ ----------
200
+
201
+ {GENERAL_HTEX_PARAM_DOCS}
202
+
203
+ cores_per_worker : float
204
+ cores to be assigned to each worker. Oversubscription is possible
205
+ by setting cores_per_worker < 1.0. Default=1
206
+
207
+ mem_per_worker : float
208
+ GB of memory required per worker. If this option is specified, the node manager
209
+ will check the available memory at startup and limit the number of workers such that
210
+ the there's sufficient memory for each worker. Default: None
211
+
212
+ max_workers_per_node : int
213
+ Caps the number of workers launched per node. Default: None
214
+
215
+ cpu_affinity: string
216
+ Whether or how each worker process sets thread affinity. Options include "none" to forgo
217
+ any CPU affinity configuration, "block" to assign adjacent cores to workers
218
+ (ex: assign 0-1 to worker 0, 2-3 to worker 1), and
219
+ "alternating" to assign cores to workers in round-robin
220
+ (ex: assign 0,2 to worker 0, 1,3 to worker 1).
221
+ The "block-reverse" option assigns adjacent cores to workers, but assigns
222
+ the CPUs with large indices to low index workers (ex: assign 2-3 to worker 1, 0,1 to worker 2)
223
+
224
+ available_accelerators: int | list
225
+ Accelerators available for workers to use. Each worker will be pinned to exactly one of the provided
226
+ accelerators, and no more workers will be launched than the number of accelerators.
227
+
228
+ Either provide the list of accelerator names or the number available. If a number is provided,
229
+ Parsl will create names as integers starting with 0.
230
+
231
+ default: empty list
232
+
229
233
  """
230
234
 
231
235
  @typeguard.typechecked
@@ -233,7 +237,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
233
237
  label: str = 'HighThroughputExecutor',
234
238
  provider: ExecutionProvider = LocalProvider(),
235
239
  launch_cmd: Optional[str] = None,
240
+ interchange_launch_cmd: Optional[Sequence[str]] = None,
236
241
  address: Optional[str] = None,
242
+ loopback_address: str = "127.0.0.1",
237
243
  worker_ports: Optional[Tuple[int, int]] = None,
238
244
  worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
239
245
  interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000),
@@ -242,7 +248,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
242
248
  worker_debug: bool = False,
243
249
  cores_per_worker: float = 1.0,
244
250
  mem_per_worker: Optional[float] = None,
245
- max_workers: Optional[Union[int, float]] = None,
246
251
  max_workers_per_node: Optional[Union[int, float]] = None,
247
252
  cpu_affinity: str = 'none',
248
253
  available_accelerators: Union[int, Sequence[str]] = (),
@@ -253,8 +258,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
253
258
  poll_period: int = 10,
254
259
  address_probe_timeout: Optional[int] = None,
255
260
  worker_logdir_root: Optional[str] = None,
256
- enable_mpi_mode: bool = False,
257
- mpi_launcher: str = "mpiexec",
261
+ manager_selector: ManagerSelector = RandomManagerSelector(),
258
262
  block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
259
263
  encrypted: bool = False):
260
264
 
@@ -270,14 +274,15 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
270
274
  self.prefetch_capacity = prefetch_capacity
271
275
  self.address = address
272
276
  self.address_probe_timeout = address_probe_timeout
277
+ self.manager_selector = manager_selector
278
+ self.loopback_address = loopback_address
279
+
273
280
  if self.address:
274
281
  self.all_addresses = address
275
282
  else:
276
283
  self.all_addresses = ','.join(get_all_addresses())
277
284
 
278
- if max_workers:
279
- self._warn_deprecated("max_workers", "max_workers_per_node")
280
- self.max_workers_per_node = max_workers_per_node or max_workers or float("inf")
285
+ self.max_workers_per_node = max_workers_per_node or float("inf")
281
286
 
282
287
  mem_slots = self.max_workers_per_node
283
288
  cpu_slots = self.max_workers_per_node
@@ -304,12 +309,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
304
309
  self._workers_per_node = 1 # our best guess-- we do not have any provider hints
305
310
 
306
311
  self._task_counter = 0
307
- self.run_id = None # set to the correct run_id in dfk
308
- self.hub_address = None # set to the correct hub address in dfk
309
- self.hub_port = None # set to the correct hub port in dfk
310
312
  self.worker_ports = worker_ports
311
313
  self.worker_port_range = worker_port_range
312
- self.interchange_proc: Optional[Process] = None
314
+ self.interchange_proc: Optional[subprocess.Popen] = None
313
315
  self.interchange_port_range = interchange_port_range
314
316
  self.heartbeat_threshold = heartbeat_threshold
315
317
  self.heartbeat_period = heartbeat_period
@@ -321,20 +323,20 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
321
323
  self.encrypted = encrypted
322
324
  self.cert_dir = None
323
325
 
324
- self.enable_mpi_mode = enable_mpi_mode
325
- assert mpi_launcher in VALID_LAUNCHERS, \
326
- f"mpi_launcher must be set to one of {VALID_LAUNCHERS}"
327
- if self.enable_mpi_mode:
328
- assert isinstance(self.provider.launcher, parsl.launchers.SingleNodeLauncher), \
329
- "mpi_mode requires the provider to be configured to use a SingleNodeLauncher"
330
-
331
- self.mpi_launcher = mpi_launcher
332
-
333
326
  if not launch_cmd:
334
327
  launch_cmd = DEFAULT_LAUNCH_CMD
335
328
  self.launch_cmd = launch_cmd
336
329
 
330
+ if not interchange_launch_cmd:
331
+ interchange_launch_cmd = DEFAULT_INTERCHANGE_LAUNCH_CMD
332
+ self.interchange_launch_cmd = interchange_launch_cmd
333
+
334
+ self._result_queue_thread_exit = threading.Event()
335
+ self._result_queue_thread: Optional[threading.Thread] = None
336
+
337
337
  radio_mode = "htex"
338
+ enable_mpi_mode: bool = False
339
+ mpi_launcher: str = "mpiexec"
338
340
 
339
341
  def _warn_deprecated(self, old: str, new: str):
340
342
  warnings.warn(
@@ -344,16 +346,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
344
346
  stacklevel=2
345
347
  )
346
348
 
347
- @property
348
- def max_workers(self):
349
- self._warn_deprecated("max_workers", "max_workers_per_node")
350
- return self.max_workers_per_node
351
-
352
- @max_workers.setter
353
- def max_workers(self, val: Union[int, float]):
354
- self._warn_deprecated("max_workers", "max_workers_per_node")
355
- self.max_workers_per_node = val
356
-
357
349
  @property
358
350
  def logdir(self):
359
351
  return "{}/{}".format(self.run_dir, self.label)
@@ -364,6 +356,20 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
364
356
  return "{}/{}".format(self.worker_logdir_root, self.label)
365
357
  return self.logdir
366
358
 
359
+ def validate_resource_spec(self, resource_specification: dict):
360
+ """HTEX supports the following *Optional* resource specifications:
361
+ priority: lower value is higher priority"""
362
+ if resource_specification:
363
+ acceptable_fields = {'priority'}
364
+ keys = set(resource_specification.keys())
365
+ invalid_keys = keys - acceptable_fields
366
+ if invalid_keys:
367
+ message = "Task resource specification only accepts these types of resources: {}".format(
368
+ ', '.join(acceptable_fields))
369
+ logger.error(message)
370
+ raise InvalidResourceSpecification(set(invalid_keys), message)
371
+ return
372
+
367
373
  def initialize_scaling(self):
368
374
  """Compose the launch command and scale out the initial blocks.
369
375
  """
@@ -400,16 +406,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
400
406
 
401
407
  logger.debug("Starting HighThroughputExecutor with provider:\n%s", self.provider)
402
408
 
403
- # TODO: why is this a provider property?
404
- block_ids = []
405
- if hasattr(self.provider, 'init_blocks'):
406
- try:
407
- block_ids = self.scale_out(blocks=self.provider.init_blocks)
408
- except Exception as e:
409
- logger.error("Scaling out failed: {}".format(e))
410
- raise e
411
- return block_ids
412
-
413
409
  def start(self):
414
410
  """Create the Interchange process and connect to it.
415
411
  """
@@ -424,30 +420,28 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
424
420
  )
425
421
 
426
422
  self.outgoing_q = zmq_pipes.TasksOutgoing(
427
- curvezmq.ClientContext(self.cert_dir), "127.0.0.1", self.interchange_port_range
423
+ self.loopback_address, self.interchange_port_range, self.cert_dir
428
424
  )
429
425
  self.incoming_q = zmq_pipes.ResultsIncoming(
430
- curvezmq.ClientContext(self.cert_dir), "127.0.0.1", self.interchange_port_range
426
+ self.loopback_address, self.interchange_port_range, self.cert_dir
431
427
  )
432
428
  self.command_client = zmq_pipes.CommandClient(
433
- curvezmq.ClientContext(self.cert_dir), "127.0.0.1", self.interchange_port_range
429
+ self.loopback_address, self.interchange_port_range, self.cert_dir
434
430
  )
435
431
 
436
- self._queue_management_thread = None
437
- self._start_queue_management_thread()
432
+ self._result_queue_thread = None
433
+ self._start_result_queue_thread()
438
434
  self._start_local_interchange_process()
439
435
 
440
- logger.debug("Created management thread: {}".format(self._queue_management_thread))
436
+ logger.debug("Created result queue thread: %s", self._result_queue_thread)
441
437
 
442
- block_ids = self.initialize_scaling()
443
- return block_ids
438
+ self.initialize_scaling()
444
439
 
445
440
  @wrap_with_logs
446
- def _queue_management_worker(self):
447
- """Listen to the queue for task status messages and handle them.
441
+ def _result_queue_worker(self):
442
+ """Listen to the queue for task result messages and handle them.
448
443
 
449
- Depending on the message, tasks will be updated with results, exceptions,
450
- or updates. It expects the following messages:
444
+ Depending on the message, tasks will be updated with results or exceptions.
451
445
 
452
446
  .. code:: python
453
447
 
@@ -461,14 +455,14 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
461
455
  "task_id" : <task_id>
462
456
  "exception" : serialized exception object, on failure
463
457
  }
464
-
465
- The `None` message is a die request.
466
458
  """
467
- logger.debug("Queue management worker starting")
459
+ logger.debug("Result queue worker starting")
468
460
 
469
- while not self.bad_state_is_set:
461
+ while not self.bad_state_is_set and not self._result_queue_thread_exit.is_set():
470
462
  try:
471
- msgs = self.incoming_q.get()
463
+ msgs = self.incoming_q.get(timeout_ms=self.poll_period)
464
+ if msgs is None: # timeout
465
+ continue
472
466
 
473
467
  except IOError as e:
474
468
  logger.exception("Caught broken queue with exception code {}: {}".format(e.errno, e))
@@ -480,109 +474,114 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
480
474
 
481
475
  else:
482
476
 
483
- if msgs is None:
484
- logger.debug("Got None, exiting")
485
- return
477
+ for serialized_msg in msgs:
478
+ try:
479
+ msg = pickle.loads(serialized_msg)
480
+ except pickle.UnpicklingError:
481
+ raise BadMessage("Message received could not be unpickled")
486
482
 
487
- else:
488
- for serialized_msg in msgs:
483
+ if msg['type'] == 'result':
489
484
  try:
490
- msg = pickle.loads(serialized_msg)
491
- except pickle.UnpicklingError:
492
- raise BadMessage("Message received could not be unpickled")
485
+ tid = msg['task_id']
486
+ except Exception:
487
+ raise BadMessage("Message received does not contain 'task_id' field")
488
+
489
+ if tid == -1 and 'exception' in msg:
490
+ logger.warning("Executor shutting down due to exception from interchange")
491
+ exception = deserialize(msg['exception'])
492
+ self.set_bad_state_and_fail_all(exception)
493
+ break
494
+
495
+ task_fut = self.tasks.pop(tid)
493
496
 
494
- if msg['type'] == 'heartbeat':
495
- continue
496
- elif msg['type'] == 'result':
497
+ if 'result' in msg:
498
+ result = deserialize(msg['result'])
499
+ task_fut.set_result(result)
500
+
501
+ elif 'exception' in msg:
497
502
  try:
498
- tid = msg['task_id']
499
- except Exception:
500
- raise BadMessage("Message received does not contain 'task_id' field")
501
-
502
- if tid == -1 and 'exception' in msg:
503
- logger.warning("Executor shutting down due to exception from interchange")
504
- exception = deserialize(msg['exception'])
505
- self.set_bad_state_and_fail_all(exception)
506
- break
507
-
508
- task_fut = self.tasks.pop(tid)
509
-
510
- if 'result' in msg:
511
- result = deserialize(msg['result'])
512
- task_fut.set_result(result)
513
-
514
- elif 'exception' in msg:
515
- try:
516
- s = deserialize(msg['exception'])
517
- # s should be a RemoteExceptionWrapper... so we can reraise it
518
- if isinstance(s, RemoteExceptionWrapper):
519
- try:
520
- s.reraise()
521
- except Exception as e:
522
- task_fut.set_exception(e)
523
- elif isinstance(s, Exception):
524
- task_fut.set_exception(s)
525
- else:
526
- raise ValueError("Unknown exception-like type received: {}".format(type(s)))
527
- except Exception as e:
528
- # TODO could be a proper wrapped exception?
529
- task_fut.set_exception(
530
- DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
531
- else:
532
- raise BadMessage("Message received is neither result or exception")
503
+ s = deserialize(msg['exception'])
504
+ # s should be a RemoteExceptionWrapper... so we can reraise it
505
+ if isinstance(s, RemoteExceptionWrapper):
506
+ try:
507
+ s.reraise()
508
+ except Exception as e:
509
+ task_fut.set_exception(e)
510
+ elif isinstance(s, Exception):
511
+ task_fut.set_exception(s)
512
+ else:
513
+ raise ValueError("Unknown exception-like type received: {}".format(type(s)))
514
+ except Exception as e:
515
+ # TODO could be a proper wrapped exception?
516
+ task_fut.set_exception(
517
+ DeserializationError("Received exception, but handling also threw an exception: {}".format(e)))
533
518
  else:
534
- raise BadMessage("Message received with unknown type {}".format(msg['type']))
519
+ raise BadMessage("Message received is neither result or exception")
520
+ else:
521
+ raise BadMessage("Message received with unknown type {}".format(msg['type']))
535
522
 
536
- logger.info("Queue management worker finished")
523
+ logger.info("Closing result ZMQ pipe")
524
+ self.incoming_q.close()
525
+ logger.info("Result queue worker finished")
537
526
 
538
- def _start_local_interchange_process(self):
527
+ def _start_local_interchange_process(self) -> None:
539
528
  """ Starts the interchange process locally
540
529
 
541
- Starts the interchange process locally and uses an internal command queue to
530
+ Starts the interchange process locally and uses the command queue to
542
531
  get the worker task and result ports that the interchange has bound to.
543
532
  """
544
- comm_q = Queue(maxsize=10)
545
- self.interchange_proc = ForkProcess(target=interchange.starter,
546
- args=(comm_q,),
547
- kwargs={"client_ports": (self.outgoing_q.port,
548
- self.incoming_q.port,
549
- self.command_client.port),
550
- "interchange_address": self.address,
551
- "worker_ports": self.worker_ports,
552
- "worker_port_range": self.worker_port_range,
553
- "hub_address": self.hub_address,
554
- "hub_port": self.hub_port,
555
- "logdir": self.logdir,
556
- "heartbeat_threshold": self.heartbeat_threshold,
557
- "poll_period": self.poll_period,
558
- "logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
559
- "cert_dir": self.cert_dir,
560
- },
561
- daemon=True,
562
- name="HTEX-Interchange"
563
- )
564
- self.interchange_proc.start()
533
+
534
+ interchange_config = {"client_address": self.loopback_address,
535
+ "client_ports": (self.outgoing_q.port,
536
+ self.incoming_q.port,
537
+ self.command_client.port),
538
+ "interchange_address": self.address,
539
+ "worker_ports": self.worker_ports,
540
+ "worker_port_range": self.worker_port_range,
541
+ "hub_address": self.hub_address,
542
+ "hub_zmq_port": self.hub_zmq_port,
543
+ "logdir": self.logdir,
544
+ "heartbeat_threshold": self.heartbeat_threshold,
545
+ "poll_period": self.poll_period,
546
+ "logging_level": logging.DEBUG if self.worker_debug else logging.INFO,
547
+ "cert_dir": self.cert_dir,
548
+ "manager_selector": self.manager_selector,
549
+ "run_id": self.run_id,
550
+ }
551
+
552
+ config_pickle = pickle.dumps(interchange_config)
553
+
554
+ self.interchange_proc = subprocess.Popen(self.interchange_launch_cmd, stdin=subprocess.PIPE)
555
+ stdin = self.interchange_proc.stdin
556
+ assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
557
+
558
+ logger.debug("Popened interchange process. Writing config object")
559
+ stdin.write(config_pickle)
560
+ stdin.flush()
561
+ stdin.close()
562
+ logger.debug("Sent config object. Requesting worker ports")
565
563
  try:
566
- (self.worker_task_port, self.worker_result_port) = comm_q.get(block=True, timeout=120)
567
- except queue.Empty:
568
- logger.error("Interchange has not completed initialization in 120s. Aborting")
564
+ (self.worker_task_port, self.worker_result_port) = self.command_client.run("WORKER_PORTS", timeout_s=120)
565
+ except CommandClientTimeoutError:
566
+ logger.error("Interchange has not completed initialization. Aborting")
569
567
  raise Exception("Interchange failed to start")
568
+ logger.debug("Got worker ports")
570
569
 
571
- def _start_queue_management_thread(self):
572
- """Method to start the management thread as a daemon.
570
+ def _start_result_queue_thread(self):
571
+ """Method to start the result queue thread as a daemon.
573
572
 
574
573
  Checks if a thread already exists, then starts it.
575
- Could be used later as a restart if the management thread dies.
574
+ Could be used later as a restart if the result queue thread dies.
576
575
  """
577
- if self._queue_management_thread is None:
578
- logger.debug("Starting queue management thread")
579
- self._queue_management_thread = threading.Thread(target=self._queue_management_worker, name="HTEX-Queue-Management-Thread")
580
- self._queue_management_thread.daemon = True
581
- self._queue_management_thread.start()
582
- logger.debug("Started queue management thread")
576
+ if self._result_queue_thread is None:
577
+ logger.debug("Starting result queue thread")
578
+ self._result_queue_thread = threading.Thread(target=self._result_queue_worker, name="HTEX-Result-Queue-Thread")
579
+ self._result_queue_thread.daemon = True
580
+ self._result_queue_thread.start()
581
+ logger.debug("Started result queue thread")
583
582
 
584
583
  else:
585
- logger.error("Management thread already exists, returning")
584
+ logger.error("Result queue thread already exists, returning")
586
585
 
587
586
  def hold_worker(self, worker_id: str) -> None:
588
587
  """Puts a worker on hold, preventing scheduling of additional tasks to it.
@@ -603,7 +602,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
603
602
  def outstanding(self) -> int:
604
603
  """Returns the count of tasks outstanding across the interchange
605
604
  and managers"""
606
- return self.command_client.run("OUTSTANDING_C")
605
+ return len(self.tasks)
607
606
 
608
607
  @property
609
608
  def connected_workers(self) -> int:
@@ -655,7 +654,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
655
654
  Returns:
656
655
  Future
657
656
  """
658
- validate_resource_spec(resource_specification)
657
+
658
+ self.validate_resource_spec(resource_specification)
659
659
 
660
660
  if self.bad_state_is_set:
661
661
  raise self.executor_exception
@@ -679,7 +679,7 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
679
679
  except TypeError:
680
680
  raise SerializationError(func.__name__)
681
681
 
682
- msg = {"task_id": task_id, "buffer": fn_buf}
682
+ msg = {"task_id": task_id, "resource_spec": resource_specification, "buffer": fn_buf}
683
683
 
684
684
  # Post task to the outgoing queue
685
685
  self.outgoing_q.put(msg)
@@ -687,22 +687,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
687
687
  # Return the future
688
688
  return fut
689
689
 
690
- def create_monitoring_info(self, status):
691
- """ Create a msg for monitoring based on the poll status
692
-
693
- """
694
- msg = []
695
- for bid, s in status.items():
696
- d = {}
697
- d['run_id'] = self.run_id
698
- d['status'] = s.status_name
699
- d['timestamp'] = datetime.datetime.now()
700
- d['executor_label'] = self.label
701
- d['job_id'] = self.blocks.get(bid, None)
702
- d['block_id'] = bid
703
- msg.append(d)
704
- return msg
705
-
706
690
  @property
707
691
  def workers_per_node(self) -> Union[int, float]:
708
692
  return self._workers_per_node
@@ -740,14 +724,24 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
740
724
  tasks: int # sum of tasks in this block
741
725
  idle: float # shortest idle time of any manager in this block
742
726
 
727
+ # block_info will be populated from two sources:
728
+ # the Job Status Poller mutable block list, and the list of blocks
729
+ # which have connected to the interchange.
730
+
731
+ def new_block_info():
732
+ return BlockInfo(tasks=0, idle=float('inf'))
733
+
734
+ block_info: Dict[str, BlockInfo] = defaultdict(new_block_info)
735
+
736
+ for block_id, job_status in self._status.items():
737
+ if job_status.state not in TERMINAL_STATES:
738
+ block_info[block_id] = new_block_info()
739
+
743
740
  managers = self.connected_managers()
744
- block_info: Dict[str, BlockInfo] = {}
745
741
  for manager in managers:
746
742
  if not manager['active']:
747
743
  continue
748
744
  b_id = manager['block_id']
749
- if b_id not in block_info:
750
- block_info[b_id] = BlockInfo(tasks=0, idle=float('inf'))
751
745
  block_info[b_id].tasks += manager['tasks']
752
746
  block_info[b_id].idle = min(block_info[b_id].idle, manager['idle_duration'])
753
747
 
@@ -779,14 +773,14 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
779
773
 
780
774
  # Now kill via provider
781
775
  # Potential issue with multiple threads trying to remove the same blocks
782
- to_kill = [self.blocks[bid] for bid in block_ids_to_kill if bid in self.blocks]
776
+ to_kill = [self.blocks_to_job_id[bid] for bid in block_ids_to_kill if bid in self.blocks_to_job_id]
783
777
 
784
778
  r = self.provider.cancel(to_kill)
785
779
  job_ids = self._filter_scale_in_ids(to_kill, r)
786
780
 
787
- # to_kill block_ids are fetched from self.blocks
788
- # If a block_id is in self.block, it must exist in self.block_mapping
789
- block_ids_killed = [self.block_mapping[jid] for jid in job_ids]
781
+ # to_kill block_ids are fetched from self.blocks_to_job_id
782
+ # If a block_id is in self.blocks_to_job_id, it must exist in self.job_ids_to_block
783
+ block_ids_killed = [self.job_ids_to_block[jid] for jid in job_ids]
790
784
 
791
785
  return block_ids_killed
792
786
 
@@ -801,7 +795,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
801
795
  connected_blocks = self.connected_blocks()
802
796
  for job_id in job_status:
803
797
  job_info = job_status[job_id]
804
- if job_info.terminal and job_id not in connected_blocks:
798
+ if job_info.terminal and job_id not in connected_blocks and job_info.state != JobState.SCALED_IN:
799
+ logger.debug("Rewriting job %s from status %s to MISSING", job_id, job_info)
805
800
  job_status[job_id].state = JobState.MISSING
806
801
  if job_status[job_id].message is None:
807
802
  job_status[job_id].message = (
@@ -829,10 +824,37 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin):
829
824
 
830
825
  logger.info("Attempting HighThroughputExecutor shutdown")
831
826
 
827
+ logger.info("Terminating interchange and result queue thread")
828
+ self._result_queue_thread_exit.set()
832
829
  self.interchange_proc.terminate()
833
- self.interchange_proc.join(timeout=timeout)
834
- if self.interchange_proc.is_alive():
835
- logger.info("Unable to terminate Interchange process; sending SIGKILL")
830
+ try:
831
+ self.interchange_proc.wait(timeout=timeout)
832
+ except subprocess.TimeoutExpired:
833
+ logger.warning("Unable to terminate Interchange process; sending SIGKILL")
836
834
  self.interchange_proc.kill()
837
835
 
836
+ logger.info("Closing ZMQ pipes")
837
+
838
+ # These pipes are used in a thread unsafe manner. If you have traced a
839
+ # problem to this block of code, you might consider what is happening
840
+ # with other threads that access these.
841
+
842
+ # incoming_q is not closed here because it is used by the results queue
843
+ # worker which is not shut down at this point.
844
+
845
+ if hasattr(self, 'outgoing_q'):
846
+ logger.info("Closing outgoing_q")
847
+ self.outgoing_q.close()
848
+
849
+ if hasattr(self, 'command_client'):
850
+ logger.info("Closing command client")
851
+ self.command_client.close()
852
+
853
+ logger.info("Waiting for result queue thread exit")
854
+ if self._result_queue_thread:
855
+ self._result_queue_thread.join()
856
+
838
857
  logger.info("Finished HighThroughputExecutor shutdown attempt")
858
+
859
+ def get_usage_information(self):
860
+ return {"mpi": self.enable_mpi_mode}