parsl 2024.3.11__py3-none-any.whl → 2025.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. parsl/__init__.py +9 -10
  2. parsl/addresses.py +29 -7
  3. parsl/app/app.py +7 -8
  4. parsl/app/bash.py +15 -8
  5. parsl/app/errors.py +10 -13
  6. parsl/app/futures.py +8 -10
  7. parsl/app/python.py +2 -1
  8. parsl/benchmark/perf.py +2 -1
  9. parsl/concurrent/__init__.py +2 -2
  10. parsl/config.py +57 -10
  11. parsl/configs/ASPIRE1.py +6 -5
  12. parsl/configs/Azure.py +9 -8
  13. parsl/configs/bridges.py +6 -4
  14. parsl/configs/cc_in2p3.py +3 -3
  15. parsl/configs/ec2.py +3 -1
  16. parsl/configs/expanse.py +4 -3
  17. parsl/configs/frontera.py +3 -4
  18. parsl/configs/htex_local.py +3 -4
  19. parsl/configs/illinoiscluster.py +3 -1
  20. parsl/configs/improv.py +34 -0
  21. parsl/configs/kubernetes.py +4 -3
  22. parsl/configs/local_threads.py +5 -1
  23. parsl/configs/midway.py +5 -3
  24. parsl/configs/osg.py +4 -2
  25. parsl/configs/polaris.py +4 -2
  26. parsl/configs/stampede2.py +6 -5
  27. parsl/configs/summit.py +3 -3
  28. parsl/configs/toss3_llnl.py +4 -3
  29. parsl/configs/vineex_local.py +6 -4
  30. parsl/configs/wqex_local.py +5 -3
  31. parsl/curvezmq.py +4 -0
  32. parsl/data_provider/data_manager.py +4 -3
  33. parsl/data_provider/file_noop.py +1 -2
  34. parsl/data_provider/files.py +3 -3
  35. parsl/data_provider/ftp.py +1 -3
  36. parsl/data_provider/globus.py +7 -6
  37. parsl/data_provider/http.py +2 -2
  38. parsl/data_provider/rsync.py +1 -1
  39. parsl/data_provider/staging.py +2 -2
  40. parsl/data_provider/zip.py +135 -0
  41. parsl/dataflow/dependency_resolvers.py +115 -0
  42. parsl/dataflow/dflow.py +262 -224
  43. parsl/dataflow/errors.py +3 -5
  44. parsl/dataflow/futures.py +27 -14
  45. parsl/dataflow/memoization.py +5 -5
  46. parsl/dataflow/rundirs.py +5 -6
  47. parsl/dataflow/taskrecord.py +4 -5
  48. parsl/executors/__init__.py +4 -2
  49. parsl/executors/base.py +45 -15
  50. parsl/executors/errors.py +13 -0
  51. parsl/executors/execute_task.py +37 -0
  52. parsl/executors/flux/execute_parsl_task.py +3 -3
  53. parsl/executors/flux/executor.py +18 -19
  54. parsl/executors/flux/flux_instance_manager.py +26 -27
  55. parsl/executors/high_throughput/errors.py +43 -3
  56. parsl/executors/high_throughput/executor.py +316 -282
  57. parsl/executors/high_throughput/interchange.py +158 -167
  58. parsl/executors/high_throughput/manager_record.py +5 -0
  59. parsl/executors/high_throughput/manager_selector.py +55 -0
  60. parsl/executors/high_throughput/monitoring_info.py +2 -1
  61. parsl/executors/high_throughput/mpi_executor.py +113 -0
  62. parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
  63. parsl/executors/high_throughput/mpi_resource_management.py +6 -17
  64. parsl/executors/high_throughput/probe.py +9 -7
  65. parsl/executors/high_throughput/process_worker_pool.py +115 -77
  66. parsl/executors/high_throughput/zmq_pipes.py +81 -23
  67. parsl/executors/radical/executor.py +130 -79
  68. parsl/executors/radical/rpex_resources.py +17 -15
  69. parsl/executors/radical/rpex_worker.py +4 -3
  70. parsl/executors/status_handling.py +157 -51
  71. parsl/executors/taskvine/__init__.py +1 -1
  72. parsl/executors/taskvine/errors.py +1 -1
  73. parsl/executors/taskvine/exec_parsl_function.py +2 -2
  74. parsl/executors/taskvine/executor.py +41 -57
  75. parsl/executors/taskvine/factory.py +1 -1
  76. parsl/executors/taskvine/factory_config.py +1 -1
  77. parsl/executors/taskvine/manager.py +18 -13
  78. parsl/executors/taskvine/manager_config.py +9 -5
  79. parsl/executors/threads.py +6 -6
  80. parsl/executors/workqueue/errors.py +1 -1
  81. parsl/executors/workqueue/exec_parsl_function.py +6 -5
  82. parsl/executors/workqueue/executor.py +64 -63
  83. parsl/executors/workqueue/parsl_coprocess.py +1 -1
  84. parsl/jobs/error_handlers.py +2 -2
  85. parsl/jobs/job_status_poller.py +30 -113
  86. parsl/jobs/states.py +7 -2
  87. parsl/jobs/strategy.py +43 -31
  88. parsl/launchers/__init__.py +12 -3
  89. parsl/launchers/errors.py +1 -1
  90. parsl/launchers/launchers.py +6 -12
  91. parsl/log_utils.py +9 -6
  92. parsl/monitoring/db_manager.py +59 -95
  93. parsl/monitoring/errors.py +6 -0
  94. parsl/monitoring/monitoring.py +87 -356
  95. parsl/monitoring/queries/pandas.py +1 -2
  96. parsl/monitoring/radios/base.py +13 -0
  97. parsl/monitoring/radios/filesystem.py +52 -0
  98. parsl/monitoring/radios/htex.py +57 -0
  99. parsl/monitoring/radios/multiprocessing.py +17 -0
  100. parsl/monitoring/radios/udp.py +56 -0
  101. parsl/monitoring/radios/zmq.py +17 -0
  102. parsl/monitoring/remote.py +33 -37
  103. parsl/monitoring/router.py +212 -0
  104. parsl/monitoring/types.py +5 -6
  105. parsl/monitoring/visualization/app.py +4 -2
  106. parsl/monitoring/visualization/models.py +0 -1
  107. parsl/monitoring/visualization/plots/default/workflow_plots.py +11 -4
  108. parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
  109. parsl/monitoring/visualization/utils.py +0 -1
  110. parsl/monitoring/visualization/views.py +16 -8
  111. parsl/multiprocessing.py +0 -1
  112. parsl/process_loggers.py +1 -2
  113. parsl/providers/__init__.py +8 -17
  114. parsl/providers/aws/aws.py +2 -3
  115. parsl/providers/azure/azure.py +4 -5
  116. parsl/providers/base.py +2 -18
  117. parsl/providers/cluster_provider.py +4 -12
  118. parsl/providers/condor/condor.py +7 -17
  119. parsl/providers/errors.py +2 -2
  120. parsl/providers/googlecloud/googlecloud.py +2 -1
  121. parsl/providers/grid_engine/grid_engine.py +5 -14
  122. parsl/providers/kubernetes/kube.py +80 -40
  123. parsl/providers/local/local.py +13 -26
  124. parsl/providers/lsf/lsf.py +5 -23
  125. parsl/providers/pbspro/pbspro.py +5 -17
  126. parsl/providers/slurm/slurm.py +81 -39
  127. parsl/providers/torque/torque.py +3 -14
  128. parsl/serialize/__init__.py +8 -3
  129. parsl/serialize/base.py +1 -2
  130. parsl/serialize/concretes.py +5 -4
  131. parsl/serialize/facade.py +3 -3
  132. parsl/serialize/proxystore.py +3 -2
  133. parsl/tests/__init__.py +1 -1
  134. parsl/tests/configs/azure_single_node.py +4 -5
  135. parsl/tests/configs/bridges.py +3 -2
  136. parsl/tests/configs/cc_in2p3.py +1 -3
  137. parsl/tests/configs/comet.py +2 -1
  138. parsl/tests/configs/ec2_single_node.py +1 -2
  139. parsl/tests/configs/ec2_spot.py +1 -2
  140. parsl/tests/configs/flux_local.py +11 -0
  141. parsl/tests/configs/frontera.py +2 -3
  142. parsl/tests/configs/htex_local.py +3 -5
  143. parsl/tests/configs/htex_local_alternate.py +11 -15
  144. parsl/tests/configs/htex_local_intask_staging.py +5 -9
  145. parsl/tests/configs/htex_local_rsync_staging.py +4 -8
  146. parsl/tests/configs/local_radical.py +1 -3
  147. parsl/tests/configs/local_radical_mpi.py +2 -2
  148. parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
  149. parsl/tests/configs/local_threads_monitoring.py +0 -1
  150. parsl/tests/configs/midway.py +2 -2
  151. parsl/tests/configs/nscc_singapore.py +3 -3
  152. parsl/tests/configs/osg_htex.py +1 -1
  153. parsl/tests/configs/petrelkube.py +3 -2
  154. parsl/tests/configs/slurm_local.py +24 -0
  155. parsl/tests/configs/summit.py +1 -0
  156. parsl/tests/configs/taskvine_ex.py +4 -7
  157. parsl/tests/configs/user_opts.py +2 -8
  158. parsl/tests/configs/workqueue_ex.py +4 -6
  159. parsl/tests/conftest.py +27 -13
  160. parsl/tests/integration/test_stress/test_python_simple.py +3 -4
  161. parsl/tests/integration/test_stress/test_python_threads.py +3 -5
  162. parsl/tests/manual_tests/htex_local.py +4 -6
  163. parsl/tests/manual_tests/test_basic.py +1 -0
  164. parsl/tests/manual_tests/test_log_filter.py +3 -1
  165. parsl/tests/manual_tests/test_memory_limits.py +6 -8
  166. parsl/tests/manual_tests/test_regression_220.py +2 -1
  167. parsl/tests/manual_tests/test_udp_simple.py +4 -4
  168. parsl/tests/manual_tests/test_worker_count.py +3 -2
  169. parsl/tests/scaling_tests/htex_local.py +2 -4
  170. parsl/tests/scaling_tests/test_scale.py +0 -9
  171. parsl/tests/scaling_tests/vineex_condor.py +1 -2
  172. parsl/tests/scaling_tests/vineex_local.py +1 -2
  173. parsl/tests/site_tests/site_config_selector.py +1 -6
  174. parsl/tests/site_tests/test_provider.py +4 -2
  175. parsl/tests/site_tests/test_site.py +2 -0
  176. parsl/tests/sites/test_affinity.py +7 -7
  177. parsl/tests/sites/test_dynamic_executor.py +3 -4
  178. parsl/tests/sites/test_ec2.py +3 -2
  179. parsl/tests/sites/test_worker_info.py +4 -5
  180. parsl/tests/test_aalst_patterns.py +0 -1
  181. parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
  182. parsl/tests/test_bash_apps/test_basic.py +10 -4
  183. parsl/tests/test_bash_apps/test_error_codes.py +5 -7
  184. parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
  185. parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
  186. parsl/tests/test_bash_apps/test_memoize.py +2 -8
  187. parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
  188. parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
  189. parsl/tests/test_bash_apps/test_multiline.py +1 -1
  190. parsl/tests/test_bash_apps/test_pipeline.py +1 -1
  191. parsl/tests/test_bash_apps/test_std_uri.py +123 -0
  192. parsl/tests/test_bash_apps/test_stdout.py +33 -8
  193. parsl/tests/test_callables.py +2 -2
  194. parsl/tests/test_checkpointing/test_periodic.py +21 -39
  195. parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
  196. parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
  197. parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
  198. parsl/tests/test_checkpointing/test_regression_239.py +1 -1
  199. parsl/tests/test_checkpointing/test_task_exit.py +2 -3
  200. parsl/tests/test_docs/test_from_slides.py +5 -2
  201. parsl/tests/test_docs/test_kwargs.py +4 -1
  202. parsl/tests/test_docs/test_tutorial_1.py +1 -2
  203. parsl/tests/test_docs/test_workflow1.py +2 -2
  204. parsl/tests/test_docs/test_workflow2.py +0 -1
  205. parsl/tests/test_error_handling/test_rand_fail.py +2 -2
  206. parsl/tests/test_error_handling/test_resource_spec.py +10 -12
  207. parsl/tests/test_error_handling/test_retries.py +6 -16
  208. parsl/tests/test_error_handling/test_retry_handler.py +1 -0
  209. parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
  210. parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
  211. parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
  212. parsl/tests/test_execute_task.py +29 -0
  213. parsl/tests/test_flux.py +1 -1
  214. parsl/tests/test_htex/test_basic.py +2 -3
  215. parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
  216. parsl/tests/test_htex/test_command_client_timeout.py +66 -0
  217. parsl/tests/test_htex/test_connected_blocks.py +3 -2
  218. parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
  219. parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
  220. parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
  221. parsl/tests/test_htex/test_drain.py +79 -0
  222. parsl/tests/test_htex/test_htex.py +51 -25
  223. parsl/tests/test_htex/test_manager_failure.py +0 -1
  224. parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
  225. parsl/tests/test_htex/test_managers_command.py +36 -0
  226. parsl/tests/test_htex/test_missing_worker.py +2 -12
  227. parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
  228. parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
  229. parsl/tests/test_htex/test_zmq_binding.py +29 -8
  230. parsl/tests/test_monitoring/test_app_names.py +86 -0
  231. parsl/tests/test_monitoring/test_basic.py +73 -25
  232. parsl/tests/test_monitoring/test_db_locks.py +6 -4
  233. parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
  234. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
  235. parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
  236. parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
  237. parsl/tests/test_monitoring/test_stdouterr.py +134 -0
  238. parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
  239. parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
  240. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
  241. parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
  242. parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
  243. parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
  244. parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
  245. parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
  246. parsl/tests/test_providers/test_local_provider.py +3 -132
  247. parsl/tests/test_providers/test_pbspro_template.py +2 -3
  248. parsl/tests/test_providers/test_slurm_template.py +2 -3
  249. parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
  250. parsl/tests/test_python_apps/test_context_manager.py +128 -0
  251. parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
  252. parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
  253. parsl/tests/test_python_apps/test_fail.py +0 -25
  254. parsl/tests/test_python_apps/test_futures.py +2 -1
  255. parsl/tests/test_python_apps/test_inputs_default.py +22 -0
  256. parsl/tests/test_python_apps/test_join.py +0 -1
  257. parsl/tests/test_python_apps/test_lifted.py +11 -7
  258. parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
  259. parsl/tests/test_python_apps/test_outputs.py +1 -1
  260. parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
  261. parsl/tests/test_radical/test_mpi_funcs.py +1 -2
  262. parsl/tests/test_regression/test_1480.py +2 -1
  263. parsl/tests/test_regression/test_1653.py +2 -1
  264. parsl/tests/test_regression/test_226.py +1 -0
  265. parsl/tests/test_regression/test_2652.py +1 -0
  266. parsl/tests/test_regression/test_69a.py +0 -1
  267. parsl/tests/test_regression/test_854.py +4 -2
  268. parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
  269. parsl/tests/test_regression/test_98.py +0 -1
  270. parsl/tests/test_scaling/test_block_error_handler.py +9 -4
  271. parsl/tests/test_scaling/test_regression_1621.py +11 -15
  272. parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
  273. parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
  274. parsl/tests/test_scaling/test_scale_down.py +2 -5
  275. parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +6 -18
  276. parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
  277. parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
  278. parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
  279. parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
  280. parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
  281. parsl/tests/test_serialization/test_basic.py +2 -1
  282. parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
  283. parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
  284. parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
  285. parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
  286. parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
  287. parsl/tests/test_staging/staging_provider.py +2 -2
  288. parsl/tests/test_staging/test_1316.py +3 -4
  289. parsl/tests/test_staging/test_docs_1.py +2 -1
  290. parsl/tests/test_staging/test_docs_2.py +2 -1
  291. parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
  292. parsl/tests/{test_data → test_staging}/test_file.py +6 -6
  293. parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
  294. parsl/tests/test_staging/test_staging_ftp.py +1 -0
  295. parsl/tests/test_staging/test_staging_https.py +5 -2
  296. parsl/tests/test_staging/test_staging_stdout.py +64 -0
  297. parsl/tests/test_staging/test_zip_in.py +39 -0
  298. parsl/tests/test_staging/test_zip_out.py +110 -0
  299. parsl/tests/test_staging/test_zip_to_zip.py +41 -0
  300. parsl/tests/test_summary.py +2 -2
  301. parsl/tests/test_thread_parallelism.py +0 -1
  302. parsl/tests/test_threads/test_configs.py +1 -2
  303. parsl/tests/test_threads/test_lazy_errors.py +2 -2
  304. parsl/tests/test_utils/test_execute_wait.py +35 -0
  305. parsl/tests/test_utils/test_sanitize_dns.py +76 -0
  306. parsl/tests/unit/test_address.py +20 -0
  307. parsl/tests/unit/test_file.py +99 -0
  308. parsl/tests/unit/test_usage_tracking.py +66 -0
  309. parsl/usage_tracking/api.py +65 -0
  310. parsl/usage_tracking/levels.py +6 -0
  311. parsl/usage_tracking/usage.py +104 -62
  312. parsl/utils.py +139 -6
  313. parsl/version.py +1 -1
  314. {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
  315. parsl-2025.1.13.data/scripts/interchange.py +649 -0
  316. {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +115 -77
  317. parsl-2025.1.13.dist-info/METADATA +96 -0
  318. parsl-2025.1.13.dist-info/RECORD +462 -0
  319. {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
  320. parsl/channels/__init__.py +0 -7
  321. parsl/channels/base.py +0 -141
  322. parsl/channels/errors.py +0 -113
  323. parsl/channels/local/local.py +0 -164
  324. parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
  325. parsl/channels/ssh/ssh.py +0 -276
  326. parsl/channels/ssh_il/__init__.py +0 -0
  327. parsl/channels/ssh_il/ssh_il.py +0 -74
  328. parsl/configs/ad_hoc.py +0 -35
  329. parsl/executors/radical/rpex_master.py +0 -42
  330. parsl/monitoring/radios.py +0 -175
  331. parsl/providers/ad_hoc/__init__.py +0 -0
  332. parsl/providers/ad_hoc/ad_hoc.py +0 -248
  333. parsl/providers/cobalt/__init__.py +0 -0
  334. parsl/providers/cobalt/cobalt.py +0 -236
  335. parsl/providers/cobalt/template.py +0 -17
  336. parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
  337. parsl/tests/configs/cooley_htex.py +0 -37
  338. parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
  339. parsl/tests/configs/local_adhoc.py +0 -18
  340. parsl/tests/configs/swan_htex.py +0 -43
  341. parsl/tests/configs/theta.py +0 -37
  342. parsl/tests/integration/test_channels/__init__.py +0 -0
  343. parsl/tests/integration/test_channels/test_channels.py +0 -17
  344. parsl/tests/integration/test_channels/test_local_channel.py +0 -42
  345. parsl/tests/integration/test_channels/test_scp_1.py +0 -45
  346. parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
  347. parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
  348. parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
  349. parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
  350. parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
  351. parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
  352. parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
  353. parsl/tests/sites/test_local_adhoc.py +0 -61
  354. parsl/tests/test_channels/__init__.py +0 -0
  355. parsl/tests/test_channels/test_large_output.py +0 -22
  356. parsl/tests/test_data/__init__.py +0 -0
  357. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
  358. parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
  359. parsl-2024.3.11.dist-info/METADATA +0 -98
  360. parsl-2024.3.11.dist-info/RECORD +0 -447
  361. parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
  362. parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
  363. parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
  364. parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
  365. parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
  366. {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
  367. {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
  368. {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
  369. {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
@@ -1,64 +1,39 @@
1
1
  #!/usr/bin/env python
2
- import multiprocessing
3
- import zmq
4
- import os
5
- import sys
6
- import platform
7
- import random
8
- import time
9
2
  import datetime
10
- import pickle
11
- import signal
3
+ import json
12
4
  import logging
5
+ import os
6
+ import pickle
7
+ import platform
13
8
  import queue
9
+ import sys
14
10
  import threading
15
- import json
11
+ import time
12
+ from typing import Any, Dict, List, NoReturn, Optional, Sequence, Set, Tuple, cast
16
13
 
17
- from typing import cast, Any, Dict, NoReturn, Sequence, Set, Optional, Tuple, List
14
+ import zmq
18
15
 
19
16
  from parsl import curvezmq
20
- from parsl.utils import setproctitle
21
- from parsl.version import VERSION as PARSL_VERSION
22
- from parsl.serialize import serialize as serialize_object
23
-
17
+ from parsl.addresses import tcp_url
24
18
  from parsl.app.errors import RemoteExceptionWrapper
19
+ from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch
25
20
  from parsl.executors.high_throughput.manager_record import ManagerRecord
21
+ from parsl.executors.high_throughput.manager_selector import ManagerSelector
26
22
  from parsl.monitoring.message_type import MessageType
23
+ from parsl.monitoring.radios.base import MonitoringRadioSender
24
+ from parsl.monitoring.radios.zmq import ZMQRadioSender
27
25
  from parsl.process_loggers import wrap_with_logs
28
-
26
+ from parsl.serialize import serialize as serialize_object
27
+ from parsl.utils import setproctitle
28
+ from parsl.version import VERSION as PARSL_VERSION
29
29
 
30
30
  PKL_HEARTBEAT_CODE = pickle.dumps((2 ** 32) - 1)
31
+ PKL_DRAINED_CODE = pickle.dumps((2 ** 32) - 2)
31
32
 
32
33
  LOGGER_NAME = "interchange"
33
34
  logger = logging.getLogger(LOGGER_NAME)
34
35
 
35
36
 
36
- class ManagerLost(Exception):
37
- ''' Task lost due to manager loss. Manager is considered lost when multiple heartbeats
38
- have been missed.
39
- '''
40
- def __init__(self, manager_id: bytes, hostname: str) -> None:
41
- self.manager_id = manager_id
42
- self.tstamp = time.time()
43
- self.hostname = hostname
44
-
45
- def __str__(self) -> str:
46
- return "Task failure due to loss of manager {} on host {}".format(self.manager_id.decode(), self.hostname)
47
-
48
-
49
- class VersionMismatch(Exception):
50
- ''' Manager and Interchange versions do not match
51
- '''
52
- def __init__(self, interchange_version: str, manager_version: str):
53
- self.interchange_version = interchange_version
54
- self.manager_version = manager_version
55
-
56
- def __str__(self) -> str:
57
- return "Manager version info {} does not match interchange version info {}, causing a critical failure".format(
58
- self.manager_version,
59
- self.interchange_version)
60
-
61
-
62
37
  class Interchange:
63
38
  """ Interchange is a task orchestrator for distributed systems.
64
39
 
@@ -67,18 +42,21 @@ class Interchange:
67
42
  3. Detect workers that have failed using heartbeats
68
43
  """
69
44
  def __init__(self,
70
- client_address: str = "127.0.0.1",
71
- interchange_address: Optional[str] = None,
72
- client_ports: Tuple[int, int, int] = (50055, 50056, 50057),
73
- worker_ports: Optional[Tuple[int, int]] = None,
74
- worker_port_range: Tuple[int, int] = (54000, 55000),
75
- hub_address: Optional[str] = None,
76
- hub_port: Optional[int] = None,
77
- heartbeat_threshold: int = 60,
78
- logdir: str = ".",
79
- logging_level: int = logging.INFO,
80
- poll_period: int = 10,
81
- cert_dir: Optional[str] = None,
45
+ *,
46
+ client_address: str,
47
+ interchange_address: Optional[str],
48
+ client_ports: Tuple[int, int, int],
49
+ worker_ports: Optional[Tuple[int, int]],
50
+ worker_port_range: Tuple[int, int],
51
+ hub_address: Optional[str],
52
+ hub_zmq_port: Optional[int],
53
+ heartbeat_threshold: int,
54
+ logdir: str,
55
+ logging_level: int,
56
+ poll_period: int,
57
+ cert_dir: Optional[str],
58
+ manager_selector: ManagerSelector,
59
+ run_id: str,
82
60
  ) -> None:
83
61
  """
84
62
  Parameters
@@ -90,45 +68,44 @@ class Interchange:
90
68
  If specified the interchange will only listen on this address for connections from workers
91
69
  else, it binds to all addresses.
92
70
 
93
- client_ports : triple(int, int, int)
71
+ client_ports : tuple(int, int, int)
94
72
  The ports at which the client can be reached
95
73
 
96
74
  worker_ports : tuple(int, int)
97
- The specific two ports at which workers will connect to the Interchange. Default: None
75
+ The specific two ports at which workers will connect to the Interchange.
98
76
 
99
77
  worker_port_range : tuple(int, int)
100
78
  The interchange picks ports at random from the range which will be used by workers.
101
- This is overridden when the worker_ports option is set. Default: (54000, 55000)
79
+ This is overridden when the worker_ports option is set.
102
80
 
103
81
  hub_address : str
104
- The ip address at which the interchange can send info about managers to when monitoring is enabled.
105
- This is passed via dfk and executor automatically. Default: None (meaning monitoring disabled)
82
+ The IP address at which the interchange can send info about managers to when monitoring is enabled.
83
+ When None, monitoring is disabled.
106
84
 
107
- hub_port : str
85
+ hub_zmq_port : str
108
86
  The port at which the interchange can send info about managers to when monitoring is enabled.
109
- This is passed via dfk and executor automatically. Default: None (meaning monitoring disabled)
87
+ When None, monitoring is disabled.
110
88
 
111
89
  heartbeat_threshold : int
112
90
  Number of seconds since the last heartbeat after which worker is considered lost.
113
91
 
114
92
  logdir : str
115
- Parsl log directory paths. Logs and temp files go here. Default: '.'
93
+ Parsl log directory paths. Logs and temp files go here.
116
94
 
117
95
  logging_level : int
118
- Logging level as defined in the logging module. Default: logging.INFO
96
+ Logging level as defined in the logging module.
119
97
 
120
98
  poll_period : int
121
- The main thread polling period, in milliseconds. Default: 10ms
99
+ The main thread polling period, in milliseconds.
122
100
 
123
101
  cert_dir : str | None
124
- Path to the certificate directory. Default: None
102
+ Path to the certificate directory.
125
103
  """
126
104
  self.cert_dir = cert_dir
127
105
  self.logdir = logdir
128
106
  os.makedirs(self.logdir, exist_ok=True)
129
107
 
130
108
  start_file_logger("{}/interchange.log".format(self.logdir), level=logging_level)
131
- logger.propagate = False
132
109
  logger.debug("Initializing Interchange process")
133
110
 
134
111
  self.client_address = client_address
@@ -140,17 +117,19 @@ class Interchange:
140
117
  self.zmq_context = curvezmq.ServerContext(self.cert_dir)
141
118
  self.task_incoming = self.zmq_context.socket(zmq.DEALER)
142
119
  self.task_incoming.set_hwm(0)
143
- self.task_incoming.connect("tcp://{}:{}".format(client_address, client_ports[0]))
120
+ self.task_incoming.connect(tcp_url(client_address, client_ports[0]))
144
121
  self.results_outgoing = self.zmq_context.socket(zmq.DEALER)
145
122
  self.results_outgoing.set_hwm(0)
146
- self.results_outgoing.connect("tcp://{}:{}".format(client_address, client_ports[1]))
123
+ self.results_outgoing.connect(tcp_url(client_address, client_ports[1]))
147
124
 
148
125
  self.command_channel = self.zmq_context.socket(zmq.REP)
149
- self.command_channel.connect("tcp://{}:{}".format(client_address, client_ports[2]))
126
+ self.command_channel.connect(tcp_url(client_address, client_ports[2]))
150
127
  logger.info("Connected to client")
151
128
 
129
+ self.run_id = run_id
130
+
152
131
  self.hub_address = hub_address
153
- self.hub_port = hub_port
132
+ self.hub_zmq_port = hub_zmq_port
154
133
 
155
134
  self.pending_task_queue: queue.Queue[Any] = queue.Queue(maxsize=10 ** 6)
156
135
  self.count = 0
@@ -167,14 +146,14 @@ class Interchange:
167
146
  self.worker_task_port = self.worker_ports[0]
168
147
  self.worker_result_port = self.worker_ports[1]
169
148
 
170
- self.task_outgoing.bind(f"tcp://{self.interchange_address}:{self.worker_task_port}")
171
- self.results_incoming.bind(f"tcp://{self.interchange_address}:{self.worker_result_port}")
149
+ self.task_outgoing.bind(tcp_url(self.interchange_address, self.worker_task_port))
150
+ self.results_incoming.bind(tcp_url(self.interchange_address, self.worker_result_port))
172
151
 
173
152
  else:
174
- self.worker_task_port = self.task_outgoing.bind_to_random_port(f"tcp://{self.interchange_address}",
153
+ self.worker_task_port = self.task_outgoing.bind_to_random_port(tcp_url(self.interchange_address),
175
154
  min_port=worker_port_range[0],
176
155
  max_port=worker_port_range[1], max_tries=100)
177
- self.worker_result_port = self.results_incoming.bind_to_random_port(f"tcp://{self.interchange_address}",
156
+ self.worker_result_port = self.results_incoming.bind_to_random_port(tcp_url(self.interchange_address),
178
157
  min_port=worker_port_range[0],
179
158
  max_port=worker_port_range[1], max_tries=100)
180
159
 
@@ -186,6 +165,8 @@ class Interchange:
186
165
 
187
166
  self.heartbeat_threshold = heartbeat_threshold
188
167
 
168
+ self.manager_selector = manager_selector
169
+
189
170
  self.current_platform = {'parsl_v': PARSL_VERSION,
190
171
  'python_v': "{}.{}.{}".format(sys.version_info.major,
191
172
  sys.version_info.minor,
@@ -242,27 +223,16 @@ class Interchange:
242
223
  task_counter += 1
243
224
  logger.debug(f"Fetched {task_counter} tasks so far")
244
225
 
245
- def _create_monitoring_channel(self) -> Optional[zmq.Socket]:
246
- if self.hub_address and self.hub_port:
247
- logger.info("Connecting to monitoring")
248
- # This is a one-off because monitoring is unencrypted
249
- hub_channel = zmq.Context().socket(zmq.DEALER)
250
- hub_channel.set_hwm(0)
251
- hub_channel.connect("tcp://{}:{}".format(self.hub_address, self.hub_port))
252
- logger.info("Monitoring enabled and connected to hub")
253
- return hub_channel
254
- else:
255
- return None
256
-
257
- def _send_monitoring_info(self, hub_channel: Optional[zmq.Socket], manager: ManagerRecord) -> None:
258
- if hub_channel:
259
- logger.info("Sending message {} to hub".format(manager))
226
+ def _send_monitoring_info(self, monitoring_radio: Optional[MonitoringRadioSender], manager: ManagerRecord) -> None:
227
+ if monitoring_radio:
228
+ logger.info("Sending message {} to MonitoringHub".format(manager))
260
229
 
261
230
  d: Dict = cast(Dict, manager.copy())
262
231
  d['timestamp'] = datetime.datetime.now()
263
232
  d['last_heartbeat'] = datetime.datetime.fromtimestamp(d['last_heartbeat'])
233
+ d['run_id'] = self.run_id
264
234
 
265
- hub_channel.send_pyobj((MessageType.NODE_INFO, d))
235
+ monitoring_radio.send((MessageType.NODE_INFO, d))
266
236
 
267
237
  @wrap_with_logs(target="interchange")
268
238
  def _command_server(self) -> NoReturn:
@@ -270,8 +240,11 @@ class Interchange:
270
240
  """
271
241
  logger.debug("Command Server Starting")
272
242
 
273
- # Need to create a new ZMQ socket for command server thread
274
- hub_channel = self._create_monitoring_channel()
243
+ if self.hub_address is not None and self.hub_zmq_port is not None:
244
+ logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
245
+ monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
246
+ else:
247
+ monitoring_radio = None
275
248
 
276
249
  reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
277
250
 
@@ -279,13 +252,7 @@ class Interchange:
279
252
  try:
280
253
  command_req = self.command_channel.recv_pyobj()
281
254
  logger.debug("Received command request: {}".format(command_req))
282
- if command_req == "OUTSTANDING_C":
283
- outstanding = self.pending_task_queue.qsize()
284
- for manager in self._ready_managers.values():
285
- outstanding += len(manager['tasks'])
286
- reply = outstanding
287
-
288
- elif command_req == "CONNECTED_BLOCKS":
255
+ if command_req == "CONNECTED_BLOCKS":
289
256
  reply = self.connected_block_history
290
257
 
291
258
  elif command_req == "WORKERS":
@@ -308,7 +275,10 @@ class Interchange:
308
275
  'worker_count': m['worker_count'],
309
276
  'tasks': len(m['tasks']),
310
277
  'idle_duration': idle_duration,
311
- 'active': m['active']}
278
+ 'active': m['active'],
279
+ 'parsl_version': m['parsl_version'],
280
+ 'python_version': m['python_version'],
281
+ 'draining': m['draining']}
312
282
  reply.append(resp)
313
283
 
314
284
  elif command_req.startswith("HOLD_WORKER"):
@@ -318,13 +288,17 @@ class Interchange:
318
288
  if manager_id in self._ready_managers:
319
289
  m = self._ready_managers[manager_id]
320
290
  m['active'] = False
321
- self._send_monitoring_info(hub_channel, m)
291
+ self._send_monitoring_info(monitoring_radio, m)
322
292
  else:
323
293
  logger.warning("Worker to hold was not in ready managers list")
324
294
 
325
295
  reply = None
326
296
 
297
+ elif command_req == "WORKER_PORTS":
298
+ reply = (self.worker_task_port, self.worker_result_port)
299
+
327
300
  else:
301
+ logger.error(f"Received unknown command: {command_req}")
328
302
  reply = None
329
303
 
330
304
  logger.debug("Reply: {}".format(reply))
@@ -339,19 +313,14 @@ class Interchange:
339
313
  """ Start the interchange
340
314
  """
341
315
 
342
- # If a user workflow has set its own signal handler for sigterm, that
343
- # handler will be inherited by the interchange process because it is
344
- # launched as a multiprocessing fork process.
345
- # That can interfere with the interchange shutdown mechanism, which is
346
- # to receive a SIGTERM and exit immediately.
347
- # See Parsl issue #2343 (Threads and multiprocessing cannot be
348
- # intermingled without deadlocks) which talks about other fork-related
349
- # parent-process-inheritance problems.
350
- signal.signal(signal.SIGTERM, signal.SIG_DFL)
351
-
352
- logger.info("Incoming ports bound")
316
+ logger.info("Starting main interchange method")
353
317
 
354
- hub_channel = self._create_monitoring_channel()
318
+ if self.hub_address is not None and self.hub_zmq_port is not None:
319
+ logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
320
+ monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
321
+ logger.debug("Created monitoring radio")
322
+ else:
323
+ monitoring_radio = None
355
324
 
356
325
  poll_period = self.poll_period
357
326
 
@@ -382,20 +351,21 @@ class Interchange:
382
351
  while not kill_event.is_set():
383
352
  self.socks = dict(poller.poll(timeout=poll_period))
384
353
 
385
- self.process_task_outgoing_incoming(interesting_managers, hub_channel, kill_event)
386
- self.process_results_incoming(interesting_managers, hub_channel)
387
- self.expire_bad_managers(interesting_managers, hub_channel)
354
+ self.process_task_outgoing_incoming(interesting_managers, monitoring_radio, kill_event)
355
+ self.process_results_incoming(interesting_managers, monitoring_radio)
356
+ self.expire_bad_managers(interesting_managers, monitoring_radio)
357
+ self.expire_drained_managers(interesting_managers, monitoring_radio)
388
358
  self.process_tasks_to_send(interesting_managers)
389
359
 
390
360
  self.zmq_context.destroy()
391
361
  delta = time.time() - start
392
- logger.info("Processed {} tasks in {} seconds".format(self.count, delta))
362
+ logger.info(f"Processed {self.count} tasks in {delta} seconds")
393
363
  logger.warning("Exiting")
394
364
 
395
365
  def process_task_outgoing_incoming(
396
366
  self,
397
367
  interesting_managers: Set[bytes],
398
- hub_channel: Optional[zmq.Socket],
368
+ monitoring_radio: Optional[MonitoringRadioSender],
399
369
  kill_event: threading.Event
400
370
  ) -> None:
401
371
  """Process one message from manager on the task_outgoing channel.
@@ -410,9 +380,8 @@ class Interchange:
410
380
  try:
411
381
  msg = json.loads(message[1].decode('utf-8'))
412
382
  except Exception:
413
- logger.warning("Got Exception reading message from manager: {!r}".format(
414
- manager_id), exc_info=True)
415
- logger.debug("Message: \n{!r}\n".format(message[1]))
383
+ logger.warning(f"Got Exception reading message from manager: {manager_id!r}", exc_info=True)
384
+ logger.debug("Message:\n %r\n", message[1])
416
385
  return
417
386
 
418
387
  # perform a bit of validation on the structure of the deserialized
@@ -420,7 +389,7 @@ class Interchange:
420
389
  # in obviously malformed cases
421
390
  if not isinstance(msg, dict) or 'type' not in msg:
422
391
  logger.error(f"JSON message was not correctly formatted from manager: {manager_id!r}")
423
- logger.debug("Message: \n{!r}\n".format(message[1]))
392
+ logger.debug("Message:\n %r\n", message[1])
424
393
  return
425
394
 
426
395
  if msg['type'] == 'registration':
@@ -428,14 +397,18 @@ class Interchange:
428
397
  self._ready_managers[manager_id] = {'last_heartbeat': time.time(),
429
398
  'idle_since': time.time(),
430
399
  'block_id': None,
400
+ 'start_time': msg['start_time'],
431
401
  'max_capacity': 0,
432
402
  'worker_count': 0,
433
403
  'active': True,
404
+ 'draining': False,
405
+ 'parsl_version': msg['parsl_v'],
406
+ 'python_version': msg['python_v'],
434
407
  'tasks': []}
435
408
  self.connected_block_history.append(msg['block_id'])
436
409
 
437
410
  interesting_managers.add(manager_id)
438
- logger.info("Adding manager: {!r} to ready queue".format(manager_id))
411
+ logger.info(f"Adding manager: {manager_id!r} to ready queue")
439
412
  m = self._ready_managers[manager_id]
440
413
 
441
414
  # m is a ManagerRecord, but msg is a dict[Any,Any] and so can
@@ -444,12 +417,12 @@ class Interchange:
444
417
  # later.
445
418
  m.update(msg) # type: ignore[typeddict-item]
446
419
 
447
- logger.info("Registration info for manager {!r}: {}".format(manager_id, msg))
448
- self._send_monitoring_info(hub_channel, m)
420
+ logger.info(f"Registration info for manager {manager_id!r}: {msg}")
421
+ self._send_monitoring_info(monitoring_radio, m)
449
422
 
450
423
  if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit(".", 1)[0] or
451
424
  msg['parsl_v'] != self.current_platform['parsl_v']):
452
- logger.error("Manager {!r} has incompatible version info with the interchange".format(manager_id))
425
+ logger.error(f"Manager {manager_id!r} has incompatible version info with the interchange")
453
426
  logger.debug("Setting kill event")
454
427
  kill_event.set()
455
428
  e = VersionMismatch("py.v={} parsl.v={}".format(self.current_platform['python_v'].rsplit(".", 1)[0],
@@ -462,27 +435,49 @@ class Interchange:
462
435
  self.results_outgoing.send(pkl_package)
463
436
  logger.error("Sent failure reports, shutting down interchange")
464
437
  else:
465
- logger.info("Manager {!r} has compatible Parsl version {}".format(manager_id, msg['parsl_v']))
466
- logger.info("Manager {!r} has compatible Python version {}".format(manager_id,
467
- msg['python_v'].rsplit(".", 1)[0]))
438
+ logger.info(f"Manager {manager_id!r} has compatible Parsl version {msg['parsl_v']}")
439
+ logger.info(f"Manager {manager_id!r} has compatible Python version {msg['python_v'].rsplit('.', 1)[0]}")
468
440
  elif msg['type'] == 'heartbeat':
469
- self._ready_managers[manager_id]['last_heartbeat'] = time.time()
470
- logger.debug("Manager {!r} sent heartbeat via tasks connection".format(manager_id))
471
- self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
441
+ manager = self._ready_managers.get(manager_id)
442
+ if manager:
443
+ manager['last_heartbeat'] = time.time()
444
+ logger.debug("Manager %r sent heartbeat via tasks connection", manager_id)
445
+ self.task_outgoing.send_multipart([manager_id, b'', PKL_HEARTBEAT_CODE])
446
+ else:
447
+ logger.warning("Received heartbeat via tasks connection for not-registered manager %r", manager_id)
448
+ elif msg['type'] == 'drain':
449
+ self._ready_managers[manager_id]['draining'] = True
450
+ logger.debug("Manager %r requested drain", manager_id)
472
451
  else:
473
452
  logger.error(f"Unexpected message type received from manager: {msg['type']}")
474
453
  logger.debug("leaving task_outgoing section")
475
454
 
455
+ def expire_drained_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
456
+
457
+ for manager_id in list(interesting_managers):
458
+ # is it always true that a draining manager will be in interesting managers?
459
+ # i think so because it will have outstanding capacity?
460
+ m = self._ready_managers[manager_id]
461
+ if m['draining'] and len(m['tasks']) == 0:
462
+ logger.info(f"Manager {manager_id!r} is drained - sending drained message to manager")
463
+ self.task_outgoing.send_multipart([manager_id, b'', PKL_DRAINED_CODE])
464
+ interesting_managers.remove(manager_id)
465
+ self._ready_managers.pop(manager_id)
466
+
467
+ m['active'] = False
468
+ self._send_monitoring_info(monitoring_radio, m)
469
+
476
470
  def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
477
471
  # Check if there are tasks that could be sent to managers
478
472
 
479
- logger.debug("Managers count (interesting/total): {interesting}/{total}".format(
480
- total=len(self._ready_managers),
481
- interesting=len(interesting_managers)))
473
+ logger.debug(
474
+ "Managers count (interesting/total): %d/%d",
475
+ len(interesting_managers),
476
+ len(self._ready_managers)
477
+ )
482
478
 
483
479
  if interesting_managers and not self.pending_task_queue.empty():
484
- shuffled_managers = list(interesting_managers)
485
- random.shuffle(shuffled_managers)
480
+ shuffled_managers = self.manager_selector.sort_managers(self._ready_managers, interesting_managers)
486
481
 
487
482
  while shuffled_managers and not self.pending_task_queue.empty(): # cf. the if statement above...
488
483
  manager_id = shuffled_managers.pop()
@@ -490,7 +485,7 @@ class Interchange:
490
485
  tasks_inflight = len(m['tasks'])
491
486
  real_capacity = m['max_capacity'] - tasks_inflight
492
487
 
493
- if (real_capacity and m['active']):
488
+ if real_capacity and m["active"] and not m["draining"]:
494
489
  tasks = self.get_tasks(real_capacity)
495
490
  if tasks:
496
491
  self.task_outgoing.send_multipart([manager_id, b'', pickle.dumps(tasks)])
@@ -499,31 +494,31 @@ class Interchange:
499
494
  tids = [t['task_id'] for t in tasks]
500
495
  m['tasks'].extend(tids)
501
496
  m['idle_since'] = None
502
- logger.debug("Sent tasks: {} to manager {!r}".format(tids, manager_id))
497
+ logger.debug("Sent tasks: %s to manager %r", tids, manager_id)
503
498
  # recompute real_capacity after sending tasks
504
499
  real_capacity = m['max_capacity'] - tasks_inflight
505
500
  if real_capacity > 0:
506
- logger.debug("Manager {!r} has free capacity {}".format(manager_id, real_capacity))
501
+ logger.debug("Manager %r has free capacity %s", manager_id, real_capacity)
507
502
  # ... so keep it in the interesting_managers list
508
503
  else:
509
- logger.debug("Manager {!r} is now saturated".format(manager_id))
504
+ logger.debug("Manager %r is now saturated", manager_id)
510
505
  interesting_managers.remove(manager_id)
511
506
  else:
512
507
  interesting_managers.remove(manager_id)
513
508
  # logger.debug("Nothing to send to manager {}".format(manager_id))
514
- logger.debug("leaving _ready_managers section, with {} managers still interesting".format(len(interesting_managers)))
509
+ logger.debug("leaving _ready_managers section, with %s managers still interesting", len(interesting_managers))
515
510
  else:
516
511
  logger.debug("either no interesting managers or no tasks, so skipping manager pass")
517
512
 
518
- def process_results_incoming(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
513
+ def process_results_incoming(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
519
514
  # Receive any results and forward to client
520
515
  if self.results_incoming in self.socks and self.socks[self.results_incoming] == zmq.POLLIN:
521
516
  logger.debug("entering results_incoming section")
522
517
  manager_id, *all_messages = self.results_incoming.recv_multipart()
523
518
  if manager_id not in self._ready_managers:
524
- logger.warning("Received a result from a un-registered manager: {!r}".format(manager_id))
519
+ logger.warning(f"Received a result from a un-registered manager: {manager_id!r}")
525
520
  else:
526
- logger.debug(f"Got {len(all_messages)} result items in batch from manager {manager_id!r}")
521
+ logger.debug("Got %s result items in batch from manager %r", len(all_messages), manager_id)
527
522
 
528
523
  b_messages = []
529
524
 
@@ -535,16 +530,15 @@ class Interchange:
535
530
  elif r['type'] == 'monitoring':
536
531
  # the monitoring code makes the assumption that no
537
532
  # monitoring messages will be received if monitoring
538
- # is not configured, and that hub_channel will only
533
+ # is not configured, and that monitoring_radio will only
539
534
  # be None when monitoring is not configurated.
540
- assert hub_channel is not None
535
+ assert monitoring_radio is not None
541
536
 
542
- hub_channel.send_pyobj(r['payload'])
537
+ monitoring_radio.send(r['payload'])
543
538
  elif r['type'] == 'heartbeat':
544
- logger.debug(f"Manager {manager_id!r} sent heartbeat via results connection")
545
- b_messages.append((p_message, r))
539
+ logger.debug("Manager %r sent heartbeat via results connection", manager_id)
546
540
  else:
547
- logger.error("Interchange discarding result_queue message of unknown type: {}".format(r['type']))
541
+ logger.error("Interchange discarding result_queue message of unknown type: %s", r["type"])
548
542
 
549
543
  got_result = False
550
544
  m = self._ready_managers[manager_id]
@@ -553,14 +547,16 @@ class Interchange:
553
547
  if r['type'] == 'result':
554
548
  got_result = True
555
549
  try:
556
- logger.debug(f"Removing task {r['task_id']} from manager record {manager_id!r}")
550
+ logger.debug("Removing task %s from manager record %r", r["task_id"], manager_id)
557
551
  m['tasks'].remove(r['task_id'])
558
552
  except Exception:
559
553
  # If we reach here, there's something very wrong.
560
- logger.exception("Ignoring exception removing task_id {} for manager {!r} with task list {}".format(
554
+ logger.exception(
555
+ "Ignoring exception removing task_id %s for manager %r with task list %s",
561
556
  r['task_id'],
562
557
  manager_id,
563
- m['tasks']))
558
+ m["tasks"]
559
+ )
564
560
 
565
561
  b_messages_to_send = []
566
562
  for (b_message, _) in b_messages:
@@ -571,7 +567,7 @@ class Interchange:
571
567
  self.results_outgoing.send_multipart(b_messages_to_send)
572
568
  logger.debug("Sent messages on results_outgoing")
573
569
 
574
- logger.debug(f"Current tasks on manager {manager_id!r}: {m['tasks']}")
570
+ logger.debug("Current tasks on manager %r: %s", manager_id, m["tasks"])
575
571
  if len(m['tasks']) == 0 and m['idle_since'] is None:
576
572
  m['idle_since'] = time.time()
577
573
 
@@ -583,7 +579,7 @@ class Interchange:
583
579
  interesting_managers.add(manager_id)
584
580
  logger.debug("leaving results_incoming section")
585
581
 
586
- def expire_bad_managers(self, interesting_managers: Set[bytes], hub_channel: Optional[zmq.Socket]) -> None:
582
+ def expire_bad_managers(self, interesting_managers: Set[bytes], monitoring_radio: Optional[MonitoringRadioSender]) -> None:
587
583
  bad_managers = [(manager_id, m) for (manager_id, m) in self._ready_managers.items() if
588
584
  time.time() - m['last_heartbeat'] > self.heartbeat_threshold]
589
585
  for (manager_id, m) in bad_managers:
@@ -591,7 +587,7 @@ class Interchange:
591
587
  logger.warning(f"Too many heartbeats missed for manager {manager_id!r} - removing manager")
592
588
  if m['active']:
593
589
  m['active'] = False
594
- self._send_monitoring_info(hub_channel, m)
590
+ self._send_monitoring_info(monitoring_radio, m)
595
591
 
596
592
  logger.warning(f"Cancelling htex tasks {m['tasks']} on removed manager")
597
593
  for tid in m['tasks']:
@@ -644,15 +640,10 @@ def start_file_logger(filename: str, level: int = logging.DEBUG, format_string:
644
640
  logger.addHandler(handler)
645
641
 
646
642
 
647
- @wrap_with_logs(target="interchange")
648
- def starter(comm_q: multiprocessing.Queue, *args: Any, **kwargs: Any) -> None:
649
- """Start the interchange process
650
-
651
- The executor is expected to call this function. The args, kwargs match that of the Interchange.__init__
652
- """
643
+ if __name__ == "__main__":
653
644
  setproctitle("parsl: HTEX interchange")
654
- # logger = multiprocessing.get_logger()
655
- ic = Interchange(*args, **kwargs)
656
- comm_q.put((ic.worker_task_port,
657
- ic.worker_result_port))
645
+
646
+ config = pickle.load(sys.stdin.buffer)
647
+
648
+ ic = Interchange(**config)
658
649
  ic.start()
@@ -1,15 +1,20 @@
1
1
  from datetime import datetime
2
2
  from typing import Any, List, Optional
3
+
3
4
  from typing_extensions import TypedDict
4
5
 
5
6
 
6
7
  class ManagerRecord(TypedDict, total=False):
7
8
  block_id: Optional[str]
9
+ start_time: float
8
10
  tasks: List[Any]
9
11
  worker_count: int
10
12
  max_capacity: int
11
13
  active: bool
14
+ draining: bool
12
15
  hostname: str
13
16
  last_heartbeat: float
14
17
  idle_since: Optional[float]
15
18
  timestamp: datetime
19
+ parsl_version: str
20
+ python_version: str