parsl 2024.3.18__py3-none-any.whl → 2025.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. parsl/__init__.py +9 -10
  2. parsl/addresses.py +26 -6
  3. parsl/app/app.py +7 -8
  4. parsl/app/bash.py +15 -8
  5. parsl/app/errors.py +10 -13
  6. parsl/app/futures.py +8 -10
  7. parsl/app/python.py +2 -1
  8. parsl/benchmark/perf.py +2 -1
  9. parsl/concurrent/__init__.py +2 -2
  10. parsl/config.py +53 -10
  11. parsl/configs/ASPIRE1.py +6 -5
  12. parsl/configs/Azure.py +9 -8
  13. parsl/configs/bridges.py +6 -4
  14. parsl/configs/cc_in2p3.py +3 -3
  15. parsl/configs/ec2.py +3 -1
  16. parsl/configs/expanse.py +4 -3
  17. parsl/configs/frontera.py +3 -4
  18. parsl/configs/htex_local.py +3 -4
  19. parsl/configs/illinoiscluster.py +3 -1
  20. parsl/configs/improv.py +34 -0
  21. parsl/configs/kubernetes.py +4 -3
  22. parsl/configs/local_threads.py +5 -1
  23. parsl/configs/midway.py +5 -3
  24. parsl/configs/osg.py +4 -2
  25. parsl/configs/polaris.py +4 -2
  26. parsl/configs/stampede2.py +6 -5
  27. parsl/configs/summit.py +3 -3
  28. parsl/configs/toss3_llnl.py +4 -3
  29. parsl/configs/vineex_local.py +6 -4
  30. parsl/configs/wqex_local.py +5 -3
  31. parsl/curvezmq.py +4 -0
  32. parsl/data_provider/data_manager.py +4 -3
  33. parsl/data_provider/file_noop.py +1 -2
  34. parsl/data_provider/files.py +3 -3
  35. parsl/data_provider/ftp.py +1 -3
  36. parsl/data_provider/globus.py +7 -6
  37. parsl/data_provider/http.py +2 -2
  38. parsl/data_provider/rsync.py +1 -1
  39. parsl/data_provider/staging.py +2 -2
  40. parsl/data_provider/zip.py +135 -0
  41. parsl/dataflow/dependency_resolvers.py +115 -0
  42. parsl/dataflow/dflow.py +259 -223
  43. parsl/dataflow/errors.py +3 -5
  44. parsl/dataflow/futures.py +27 -14
  45. parsl/dataflow/memoization.py +5 -5
  46. parsl/dataflow/rundirs.py +5 -6
  47. parsl/dataflow/taskrecord.py +4 -5
  48. parsl/executors/__init__.py +4 -2
  49. parsl/executors/base.py +45 -15
  50. parsl/executors/errors.py +13 -0
  51. parsl/executors/execute_task.py +37 -0
  52. parsl/executors/flux/execute_parsl_task.py +3 -3
  53. parsl/executors/flux/executor.py +18 -19
  54. parsl/executors/flux/flux_instance_manager.py +26 -27
  55. parsl/executors/high_throughput/errors.py +43 -3
  56. parsl/executors/high_throughput/executor.py +307 -285
  57. parsl/executors/high_throughput/interchange.py +137 -168
  58. parsl/executors/high_throughput/manager_record.py +4 -0
  59. parsl/executors/high_throughput/manager_selector.py +55 -0
  60. parsl/executors/high_throughput/monitoring_info.py +2 -1
  61. parsl/executors/high_throughput/mpi_executor.py +113 -0
  62. parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
  63. parsl/executors/high_throughput/mpi_resource_management.py +6 -17
  64. parsl/executors/high_throughput/probe.py +9 -7
  65. parsl/executors/high_throughput/process_worker_pool.py +77 -75
  66. parsl/executors/high_throughput/zmq_pipes.py +81 -23
  67. parsl/executors/radical/executor.py +130 -79
  68. parsl/executors/radical/rpex_resources.py +17 -15
  69. parsl/executors/radical/rpex_worker.py +4 -3
  70. parsl/executors/status_handling.py +157 -51
  71. parsl/executors/taskvine/__init__.py +1 -1
  72. parsl/executors/taskvine/errors.py +1 -1
  73. parsl/executors/taskvine/exec_parsl_function.py +2 -2
  74. parsl/executors/taskvine/executor.py +38 -55
  75. parsl/executors/taskvine/factory.py +1 -1
  76. parsl/executors/taskvine/factory_config.py +1 -1
  77. parsl/executors/taskvine/manager.py +17 -13
  78. parsl/executors/taskvine/manager_config.py +7 -2
  79. parsl/executors/threads.py +6 -6
  80. parsl/executors/workqueue/errors.py +1 -1
  81. parsl/executors/workqueue/exec_parsl_function.py +6 -5
  82. parsl/executors/workqueue/executor.py +64 -63
  83. parsl/executors/workqueue/parsl_coprocess.py +1 -1
  84. parsl/jobs/error_handlers.py +2 -2
  85. parsl/jobs/job_status_poller.py +28 -112
  86. parsl/jobs/states.py +7 -2
  87. parsl/jobs/strategy.py +43 -31
  88. parsl/launchers/__init__.py +12 -3
  89. parsl/launchers/errors.py +1 -1
  90. parsl/launchers/launchers.py +0 -6
  91. parsl/log_utils.py +1 -2
  92. parsl/monitoring/db_manager.py +55 -93
  93. parsl/monitoring/errors.py +6 -0
  94. parsl/monitoring/monitoring.py +85 -311
  95. parsl/monitoring/queries/pandas.py +1 -2
  96. parsl/monitoring/radios/base.py +13 -0
  97. parsl/monitoring/radios/filesystem.py +52 -0
  98. parsl/monitoring/radios/htex.py +57 -0
  99. parsl/monitoring/radios/multiprocessing.py +17 -0
  100. parsl/monitoring/radios/udp.py +56 -0
  101. parsl/monitoring/radios/zmq.py +17 -0
  102. parsl/monitoring/remote.py +33 -37
  103. parsl/monitoring/router.py +212 -0
  104. parsl/monitoring/types.py +5 -6
  105. parsl/monitoring/visualization/app.py +4 -2
  106. parsl/monitoring/visualization/models.py +0 -1
  107. parsl/monitoring/visualization/plots/default/workflow_plots.py +8 -4
  108. parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
  109. parsl/monitoring/visualization/utils.py +0 -1
  110. parsl/monitoring/visualization/views.py +16 -9
  111. parsl/multiprocessing.py +0 -1
  112. parsl/process_loggers.py +1 -2
  113. parsl/providers/__init__.py +8 -17
  114. parsl/providers/aws/aws.py +2 -3
  115. parsl/providers/azure/azure.py +4 -5
  116. parsl/providers/base.py +2 -18
  117. parsl/providers/cluster_provider.py +3 -9
  118. parsl/providers/condor/condor.py +7 -17
  119. parsl/providers/errors.py +2 -2
  120. parsl/providers/googlecloud/googlecloud.py +2 -1
  121. parsl/providers/grid_engine/grid_engine.py +5 -14
  122. parsl/providers/kubernetes/kube.py +80 -40
  123. parsl/providers/local/local.py +13 -26
  124. parsl/providers/lsf/lsf.py +5 -23
  125. parsl/providers/pbspro/pbspro.py +5 -17
  126. parsl/providers/slurm/slurm.py +81 -39
  127. parsl/providers/torque/torque.py +3 -14
  128. parsl/serialize/__init__.py +8 -3
  129. parsl/serialize/base.py +1 -2
  130. parsl/serialize/concretes.py +5 -4
  131. parsl/serialize/facade.py +3 -3
  132. parsl/serialize/proxystore.py +3 -2
  133. parsl/tests/__init__.py +1 -1
  134. parsl/tests/configs/azure_single_node.py +4 -5
  135. parsl/tests/configs/bridges.py +3 -2
  136. parsl/tests/configs/cc_in2p3.py +1 -3
  137. parsl/tests/configs/comet.py +2 -1
  138. parsl/tests/configs/ec2_single_node.py +1 -2
  139. parsl/tests/configs/ec2_spot.py +1 -2
  140. parsl/tests/configs/flux_local.py +11 -0
  141. parsl/tests/configs/frontera.py +2 -3
  142. parsl/tests/configs/htex_local.py +3 -5
  143. parsl/tests/configs/htex_local_alternate.py +11 -15
  144. parsl/tests/configs/htex_local_intask_staging.py +5 -9
  145. parsl/tests/configs/htex_local_rsync_staging.py +4 -8
  146. parsl/tests/configs/local_radical.py +1 -3
  147. parsl/tests/configs/local_radical_mpi.py +2 -2
  148. parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
  149. parsl/tests/configs/local_threads_monitoring.py +0 -1
  150. parsl/tests/configs/midway.py +2 -2
  151. parsl/tests/configs/nscc_singapore.py +3 -3
  152. parsl/tests/configs/osg_htex.py +1 -1
  153. parsl/tests/configs/petrelkube.py +3 -2
  154. parsl/tests/configs/slurm_local.py +24 -0
  155. parsl/tests/configs/summit.py +1 -0
  156. parsl/tests/configs/taskvine_ex.py +4 -7
  157. parsl/tests/configs/user_opts.py +0 -7
  158. parsl/tests/configs/workqueue_ex.py +4 -6
  159. parsl/tests/conftest.py +27 -13
  160. parsl/tests/integration/test_stress/test_python_simple.py +3 -4
  161. parsl/tests/integration/test_stress/test_python_threads.py +3 -5
  162. parsl/tests/manual_tests/htex_local.py +4 -6
  163. parsl/tests/manual_tests/test_basic.py +1 -0
  164. parsl/tests/manual_tests/test_log_filter.py +3 -1
  165. parsl/tests/manual_tests/test_memory_limits.py +6 -8
  166. parsl/tests/manual_tests/test_regression_220.py +2 -1
  167. parsl/tests/manual_tests/test_udp_simple.py +4 -4
  168. parsl/tests/manual_tests/test_worker_count.py +3 -2
  169. parsl/tests/scaling_tests/htex_local.py +2 -4
  170. parsl/tests/scaling_tests/test_scale.py +0 -9
  171. parsl/tests/scaling_tests/vineex_condor.py +1 -2
  172. parsl/tests/scaling_tests/vineex_local.py +1 -2
  173. parsl/tests/site_tests/site_config_selector.py +1 -6
  174. parsl/tests/site_tests/test_provider.py +4 -2
  175. parsl/tests/site_tests/test_site.py +2 -0
  176. parsl/tests/sites/test_affinity.py +7 -7
  177. parsl/tests/sites/test_dynamic_executor.py +3 -4
  178. parsl/tests/sites/test_ec2.py +3 -2
  179. parsl/tests/sites/test_worker_info.py +4 -5
  180. parsl/tests/test_aalst_patterns.py +0 -1
  181. parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
  182. parsl/tests/test_bash_apps/test_basic.py +10 -4
  183. parsl/tests/test_bash_apps/test_error_codes.py +5 -7
  184. parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
  185. parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
  186. parsl/tests/test_bash_apps/test_memoize.py +2 -8
  187. parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
  188. parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
  189. parsl/tests/test_bash_apps/test_multiline.py +1 -1
  190. parsl/tests/test_bash_apps/test_pipeline.py +1 -1
  191. parsl/tests/test_bash_apps/test_std_uri.py +123 -0
  192. parsl/tests/test_bash_apps/test_stdout.py +33 -8
  193. parsl/tests/test_callables.py +2 -2
  194. parsl/tests/test_checkpointing/test_periodic.py +21 -39
  195. parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
  196. parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
  197. parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
  198. parsl/tests/test_checkpointing/test_regression_239.py +1 -1
  199. parsl/tests/test_checkpointing/test_task_exit.py +2 -3
  200. parsl/tests/test_docs/test_from_slides.py +5 -2
  201. parsl/tests/test_docs/test_kwargs.py +4 -1
  202. parsl/tests/test_docs/test_tutorial_1.py +1 -2
  203. parsl/tests/test_docs/test_workflow1.py +2 -2
  204. parsl/tests/test_docs/test_workflow2.py +0 -1
  205. parsl/tests/test_error_handling/test_rand_fail.py +2 -2
  206. parsl/tests/test_error_handling/test_resource_spec.py +10 -12
  207. parsl/tests/test_error_handling/test_retries.py +6 -16
  208. parsl/tests/test_error_handling/test_retry_handler.py +1 -0
  209. parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
  210. parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
  211. parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
  212. parsl/tests/test_execute_task.py +29 -0
  213. parsl/tests/test_flux.py +1 -1
  214. parsl/tests/test_htex/test_basic.py +2 -3
  215. parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
  216. parsl/tests/test_htex/test_command_client_timeout.py +66 -0
  217. parsl/tests/test_htex/test_connected_blocks.py +3 -2
  218. parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
  219. parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
  220. parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
  221. parsl/tests/test_htex/test_drain.py +11 -10
  222. parsl/tests/test_htex/test_htex.py +51 -25
  223. parsl/tests/test_htex/test_manager_failure.py +0 -1
  224. parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
  225. parsl/tests/test_htex/test_managers_command.py +36 -0
  226. parsl/tests/test_htex/test_missing_worker.py +2 -12
  227. parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
  228. parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
  229. parsl/tests/test_htex/test_zmq_binding.py +29 -8
  230. parsl/tests/test_monitoring/test_app_names.py +5 -5
  231. parsl/tests/test_monitoring/test_basic.py +73 -25
  232. parsl/tests/test_monitoring/test_db_locks.py +6 -4
  233. parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
  234. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
  235. parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
  236. parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
  237. parsl/tests/test_monitoring/test_stdouterr.py +134 -0
  238. parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
  239. parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
  240. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
  241. parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
  242. parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
  243. parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
  244. parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
  245. parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
  246. parsl/tests/test_providers/test_local_provider.py +3 -132
  247. parsl/tests/test_providers/test_pbspro_template.py +2 -3
  248. parsl/tests/test_providers/test_slurm_template.py +2 -3
  249. parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
  250. parsl/tests/test_python_apps/test_context_manager.py +128 -0
  251. parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
  252. parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
  253. parsl/tests/test_python_apps/test_fail.py +0 -25
  254. parsl/tests/test_python_apps/test_futures.py +2 -1
  255. parsl/tests/test_python_apps/test_inputs_default.py +22 -0
  256. parsl/tests/test_python_apps/test_join.py +0 -1
  257. parsl/tests/test_python_apps/test_lifted.py +11 -7
  258. parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
  259. parsl/tests/test_python_apps/test_outputs.py +1 -1
  260. parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
  261. parsl/tests/test_radical/test_mpi_funcs.py +1 -2
  262. parsl/tests/test_regression/test_1480.py +2 -1
  263. parsl/tests/test_regression/test_1653.py +2 -1
  264. parsl/tests/test_regression/test_226.py +1 -0
  265. parsl/tests/test_regression/test_2652.py +1 -0
  266. parsl/tests/test_regression/test_69a.py +0 -1
  267. parsl/tests/test_regression/test_854.py +4 -2
  268. parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
  269. parsl/tests/test_regression/test_98.py +0 -1
  270. parsl/tests/test_scaling/test_block_error_handler.py +9 -4
  271. parsl/tests/test_scaling/test_regression_1621.py +11 -15
  272. parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
  273. parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
  274. parsl/tests/test_scaling/test_scale_down.py +2 -5
  275. parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +5 -8
  276. parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
  277. parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
  278. parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
  279. parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
  280. parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
  281. parsl/tests/test_serialization/test_basic.py +2 -1
  282. parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
  283. parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
  284. parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
  285. parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
  286. parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
  287. parsl/tests/test_staging/staging_provider.py +2 -2
  288. parsl/tests/test_staging/test_1316.py +3 -4
  289. parsl/tests/test_staging/test_docs_1.py +2 -1
  290. parsl/tests/test_staging/test_docs_2.py +2 -1
  291. parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
  292. parsl/tests/{test_data → test_staging}/test_file.py +6 -6
  293. parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
  294. parsl/tests/test_staging/test_staging_ftp.py +1 -0
  295. parsl/tests/test_staging/test_staging_https.py +5 -2
  296. parsl/tests/test_staging/test_staging_stdout.py +64 -0
  297. parsl/tests/test_staging/test_zip_in.py +39 -0
  298. parsl/tests/test_staging/test_zip_out.py +110 -0
  299. parsl/tests/test_staging/test_zip_to_zip.py +41 -0
  300. parsl/tests/test_summary.py +2 -2
  301. parsl/tests/test_thread_parallelism.py +0 -1
  302. parsl/tests/test_threads/test_configs.py +1 -2
  303. parsl/tests/test_threads/test_lazy_errors.py +2 -2
  304. parsl/tests/test_utils/test_execute_wait.py +35 -0
  305. parsl/tests/test_utils/test_sanitize_dns.py +76 -0
  306. parsl/tests/unit/test_address.py +20 -0
  307. parsl/tests/unit/test_file.py +99 -0
  308. parsl/tests/unit/test_usage_tracking.py +66 -0
  309. parsl/usage_tracking/api.py +65 -0
  310. parsl/usage_tracking/levels.py +6 -0
  311. parsl/usage_tracking/usage.py +104 -62
  312. parsl/utils.py +137 -4
  313. parsl/version.py +1 -1
  314. {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
  315. parsl-2025.1.13.data/scripts/interchange.py +649 -0
  316. {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +77 -75
  317. parsl-2025.1.13.dist-info/METADATA +96 -0
  318. parsl-2025.1.13.dist-info/RECORD +462 -0
  319. {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
  320. parsl/channels/__init__.py +0 -7
  321. parsl/channels/base.py +0 -141
  322. parsl/channels/errors.py +0 -113
  323. parsl/channels/local/local.py +0 -164
  324. parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
  325. parsl/channels/ssh/ssh.py +0 -276
  326. parsl/channels/ssh_il/__init__.py +0 -0
  327. parsl/channels/ssh_il/ssh_il.py +0 -74
  328. parsl/configs/ad_hoc.py +0 -35
  329. parsl/executors/radical/rpex_master.py +0 -42
  330. parsl/monitoring/radios.py +0 -175
  331. parsl/providers/ad_hoc/__init__.py +0 -0
  332. parsl/providers/ad_hoc/ad_hoc.py +0 -248
  333. parsl/providers/cobalt/__init__.py +0 -0
  334. parsl/providers/cobalt/cobalt.py +0 -236
  335. parsl/providers/cobalt/template.py +0 -17
  336. parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
  337. parsl/tests/configs/cooley_htex.py +0 -37
  338. parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
  339. parsl/tests/configs/local_adhoc.py +0 -18
  340. parsl/tests/configs/swan_htex.py +0 -43
  341. parsl/tests/configs/theta.py +0 -37
  342. parsl/tests/integration/test_channels/__init__.py +0 -0
  343. parsl/tests/integration/test_channels/test_channels.py +0 -17
  344. parsl/tests/integration/test_channels/test_local_channel.py +0 -42
  345. parsl/tests/integration/test_channels/test_scp_1.py +0 -45
  346. parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
  347. parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
  348. parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
  349. parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
  350. parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
  351. parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
  352. parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
  353. parsl/tests/sites/test_local_adhoc.py +0 -61
  354. parsl/tests/test_channels/__init__.py +0 -0
  355. parsl/tests/test_channels/test_large_output.py +0 -22
  356. parsl/tests/test_data/__init__.py +0 -0
  357. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
  358. parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
  359. parsl-2024.3.18.dist-info/METADATA +0 -98
  360. parsl-2024.3.18.dist-info/RECORD +0 -449
  361. parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
  362. parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
  363. parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
  364. parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
  365. parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
  366. {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
  367. {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
  368. {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
  369. {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,55 @@
1
+ import random
2
+ from abc import ABCMeta, abstractmethod
3
+ from typing import Dict, List, Set
4
+
5
+ from parsl.executors.high_throughput.manager_record import ManagerRecord
6
+
7
+
8
+ class ManagerSelector(metaclass=ABCMeta):
9
+
10
+ @abstractmethod
11
+ def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
12
+ """ Sort a given list of managers.
13
+
14
+ Any operations pertaining to the sorting and rearrangement of the
15
+ interesting_managers Set should be performed here.
16
+ """
17
+ pass
18
+
19
+
20
+ class RandomManagerSelector(ManagerSelector):
21
+
22
+ """Returns a shuffled list of interesting_managers
23
+
24
+ By default this strategy is used by the interchange. Works well
25
+ in distributing workloads equally across all availble compute
26
+ resources. The random workload strategy is not effective in
27
+ conjunction with elastic scaling behavior as the even task
28
+ distribution does not allow the scaling down of blocks, leading
29
+ to wasted resource consumption.
30
+ """
31
+
32
+ def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
33
+ c_manager_list = list(manager_list)
34
+ random.shuffle(c_manager_list)
35
+ return c_manager_list
36
+
37
+
38
+ class BlockIdManagerSelector(ManagerSelector):
39
+
40
+ """Returns an interesting_managers list sorted by block ID
41
+
42
+ Observations:
43
+ 1. BlockID manager selector helps with workloads that see a varying
44
+ amount of tasks over time. New blocks are prioritized with the
45
+ blockID manager selector, when used with 'htex_auto_scaling', results
46
+ in compute cost savings.
47
+
48
+ 2. Doesn't really work with bag-of-tasks workloads. When all the tasks
49
+ are put into the queue upfront, all blocks operate at near full
50
+ utilization for the majority of the workload, which task goes where
51
+ doesn't really matter.
52
+ """
53
+
54
+ def sort_managers(self, ready_managers: Dict[bytes, ManagerRecord], manager_list: Set[bytes]) -> List[bytes]:
55
+ return sorted(manager_list, key=lambda x: (ready_managers[x]['block_id'] is not None, ready_managers[x]['block_id']))
@@ -3,6 +3,7 @@
3
3
  # then be acquired by any other code running in
4
4
  # a worker context - specifically the monitoring
5
5
  # wrapper code.
6
- from typing import Optional
7
6
  from queue import Queue
7
+ from typing import Optional
8
+
8
9
  result_queue: Optional[Queue] = None
@@ -0,0 +1,113 @@
1
+ """A simplified interface for HTEx when running in MPI mode"""
2
+ from typing import Callable, Dict, List, Optional, Tuple, Union
3
+
4
+ import typeguard
5
+
6
+ from parsl.data_provider.staging import Staging
7
+ from parsl.executors.high_throughput.executor import (
8
+ GENERAL_HTEX_PARAM_DOCS,
9
+ HighThroughputExecutor,
10
+ )
11
+ from parsl.executors.high_throughput.mpi_prefix_composer import (
12
+ VALID_LAUNCHERS,
13
+ validate_resource_spec,
14
+ )
15
+ from parsl.executors.status_handling import BlockProviderExecutor
16
+ from parsl.jobs.states import JobStatus
17
+ from parsl.launchers import SimpleLauncher
18
+ from parsl.providers import LocalProvider
19
+ from parsl.providers.base import ExecutionProvider
20
+
21
+
22
+ class MPIExecutor(HighThroughputExecutor):
23
+ __doc__ = f"""A version of :class:`~parsl.HighThroughputExecutor` tuned for executing multi-node (e.g., MPI) tasks.
24
+
25
+ The Provider _must_ use the :class:`~parsl.launchers.SimpleLauncher`,
26
+ which places a single pool of workers on the first node of a block.
27
+ Each worker can then make system calls which use an MPI launcher (e.g., ``mpirun``, ``srun``)
28
+ to spawn multi-node tasks.
29
+
30
+ Specify the maximum number of multi-node tasks to run at once using ``max_workers_per_block``.
31
+ The value should be less than or equal to the ``nodes_per_block`` in the Provider.
32
+
33
+ Parameters
34
+ ----------
35
+ max_workers_per_block: int
36
+ Maximum number of MPI applications to run at once per block
37
+
38
+ mpi_launcher: str
39
+ Select one from the list of supported MPI launchers:
40
+ ("srun", "aprun", "mpiexec").
41
+ default: "mpiexec"
42
+
43
+ {GENERAL_HTEX_PARAM_DOCS}
44
+ """
45
+
46
+ @typeguard.typechecked
47
+ def __init__(self,
48
+ label: str = 'MPIExecutor',
49
+ provider: ExecutionProvider = LocalProvider(),
50
+ launch_cmd: Optional[str] = None,
51
+ interchange_launch_cmd: Optional[str] = None,
52
+ address: Optional[str] = None,
53
+ loopback_address: str = "127.0.0.1",
54
+ worker_ports: Optional[Tuple[int, int]] = None,
55
+ worker_port_range: Optional[Tuple[int, int]] = (54000, 55000),
56
+ interchange_port_range: Optional[Tuple[int, int]] = (55000, 56000),
57
+ storage_access: Optional[List[Staging]] = None,
58
+ working_dir: Optional[str] = None,
59
+ worker_debug: bool = False,
60
+ max_workers_per_block: int = 1,
61
+ prefetch_capacity: int = 0,
62
+ heartbeat_threshold: int = 120,
63
+ heartbeat_period: int = 30,
64
+ drain_period: Optional[int] = None,
65
+ poll_period: int = 10,
66
+ address_probe_timeout: Optional[int] = None,
67
+ worker_logdir_root: Optional[str] = None,
68
+ mpi_launcher: str = "mpiexec",
69
+ block_error_handler: Union[bool, Callable[[BlockProviderExecutor, Dict[str, JobStatus]], None]] = True,
70
+ encrypted: bool = False):
71
+ super().__init__(
72
+ # Hard-coded settings
73
+ cores_per_worker=1e-9, # Ensures there will be at least an absurd number of workers
74
+ max_workers_per_node=max_workers_per_block,
75
+
76
+ # Everything else
77
+ label=label,
78
+ provider=provider,
79
+ launch_cmd=launch_cmd,
80
+ interchange_launch_cmd=interchange_launch_cmd,
81
+ address=address,
82
+ loopback_address=loopback_address,
83
+ worker_ports=worker_ports,
84
+ worker_port_range=worker_port_range,
85
+ interchange_port_range=interchange_port_range,
86
+ storage_access=storage_access,
87
+ working_dir=working_dir,
88
+ worker_debug=worker_debug,
89
+ prefetch_capacity=prefetch_capacity,
90
+ heartbeat_threshold=heartbeat_threshold,
91
+ heartbeat_period=heartbeat_period,
92
+ drain_period=drain_period,
93
+ poll_period=poll_period,
94
+ address_probe_timeout=address_probe_timeout,
95
+ worker_logdir_root=worker_logdir_root,
96
+ block_error_handler=block_error_handler,
97
+ encrypted=encrypted
98
+ )
99
+ self.enable_mpi_mode = True
100
+ self.mpi_launcher = mpi_launcher
101
+
102
+ self.max_workers_per_block = max_workers_per_block
103
+
104
+ if not isinstance(self.provider.launcher, SimpleLauncher):
105
+ raise TypeError("mpi_mode requires the provider to be configured to use a SimpleLauncher")
106
+
107
+ if mpi_launcher not in VALID_LAUNCHERS:
108
+ raise ValueError(f"mpi_launcher set to:{mpi_launcher} must be set to one of {VALID_LAUNCHERS}")
109
+
110
+ self.mpi_launcher = mpi_launcher
111
+
112
+ def validate_resource_spec(self, resource_specification: dict):
113
+ return validate_resource_spec(resource_specification)
@@ -1,5 +1,7 @@
1
1
  import logging
2
- from typing import Dict, List, Tuple, Set
2
+ from typing import Dict, List, Tuple
3
+
4
+ from parsl.executors.errors import InvalidResourceSpecification
3
5
 
4
6
  logger = logging.getLogger(__name__)
5
7
 
@@ -8,16 +10,6 @@ VALID_LAUNCHERS = ('srun',
8
10
  'mpiexec')
9
11
 
10
12
 
11
- class InvalidResourceSpecification(Exception):
12
- """Exception raised when Invalid keys are supplied via resource specification"""
13
-
14
- def __init__(self, invalid_keys: Set[str]):
15
- self.invalid_keys = invalid_keys
16
-
17
- def __str__(self):
18
- return f"Invalid resource specification options supplied: {self.invalid_keys}"
19
-
20
-
21
13
  def validate_resource_spec(resource_spec: Dict[str, str]):
22
14
  """Basic validation of keys in the resource_spec
23
15
 
@@ -25,6 +17,13 @@ def validate_resource_spec(resource_spec: Dict[str, str]):
25
17
  is invalid (e.g, contains invalid keys)
26
18
  """
27
19
  user_keys = set(resource_spec.keys())
20
+
21
+ # empty resource_spec when mpi_mode is set causes parsl to hang
22
+ # ref issue #3427
23
+ if len(user_keys) == 0:
24
+ raise InvalidResourceSpecification(user_keys,
25
+ 'MPI mode requires optional parsl_resource_specification keyword argument to be configured')
26
+
28
27
  legal_keys = set(("ranks_per_node",
29
28
  "num_nodes",
30
29
  "num_ranks",
@@ -8,8 +8,7 @@ from enum import Enum
8
8
  from typing import Dict, List
9
9
 
10
10
  from parsl.multiprocessing import SpawnContext
11
- from parsl.serialize import (pack_res_spec_apply_message,
12
- unpack_res_spec_apply_message)
11
+ from parsl.serialize import pack_res_spec_apply_message, unpack_res_spec_apply_message
13
12
 
14
13
  logger = logging.getLogger(__name__)
15
14
 
@@ -18,7 +17,6 @@ class Scheduler(Enum):
18
17
  Unknown = 0
19
18
  Slurm = 1
20
19
  PBS = 2
21
- Cobalt = 3
22
20
 
23
21
 
24
22
  def get_slurm_hosts_list() -> List[str]:
@@ -38,13 +36,6 @@ def get_pbs_hosts_list() -> List[str]:
38
36
  return [line.strip() for line in f.readlines()]
39
37
 
40
38
 
41
- def get_cobalt_hosts_list() -> List[str]:
42
- """Get list of COBALT hosts from envvar: COBALT_NODEFILE"""
43
- nodefile_name = os.environ["COBALT_NODEFILE"]
44
- with open(nodefile_name) as f:
45
- return [line.strip() for line in f.readlines()]
46
-
47
-
48
39
  def get_nodes_in_batchjob(scheduler: Scheduler) -> List[str]:
49
40
  """Get nodelist from all supported schedulers"""
50
41
  nodelist = []
@@ -52,8 +43,6 @@ def get_nodes_in_batchjob(scheduler: Scheduler) -> List[str]:
52
43
  nodelist = get_slurm_hosts_list()
53
44
  elif scheduler == Scheduler.PBS:
54
45
  nodelist = get_pbs_hosts_list()
55
- elif scheduler == Scheduler.Cobalt:
56
- nodelist = get_cobalt_hosts_list()
57
46
  else:
58
47
  raise RuntimeError(f"mpi_mode does not support scheduler:{scheduler}")
59
48
  return nodelist
@@ -65,8 +54,6 @@ def identify_scheduler() -> Scheduler:
65
54
  return Scheduler.Slurm
66
55
  elif os.environ.get("PBS_NODEFILE"):
67
56
  return Scheduler.PBS
68
- elif os.environ.get("COBALT_NODEFILE"):
69
- return Scheduler.Cobalt
70
57
  else:
71
58
  return Scheduler.Unknown
72
59
 
@@ -173,9 +160,7 @@ class MPITaskScheduler(TaskScheduler):
173
160
  """Schedule task if resources are available otherwise backlog the task"""
174
161
  user_ns = locals()
175
162
  user_ns.update({"__builtins__": __builtins__})
176
- _f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(
177
- task_package["buffer"], user_ns, copy=False
178
- )
163
+ _f, _args, _kwargs, resource_spec = unpack_res_spec_apply_message(task_package["buffer"])
179
164
 
180
165
  nodes_needed = resource_spec.get("num_nodes")
181
166
  if nodes_needed:
@@ -190,6 +175,7 @@ class MPITaskScheduler(TaskScheduler):
190
175
  self._map_tasks_to_nodes[task_package["task_id"]] = allocated_nodes
191
176
  buffer = pack_res_spec_apply_message(_f, _args, _kwargs, resource_spec)
192
177
  task_package["buffer"] = buffer
178
+ task_package["resource_spec"] = resource_spec
193
179
 
194
180
  self.pending_task_q.put(task_package)
195
181
 
@@ -208,8 +194,11 @@ class MPITaskScheduler(TaskScheduler):
208
194
  """Return result and relinquish provisioned nodes"""
209
195
  result_pkl = self.pending_result_q.get(block, timeout=timeout)
210
196
  result_dict = pickle.loads(result_pkl)
197
+ # TODO (wardlt): If the task did not request nodes, it won't be in `self._map_tasks_to_nodes`.
198
+ # Causes Parsl to hang. See Issue #3427
211
199
  if result_dict["type"] == "result":
212
200
  task_id = result_dict["task_id"]
201
+ assert task_id in self._map_tasks_to_nodes, "You are about to experience issue #3427"
213
202
  nodes_to_reallocate = self._map_tasks_to_nodes[task_id]
214
203
  self._return_nodes(nodes_to_reallocate)
215
204
  self._schedule_backlog_tasks()
@@ -1,11 +1,13 @@
1
- import zmq
2
1
  import argparse
3
- import uuid
4
- import time
5
2
  import logging
6
- from parsl.addresses import get_all_addresses
3
+ import time
4
+ import uuid
5
+
6
+ import zmq
7
7
  from zmq.utils.monitor import recv_monitor_message
8
8
 
9
+ from parsl.addresses import get_all_addresses, tcp_url
10
+
9
11
  logger = logging.getLogger(__name__)
10
12
 
11
13
 
@@ -30,7 +32,8 @@ def probe_addresses(addresses, task_port, timeout=120):
30
32
  for addr in addresses:
31
33
  socket = context.socket(zmq.DEALER)
32
34
  socket.setsockopt(zmq.LINGER, 0)
33
- url = "tcp://{}:{}".format(addr, task_port)
35
+ socket.setsockopt(zmq.IPV6, True)
36
+ url = tcp_url(addr, task_port)
34
37
  logger.debug("Trying to connect back on {}".format(url))
35
38
  socket.connect(url)
36
39
  addr_map[addr] = {'sock': socket,
@@ -69,8 +72,7 @@ class TestWorker:
69
72
 
70
73
  address = probe_addresses(addresses, port)
71
74
  print("Viable address :", address)
72
- self.task_incoming.connect("tcp://{}:{}".format(address, port))
73
- print("Here")
75
+ self.task_incoming.connect(tcp_url(address, port))
74
76
 
75
77
  def heartbeat(self):
76
78
  """ Send heartbeat to the incoming task queue
@@ -1,39 +1,44 @@
1
1
  #!/usr/bin/env python3
2
2
 
3
3
  import argparse
4
+ import json
4
5
  import logging
6
+ import math
7
+ import multiprocessing
5
8
  import os
6
- import sys
9
+ import pickle
7
10
  import platform
11
+ import queue
12
+ import subprocess
13
+ import sys
8
14
  import threading
9
- import pickle
10
15
  import time
11
- import queue
12
16
  import uuid
13
- from typing import Sequence, Optional, Dict, List
14
-
15
- import zmq
16
- import math
17
- import json
18
- import psutil
19
- import multiprocessing
20
17
  from multiprocessing.managers import DictProxy
21
18
  from multiprocessing.sharedctypes import Synchronized
19
+ from typing import Dict, List, Optional, Sequence
20
+
21
+ import psutil
22
+ import zmq
22
23
 
23
24
  from parsl import curvezmq
24
- from parsl.process_loggers import wrap_with_logs
25
- from parsl.version import VERSION as PARSL_VERSION
25
+ from parsl.addresses import tcp_url
26
26
  from parsl.app.errors import RemoteExceptionWrapper
27
+ from parsl.executors.execute_task import execute_task
27
28
  from parsl.executors.high_throughput.errors import WorkerLost
28
- from parsl.executors.high_throughput.probe import probe_addresses
29
- from parsl.multiprocessing import SpawnContext
30
- from parsl.serialize import unpack_res_spec_apply_message, serialize
29
+ from parsl.executors.high_throughput.mpi_prefix_composer import (
30
+ VALID_LAUNCHERS,
31
+ compose_all,
32
+ )
31
33
  from parsl.executors.high_throughput.mpi_resource_management import (
34
+ MPITaskScheduler,
32
35
  TaskScheduler,
33
- MPITaskScheduler
34
36
  )
35
-
36
- from parsl.executors.high_throughput.mpi_prefix_composer import compose_all, VALID_LAUNCHERS
37
+ from parsl.executors.high_throughput.probe import probe_addresses
38
+ from parsl.multiprocessing import SpawnContext
39
+ from parsl.process_loggers import wrap_with_logs
40
+ from parsl.serialize import serialize
41
+ from parsl.version import VERSION as PARSL_VERSION
37
42
 
38
43
  HEARTBEAT_CODE = (2 ** 32) - 1
39
44
  DRAINED_CODE = (2 ** 32) - 2
@@ -155,8 +160,8 @@ class Manager:
155
160
  raise Exception("No viable address found")
156
161
  else:
157
162
  logger.info("Connection to Interchange successful on {}".format(ix_address))
158
- task_q_url = "tcp://{}:{}".format(ix_address, task_port)
159
- result_q_url = "tcp://{}:{}".format(ix_address, result_port)
163
+ task_q_url = tcp_url(ix_address, task_port)
164
+ result_q_url = tcp_url(ix_address, result_port)
160
165
  logger.info("Task url : {}".format(task_q_url))
161
166
  logger.info("Result url : {}".format(result_q_url))
162
167
  except Exception:
@@ -181,6 +186,7 @@ class Manager:
181
186
 
182
187
  self.uid = uid
183
188
  self.block_id = block_id
189
+ self.start_time = time.time()
184
190
 
185
191
  self.enable_mpi_mode = enable_mpi_mode
186
192
  self.mpi_launcher = mpi_launcher
@@ -260,6 +266,7 @@ class Manager:
260
266
  'worker_count': self.worker_count,
261
267
  'uid': self.uid,
262
268
  'block_id': self.block_id,
269
+ 'start_time': self.start_time,
263
270
  'prefetch_capacity': self.prefetch_capacity,
264
271
  'max_capacity': self.worker_count + self.prefetch_capacity,
265
272
  'os': platform.system(),
@@ -335,14 +342,17 @@ class Manager:
335
342
  self.heartbeat_to_incoming()
336
343
  last_beat = time.time()
337
344
 
338
- if self.drain_time and time.time() > self.drain_time:
345
+ if time.time() > self.drain_time:
339
346
  logger.info("Requesting drain")
340
347
  self.drain_to_incoming()
341
- self.drain_time = None
342
348
  # This will start the pool draining...
343
349
  # Drained exit behaviour does not happen here. It will be
344
350
  # driven by the interchange sending a DRAINED_CODE message.
345
351
 
352
+ # now set drain time to the far future so we don't send a drain
353
+ # message every iteration.
354
+ self.drain_time = float('inf')
355
+
346
356
  poll_duration_s = max(0, next_interesting_event_time - time.time())
347
357
  socks = dict(poller.poll(timeout=poll_duration_s * 1000))
348
358
 
@@ -354,11 +364,13 @@ class Manager:
354
364
  if tasks == HEARTBEAT_CODE:
355
365
  logger.debug("Got heartbeat from interchange")
356
366
  elif tasks == DRAINED_CODE:
357
- logger.info("Got fulled drained message from interchange - setting kill flag")
367
+ logger.info("Got fully drained message from interchange - setting kill flag")
358
368
  kill_event.set()
359
369
  else:
360
370
  task_recv_counter += len(tasks)
361
- logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format([t['task_id'] for t in tasks], task_recv_counter))
371
+ logger.debug("Got executor tasks: {}, cumulative count of tasks: {}".format(
372
+ [t['task_id'] for t in tasks], task_recv_counter
373
+ ))
362
374
 
363
375
  for task in tasks:
364
376
  self.task_scheduler.put_task(task)
@@ -580,45 +592,13 @@ def update_resource_spec_env_vars(mpi_launcher: str, resource_spec: Dict, node_i
580
592
  os.environ[key] = prefix_table[key]
581
593
 
582
594
 
583
- def execute_task(bufs, mpi_launcher: Optional[str] = None):
584
- """Deserialize the buffer and execute the task.
585
-
586
- Returns the result or throws exception.
587
- """
588
- user_ns = locals()
589
- user_ns.update({'__builtins__': __builtins__})
590
-
591
- f, args, kwargs, resource_spec = unpack_res_spec_apply_message(bufs, user_ns, copy=False)
592
-
593
- for varname in resource_spec:
594
- envname = "PARSL_" + str(varname).upper()
595
- os.environ[envname] = str(resource_spec[varname])
596
-
597
- if resource_spec.get("MPI_NODELIST"):
598
- worker_id = os.environ['PARSL_WORKER_RANK']
599
- nodes_for_task = resource_spec["MPI_NODELIST"].split(',')
600
- logger.info(f"Launching task on provisioned nodes: {nodes_for_task}")
601
- assert mpi_launcher
602
- update_resource_spec_env_vars(mpi_launcher,
603
- resource_spec=resource_spec,
604
- node_info=nodes_for_task)
605
- # We might need to look into callability of the function from itself
606
- # since we change it's name in the new namespace
607
- prefix = "parsl_"
608
- fname = prefix + "f"
609
- argname = prefix + "args"
610
- kwargname = prefix + "kwargs"
611
- resultname = prefix + "result"
612
-
613
- user_ns.update({fname: f,
614
- argname: args,
615
- kwargname: kwargs,
616
- resultname: resultname})
617
-
618
- code = "{0} = {1}(*{2}, **{3})".format(resultname, fname,
619
- argname, kwargname)
620
- exec(code, user_ns, user_ns)
621
- return user_ns.get(resultname)
595
+ def _init_mpi_env(mpi_launcher: str, resource_spec: Dict):
596
+ node_list = resource_spec.get("MPI_NODELIST")
597
+ if node_list is None:
598
+ return
599
+ nodes_for_task = node_list.split(',')
600
+ logger.info(f"Launching task on provisioned nodes: {nodes_for_task}")
601
+ update_resource_spec_env_vars(mpi_launcher=mpi_launcher, resource_spec=resource_spec, node_info=nodes_for_task)
622
602
 
623
603
 
624
604
  @wrap_with_logs(target="worker_log")
@@ -640,14 +620,6 @@ def worker(
640
620
  debug: bool,
641
621
  mpi_launcher: str,
642
622
  ):
643
- """
644
-
645
- Put request token into queue
646
- Get task from task_queue
647
- Pop request from queue
648
- Put result into result_queue
649
- """
650
-
651
623
  # override the global logger inherited from the __main__ process (which
652
624
  # usually logs to manager.log) with one specific to this worker.
653
625
  global logger
@@ -672,7 +644,8 @@ def worker(
672
644
  # If desired, set process affinity
673
645
  if cpu_affinity != "none":
674
646
  # Count the number of cores per worker
675
- avail_cores = sorted(os.sched_getaffinity(0)) # Get the available threads
647
+ # OSX does not implement os.sched_getaffinity
648
+ avail_cores = sorted(os.sched_getaffinity(0)) # type: ignore[attr-defined, unused-ignore]
676
649
  cores_per_worker = len(avail_cores) // pool_size
677
650
  assert cores_per_worker > 0, "Affinity does not work if there are more workers than cores"
678
651
 
@@ -712,12 +685,39 @@ def worker(
712
685
  os.environ["KMP_AFFINITY"] = f"explicit,proclist=[{proc_list}]" # For Intel OpenMP
713
686
 
714
687
  # Set the affinity for this worker
715
- os.sched_setaffinity(0, my_cores)
688
+ # OSX does not implement os.sched_setaffinity so type checking
689
+ # is ignored here in two ways:
690
+ # On a platform without sched_setaffinity, that attribute will not
691
+ # be defined, so ignore[attr-defined] will tell mypy to ignore this
692
+ # incorrect-for-OS X attribute access.
693
+ # On a platform with sched_setaffinity, that type: ignore message
694
+ # will be redundant, and ignore[unused-ignore] tells mypy to ignore
695
+ # that this ignore is unneeded.
696
+ os.sched_setaffinity(0, my_cores) # type: ignore[attr-defined, unused-ignore]
716
697
  logger.info("Set worker CPU affinity to {}".format(my_cores))
717
698
 
718
699
  # If desired, pin to accelerator
719
700
  if accelerator is not None:
720
- os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
701
+
702
+ # If CUDA devices, find total number of devices to allow for MPS
703
+ # See: https://developer.nvidia.com/system-management-interface
704
+ nvidia_smi_cmd = "nvidia-smi -L > /dev/null && nvidia-smi -L | wc -l"
705
+ nvidia_smi_ret = subprocess.run(nvidia_smi_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
706
+ if nvidia_smi_ret.returncode == 0:
707
+ num_cuda_devices = int(nvidia_smi_ret.stdout.split()[0])
708
+ else:
709
+ num_cuda_devices = None
710
+
711
+ try:
712
+ if num_cuda_devices is not None:
713
+ procs_per_cuda_device = pool_size // num_cuda_devices
714
+ partitioned_accelerator = str(int(accelerator) // procs_per_cuda_device) # multiple workers will share a GPU
715
+ os.environ["CUDA_VISIBLE_DEVICES"] = partitioned_accelerator
716
+ logger.info(f'Pinned worker to partitioned cuda device: {partitioned_accelerator}')
717
+ else:
718
+ os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
719
+ except (TypeError, ValueError, ZeroDivisionError):
720
+ os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
721
721
  os.environ["ROCR_VISIBLE_DEVICES"] = accelerator
722
722
  os.environ["ZE_AFFINITY_MASK"] = accelerator
723
723
  os.environ["ZE_ENABLE_PCI_ID_DEVICE_ORDER"] = '1'
@@ -756,8 +756,10 @@ def worker(
756
756
  ready_worker_count.value -= 1
757
757
  worker_enqueued = False
758
758
 
759
+ _init_mpi_env(mpi_launcher=mpi_launcher, resource_spec=req["resource_spec"])
760
+
759
761
  try:
760
- result = execute_task(req['buffer'], mpi_launcher=mpi_launcher)
762
+ result = execute_task(req['buffer'])
761
763
  serialized_result = serialize(result, buffer_threshold=1000000)
762
764
  except Exception as e:
763
765
  logger.info('Caught an exception: {}'.format(e))