parsl 2024.3.18__py3-none-any.whl → 2025.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. parsl/__init__.py +9 -10
  2. parsl/addresses.py +26 -6
  3. parsl/app/app.py +7 -8
  4. parsl/app/bash.py +15 -8
  5. parsl/app/errors.py +10 -13
  6. parsl/app/futures.py +8 -10
  7. parsl/app/python.py +2 -1
  8. parsl/benchmark/perf.py +2 -1
  9. parsl/concurrent/__init__.py +2 -2
  10. parsl/config.py +53 -10
  11. parsl/configs/ASPIRE1.py +6 -5
  12. parsl/configs/Azure.py +9 -8
  13. parsl/configs/bridges.py +6 -4
  14. parsl/configs/cc_in2p3.py +3 -3
  15. parsl/configs/ec2.py +3 -1
  16. parsl/configs/expanse.py +4 -3
  17. parsl/configs/frontera.py +3 -4
  18. parsl/configs/htex_local.py +3 -4
  19. parsl/configs/illinoiscluster.py +3 -1
  20. parsl/configs/improv.py +34 -0
  21. parsl/configs/kubernetes.py +4 -3
  22. parsl/configs/local_threads.py +5 -1
  23. parsl/configs/midway.py +5 -3
  24. parsl/configs/osg.py +4 -2
  25. parsl/configs/polaris.py +4 -2
  26. parsl/configs/stampede2.py +6 -5
  27. parsl/configs/summit.py +3 -3
  28. parsl/configs/toss3_llnl.py +4 -3
  29. parsl/configs/vineex_local.py +6 -4
  30. parsl/configs/wqex_local.py +5 -3
  31. parsl/curvezmq.py +4 -0
  32. parsl/data_provider/data_manager.py +4 -3
  33. parsl/data_provider/file_noop.py +1 -2
  34. parsl/data_provider/files.py +3 -3
  35. parsl/data_provider/ftp.py +1 -3
  36. parsl/data_provider/globus.py +7 -6
  37. parsl/data_provider/http.py +2 -2
  38. parsl/data_provider/rsync.py +1 -1
  39. parsl/data_provider/staging.py +2 -2
  40. parsl/data_provider/zip.py +135 -0
  41. parsl/dataflow/dependency_resolvers.py +115 -0
  42. parsl/dataflow/dflow.py +259 -223
  43. parsl/dataflow/errors.py +3 -5
  44. parsl/dataflow/futures.py +27 -14
  45. parsl/dataflow/memoization.py +5 -5
  46. parsl/dataflow/rundirs.py +5 -6
  47. parsl/dataflow/taskrecord.py +4 -5
  48. parsl/executors/__init__.py +4 -2
  49. parsl/executors/base.py +45 -15
  50. parsl/executors/errors.py +13 -0
  51. parsl/executors/execute_task.py +37 -0
  52. parsl/executors/flux/execute_parsl_task.py +3 -3
  53. parsl/executors/flux/executor.py +18 -19
  54. parsl/executors/flux/flux_instance_manager.py +26 -27
  55. parsl/executors/high_throughput/errors.py +43 -3
  56. parsl/executors/high_throughput/executor.py +307 -285
  57. parsl/executors/high_throughput/interchange.py +137 -168
  58. parsl/executors/high_throughput/manager_record.py +4 -0
  59. parsl/executors/high_throughput/manager_selector.py +55 -0
  60. parsl/executors/high_throughput/monitoring_info.py +2 -1
  61. parsl/executors/high_throughput/mpi_executor.py +113 -0
  62. parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
  63. parsl/executors/high_throughput/mpi_resource_management.py +6 -17
  64. parsl/executors/high_throughput/probe.py +9 -7
  65. parsl/executors/high_throughput/process_worker_pool.py +77 -75
  66. parsl/executors/high_throughput/zmq_pipes.py +81 -23
  67. parsl/executors/radical/executor.py +130 -79
  68. parsl/executors/radical/rpex_resources.py +17 -15
  69. parsl/executors/radical/rpex_worker.py +4 -3
  70. parsl/executors/status_handling.py +157 -51
  71. parsl/executors/taskvine/__init__.py +1 -1
  72. parsl/executors/taskvine/errors.py +1 -1
  73. parsl/executors/taskvine/exec_parsl_function.py +2 -2
  74. parsl/executors/taskvine/executor.py +38 -55
  75. parsl/executors/taskvine/factory.py +1 -1
  76. parsl/executors/taskvine/factory_config.py +1 -1
  77. parsl/executors/taskvine/manager.py +17 -13
  78. parsl/executors/taskvine/manager_config.py +7 -2
  79. parsl/executors/threads.py +6 -6
  80. parsl/executors/workqueue/errors.py +1 -1
  81. parsl/executors/workqueue/exec_parsl_function.py +6 -5
  82. parsl/executors/workqueue/executor.py +64 -63
  83. parsl/executors/workqueue/parsl_coprocess.py +1 -1
  84. parsl/jobs/error_handlers.py +2 -2
  85. parsl/jobs/job_status_poller.py +28 -112
  86. parsl/jobs/states.py +7 -2
  87. parsl/jobs/strategy.py +43 -31
  88. parsl/launchers/__init__.py +12 -3
  89. parsl/launchers/errors.py +1 -1
  90. parsl/launchers/launchers.py +0 -6
  91. parsl/log_utils.py +1 -2
  92. parsl/monitoring/db_manager.py +55 -93
  93. parsl/monitoring/errors.py +6 -0
  94. parsl/monitoring/monitoring.py +85 -311
  95. parsl/monitoring/queries/pandas.py +1 -2
  96. parsl/monitoring/radios/base.py +13 -0
  97. parsl/monitoring/radios/filesystem.py +52 -0
  98. parsl/monitoring/radios/htex.py +57 -0
  99. parsl/monitoring/radios/multiprocessing.py +17 -0
  100. parsl/monitoring/radios/udp.py +56 -0
  101. parsl/monitoring/radios/zmq.py +17 -0
  102. parsl/monitoring/remote.py +33 -37
  103. parsl/monitoring/router.py +212 -0
  104. parsl/monitoring/types.py +5 -6
  105. parsl/monitoring/visualization/app.py +4 -2
  106. parsl/monitoring/visualization/models.py +0 -1
  107. parsl/monitoring/visualization/plots/default/workflow_plots.py +8 -4
  108. parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
  109. parsl/monitoring/visualization/utils.py +0 -1
  110. parsl/monitoring/visualization/views.py +16 -9
  111. parsl/multiprocessing.py +0 -1
  112. parsl/process_loggers.py +1 -2
  113. parsl/providers/__init__.py +8 -17
  114. parsl/providers/aws/aws.py +2 -3
  115. parsl/providers/azure/azure.py +4 -5
  116. parsl/providers/base.py +2 -18
  117. parsl/providers/cluster_provider.py +3 -9
  118. parsl/providers/condor/condor.py +7 -17
  119. parsl/providers/errors.py +2 -2
  120. parsl/providers/googlecloud/googlecloud.py +2 -1
  121. parsl/providers/grid_engine/grid_engine.py +5 -14
  122. parsl/providers/kubernetes/kube.py +80 -40
  123. parsl/providers/local/local.py +13 -26
  124. parsl/providers/lsf/lsf.py +5 -23
  125. parsl/providers/pbspro/pbspro.py +5 -17
  126. parsl/providers/slurm/slurm.py +81 -39
  127. parsl/providers/torque/torque.py +3 -14
  128. parsl/serialize/__init__.py +8 -3
  129. parsl/serialize/base.py +1 -2
  130. parsl/serialize/concretes.py +5 -4
  131. parsl/serialize/facade.py +3 -3
  132. parsl/serialize/proxystore.py +3 -2
  133. parsl/tests/__init__.py +1 -1
  134. parsl/tests/configs/azure_single_node.py +4 -5
  135. parsl/tests/configs/bridges.py +3 -2
  136. parsl/tests/configs/cc_in2p3.py +1 -3
  137. parsl/tests/configs/comet.py +2 -1
  138. parsl/tests/configs/ec2_single_node.py +1 -2
  139. parsl/tests/configs/ec2_spot.py +1 -2
  140. parsl/tests/configs/flux_local.py +11 -0
  141. parsl/tests/configs/frontera.py +2 -3
  142. parsl/tests/configs/htex_local.py +3 -5
  143. parsl/tests/configs/htex_local_alternate.py +11 -15
  144. parsl/tests/configs/htex_local_intask_staging.py +5 -9
  145. parsl/tests/configs/htex_local_rsync_staging.py +4 -8
  146. parsl/tests/configs/local_radical.py +1 -3
  147. parsl/tests/configs/local_radical_mpi.py +2 -2
  148. parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
  149. parsl/tests/configs/local_threads_monitoring.py +0 -1
  150. parsl/tests/configs/midway.py +2 -2
  151. parsl/tests/configs/nscc_singapore.py +3 -3
  152. parsl/tests/configs/osg_htex.py +1 -1
  153. parsl/tests/configs/petrelkube.py +3 -2
  154. parsl/tests/configs/slurm_local.py +24 -0
  155. parsl/tests/configs/summit.py +1 -0
  156. parsl/tests/configs/taskvine_ex.py +4 -7
  157. parsl/tests/configs/user_opts.py +0 -7
  158. parsl/tests/configs/workqueue_ex.py +4 -6
  159. parsl/tests/conftest.py +27 -13
  160. parsl/tests/integration/test_stress/test_python_simple.py +3 -4
  161. parsl/tests/integration/test_stress/test_python_threads.py +3 -5
  162. parsl/tests/manual_tests/htex_local.py +4 -6
  163. parsl/tests/manual_tests/test_basic.py +1 -0
  164. parsl/tests/manual_tests/test_log_filter.py +3 -1
  165. parsl/tests/manual_tests/test_memory_limits.py +6 -8
  166. parsl/tests/manual_tests/test_regression_220.py +2 -1
  167. parsl/tests/manual_tests/test_udp_simple.py +4 -4
  168. parsl/tests/manual_tests/test_worker_count.py +3 -2
  169. parsl/tests/scaling_tests/htex_local.py +2 -4
  170. parsl/tests/scaling_tests/test_scale.py +0 -9
  171. parsl/tests/scaling_tests/vineex_condor.py +1 -2
  172. parsl/tests/scaling_tests/vineex_local.py +1 -2
  173. parsl/tests/site_tests/site_config_selector.py +1 -6
  174. parsl/tests/site_tests/test_provider.py +4 -2
  175. parsl/tests/site_tests/test_site.py +2 -0
  176. parsl/tests/sites/test_affinity.py +7 -7
  177. parsl/tests/sites/test_dynamic_executor.py +3 -4
  178. parsl/tests/sites/test_ec2.py +3 -2
  179. parsl/tests/sites/test_worker_info.py +4 -5
  180. parsl/tests/test_aalst_patterns.py +0 -1
  181. parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
  182. parsl/tests/test_bash_apps/test_basic.py +10 -4
  183. parsl/tests/test_bash_apps/test_error_codes.py +5 -7
  184. parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
  185. parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
  186. parsl/tests/test_bash_apps/test_memoize.py +2 -8
  187. parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
  188. parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
  189. parsl/tests/test_bash_apps/test_multiline.py +1 -1
  190. parsl/tests/test_bash_apps/test_pipeline.py +1 -1
  191. parsl/tests/test_bash_apps/test_std_uri.py +123 -0
  192. parsl/tests/test_bash_apps/test_stdout.py +33 -8
  193. parsl/tests/test_callables.py +2 -2
  194. parsl/tests/test_checkpointing/test_periodic.py +21 -39
  195. parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
  196. parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
  197. parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
  198. parsl/tests/test_checkpointing/test_regression_239.py +1 -1
  199. parsl/tests/test_checkpointing/test_task_exit.py +2 -3
  200. parsl/tests/test_docs/test_from_slides.py +5 -2
  201. parsl/tests/test_docs/test_kwargs.py +4 -1
  202. parsl/tests/test_docs/test_tutorial_1.py +1 -2
  203. parsl/tests/test_docs/test_workflow1.py +2 -2
  204. parsl/tests/test_docs/test_workflow2.py +0 -1
  205. parsl/tests/test_error_handling/test_rand_fail.py +2 -2
  206. parsl/tests/test_error_handling/test_resource_spec.py +10 -12
  207. parsl/tests/test_error_handling/test_retries.py +6 -16
  208. parsl/tests/test_error_handling/test_retry_handler.py +1 -0
  209. parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
  210. parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
  211. parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
  212. parsl/tests/test_execute_task.py +29 -0
  213. parsl/tests/test_flux.py +1 -1
  214. parsl/tests/test_htex/test_basic.py +2 -3
  215. parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
  216. parsl/tests/test_htex/test_command_client_timeout.py +66 -0
  217. parsl/tests/test_htex/test_connected_blocks.py +3 -2
  218. parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
  219. parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
  220. parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
  221. parsl/tests/test_htex/test_drain.py +11 -10
  222. parsl/tests/test_htex/test_htex.py +51 -25
  223. parsl/tests/test_htex/test_manager_failure.py +0 -1
  224. parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
  225. parsl/tests/test_htex/test_managers_command.py +36 -0
  226. parsl/tests/test_htex/test_missing_worker.py +2 -12
  227. parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
  228. parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
  229. parsl/tests/test_htex/test_zmq_binding.py +29 -8
  230. parsl/tests/test_monitoring/test_app_names.py +5 -5
  231. parsl/tests/test_monitoring/test_basic.py +73 -25
  232. parsl/tests/test_monitoring/test_db_locks.py +6 -4
  233. parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
  234. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
  235. parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
  236. parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
  237. parsl/tests/test_monitoring/test_stdouterr.py +134 -0
  238. parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
  239. parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
  240. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
  241. parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
  242. parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
  243. parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
  244. parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
  245. parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
  246. parsl/tests/test_providers/test_local_provider.py +3 -132
  247. parsl/tests/test_providers/test_pbspro_template.py +2 -3
  248. parsl/tests/test_providers/test_slurm_template.py +2 -3
  249. parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
  250. parsl/tests/test_python_apps/test_context_manager.py +128 -0
  251. parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
  252. parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
  253. parsl/tests/test_python_apps/test_fail.py +0 -25
  254. parsl/tests/test_python_apps/test_futures.py +2 -1
  255. parsl/tests/test_python_apps/test_inputs_default.py +22 -0
  256. parsl/tests/test_python_apps/test_join.py +0 -1
  257. parsl/tests/test_python_apps/test_lifted.py +11 -7
  258. parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
  259. parsl/tests/test_python_apps/test_outputs.py +1 -1
  260. parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
  261. parsl/tests/test_radical/test_mpi_funcs.py +1 -2
  262. parsl/tests/test_regression/test_1480.py +2 -1
  263. parsl/tests/test_regression/test_1653.py +2 -1
  264. parsl/tests/test_regression/test_226.py +1 -0
  265. parsl/tests/test_regression/test_2652.py +1 -0
  266. parsl/tests/test_regression/test_69a.py +0 -1
  267. parsl/tests/test_regression/test_854.py +4 -2
  268. parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
  269. parsl/tests/test_regression/test_98.py +0 -1
  270. parsl/tests/test_scaling/test_block_error_handler.py +9 -4
  271. parsl/tests/test_scaling/test_regression_1621.py +11 -15
  272. parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
  273. parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
  274. parsl/tests/test_scaling/test_scale_down.py +2 -5
  275. parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +5 -8
  276. parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
  277. parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
  278. parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
  279. parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
  280. parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
  281. parsl/tests/test_serialization/test_basic.py +2 -1
  282. parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
  283. parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
  284. parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
  285. parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
  286. parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
  287. parsl/tests/test_staging/staging_provider.py +2 -2
  288. parsl/tests/test_staging/test_1316.py +3 -4
  289. parsl/tests/test_staging/test_docs_1.py +2 -1
  290. parsl/tests/test_staging/test_docs_2.py +2 -1
  291. parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
  292. parsl/tests/{test_data → test_staging}/test_file.py +6 -6
  293. parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
  294. parsl/tests/test_staging/test_staging_ftp.py +1 -0
  295. parsl/tests/test_staging/test_staging_https.py +5 -2
  296. parsl/tests/test_staging/test_staging_stdout.py +64 -0
  297. parsl/tests/test_staging/test_zip_in.py +39 -0
  298. parsl/tests/test_staging/test_zip_out.py +110 -0
  299. parsl/tests/test_staging/test_zip_to_zip.py +41 -0
  300. parsl/tests/test_summary.py +2 -2
  301. parsl/tests/test_thread_parallelism.py +0 -1
  302. parsl/tests/test_threads/test_configs.py +1 -2
  303. parsl/tests/test_threads/test_lazy_errors.py +2 -2
  304. parsl/tests/test_utils/test_execute_wait.py +35 -0
  305. parsl/tests/test_utils/test_sanitize_dns.py +76 -0
  306. parsl/tests/unit/test_address.py +20 -0
  307. parsl/tests/unit/test_file.py +99 -0
  308. parsl/tests/unit/test_usage_tracking.py +66 -0
  309. parsl/usage_tracking/api.py +65 -0
  310. parsl/usage_tracking/levels.py +6 -0
  311. parsl/usage_tracking/usage.py +104 -62
  312. parsl/utils.py +137 -4
  313. parsl/version.py +1 -1
  314. {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
  315. parsl-2025.1.13.data/scripts/interchange.py +649 -0
  316. {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +77 -75
  317. parsl-2025.1.13.dist-info/METADATA +96 -0
  318. parsl-2025.1.13.dist-info/RECORD +462 -0
  319. {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
  320. parsl/channels/__init__.py +0 -7
  321. parsl/channels/base.py +0 -141
  322. parsl/channels/errors.py +0 -113
  323. parsl/channels/local/local.py +0 -164
  324. parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
  325. parsl/channels/ssh/ssh.py +0 -276
  326. parsl/channels/ssh_il/__init__.py +0 -0
  327. parsl/channels/ssh_il/ssh_il.py +0 -74
  328. parsl/configs/ad_hoc.py +0 -35
  329. parsl/executors/radical/rpex_master.py +0 -42
  330. parsl/monitoring/radios.py +0 -175
  331. parsl/providers/ad_hoc/__init__.py +0 -0
  332. parsl/providers/ad_hoc/ad_hoc.py +0 -248
  333. parsl/providers/cobalt/__init__.py +0 -0
  334. parsl/providers/cobalt/cobalt.py +0 -236
  335. parsl/providers/cobalt/template.py +0 -17
  336. parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
  337. parsl/tests/configs/cooley_htex.py +0 -37
  338. parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
  339. parsl/tests/configs/local_adhoc.py +0 -18
  340. parsl/tests/configs/swan_htex.py +0 -43
  341. parsl/tests/configs/theta.py +0 -37
  342. parsl/tests/integration/test_channels/__init__.py +0 -0
  343. parsl/tests/integration/test_channels/test_channels.py +0 -17
  344. parsl/tests/integration/test_channels/test_local_channel.py +0 -42
  345. parsl/tests/integration/test_channels/test_scp_1.py +0 -45
  346. parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
  347. parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
  348. parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
  349. parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
  350. parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
  351. parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
  352. parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
  353. parsl/tests/sites/test_local_adhoc.py +0 -61
  354. parsl/tests/test_channels/__init__.py +0 -0
  355. parsl/tests/test_channels/test_large_output.py +0 -22
  356. parsl/tests/test_data/__init__.py +0 -0
  357. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
  358. parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
  359. parsl-2024.3.18.dist-info/METADATA +0 -98
  360. parsl-2024.3.18.dist-info/RECORD +0 -449
  361. parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
  362. parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
  363. parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
  364. parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
  365. parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
  366. {parsl-2024.3.18.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
  367. {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
  368. {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
  369. {parsl-2024.3.18.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
@@ -1,30 +1,25 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
4
+ import multiprocessing.synchronize as ms
3
5
  import os
4
- import socket
5
- import time
6
6
  import pickle
7
- import logging
8
- import typeguard
9
- import zmq
10
-
11
7
  import queue
8
+ import time
9
+ from multiprocessing import Event
10
+ from multiprocessing.queues import Queue
11
+ from typing import TYPE_CHECKING, Literal, Optional, Tuple, Union, cast
12
12
 
13
- import parsl.monitoring.remote
13
+ import typeguard
14
14
 
15
- from parsl.multiprocessing import ForkProcess, SizedQueue
16
- from multiprocessing import Process
17
- from multiprocessing.queues import Queue
18
15
  from parsl.log_utils import set_file_logger
19
- from parsl.utils import RepresentationMixin
16
+ from parsl.monitoring.errors import MonitoringHubStartError
17
+ from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
18
+ from parsl.monitoring.router import router_starter
19
+ from parsl.monitoring.types import TaggedMonitoringMessage
20
+ from parsl.multiprocessing import ForkProcess, SizedQueue
20
21
  from parsl.process_loggers import wrap_with_logs
21
- from parsl.utils import setproctitle
22
-
23
- from parsl.serialize import deserialize
24
-
25
- from parsl.monitoring.message_type import MessageType
26
- from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
27
- from typing import cast, Any, Callable, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING
22
+ from parsl.utils import RepresentationMixin, setproctitle
28
23
 
29
24
  _db_manager_excepts: Optional[Exception]
30
25
 
@@ -49,7 +44,6 @@ class MonitoringHub(RepresentationMixin):
49
44
  workflow_name: Optional[str] = None,
50
45
  workflow_version: Optional[str] = None,
51
46
  logging_endpoint: Optional[str] = None,
52
- logdir: Optional[str] = None,
53
47
  monitoring_debug: bool = False,
54
48
  resource_monitoring_enabled: bool = True,
55
49
  resource_monitoring_interval: float = 30): # in seconds
@@ -78,8 +72,6 @@ class MonitoringHub(RepresentationMixin):
78
72
  The database connection url for monitoring to log the information.
79
73
  These URLs follow RFC-1738, and can include username, password, hostname, database name.
80
74
  Default: sqlite, in the configured run_dir.
81
- logdir : str
82
- Parsl log directory paths. Logs and temp files go here. Default: '.'
83
75
  monitoring_debug : Bool
84
76
  Enable monitoring debug logging. Default: False
85
77
  resource_monitoring_enabled : boolean
@@ -93,14 +85,6 @@ class MonitoringHub(RepresentationMixin):
93
85
  Default: 30 seconds
94
86
  """
95
87
 
96
- self.logger = logger
97
-
98
- # Any is used to disable typechecking on uses of _dfk_channel,
99
- # because it is used in the code as if it points to a channel, but
100
- # the static type is that it can also be None. The code relies on
101
- # .start() being called and initialising this to a real channel.
102
- self._dfk_channel = None # type: Any
103
-
104
88
  if _db_manager_excepts:
105
89
  raise _db_manager_excepts
106
90
 
@@ -109,7 +93,6 @@ class MonitoringHub(RepresentationMixin):
109
93
  self.hub_port_range = hub_port_range
110
94
 
111
95
  self.logging_endpoint = logging_endpoint
112
- self.logdir = logdir
113
96
  self.monitoring_debug = monitoring_debug
114
97
 
115
98
  self.workflow_name = workflow_name
@@ -118,19 +101,15 @@ class MonitoringHub(RepresentationMixin):
118
101
  self.resource_monitoring_enabled = resource_monitoring_enabled
119
102
  self.resource_monitoring_interval = resource_monitoring_interval
120
103
 
121
- def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> int:
104
+ def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None:
122
105
 
123
- if self.logdir is None:
124
- self.logdir = "."
106
+ logger.debug("Starting MonitoringHub")
125
107
 
126
108
  if self.logging_endpoint is None:
127
109
  self.logging_endpoint = f"sqlite:///{os.fspath(config_run_dir)}/monitoring.db"
128
110
 
129
- os.makedirs(self.logdir, exist_ok=True)
111
+ os.makedirs(dfk_run_dir, exist_ok=True)
130
112
 
131
- # Initialize the ZMQ pipe to the Parsl Client
132
-
133
- self.logger.debug("Initializing ZMQ Pipes to client")
134
113
  self.monitoring_hub_active = True
135
114
 
136
115
  # This annotation is incompatible with typeguard 4.x instrumentation
@@ -151,26 +130,22 @@ class MonitoringHub(RepresentationMixin):
151
130
  self.exception_q: Queue[Tuple[str, str]]
152
131
  self.exception_q = SizedQueue(maxsize=10)
153
132
 
154
- self.priority_msgs: Queue[Tuple[Any, int]]
155
- self.priority_msgs = SizedQueue()
156
-
157
- self.resource_msgs: Queue[AddressedMonitoringMessage]
133
+ self.resource_msgs: Queue[Union[TaggedMonitoringMessage, Literal["STOP"]]]
158
134
  self.resource_msgs = SizedQueue()
159
135
 
160
- self.node_msgs: Queue[AddressedMonitoringMessage]
161
- self.node_msgs = SizedQueue()
162
-
163
- self.block_msgs: Queue[AddressedMonitoringMessage]
164
- self.block_msgs = SizedQueue()
136
+ self.router_exit_event: ms.Event
137
+ self.router_exit_event = Event()
165
138
 
166
139
  self.router_proc = ForkProcess(target=router_starter,
167
- args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs, self.block_msgs, self.resource_msgs),
168
- kwargs={"hub_address": self.hub_address,
169
- "hub_port": self.hub_port,
170
- "hub_port_range": self.hub_port_range,
171
- "logdir": self.logdir,
140
+ kwargs={"comm_q": comm_q,
141
+ "exception_q": self.exception_q,
142
+ "resource_msgs": self.resource_msgs,
143
+ "exit_event": self.router_exit_event,
144
+ "hub_address": self.hub_address,
145
+ "udp_port": self.hub_port,
146
+ "zmq_port_range": self.hub_port_range,
147
+ "run_dir": dfk_run_dir,
172
148
  "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
173
- "run_id": run_id
174
149
  },
175
150
  name="Monitoring-Router-Process",
176
151
  daemon=True,
@@ -178,8 +153,8 @@ class MonitoringHub(RepresentationMixin):
178
153
  self.router_proc.start()
179
154
 
180
155
  self.dbm_proc = ForkProcess(target=dbm_starter,
181
- args=(self.exception_q, self.priority_msgs, self.node_msgs, self.block_msgs, self.resource_msgs,),
182
- kwargs={"logdir": self.logdir,
156
+ args=(self.exception_q, self.resource_msgs,),
157
+ kwargs={"run_dir": dfk_run_dir,
183
158
  "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
184
159
  "db_url": self.logging_endpoint,
185
160
  },
@@ -187,113 +162,97 @@ class MonitoringHub(RepresentationMixin):
187
162
  daemon=True,
188
163
  )
189
164
  self.dbm_proc.start()
190
- self.logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
165
+ logger.info("Started the router process %s and DBM process %s", self.router_proc.pid, self.dbm_proc.pid)
191
166
 
192
- self.filesystem_proc = Process(target=filesystem_receiver,
193
- args=(self.logdir, self.resource_msgs, dfk_run_dir),
194
- name="Monitoring-Filesystem-Process",
195
- daemon=True
196
- )
167
+ self.filesystem_proc = ForkProcess(target=filesystem_receiver,
168
+ args=(self.resource_msgs, dfk_run_dir),
169
+ name="Monitoring-Filesystem-Process",
170
+ daemon=True
171
+ )
197
172
  self.filesystem_proc.start()
198
- self.logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
173
+ logger.info("Started filesystem radio receiver process %s", self.filesystem_proc.pid)
174
+
175
+ self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
199
176
 
200
177
  try:
201
178
  comm_q_result = comm_q.get(block=True, timeout=120)
179
+ comm_q.close()
180
+ comm_q.join_thread()
202
181
  except queue.Empty:
203
- self.logger.error("Hub has not completed initialization in 120s. Aborting")
204
- raise Exception("Hub failed to start")
182
+ logger.error("Hub has not completed initialization in 120s. Aborting")
183
+ raise MonitoringHubStartError()
205
184
 
206
185
  if isinstance(comm_q_result, str):
207
- self.logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
186
+ logger.error("MonitoringRouter sent an error message: %s", comm_q_result)
208
187
  raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
209
188
 
210
- udp_port, ic_port = comm_q_result
189
+ udp_port, zmq_port = comm_q_result
211
190
 
212
191
  self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
213
192
 
214
- context = zmq.Context()
215
- self.dfk_channel_timeout = 10000 # in milliseconds
216
- self._dfk_channel = context.socket(zmq.DEALER)
217
- self._dfk_channel.setsockopt(zmq.LINGER, 0)
218
- self._dfk_channel.set_hwm(0)
219
- self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
220
- self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, ic_port))
193
+ logger.info("Monitoring Hub initialized")
221
194
 
222
- self.logger.info("Monitoring Hub initialized")
195
+ self.hub_zmq_port = zmq_port
223
196
 
224
- return ic_port
225
-
226
- # TODO: tighten the Any message format
227
- def send(self, mtype: MessageType, message: Any) -> None:
228
- self.logger.debug("Sending message type {}".format(mtype))
229
- try:
230
- self._dfk_channel.send_pyobj((mtype, message))
231
- except zmq.Again:
232
- self.logger.exception(
233
- "The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
197
+ def send(self, message: TaggedMonitoringMessage) -> None:
198
+ logger.debug("Sending message type %s", message[0])
199
+ self.radio.send(message)
234
200
 
235
201
  def close(self) -> None:
236
- self.logger.info("Terminating Monitoring Hub")
202
+ logger.info("Terminating Monitoring Hub")
237
203
  exception_msgs = []
238
204
  while True:
239
205
  try:
240
206
  exception_msgs.append(self.exception_q.get(block=False))
241
- self.logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
207
+ logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
242
208
  except queue.Empty:
243
209
  break
244
- if self._dfk_channel and self.monitoring_hub_active:
210
+ if self.monitoring_hub_active:
245
211
  self.monitoring_hub_active = False
246
- self._dfk_channel.close()
247
212
  if exception_msgs:
248
213
  for exception_msg in exception_msgs:
249
- self.logger.error(
250
- "{} process delivered an exception: {}. Terminating all monitoring processes immediately.".format(
251
- exception_msg[0],
252
- exception_msg[1]
253
- )
214
+ logger.error(
215
+ "%s process delivered an exception: %s. Terminating all monitoring processes immediately.",
216
+ exception_msg[0],
217
+ exception_msg[1]
254
218
  )
255
219
  self.router_proc.terminate()
256
220
  self.dbm_proc.terminate()
257
221
  self.filesystem_proc.terminate()
258
- self.logger.info("Waiting for router to terminate")
222
+ logger.info("Setting router termination event")
223
+ self.router_exit_event.set()
224
+ logger.info("Waiting for router to terminate")
259
225
  self.router_proc.join()
260
- self.logger.debug("Finished waiting for router termination")
226
+ self.router_proc.close()
227
+ logger.debug("Finished waiting for router termination")
261
228
  if len(exception_msgs) == 0:
262
- self.logger.debug("Sending STOP to DBM")
263
- self.priority_msgs.put(("STOP", 0))
229
+ logger.debug("Sending STOP to DBM")
230
+ self.resource_msgs.put("STOP")
264
231
  else:
265
- self.logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
266
- self.logger.debug("Waiting for DB termination")
232
+ logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
233
+ logger.debug("Waiting for DB termination")
267
234
  self.dbm_proc.join()
268
- self.logger.debug("Finished waiting for DBM termination")
235
+ self.dbm_proc.close()
236
+ logger.debug("Finished waiting for DBM termination")
269
237
 
270
238
  # should this be message based? it probably doesn't need to be if
271
239
  # we believe we've received all messages
272
- self.logger.info("Terminating filesystem radio receiver process")
240
+ logger.info("Terminating filesystem radio receiver process")
273
241
  self.filesystem_proc.terminate()
274
242
  self.filesystem_proc.join()
243
+ self.filesystem_proc.close()
275
244
 
276
- @staticmethod
277
- def monitor_wrapper(f: Any,
278
- args: Sequence,
279
- kwargs: Dict,
280
- try_id: int,
281
- task_id: int,
282
- monitoring_hub_url: str,
283
- run_id: str,
284
- logging_level: int,
285
- sleep_dur: float,
286
- radio_mode: str,
287
- monitor_resources: bool,
288
- run_dir: str) -> Tuple[Callable, Sequence, Dict]:
289
- return parsl.monitoring.remote.monitor_wrapper(f, args, kwargs, try_id, task_id, monitoring_hub_url,
290
- run_id, logging_level, sleep_dur, radio_mode,
291
- monitor_resources, run_dir)
245
+ logger.info("Closing monitoring multiprocessing queues")
246
+ self.exception_q.close()
247
+ self.exception_q.join_thread()
248
+ self.resource_msgs.close()
249
+ self.resource_msgs.join_thread()
250
+ logger.info("Closed monitoring multiprocessing queues")
292
251
 
293
252
 
294
253
  @wrap_with_logs
295
- def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:
296
- logger = set_file_logger("{}/monitoring_filesystem_radio.log".format(logdir),
254
+ def filesystem_receiver(q: Queue[TaggedMonitoringMessage], run_dir: str) -> None:
255
+ logger = set_file_logger(f"{run_dir}/monitoring_filesystem_radio.log",
297
256
  name="monitoring_filesystem_radio",
298
257
  level=logging.INFO)
299
258
 
@@ -302,7 +261,9 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
302
261
  base_path = f"{run_dir}/monitor-fs-radio/"
303
262
  tmp_dir = f"{base_path}/tmp/"
304
263
  new_dir = f"{base_path}/new/"
305
- logger.debug(f"Creating new and tmp paths under {base_path}")
264
+ logger.debug("Creating new and tmp paths under %s", base_path)
265
+
266
+ target_radio = MultiprocessingQueueRadioSender(q)
306
267
 
307
268
  os.makedirs(tmp_dir, exist_ok=True)
308
269
  os.makedirs(new_dir, exist_ok=True)
@@ -313,202 +274,15 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
313
274
  # iterate over files in new_dir
314
275
  for filename in os.listdir(new_dir):
315
276
  try:
316
- logger.info(f"Processing filesystem radio file {filename}")
277
+ logger.info("Processing filesystem radio file %s", filename)
317
278
  full_path_filename = f"{new_dir}/{filename}"
318
279
  with open(full_path_filename, "rb") as f:
319
- message = deserialize(f.read())
320
- logger.debug(f"Message received is: {message}")
280
+ message = pickle.load(f)
281
+ logger.debug("Message received is: %s", message)
321
282
  assert isinstance(message, tuple)
322
- q.put(cast(AddressedMonitoringMessage, message))
283
+ target_radio.send(cast(TaggedMonitoringMessage, message))
323
284
  os.remove(full_path_filename)
324
285
  except Exception:
325
- logger.exception(f"Exception processing {filename} - probably will be retried next iteration")
286
+ logger.exception("Exception processing %s - probably will be retried next iteration", filename)
326
287
 
327
288
  time.sleep(1) # whats a good time for this poll?
328
-
329
-
330
- class MonitoringRouter:
331
-
332
- def __init__(self,
333
- *,
334
- hub_address: str,
335
- hub_port: Optional[int] = None,
336
- hub_port_range: Tuple[int, int] = (55050, 56000),
337
-
338
- monitoring_hub_address: str = "127.0.0.1",
339
- logdir: str = ".",
340
- run_id: str,
341
- logging_level: int = logging.INFO,
342
- atexit_timeout: int = 3 # in seconds
343
- ):
344
- """ Initializes a monitoring configuration class.
345
-
346
- Parameters
347
- ----------
348
- hub_address : str
349
- The ip address at which the workers will be able to reach the Hub.
350
- hub_port : int
351
- The specific port at which workers will be able to reach the Hub via UDP. Default: None
352
- hub_port_range : tuple(int, int)
353
- The MonitoringHub picks ports at random from the range which will be used by Hub.
354
- This is overridden when the hub_port option is set. Default: (55050, 56000)
355
- logdir : str
356
- Parsl log directory paths. Logs and temp files go here. Default: '.'
357
- logging_level : int
358
- Logging level as defined in the logging module. Default: logging.INFO
359
- atexit_timeout : float, optional
360
- The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
361
-
362
- """
363
- os.makedirs(logdir, exist_ok=True)
364
- self.logger = set_file_logger("{}/monitoring_router.log".format(logdir),
365
- name="monitoring_router",
366
- level=logging_level)
367
- self.logger.debug("Monitoring router starting")
368
-
369
- self.hub_address = hub_address
370
- self.atexit_timeout = atexit_timeout
371
- self.run_id = run_id
372
-
373
- self.loop_freq = 10.0 # milliseconds
374
-
375
- # Initialize the UDP socket
376
- self.sock = socket.socket(socket.AF_INET,
377
- socket.SOCK_DGRAM,
378
- socket.IPPROTO_UDP)
379
-
380
- # We are trying to bind to all interfaces with 0.0.0.0
381
- if not hub_port:
382
- self.sock.bind(('0.0.0.0', 0))
383
- self.hub_port = self.sock.getsockname()[1]
384
- else:
385
- self.hub_port = hub_port
386
- try:
387
- self.sock.bind(('0.0.0.0', self.hub_port))
388
- except Exception as e:
389
- raise RuntimeError(f"Could not bind to hub_port {hub_port} because: {e}")
390
- self.sock.settimeout(self.loop_freq / 1000)
391
- self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.hub_port))
392
-
393
- self._context = zmq.Context()
394
- self.ic_channel = self._context.socket(zmq.DEALER)
395
- self.ic_channel.setsockopt(zmq.LINGER, 0)
396
- self.ic_channel.set_hwm(0)
397
- self.ic_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
398
- self.logger.debug("hub_address: {}. hub_port_range {}".format(hub_address, hub_port_range))
399
- self.ic_port = self.ic_channel.bind_to_random_port("tcp://*",
400
- min_port=hub_port_range[0],
401
- max_port=hub_port_range[1])
402
-
403
- def start(self,
404
- priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
405
- node_msgs: "queue.Queue[AddressedMonitoringMessage]",
406
- block_msgs: "queue.Queue[AddressedMonitoringMessage]",
407
- resource_msgs: "queue.Queue[AddressedMonitoringMessage]") -> None:
408
- try:
409
- router_keep_going = True
410
- while router_keep_going:
411
- try:
412
- data, addr = self.sock.recvfrom(2048)
413
- resource_msg = pickle.loads(data)
414
- self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
415
- resource_msgs.put((resource_msg, addr))
416
- except socket.timeout:
417
- pass
418
-
419
- try:
420
- dfk_loop_start = time.time()
421
- while time.time() - dfk_loop_start < 1.0: # TODO make configurable
422
- # note that nothing checks that msg really is of the annotated type
423
- msg: TaggedMonitoringMessage
424
- msg = self.ic_channel.recv_pyobj()
425
-
426
- assert isinstance(msg, tuple), "IC Channel expects only tuples, got {}".format(msg)
427
- assert len(msg) >= 1, "IC Channel expects tuples of length at least 1, got {}".format(msg)
428
- assert len(msg) == 2, "IC Channel expects message tuples of exactly length 2, got {}".format(msg)
429
-
430
- msg_0: AddressedMonitoringMessage
431
- msg_0 = (msg, 0)
432
-
433
- if msg[0] == MessageType.NODE_INFO:
434
- msg[1]['run_id'] = self.run_id
435
- node_msgs.put(msg_0)
436
- elif msg[0] == MessageType.RESOURCE_INFO:
437
- resource_msgs.put(msg_0)
438
- elif msg[0] == MessageType.BLOCK_INFO:
439
- block_msgs.put(msg_0)
440
- elif msg[0] == MessageType.TASK_INFO:
441
- priority_msgs.put(msg_0)
442
- elif msg[0] == MessageType.WORKFLOW_INFO:
443
- priority_msgs.put(msg_0)
444
- if 'exit_now' in msg[1] and msg[1]['exit_now']:
445
- router_keep_going = False
446
- else:
447
- # There is a type: ignore here because if msg[0]
448
- # is of the correct type, this code is unreachable,
449
- # but there is no verification that the message
450
- # received from ic_channel.recv_pyobj() is actually
451
- # of that type.
452
- self.logger.error("Discarding message " # type: ignore[unreachable]
453
- f"from interchange with unknown type {msg[0].value}")
454
- except zmq.Again:
455
- pass
456
- except Exception:
457
- # This will catch malformed messages. What happens if the
458
- # channel is broken in such a way that it always raises
459
- # an exception? Looping on this would maybe be the wrong
460
- # thing to do.
461
- self.logger.warning("Failure processing a ZMQ message", exc_info=True)
462
-
463
- self.logger.info("Monitoring router draining")
464
- last_msg_received_time = time.time()
465
- while time.time() - last_msg_received_time < self.atexit_timeout:
466
- try:
467
- data, addr = self.sock.recvfrom(2048)
468
- msg = pickle.loads(data)
469
- self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
470
- resource_msgs.put((msg, addr))
471
- last_msg_received_time = time.time()
472
- except socket.timeout:
473
- pass
474
-
475
- self.logger.info("Monitoring router finishing normally")
476
- finally:
477
- self.logger.info("Monitoring router finished")
478
-
479
-
480
- @wrap_with_logs
481
- def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
482
- exception_q: "queue.Queue[Tuple[str, str]]",
483
- priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
484
- node_msgs: "queue.Queue[AddressedMonitoringMessage]",
485
- block_msgs: "queue.Queue[AddressedMonitoringMessage]",
486
- resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
487
-
488
- hub_address: str,
489
- hub_port: Optional[int],
490
- hub_port_range: Tuple[int, int],
491
-
492
- logdir: str,
493
- logging_level: int,
494
- run_id: str) -> None:
495
- setproctitle("parsl: monitoring router")
496
- try:
497
- router = MonitoringRouter(hub_address=hub_address,
498
- hub_port=hub_port,
499
- hub_port_range=hub_port_range,
500
- logdir=logdir,
501
- logging_level=logging_level,
502
- run_id=run_id)
503
- except Exception as e:
504
- logger.error("MonitoringRouter construction failed.", exc_info=True)
505
- comm_q.put(f"Monitoring router construction failed: {e}")
506
- else:
507
- comm_q.put((router.hub_port, router.ic_port))
508
-
509
- router.logger.info("Starting MonitoringRouter in router_starter")
510
- try:
511
- router.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
512
- except Exception as e:
513
- router.logger.exception("router.start exception")
514
- exception_q.put(('Hub', str(e)))
@@ -1,7 +1,6 @@
1
- import pandas as pd
2
-
3
1
  from typing import Any
4
2
 
3
+ import pandas as pd
5
4
 
6
5
  # pandas can take several different types of database connection,
7
6
  # and itself exposes its connection parameters as "Any".
@@ -0,0 +1,13 @@
1
+ import logging
2
+ from abc import ABCMeta, abstractmethod
3
+ from typing import Optional
4
+
5
+ _db_manager_excepts: Optional[Exception]
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class MonitoringRadioSender(metaclass=ABCMeta):
11
+ @abstractmethod
12
+ def send(self, message: object) -> None:
13
+ pass
@@ -0,0 +1,52 @@
1
+ import logging
2
+ import os
3
+ import pickle
4
+ import uuid
5
+
6
+ from parsl.monitoring.radios.base import MonitoringRadioSender
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class FilesystemRadioSender(MonitoringRadioSender):
12
+ """A MonitoringRadioSender that sends messages over a shared filesystem.
13
+
14
+ The messsage directory structure is based on maildir,
15
+ https://en.wikipedia.org/wiki/Maildir
16
+
17
+ The writer creates a message in tmp/ and then when it is fully
18
+ written, moves it atomically into new/
19
+
20
+ The reader ignores tmp/ and only reads and deletes messages from
21
+ new/
22
+
23
+ This avoids a race condition of reading partially written messages.
24
+
25
+ This radio is likely to give higher shared filesystem load compared to
26
+ the UDP radio, but should be much more reliable.
27
+ """
28
+
29
+ def __init__(self, *, monitoring_url: str, timeout: int = 10, run_dir: str):
30
+ logger.info("filesystem based monitoring channel initializing")
31
+ self.base_path = f"{run_dir}/monitor-fs-radio/"
32
+ self.tmp_path = f"{self.base_path}/tmp"
33
+ self.new_path = f"{self.base_path}/new"
34
+
35
+ os.makedirs(self.tmp_path, exist_ok=True)
36
+ os.makedirs(self.new_path, exist_ok=True)
37
+
38
+ def send(self, message: object) -> None:
39
+ logger.info("Sending a monitoring message via filesystem")
40
+
41
+ unique_id = str(uuid.uuid4())
42
+
43
+ tmp_filename = f"{self.tmp_path}/{unique_id}"
44
+ new_filename = f"{self.new_path}/{unique_id}"
45
+ buffer = message
46
+
47
+ # this will write the message out then atomically
48
+ # move it into new/, so that a partially written
49
+ # file will never be observed in new/
50
+ with open(tmp_filename, "wb") as f:
51
+ pickle.dump(buffer, f)
52
+ os.rename(tmp_filename, new_filename)
@@ -0,0 +1,57 @@
1
+ import logging
2
+ import pickle
3
+
4
+ from parsl.monitoring.radios.base import MonitoringRadioSender
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class HTEXRadioSender(MonitoringRadioSender):
10
+
11
+ def __init__(self, monitoring_url: str, timeout: int = 10):
12
+ """
13
+ Parameters
14
+ ----------
15
+
16
+ monitoring_url : str
17
+ URL of the form <scheme>://<IP>:<PORT>
18
+ timeout : int
19
+ timeout, default=10s
20
+ """
21
+ logger.info("htex-based monitoring channel initialising")
22
+
23
+ def send(self, message: object) -> None:
24
+ """ Sends a message to the UDP receiver
25
+
26
+ Parameter
27
+ ---------
28
+
29
+ message: object
30
+ Arbitrary pickle-able object that is to be sent
31
+
32
+ Returns:
33
+ None
34
+ """
35
+
36
+ import parsl.executors.high_throughput.monitoring_info
37
+
38
+ result_queue = parsl.executors.high_throughput.monitoring_info.result_queue
39
+
40
+ # this message needs to go in the result queue tagged so that it is treated
41
+ # i) as a monitoring message by the interchange, and then further more treated
42
+ # as a RESOURCE_INFO message when received by monitoring (rather than a NODE_INFO
43
+ # which is the implicit default for messages from the interchange)
44
+
45
+ # for the interchange, the outer wrapper, this needs to be a dict:
46
+
47
+ interchange_msg = {
48
+ 'type': 'monitoring',
49
+ 'payload': message
50
+ }
51
+
52
+ if result_queue:
53
+ result_queue.put(pickle.dumps(interchange_msg))
54
+ else:
55
+ logger.error("result_queue is uninitialized - cannot put monitoring message")
56
+
57
+ return