parsl 2024.3.11__py3-none-any.whl → 2025.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. parsl/__init__.py +9 -10
  2. parsl/addresses.py +29 -7
  3. parsl/app/app.py +7 -8
  4. parsl/app/bash.py +15 -8
  5. parsl/app/errors.py +10 -13
  6. parsl/app/futures.py +8 -10
  7. parsl/app/python.py +2 -1
  8. parsl/benchmark/perf.py +2 -1
  9. parsl/concurrent/__init__.py +2 -2
  10. parsl/config.py +57 -10
  11. parsl/configs/ASPIRE1.py +6 -5
  12. parsl/configs/Azure.py +9 -8
  13. parsl/configs/bridges.py +6 -4
  14. parsl/configs/cc_in2p3.py +3 -3
  15. parsl/configs/ec2.py +3 -1
  16. parsl/configs/expanse.py +4 -3
  17. parsl/configs/frontera.py +3 -4
  18. parsl/configs/htex_local.py +3 -4
  19. parsl/configs/illinoiscluster.py +3 -1
  20. parsl/configs/improv.py +34 -0
  21. parsl/configs/kubernetes.py +4 -3
  22. parsl/configs/local_threads.py +5 -1
  23. parsl/configs/midway.py +5 -3
  24. parsl/configs/osg.py +4 -2
  25. parsl/configs/polaris.py +4 -2
  26. parsl/configs/stampede2.py +6 -5
  27. parsl/configs/summit.py +3 -3
  28. parsl/configs/toss3_llnl.py +4 -3
  29. parsl/configs/vineex_local.py +6 -4
  30. parsl/configs/wqex_local.py +5 -3
  31. parsl/curvezmq.py +4 -0
  32. parsl/data_provider/data_manager.py +4 -3
  33. parsl/data_provider/file_noop.py +1 -2
  34. parsl/data_provider/files.py +3 -3
  35. parsl/data_provider/ftp.py +1 -3
  36. parsl/data_provider/globus.py +7 -6
  37. parsl/data_provider/http.py +2 -2
  38. parsl/data_provider/rsync.py +1 -1
  39. parsl/data_provider/staging.py +2 -2
  40. parsl/data_provider/zip.py +135 -0
  41. parsl/dataflow/dependency_resolvers.py +115 -0
  42. parsl/dataflow/dflow.py +262 -224
  43. parsl/dataflow/errors.py +3 -5
  44. parsl/dataflow/futures.py +27 -14
  45. parsl/dataflow/memoization.py +5 -5
  46. parsl/dataflow/rundirs.py +5 -6
  47. parsl/dataflow/taskrecord.py +4 -5
  48. parsl/executors/__init__.py +4 -2
  49. parsl/executors/base.py +45 -15
  50. parsl/executors/errors.py +13 -0
  51. parsl/executors/execute_task.py +37 -0
  52. parsl/executors/flux/execute_parsl_task.py +3 -3
  53. parsl/executors/flux/executor.py +18 -19
  54. parsl/executors/flux/flux_instance_manager.py +26 -27
  55. parsl/executors/high_throughput/errors.py +43 -3
  56. parsl/executors/high_throughput/executor.py +316 -282
  57. parsl/executors/high_throughput/interchange.py +158 -167
  58. parsl/executors/high_throughput/manager_record.py +5 -0
  59. parsl/executors/high_throughput/manager_selector.py +55 -0
  60. parsl/executors/high_throughput/monitoring_info.py +2 -1
  61. parsl/executors/high_throughput/mpi_executor.py +113 -0
  62. parsl/executors/high_throughput/mpi_prefix_composer.py +10 -11
  63. parsl/executors/high_throughput/mpi_resource_management.py +6 -17
  64. parsl/executors/high_throughput/probe.py +9 -7
  65. parsl/executors/high_throughput/process_worker_pool.py +115 -77
  66. parsl/executors/high_throughput/zmq_pipes.py +81 -23
  67. parsl/executors/radical/executor.py +130 -79
  68. parsl/executors/radical/rpex_resources.py +17 -15
  69. parsl/executors/radical/rpex_worker.py +4 -3
  70. parsl/executors/status_handling.py +157 -51
  71. parsl/executors/taskvine/__init__.py +1 -1
  72. parsl/executors/taskvine/errors.py +1 -1
  73. parsl/executors/taskvine/exec_parsl_function.py +2 -2
  74. parsl/executors/taskvine/executor.py +41 -57
  75. parsl/executors/taskvine/factory.py +1 -1
  76. parsl/executors/taskvine/factory_config.py +1 -1
  77. parsl/executors/taskvine/manager.py +18 -13
  78. parsl/executors/taskvine/manager_config.py +9 -5
  79. parsl/executors/threads.py +6 -6
  80. parsl/executors/workqueue/errors.py +1 -1
  81. parsl/executors/workqueue/exec_parsl_function.py +6 -5
  82. parsl/executors/workqueue/executor.py +64 -63
  83. parsl/executors/workqueue/parsl_coprocess.py +1 -1
  84. parsl/jobs/error_handlers.py +2 -2
  85. parsl/jobs/job_status_poller.py +30 -113
  86. parsl/jobs/states.py +7 -2
  87. parsl/jobs/strategy.py +43 -31
  88. parsl/launchers/__init__.py +12 -3
  89. parsl/launchers/errors.py +1 -1
  90. parsl/launchers/launchers.py +6 -12
  91. parsl/log_utils.py +9 -6
  92. parsl/monitoring/db_manager.py +59 -95
  93. parsl/monitoring/errors.py +6 -0
  94. parsl/monitoring/monitoring.py +87 -356
  95. parsl/monitoring/queries/pandas.py +1 -2
  96. parsl/monitoring/radios/base.py +13 -0
  97. parsl/monitoring/radios/filesystem.py +52 -0
  98. parsl/monitoring/radios/htex.py +57 -0
  99. parsl/monitoring/radios/multiprocessing.py +17 -0
  100. parsl/monitoring/radios/udp.py +56 -0
  101. parsl/monitoring/radios/zmq.py +17 -0
  102. parsl/monitoring/remote.py +33 -37
  103. parsl/monitoring/router.py +212 -0
  104. parsl/monitoring/types.py +5 -6
  105. parsl/monitoring/visualization/app.py +4 -2
  106. parsl/monitoring/visualization/models.py +0 -1
  107. parsl/monitoring/visualization/plots/default/workflow_plots.py +11 -4
  108. parsl/monitoring/visualization/plots/default/workflow_resource_plots.py +1 -0
  109. parsl/monitoring/visualization/utils.py +0 -1
  110. parsl/monitoring/visualization/views.py +16 -8
  111. parsl/multiprocessing.py +0 -1
  112. parsl/process_loggers.py +1 -2
  113. parsl/providers/__init__.py +8 -17
  114. parsl/providers/aws/aws.py +2 -3
  115. parsl/providers/azure/azure.py +4 -5
  116. parsl/providers/base.py +2 -18
  117. parsl/providers/cluster_provider.py +4 -12
  118. parsl/providers/condor/condor.py +7 -17
  119. parsl/providers/errors.py +2 -2
  120. parsl/providers/googlecloud/googlecloud.py +2 -1
  121. parsl/providers/grid_engine/grid_engine.py +5 -14
  122. parsl/providers/kubernetes/kube.py +80 -40
  123. parsl/providers/local/local.py +13 -26
  124. parsl/providers/lsf/lsf.py +5 -23
  125. parsl/providers/pbspro/pbspro.py +5 -17
  126. parsl/providers/slurm/slurm.py +81 -39
  127. parsl/providers/torque/torque.py +3 -14
  128. parsl/serialize/__init__.py +8 -3
  129. parsl/serialize/base.py +1 -2
  130. parsl/serialize/concretes.py +5 -4
  131. parsl/serialize/facade.py +3 -3
  132. parsl/serialize/proxystore.py +3 -2
  133. parsl/tests/__init__.py +1 -1
  134. parsl/tests/configs/azure_single_node.py +4 -5
  135. parsl/tests/configs/bridges.py +3 -2
  136. parsl/tests/configs/cc_in2p3.py +1 -3
  137. parsl/tests/configs/comet.py +2 -1
  138. parsl/tests/configs/ec2_single_node.py +1 -2
  139. parsl/tests/configs/ec2_spot.py +1 -2
  140. parsl/tests/configs/flux_local.py +11 -0
  141. parsl/tests/configs/frontera.py +2 -3
  142. parsl/tests/configs/htex_local.py +3 -5
  143. parsl/tests/configs/htex_local_alternate.py +11 -15
  144. parsl/tests/configs/htex_local_intask_staging.py +5 -9
  145. parsl/tests/configs/htex_local_rsync_staging.py +4 -8
  146. parsl/tests/configs/local_radical.py +1 -3
  147. parsl/tests/configs/local_radical_mpi.py +2 -2
  148. parsl/tests/configs/local_threads_checkpoint_periodic.py +8 -10
  149. parsl/tests/configs/local_threads_monitoring.py +0 -1
  150. parsl/tests/configs/midway.py +2 -2
  151. parsl/tests/configs/nscc_singapore.py +3 -3
  152. parsl/tests/configs/osg_htex.py +1 -1
  153. parsl/tests/configs/petrelkube.py +3 -2
  154. parsl/tests/configs/slurm_local.py +24 -0
  155. parsl/tests/configs/summit.py +1 -0
  156. parsl/tests/configs/taskvine_ex.py +4 -7
  157. parsl/tests/configs/user_opts.py +2 -8
  158. parsl/tests/configs/workqueue_ex.py +4 -6
  159. parsl/tests/conftest.py +27 -13
  160. parsl/tests/integration/test_stress/test_python_simple.py +3 -4
  161. parsl/tests/integration/test_stress/test_python_threads.py +3 -5
  162. parsl/tests/manual_tests/htex_local.py +4 -6
  163. parsl/tests/manual_tests/test_basic.py +1 -0
  164. parsl/tests/manual_tests/test_log_filter.py +3 -1
  165. parsl/tests/manual_tests/test_memory_limits.py +6 -8
  166. parsl/tests/manual_tests/test_regression_220.py +2 -1
  167. parsl/tests/manual_tests/test_udp_simple.py +4 -4
  168. parsl/tests/manual_tests/test_worker_count.py +3 -2
  169. parsl/tests/scaling_tests/htex_local.py +2 -4
  170. parsl/tests/scaling_tests/test_scale.py +0 -9
  171. parsl/tests/scaling_tests/vineex_condor.py +1 -2
  172. parsl/tests/scaling_tests/vineex_local.py +1 -2
  173. parsl/tests/site_tests/site_config_selector.py +1 -6
  174. parsl/tests/site_tests/test_provider.py +4 -2
  175. parsl/tests/site_tests/test_site.py +2 -0
  176. parsl/tests/sites/test_affinity.py +7 -7
  177. parsl/tests/sites/test_dynamic_executor.py +3 -4
  178. parsl/tests/sites/test_ec2.py +3 -2
  179. parsl/tests/sites/test_worker_info.py +4 -5
  180. parsl/tests/test_aalst_patterns.py +0 -1
  181. parsl/tests/test_bash_apps/test_apptimeout.py +2 -2
  182. parsl/tests/test_bash_apps/test_basic.py +10 -4
  183. parsl/tests/test_bash_apps/test_error_codes.py +5 -7
  184. parsl/tests/test_bash_apps/test_inputs_default.py +25 -0
  185. parsl/tests/test_bash_apps/test_kwarg_storage.py +1 -1
  186. parsl/tests/test_bash_apps/test_memoize.py +2 -8
  187. parsl/tests/test_bash_apps/test_memoize_ignore_args.py +9 -14
  188. parsl/tests/test_bash_apps/test_memoize_ignore_args_regr.py +9 -14
  189. parsl/tests/test_bash_apps/test_multiline.py +1 -1
  190. parsl/tests/test_bash_apps/test_pipeline.py +1 -1
  191. parsl/tests/test_bash_apps/test_std_uri.py +123 -0
  192. parsl/tests/test_bash_apps/test_stdout.py +33 -8
  193. parsl/tests/test_callables.py +2 -2
  194. parsl/tests/test_checkpointing/test_periodic.py +21 -39
  195. parsl/tests/test_checkpointing/test_python_checkpoint_1.py +1 -0
  196. parsl/tests/test_checkpointing/test_python_checkpoint_2.py +2 -2
  197. parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -1
  198. parsl/tests/test_checkpointing/test_regression_239.py +1 -1
  199. parsl/tests/test_checkpointing/test_task_exit.py +2 -3
  200. parsl/tests/test_docs/test_from_slides.py +5 -2
  201. parsl/tests/test_docs/test_kwargs.py +4 -1
  202. parsl/tests/test_docs/test_tutorial_1.py +1 -2
  203. parsl/tests/test_docs/test_workflow1.py +2 -2
  204. parsl/tests/test_docs/test_workflow2.py +0 -1
  205. parsl/tests/test_error_handling/test_rand_fail.py +2 -2
  206. parsl/tests/test_error_handling/test_resource_spec.py +10 -12
  207. parsl/tests/test_error_handling/test_retries.py +6 -16
  208. parsl/tests/test_error_handling/test_retry_handler.py +1 -0
  209. parsl/tests/test_error_handling/test_retry_handler_failure.py +2 -1
  210. parsl/tests/test_error_handling/test_serialization_fail.py +1 -1
  211. parsl/tests/test_error_handling/test_wrap_with_logs.py +1 -0
  212. parsl/tests/test_execute_task.py +29 -0
  213. parsl/tests/test_flux.py +1 -1
  214. parsl/tests/test_htex/test_basic.py +2 -3
  215. parsl/tests/test_htex/test_block_manager_selector_unit.py +20 -0
  216. parsl/tests/test_htex/test_command_client_timeout.py +66 -0
  217. parsl/tests/test_htex/test_connected_blocks.py +3 -2
  218. parsl/tests/test_htex/test_cpu_affinity_explicit.py +6 -10
  219. parsl/tests/test_htex/test_disconnected_blocks.py +6 -5
  220. parsl/tests/test_htex/test_disconnected_blocks_failing_provider.py +71 -0
  221. parsl/tests/test_htex/test_drain.py +79 -0
  222. parsl/tests/test_htex/test_htex.py +51 -25
  223. parsl/tests/test_htex/test_manager_failure.py +0 -1
  224. parsl/tests/test_htex/test_manager_selector_by_block.py +51 -0
  225. parsl/tests/test_htex/test_managers_command.py +36 -0
  226. parsl/tests/test_htex/test_missing_worker.py +2 -12
  227. parsl/tests/test_htex/test_multiple_disconnected_blocks.py +9 -9
  228. parsl/tests/test_htex/test_resource_spec_validation.py +45 -0
  229. parsl/tests/test_htex/test_zmq_binding.py +29 -8
  230. parsl/tests/test_monitoring/test_app_names.py +86 -0
  231. parsl/tests/test_monitoring/test_basic.py +73 -25
  232. parsl/tests/test_monitoring/test_db_locks.py +6 -4
  233. parsl/tests/test_monitoring/test_fuzz_zmq.py +19 -8
  234. parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +80 -0
  235. parsl/tests/test_monitoring/test_incomplete_futures.py +5 -4
  236. parsl/tests/test_monitoring/test_memoization_representation.py +4 -2
  237. parsl/tests/test_monitoring/test_stdouterr.py +134 -0
  238. parsl/tests/test_monitoring/test_viz_colouring.py +1 -0
  239. parsl/tests/test_mpi_apps/test_bad_mpi_config.py +33 -26
  240. parsl/tests/test_mpi_apps/test_mpi_mode_enabled.py +28 -11
  241. parsl/tests/test_mpi_apps/test_mpi_prefix.py +4 -4
  242. parsl/tests/test_mpi_apps/test_mpi_scheduler.py +7 -2
  243. parsl/tests/test_mpi_apps/test_mpiex.py +64 -0
  244. parsl/tests/test_mpi_apps/test_resource_spec.py +42 -49
  245. parsl/tests/test_providers/test_kubernetes_provider.py +102 -0
  246. parsl/tests/test_providers/test_local_provider.py +3 -132
  247. parsl/tests/test_providers/test_pbspro_template.py +2 -3
  248. parsl/tests/test_providers/test_slurm_template.py +2 -3
  249. parsl/tests/test_providers/test_submiterror_deprecation.py +2 -1
  250. parsl/tests/test_python_apps/test_context_manager.py +128 -0
  251. parsl/tests/test_python_apps/test_dep_standard_futures.py +2 -1
  252. parsl/tests/test_python_apps/test_dependencies_deep.py +59 -0
  253. parsl/tests/test_python_apps/test_fail.py +0 -25
  254. parsl/tests/test_python_apps/test_futures.py +2 -1
  255. parsl/tests/test_python_apps/test_inputs_default.py +22 -0
  256. parsl/tests/test_python_apps/test_join.py +0 -1
  257. parsl/tests/test_python_apps/test_lifted.py +11 -7
  258. parsl/tests/test_python_apps/test_memoize_bad_id_for_memo.py +1 -0
  259. parsl/tests/test_python_apps/test_outputs.py +1 -1
  260. parsl/tests/test_python_apps/test_pluggable_future_resolution.py +161 -0
  261. parsl/tests/test_radical/test_mpi_funcs.py +1 -2
  262. parsl/tests/test_regression/test_1480.py +2 -1
  263. parsl/tests/test_regression/test_1653.py +2 -1
  264. parsl/tests/test_regression/test_226.py +1 -0
  265. parsl/tests/test_regression/test_2652.py +1 -0
  266. parsl/tests/test_regression/test_69a.py +0 -1
  267. parsl/tests/test_regression/test_854.py +4 -2
  268. parsl/tests/test_regression/test_97_parallelism_0.py +1 -2
  269. parsl/tests/test_regression/test_98.py +0 -1
  270. parsl/tests/test_scaling/test_block_error_handler.py +9 -4
  271. parsl/tests/test_scaling/test_regression_1621.py +11 -15
  272. parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +84 -0
  273. parsl/tests/test_scaling/test_regression_3696_oscillation.py +103 -0
  274. parsl/tests/test_scaling/test_scale_down.py +2 -5
  275. parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +6 -18
  276. parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +71 -0
  277. parsl/tests/test_scaling/test_shutdown_scalein.py +73 -0
  278. parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +90 -0
  279. parsl/tests/test_serialization/test_2555_caching_deserializer.py +1 -1
  280. parsl/tests/test_serialization/test_3495_deserialize_managerlost.py +47 -0
  281. parsl/tests/test_serialization/test_basic.py +2 -1
  282. parsl/tests/test_serialization/test_htex_code_cache.py +3 -4
  283. parsl/tests/test_serialization/test_pack_resource_spec.py +2 -1
  284. parsl/tests/test_serialization/test_proxystore_configured.py +10 -6
  285. parsl/tests/test_serialization/test_proxystore_impl.py +5 -3
  286. parsl/tests/test_shutdown/test_kill_monitoring.py +64 -0
  287. parsl/tests/test_staging/staging_provider.py +2 -2
  288. parsl/tests/test_staging/test_1316.py +3 -4
  289. parsl/tests/test_staging/test_docs_1.py +2 -1
  290. parsl/tests/test_staging/test_docs_2.py +2 -1
  291. parsl/tests/test_staging/test_elaborate_noop_file.py +2 -3
  292. parsl/tests/{test_data → test_staging}/test_file.py +6 -6
  293. parsl/tests/{test_data → test_staging}/test_output_chain_filenames.py +3 -0
  294. parsl/tests/test_staging/test_staging_ftp.py +1 -0
  295. parsl/tests/test_staging/test_staging_https.py +5 -2
  296. parsl/tests/test_staging/test_staging_stdout.py +64 -0
  297. parsl/tests/test_staging/test_zip_in.py +39 -0
  298. parsl/tests/test_staging/test_zip_out.py +110 -0
  299. parsl/tests/test_staging/test_zip_to_zip.py +41 -0
  300. parsl/tests/test_summary.py +2 -2
  301. parsl/tests/test_thread_parallelism.py +0 -1
  302. parsl/tests/test_threads/test_configs.py +1 -2
  303. parsl/tests/test_threads/test_lazy_errors.py +2 -2
  304. parsl/tests/test_utils/test_execute_wait.py +35 -0
  305. parsl/tests/test_utils/test_sanitize_dns.py +76 -0
  306. parsl/tests/unit/test_address.py +20 -0
  307. parsl/tests/unit/test_file.py +99 -0
  308. parsl/tests/unit/test_usage_tracking.py +66 -0
  309. parsl/usage_tracking/api.py +65 -0
  310. parsl/usage_tracking/levels.py +6 -0
  311. parsl/usage_tracking/usage.py +104 -62
  312. parsl/utils.py +139 -6
  313. parsl/version.py +1 -1
  314. {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/exec_parsl_function.py +6 -5
  315. parsl-2025.1.13.data/scripts/interchange.py +649 -0
  316. {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/process_worker_pool.py +115 -77
  317. parsl-2025.1.13.dist-info/METADATA +96 -0
  318. parsl-2025.1.13.dist-info/RECORD +462 -0
  319. {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/WHEEL +1 -1
  320. parsl/channels/__init__.py +0 -7
  321. parsl/channels/base.py +0 -141
  322. parsl/channels/errors.py +0 -113
  323. parsl/channels/local/local.py +0 -164
  324. parsl/channels/oauth_ssh/oauth_ssh.py +0 -110
  325. parsl/channels/ssh/ssh.py +0 -276
  326. parsl/channels/ssh_il/__init__.py +0 -0
  327. parsl/channels/ssh_il/ssh_il.py +0 -74
  328. parsl/configs/ad_hoc.py +0 -35
  329. parsl/executors/radical/rpex_master.py +0 -42
  330. parsl/monitoring/radios.py +0 -175
  331. parsl/providers/ad_hoc/__init__.py +0 -0
  332. parsl/providers/ad_hoc/ad_hoc.py +0 -248
  333. parsl/providers/cobalt/__init__.py +0 -0
  334. parsl/providers/cobalt/cobalt.py +0 -236
  335. parsl/providers/cobalt/template.py +0 -17
  336. parsl/tests/configs/ad_hoc_cluster_htex.py +0 -35
  337. parsl/tests/configs/cooley_htex.py +0 -37
  338. parsl/tests/configs/htex_ad_hoc_cluster.py +0 -28
  339. parsl/tests/configs/local_adhoc.py +0 -18
  340. parsl/tests/configs/swan_htex.py +0 -43
  341. parsl/tests/configs/theta.py +0 -37
  342. parsl/tests/integration/test_channels/__init__.py +0 -0
  343. parsl/tests/integration/test_channels/test_channels.py +0 -17
  344. parsl/tests/integration/test_channels/test_local_channel.py +0 -42
  345. parsl/tests/integration/test_channels/test_scp_1.py +0 -45
  346. parsl/tests/integration/test_channels/test_ssh_1.py +0 -40
  347. parsl/tests/integration/test_channels/test_ssh_errors.py +0 -46
  348. parsl/tests/integration/test_channels/test_ssh_file_transport.py +0 -41
  349. parsl/tests/integration/test_channels/test_ssh_interactive.py +0 -24
  350. parsl/tests/manual_tests/test_ad_hoc_htex.py +0 -48
  351. parsl/tests/manual_tests/test_fan_in_out_htex_remote.py +0 -88
  352. parsl/tests/manual_tests/test_oauth_ssh.py +0 -13
  353. parsl/tests/sites/test_local_adhoc.py +0 -61
  354. parsl/tests/test_channels/__init__.py +0 -0
  355. parsl/tests/test_channels/test_large_output.py +0 -22
  356. parsl/tests/test_data/__init__.py +0 -0
  357. parsl/tests/test_mpi_apps/test_mpi_mode_disabled.py +0 -51
  358. parsl/tests/test_providers/test_cobalt_deprecation_warning.py +0 -16
  359. parsl-2024.3.11.dist-info/METADATA +0 -98
  360. parsl-2024.3.11.dist-info/RECORD +0 -447
  361. parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
  362. parsl/{channels/oauth_ssh → tests/test_shutdown}/__init__.py +0 -0
  363. parsl/tests/{test_data → test_staging}/test_file_apps.py +0 -0
  364. parsl/tests/{test_data → test_staging}/test_file_staging.py +0 -0
  365. parsl/{channels/ssh → tests/unit}/__init__.py +0 -0
  366. {parsl-2024.3.11.data → parsl-2025.1.13.data}/scripts/parsl_coprocess.py +1 -1
  367. {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/LICENSE +0 -0
  368. {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/entry_points.txt +0 -0
  369. {parsl-2024.3.11.dist-info → parsl-2025.1.13.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,25 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
4
+ import multiprocessing.synchronize as ms
3
5
  import os
4
- import socket
5
- import time
6
6
  import pickle
7
- import logging
8
- import typeguard
9
- import zmq
10
-
11
7
  import queue
8
+ import time
9
+ from multiprocessing import Event
10
+ from multiprocessing.queues import Queue
11
+ from typing import TYPE_CHECKING, Literal, Optional, Tuple, Union, cast
12
12
 
13
- import parsl.monitoring.remote
13
+ import typeguard
14
14
 
15
+ from parsl.log_utils import set_file_logger
16
+ from parsl.monitoring.errors import MonitoringHubStartError
17
+ from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
18
+ from parsl.monitoring.router import router_starter
19
+ from parsl.monitoring.types import TaggedMonitoringMessage
15
20
  from parsl.multiprocessing import ForkProcess, SizedQueue
16
- from multiprocessing import Process
17
- from multiprocessing.queues import Queue
18
- from parsl.utils import RepresentationMixin
19
21
  from parsl.process_loggers import wrap_with_logs
20
- from parsl.utils import setproctitle
21
-
22
- from parsl.serialize import deserialize
23
-
24
- from parsl.monitoring.message_type import MessageType
25
- from parsl.monitoring.types import AddressedMonitoringMessage, TaggedMonitoringMessage
26
- from typing import cast, Any, Callable, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING
22
+ from parsl.utils import RepresentationMixin, setproctitle
27
23
 
28
24
  _db_manager_excepts: Optional[Exception]
29
25
 
@@ -38,40 +34,6 @@ else:
38
34
  logger = logging.getLogger(__name__)
39
35
 
40
36
 
41
- def start_file_logger(filename: str, name: str = 'monitoring', level: int = logging.DEBUG, format_string: Optional[str] = None) -> logging.Logger:
42
- """Add a stream log handler.
43
-
44
- Parameters
45
- ---------
46
-
47
- filename: string
48
- Name of the file to write logs to. Required.
49
- name: string
50
- Logger name.
51
- level: logging.LEVEL
52
- Set the logging level. Default=logging.DEBUG
53
- - format_string (string): Set the format string
54
- format_string: string
55
- Format string to use.
56
-
57
- Returns
58
- -------
59
- None.
60
- """
61
- if format_string is None:
62
- format_string = "%(asctime)s.%(msecs)03d %(name)s:%(lineno)d [%(levelname)s] %(message)s"
63
-
64
- logger = logging.getLogger(name)
65
- logger.setLevel(level)
66
- logger.propagate = False
67
- handler = logging.FileHandler(filename)
68
- handler.setLevel(level)
69
- formatter = logging.Formatter(format_string, datefmt='%Y-%m-%d %H:%M:%S')
70
- handler.setFormatter(formatter)
71
- logger.addHandler(handler)
72
- return logger
73
-
74
-
75
37
  @typeguard.typechecked
76
38
  class MonitoringHub(RepresentationMixin):
77
39
  def __init__(self,
@@ -79,13 +41,9 @@ class MonitoringHub(RepresentationMixin):
79
41
  hub_port: Optional[int] = None,
80
42
  hub_port_range: Tuple[int, int] = (55050, 56000),
81
43
 
82
- client_address: str = "127.0.0.1",
83
- client_port_range: Tuple[int, int] = (55000, 56000),
84
-
85
44
  workflow_name: Optional[str] = None,
86
45
  workflow_version: Optional[str] = None,
87
46
  logging_endpoint: Optional[str] = None,
88
- logdir: Optional[str] = None,
89
47
  monitoring_debug: bool = False,
90
48
  resource_monitoring_enabled: bool = True,
91
49
  resource_monitoring_interval: float = 30): # in seconds
@@ -106,11 +64,6 @@ class MonitoringHub(RepresentationMixin):
106
64
  to deliver monitoring messages to the monitoring router.
107
65
  Note that despite the similar name, this is not related to hub_port.
108
66
  Default: (55050, 56000)
109
- client_address : str
110
- The ip address at which the dfk will be able to reach Hub. Default: "127.0.0.1"
111
- client_port_range : tuple(int, int)
112
- The MonitoringHub picks ports at random from the range which will be used by Hub.
113
- Default: (55000, 56000)
114
67
  workflow_name : str
115
68
  The name for the workflow. Default to the name of the parsl script
116
69
  workflow_version : str
@@ -119,8 +72,6 @@ class MonitoringHub(RepresentationMixin):
119
72
  The database connection url for monitoring to log the information.
120
73
  These URLs follow RFC-1738, and can include username, password, hostname, database name.
121
74
  Default: sqlite, in the configured run_dir.
122
- logdir : str
123
- Parsl log directory paths. Logs and temp files go here. Default: '.'
124
75
  monitoring_debug : Bool
125
76
  Enable monitoring debug logging. Default: False
126
77
  resource_monitoring_enabled : boolean
@@ -134,26 +85,14 @@ class MonitoringHub(RepresentationMixin):
134
85
  Default: 30 seconds
135
86
  """
136
87
 
137
- self.logger = logger
138
-
139
- # Any is used to disable typechecking on uses of _dfk_channel,
140
- # because it is used in the code as if it points to a channel, but
141
- # the static type is that it can also be None. The code relies on
142
- # .start() being called and initialising this to a real channel.
143
- self._dfk_channel = None # type: Any
144
-
145
88
  if _db_manager_excepts:
146
89
  raise _db_manager_excepts
147
90
 
148
- self.client_address = client_address
149
- self.client_port_range = client_port_range
150
-
151
91
  self.hub_address = hub_address
152
92
  self.hub_port = hub_port
153
93
  self.hub_port_range = hub_port_range
154
94
 
155
95
  self.logging_endpoint = logging_endpoint
156
- self.logdir = logdir
157
96
  self.monitoring_debug = monitoring_debug
158
97
 
159
98
  self.workflow_name = workflow_name
@@ -162,19 +101,15 @@ class MonitoringHub(RepresentationMixin):
162
101
  self.resource_monitoring_enabled = resource_monitoring_enabled
163
102
  self.resource_monitoring_interval = resource_monitoring_interval
164
103
 
165
- def start(self, run_id: str, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> int:
104
+ def start(self, dfk_run_dir: str, config_run_dir: Union[str, os.PathLike]) -> None:
166
105
 
167
- if self.logdir is None:
168
- self.logdir = "."
106
+ logger.debug("Starting MonitoringHub")
169
107
 
170
108
  if self.logging_endpoint is None:
171
109
  self.logging_endpoint = f"sqlite:///{os.fspath(config_run_dir)}/monitoring.db"
172
110
 
173
- os.makedirs(self.logdir, exist_ok=True)
174
-
175
- # Initialize the ZMQ pipe to the Parsl Client
111
+ os.makedirs(dfk_run_dir, exist_ok=True)
176
112
 
177
- self.logger.debug("Initializing ZMQ Pipes to client")
178
113
  self.monitoring_hub_active = True
179
114
 
180
115
  # This annotation is incompatible with typeguard 4.x instrumentation
@@ -195,26 +130,22 @@ class MonitoringHub(RepresentationMixin):
195
130
  self.exception_q: Queue[Tuple[str, str]]
196
131
  self.exception_q = SizedQueue(maxsize=10)
197
132
 
198
- self.priority_msgs: Queue[Tuple[Any, int]]
199
- self.priority_msgs = SizedQueue()
200
-
201
- self.resource_msgs: Queue[AddressedMonitoringMessage]
133
+ self.resource_msgs: Queue[Union[TaggedMonitoringMessage, Literal["STOP"]]]
202
134
  self.resource_msgs = SizedQueue()
203
135
 
204
- self.node_msgs: Queue[AddressedMonitoringMessage]
205
- self.node_msgs = SizedQueue()
206
-
207
- self.block_msgs: Queue[AddressedMonitoringMessage]
208
- self.block_msgs = SizedQueue()
136
+ self.router_exit_event: ms.Event
137
+ self.router_exit_event = Event()
209
138
 
210
139
  self.router_proc = ForkProcess(target=router_starter,
211
- args=(comm_q, self.exception_q, self.priority_msgs, self.node_msgs, self.block_msgs, self.resource_msgs),
212
- kwargs={"hub_address": self.hub_address,
213
- "hub_port": self.hub_port,
214
- "hub_port_range": self.hub_port_range,
215
- "logdir": self.logdir,
140
+ kwargs={"comm_q": comm_q,
141
+ "exception_q": self.exception_q,
142
+ "resource_msgs": self.resource_msgs,
143
+ "exit_event": self.router_exit_event,
144
+ "hub_address": self.hub_address,
145
+ "udp_port": self.hub_port,
146
+ "zmq_port_range": self.hub_port_range,
147
+ "run_dir": dfk_run_dir,
216
148
  "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
217
- "run_id": run_id
218
149
  },
219
150
  name="Monitoring-Router-Process",
220
151
  daemon=True,
@@ -222,8 +153,8 @@ class MonitoringHub(RepresentationMixin):
222
153
  self.router_proc.start()
223
154
 
224
155
  self.dbm_proc = ForkProcess(target=dbm_starter,
225
- args=(self.exception_q, self.priority_msgs, self.node_msgs, self.block_msgs, self.resource_msgs,),
226
- kwargs={"logdir": self.logdir,
156
+ args=(self.exception_q, self.resource_msgs,),
157
+ kwargs={"run_dir": dfk_run_dir,
227
158
  "logging_level": logging.DEBUG if self.monitoring_debug else logging.INFO,
228
159
  "db_url": self.logging_endpoint,
229
160
  },
@@ -231,122 +162,108 @@ class MonitoringHub(RepresentationMixin):
231
162
  daemon=True,
232
163
  )
233
164
  self.dbm_proc.start()
234
- self.logger.info("Started the router process {} and DBM process {}".format(self.router_proc.pid, self.dbm_proc.pid))
165
+ logger.info("Started the router process %s and DBM process %s", self.router_proc.pid, self.dbm_proc.pid)
235
166
 
236
- self.filesystem_proc = Process(target=filesystem_receiver,
237
- args=(self.logdir, self.resource_msgs, dfk_run_dir),
238
- name="Monitoring-Filesystem-Process",
239
- daemon=True
240
- )
167
+ self.filesystem_proc = ForkProcess(target=filesystem_receiver,
168
+ args=(self.resource_msgs, dfk_run_dir),
169
+ name="Monitoring-Filesystem-Process",
170
+ daemon=True
171
+ )
241
172
  self.filesystem_proc.start()
242
- self.logger.info(f"Started filesystem radio receiver process {self.filesystem_proc.pid}")
173
+ logger.info("Started filesystem radio receiver process %s", self.filesystem_proc.pid)
174
+
175
+ self.radio = MultiprocessingQueueRadioSender(self.resource_msgs)
243
176
 
244
177
  try:
245
178
  comm_q_result = comm_q.get(block=True, timeout=120)
179
+ comm_q.close()
180
+ comm_q.join_thread()
246
181
  except queue.Empty:
247
- self.logger.error("Hub has not completed initialization in 120s. Aborting")
248
- raise Exception("Hub failed to start")
182
+ logger.error("Hub has not completed initialization in 120s. Aborting")
183
+ raise MonitoringHubStartError()
249
184
 
250
185
  if isinstance(comm_q_result, str):
251
- self.logger.error(f"MonitoringRouter sent an error message: {comm_q_result}")
186
+ logger.error("MonitoringRouter sent an error message: %s", comm_q_result)
252
187
  raise RuntimeError(f"MonitoringRouter failed to start: {comm_q_result}")
253
188
 
254
- udp_port, ic_port = comm_q_result
189
+ udp_port, zmq_port = comm_q_result
255
190
 
256
191
  self.monitoring_hub_url = "udp://{}:{}".format(self.hub_address, udp_port)
257
192
 
258
- context = zmq.Context()
259
- self.dfk_channel_timeout = 10000 # in milliseconds
260
- self._dfk_channel = context.socket(zmq.DEALER)
261
- self._dfk_channel.setsockopt(zmq.LINGER, 0)
262
- self._dfk_channel.set_hwm(0)
263
- self._dfk_channel.setsockopt(zmq.SNDTIMEO, self.dfk_channel_timeout)
264
- self._dfk_channel.connect("tcp://{}:{}".format(self.hub_address, ic_port))
265
-
266
- self.logger.info("Monitoring Hub initialized")
193
+ logger.info("Monitoring Hub initialized")
267
194
 
268
- return ic_port
195
+ self.hub_zmq_port = zmq_port
269
196
 
270
- # TODO: tighten the Any message format
271
- def send(self, mtype: MessageType, message: Any) -> None:
272
- self.logger.debug("Sending message type {}".format(mtype))
273
- try:
274
- self._dfk_channel.send_pyobj((mtype, message))
275
- except zmq.Again:
276
- self.logger.exception(
277
- "The monitoring message sent from DFK to router timed-out after {}ms".format(self.dfk_channel_timeout))
197
+ def send(self, message: TaggedMonitoringMessage) -> None:
198
+ logger.debug("Sending message type %s", message[0])
199
+ self.radio.send(message)
278
200
 
279
201
  def close(self) -> None:
280
- self.logger.info("Terminating Monitoring Hub")
202
+ logger.info("Terminating Monitoring Hub")
281
203
  exception_msgs = []
282
204
  while True:
283
205
  try:
284
206
  exception_msgs.append(self.exception_q.get(block=False))
285
- self.logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
207
+ logger.error("There was a queued exception (Either router or DBM process got exception much earlier?)")
286
208
  except queue.Empty:
287
209
  break
288
- if self._dfk_channel and self.monitoring_hub_active:
210
+ if self.monitoring_hub_active:
289
211
  self.monitoring_hub_active = False
290
- self._dfk_channel.close()
291
212
  if exception_msgs:
292
213
  for exception_msg in exception_msgs:
293
- self.logger.error(
294
- "{} process delivered an exception: {}. Terminating all monitoring processes immediately.".format(
295
- exception_msg[0],
296
- exception_msg[1]
297
- )
214
+ logger.error(
215
+ "%s process delivered an exception: %s. Terminating all monitoring processes immediately.",
216
+ exception_msg[0],
217
+ exception_msg[1]
298
218
  )
299
219
  self.router_proc.terminate()
300
220
  self.dbm_proc.terminate()
301
221
  self.filesystem_proc.terminate()
302
- self.logger.info("Waiting for router to terminate")
222
+ logger.info("Setting router termination event")
223
+ self.router_exit_event.set()
224
+ logger.info("Waiting for router to terminate")
303
225
  self.router_proc.join()
304
- self.logger.debug("Finished waiting for router termination")
226
+ self.router_proc.close()
227
+ logger.debug("Finished waiting for router termination")
305
228
  if len(exception_msgs) == 0:
306
- self.logger.debug("Sending STOP to DBM")
307
- self.priority_msgs.put(("STOP", 0))
229
+ logger.debug("Sending STOP to DBM")
230
+ self.resource_msgs.put("STOP")
308
231
  else:
309
- self.logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
310
- self.logger.debug("Waiting for DB termination")
232
+ logger.debug("Not sending STOP to DBM, because there were DBM exceptions")
233
+ logger.debug("Waiting for DB termination")
311
234
  self.dbm_proc.join()
312
- self.logger.debug("Finished waiting for DBM termination")
235
+ self.dbm_proc.close()
236
+ logger.debug("Finished waiting for DBM termination")
313
237
 
314
238
  # should this be message based? it probably doesn't need to be if
315
239
  # we believe we've received all messages
316
- self.logger.info("Terminating filesystem radio receiver process")
240
+ logger.info("Terminating filesystem radio receiver process")
317
241
  self.filesystem_proc.terminate()
318
242
  self.filesystem_proc.join()
243
+ self.filesystem_proc.close()
319
244
 
320
- @staticmethod
321
- def monitor_wrapper(f: Any,
322
- args: Sequence,
323
- kwargs: Dict,
324
- try_id: int,
325
- task_id: int,
326
- monitoring_hub_url: str,
327
- run_id: str,
328
- logging_level: int,
329
- sleep_dur: float,
330
- radio_mode: str,
331
- monitor_resources: bool,
332
- run_dir: str) -> Tuple[Callable, Sequence, Dict]:
333
- return parsl.monitoring.remote.monitor_wrapper(f, args, kwargs, try_id, task_id, monitoring_hub_url,
334
- run_id, logging_level, sleep_dur, radio_mode,
335
- monitor_resources, run_dir)
245
+ logger.info("Closing monitoring multiprocessing queues")
246
+ self.exception_q.close()
247
+ self.exception_q.join_thread()
248
+ self.resource_msgs.close()
249
+ self.resource_msgs.join_thread()
250
+ logger.info("Closed monitoring multiprocessing queues")
336
251
 
337
252
 
338
253
  @wrap_with_logs
339
- def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]", run_dir: str) -> None:
340
- logger = start_file_logger("{}/monitoring_filesystem_radio.log".format(logdir),
341
- name="monitoring_filesystem_radio",
342
- level=logging.INFO)
254
+ def filesystem_receiver(q: Queue[TaggedMonitoringMessage], run_dir: str) -> None:
255
+ logger = set_file_logger(f"{run_dir}/monitoring_filesystem_radio.log",
256
+ name="monitoring_filesystem_radio",
257
+ level=logging.INFO)
343
258
 
344
259
  logger.info("Starting filesystem radio receiver")
345
260
  setproctitle("parsl: monitoring filesystem receiver")
346
261
  base_path = f"{run_dir}/monitor-fs-radio/"
347
262
  tmp_dir = f"{base_path}/tmp/"
348
263
  new_dir = f"{base_path}/new/"
349
- logger.debug(f"Creating new and tmp paths under {base_path}")
264
+ logger.debug("Creating new and tmp paths under %s", base_path)
265
+
266
+ target_radio = MultiprocessingQueueRadioSender(q)
350
267
 
351
268
  os.makedirs(tmp_dir, exist_ok=True)
352
269
  os.makedirs(new_dir, exist_ok=True)
@@ -357,201 +274,15 @@ def filesystem_receiver(logdir: str, q: "queue.Queue[AddressedMonitoringMessage]
357
274
  # iterate over files in new_dir
358
275
  for filename in os.listdir(new_dir):
359
276
  try:
360
- logger.info(f"Processing filesystem radio file {filename}")
277
+ logger.info("Processing filesystem radio file %s", filename)
361
278
  full_path_filename = f"{new_dir}/{filename}"
362
279
  with open(full_path_filename, "rb") as f:
363
- message = deserialize(f.read())
364
- logger.debug(f"Message received is: {message}")
280
+ message = pickle.load(f)
281
+ logger.debug("Message received is: %s", message)
365
282
  assert isinstance(message, tuple)
366
- q.put(cast(AddressedMonitoringMessage, message))
283
+ target_radio.send(cast(TaggedMonitoringMessage, message))
367
284
  os.remove(full_path_filename)
368
285
  except Exception:
369
- logger.exception(f"Exception processing {filename} - probably will be retried next iteration")
286
+ logger.exception("Exception processing %s - probably will be retried next iteration", filename)
370
287
 
371
288
  time.sleep(1) # whats a good time for this poll?
372
-
373
-
374
- class MonitoringRouter:
375
-
376
- def __init__(self,
377
- *,
378
- hub_address: str,
379
- hub_port: Optional[int] = None,
380
- hub_port_range: Tuple[int, int] = (55050, 56000),
381
-
382
- monitoring_hub_address: str = "127.0.0.1",
383
- logdir: str = ".",
384
- run_id: str,
385
- logging_level: int = logging.INFO,
386
- atexit_timeout: int = 3 # in seconds
387
- ):
388
- """ Initializes a monitoring configuration class.
389
-
390
- Parameters
391
- ----------
392
- hub_address : str
393
- The ip address at which the workers will be able to reach the Hub.
394
- hub_port : int
395
- The specific port at which workers will be able to reach the Hub via UDP. Default: None
396
- hub_port_range : tuple(int, int)
397
- The MonitoringHub picks ports at random from the range which will be used by Hub.
398
- This is overridden when the hub_port option is set. Default: (55050, 56000)
399
- logdir : str
400
- Parsl log directory paths. Logs and temp files go here. Default: '.'
401
- logging_level : int
402
- Logging level as defined in the logging module. Default: logging.INFO
403
- atexit_timeout : float, optional
404
- The amount of time in seconds to terminate the hub without receiving any messages, after the last dfk workflow message is received.
405
-
406
- """
407
- os.makedirs(logdir, exist_ok=True)
408
- self.logger = start_file_logger("{}/monitoring_router.log".format(logdir),
409
- name="monitoring_router",
410
- level=logging_level)
411
- self.logger.debug("Monitoring router starting")
412
-
413
- self.hub_address = hub_address
414
- self.atexit_timeout = atexit_timeout
415
- self.run_id = run_id
416
-
417
- self.loop_freq = 10.0 # milliseconds
418
-
419
- # Initialize the UDP socket
420
- self.sock = socket.socket(socket.AF_INET,
421
- socket.SOCK_DGRAM,
422
- socket.IPPROTO_UDP)
423
-
424
- # We are trying to bind to all interfaces with 0.0.0.0
425
- if not hub_port:
426
- self.sock.bind(('0.0.0.0', 0))
427
- self.hub_port = self.sock.getsockname()[1]
428
- else:
429
- self.hub_port = hub_port
430
- try:
431
- self.sock.bind(('0.0.0.0', self.hub_port))
432
- except Exception as e:
433
- raise RuntimeError(f"Could not bind to hub_port {hub_port} because: {e}")
434
- self.sock.settimeout(self.loop_freq / 1000)
435
- self.logger.info("Initialized the UDP socket on 0.0.0.0:{}".format(self.hub_port))
436
-
437
- self._context = zmq.Context()
438
- self.ic_channel = self._context.socket(zmq.DEALER)
439
- self.ic_channel.setsockopt(zmq.LINGER, 0)
440
- self.ic_channel.set_hwm(0)
441
- self.ic_channel.RCVTIMEO = int(self.loop_freq) # in milliseconds
442
- self.logger.debug("hub_address: {}. hub_port_range {}".format(hub_address, hub_port_range))
443
- self.ic_port = self.ic_channel.bind_to_random_port("tcp://*",
444
- min_port=hub_port_range[0],
445
- max_port=hub_port_range[1])
446
-
447
- def start(self,
448
- priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
449
- node_msgs: "queue.Queue[AddressedMonitoringMessage]",
450
- block_msgs: "queue.Queue[AddressedMonitoringMessage]",
451
- resource_msgs: "queue.Queue[AddressedMonitoringMessage]") -> None:
452
- try:
453
- router_keep_going = True
454
- while router_keep_going:
455
- try:
456
- data, addr = self.sock.recvfrom(2048)
457
- resource_msg = pickle.loads(data)
458
- self.logger.debug("Got UDP Message from {}: {}".format(addr, resource_msg))
459
- resource_msgs.put((resource_msg, addr))
460
- except socket.timeout:
461
- pass
462
-
463
- try:
464
- dfk_loop_start = time.time()
465
- while time.time() - dfk_loop_start < 1.0: # TODO make configurable
466
- # note that nothing checks that msg really is of the annotated type
467
- msg: TaggedMonitoringMessage
468
- msg = self.ic_channel.recv_pyobj()
469
-
470
- assert isinstance(msg, tuple), "IC Channel expects only tuples, got {}".format(msg)
471
- assert len(msg) >= 1, "IC Channel expects tuples of length at least 1, got {}".format(msg)
472
- assert len(msg) == 2, "IC Channel expects message tuples of exactly length 2, got {}".format(msg)
473
-
474
- msg_0: AddressedMonitoringMessage
475
- msg_0 = (msg, 0)
476
-
477
- if msg[0] == MessageType.NODE_INFO:
478
- msg[1]['run_id'] = self.run_id
479
- node_msgs.put(msg_0)
480
- elif msg[0] == MessageType.RESOURCE_INFO:
481
- resource_msgs.put(msg_0)
482
- elif msg[0] == MessageType.BLOCK_INFO:
483
- block_msgs.put(msg_0)
484
- elif msg[0] == MessageType.TASK_INFO:
485
- priority_msgs.put(msg_0)
486
- elif msg[0] == MessageType.WORKFLOW_INFO:
487
- priority_msgs.put(msg_0)
488
- if 'exit_now' in msg[1] and msg[1]['exit_now']:
489
- router_keep_going = False
490
- else:
491
- # There is a type: ignore here because if msg[0]
492
- # is of the correct type, this code is unreachable,
493
- # but there is no verification that the message
494
- # received from ic_channel.recv_pyobj() is actually
495
- # of that type.
496
- self.logger.error(f"Discarding message from interchange with unknown type {msg[0].value}") # type: ignore[unreachable]
497
- except zmq.Again:
498
- pass
499
- except Exception:
500
- # This will catch malformed messages. What happens if the
501
- # channel is broken in such a way that it always raises
502
- # an exception? Looping on this would maybe be the wrong
503
- # thing to do.
504
- self.logger.warning("Failure processing a ZMQ message", exc_info=True)
505
-
506
- self.logger.info("Monitoring router draining")
507
- last_msg_received_time = time.time()
508
- while time.time() - last_msg_received_time < self.atexit_timeout:
509
- try:
510
- data, addr = self.sock.recvfrom(2048)
511
- msg = pickle.loads(data)
512
- self.logger.debug("Got UDP Message from {}: {}".format(addr, msg))
513
- resource_msgs.put((msg, addr))
514
- last_msg_received_time = time.time()
515
- except socket.timeout:
516
- pass
517
-
518
- self.logger.info("Monitoring router finishing normally")
519
- finally:
520
- self.logger.info("Monitoring router finished")
521
-
522
-
523
- @wrap_with_logs
524
- def router_starter(comm_q: "queue.Queue[Union[Tuple[int, int], str]]",
525
- exception_q: "queue.Queue[Tuple[str, str]]",
526
- priority_msgs: "queue.Queue[AddressedMonitoringMessage]",
527
- node_msgs: "queue.Queue[AddressedMonitoringMessage]",
528
- block_msgs: "queue.Queue[AddressedMonitoringMessage]",
529
- resource_msgs: "queue.Queue[AddressedMonitoringMessage]",
530
-
531
- hub_address: str,
532
- hub_port: Optional[int],
533
- hub_port_range: Tuple[int, int],
534
-
535
- logdir: str,
536
- logging_level: int,
537
- run_id: str) -> None:
538
- setproctitle("parsl: monitoring router")
539
- try:
540
- router = MonitoringRouter(hub_address=hub_address,
541
- hub_port=hub_port,
542
- hub_port_range=hub_port_range,
543
- logdir=logdir,
544
- logging_level=logging_level,
545
- run_id=run_id)
546
- except Exception as e:
547
- logger.error("MonitoringRouter construction failed.", exc_info=True)
548
- comm_q.put(f"Monitoring router construction failed: {e}")
549
- else:
550
- comm_q.put((router.hub_port, router.ic_port))
551
-
552
- router.logger.info("Starting MonitoringRouter in router_starter")
553
- try:
554
- router.start(priority_msgs, node_msgs, block_msgs, resource_msgs)
555
- except Exception as e:
556
- router.logger.exception("router.start exception")
557
- exception_q.put(('Hub', str(e)))
@@ -1,7 +1,6 @@
1
- import pandas as pd
2
-
3
1
  from typing import Any
4
2
 
3
+ import pandas as pd
5
4
 
6
5
  # pandas can take several different types of database connection,
7
6
  # and itself exposes its connection parameters as "Any".
@@ -0,0 +1,13 @@
1
+ import logging
2
+ from abc import ABCMeta, abstractmethod
3
+ from typing import Optional
4
+
5
+ _db_manager_excepts: Optional[Exception]
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class MonitoringRadioSender(metaclass=ABCMeta):
11
+ @abstractmethod
12
+ def send(self, message: object) -> None:
13
+ pass
@@ -0,0 +1,52 @@
1
+ import logging
2
+ import os
3
+ import pickle
4
+ import uuid
5
+
6
+ from parsl.monitoring.radios.base import MonitoringRadioSender
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class FilesystemRadioSender(MonitoringRadioSender):
12
+ """A MonitoringRadioSender that sends messages over a shared filesystem.
13
+
14
+ The messsage directory structure is based on maildir,
15
+ https://en.wikipedia.org/wiki/Maildir
16
+
17
+ The writer creates a message in tmp/ and then when it is fully
18
+ written, moves it atomically into new/
19
+
20
+ The reader ignores tmp/ and only reads and deletes messages from
21
+ new/
22
+
23
+ This avoids a race condition of reading partially written messages.
24
+
25
+ This radio is likely to give higher shared filesystem load compared to
26
+ the UDP radio, but should be much more reliable.
27
+ """
28
+
29
+ def __init__(self, *, monitoring_url: str, timeout: int = 10, run_dir: str):
30
+ logger.info("filesystem based monitoring channel initializing")
31
+ self.base_path = f"{run_dir}/monitor-fs-radio/"
32
+ self.tmp_path = f"{self.base_path}/tmp"
33
+ self.new_path = f"{self.base_path}/new"
34
+
35
+ os.makedirs(self.tmp_path, exist_ok=True)
36
+ os.makedirs(self.new_path, exist_ok=True)
37
+
38
+ def send(self, message: object) -> None:
39
+ logger.info("Sending a monitoring message via filesystem")
40
+
41
+ unique_id = str(uuid.uuid4())
42
+
43
+ tmp_filename = f"{self.tmp_path}/{unique_id}"
44
+ new_filename = f"{self.new_path}/{unique_id}"
45
+ buffer = message
46
+
47
+ # this will write the message out then atomically
48
+ # move it into new/, so that a partially written
49
+ # file will never be observed in new/
50
+ with open(tmp_filename, "wb") as f:
51
+ pickle.dump(buffer, f)
52
+ os.rename(tmp_filename, new_filename)