slurm-sdk 0.4.5.dev0__tar.gz → 0.4.6.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/AGENTS.md +64 -0
  2. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/PKG-INFO +2 -1
  3. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/CHANGELOG.md +111 -0
  4. slurm_sdk-0.4.6.dev0/docs/explanation/callbacks_and_events.md +215 -0
  5. slurm_sdk-0.4.6.dev0/docs/explanation/container_packaging.md +87 -0
  6. slurm_sdk-0.4.6.dev0/docs/how-to/container_dependencies.md +160 -0
  7. slurm_sdk-0.4.6.dev0/docs/how-to/custom-task-decorators.md +178 -0
  8. slurm_sdk-0.4.6.dev0/docs/how-to/hello_torch.md +153 -0
  9. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/how-to/index.md +1 -0
  10. slurm_sdk-0.4.6.dev0/docs/how-to/parallelization_patterns.md +173 -0
  11. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/how-to/remote-debugging.md +1 -2
  12. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/reference/api/tasks_workflows.md +2 -0
  13. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/tutorials/container_basics_hello_container.md +1 -1
  14. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/tutorials/getting_started_hello_world.md +1 -1
  15. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/tutorials/parallel-train-eval-workflow.md +5 -0
  16. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/tutorials/workflow_graph_visualization.md +0 -2
  17. slurm_sdk-0.4.6.dev0/llms.txt +559 -0
  18. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/mkdocs.yml +4 -3
  19. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/pyproject.toml +2 -1
  20. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/__init__.py +8 -3
  21. slurm_sdk-0.4.6.dev0/src/slurm/_polling.py +235 -0
  22. slurm_sdk-0.4.6.dev0/src/slurm/_serialization.py +108 -0
  23. slurm_sdk-0.4.6.dev0/src/slurm/_submission.py +461 -0
  24. slurm_sdk-0.4.6.dev0/src/slurm/_workflow.py +376 -0
  25. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/api/base.py +81 -1
  26. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/api/local.py +83 -2
  27. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/api/ssh.py +160 -35
  28. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/array_items.py +6 -6
  29. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/array_job.py +46 -28
  30. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/cli/jobs.py +51 -0
  31. slurm_sdk-0.4.6.dev0/src/slurm/cluster.py +1237 -0
  32. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/context.py +34 -0
  33. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/decorators.py +34 -12
  34. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/hello_world.py +8 -6
  35. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/job.py +255 -42
  36. slurm_sdk-0.4.6.dev0/src/slurm/packaging/__init__.py +128 -0
  37. slurm_sdk-0.4.6.dev0/src/slurm/packaging/_registry.py +192 -0
  38. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/packaging/container.py +82 -72
  39. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/rendering.py +45 -31
  40. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/runner/__init__.py +1 -1
  41. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/runner/__main__.py +1 -1
  42. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/runner/argument_loader.py +10 -11
  43. slurm_sdk-0.4.5.dev0/src/slurm/_runner_impl.py → slurm_sdk-0.4.6.dev0/src/slurm/runner/main.py +280 -184
  44. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/runner/placeholder.py +3 -4
  45. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/runner/result_saver.py +8 -7
  46. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/runtime.py +20 -2
  47. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/task.py +84 -316
  48. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/workflow.py +22 -8
  49. slurm_sdk-0.4.6.dev0/tests/helpers/cluster_factory.py +49 -0
  50. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/helpers/local_backend.py +24 -0
  51. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/integration/conftest.py +2 -0
  52. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_after_dependencies.py +6 -12
  53. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_array_jobs.py +6 -12
  54. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_backend_mock.py +2 -4
  55. slurm_sdk-0.4.6.dev0/tests/test_backend_tail.py +165 -0
  56. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_callbacks.py +5 -11
  57. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_cluster_submit.py +5 -12
  58. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_container_packaging.py +10 -0
  59. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_context.py +10 -18
  60. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_context_execution.py +6 -12
  61. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_dependencies.py +6 -12
  62. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_job_get_result.py +51 -7
  63. slurm_sdk-0.4.6.dev0/tests/test_job_snapshot.py +162 -0
  64. slurm_sdk-0.4.6.dev0/tests/test_job_tail.py +138 -0
  65. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_parallel_train_eval_example.py +6 -11
  66. slurm_sdk-0.4.6.dev0/tests/test_parse_packaging_config.py +71 -0
  67. slurm_sdk-0.4.6.dev0/tests/test_registry.py +160 -0
  68. slurm_sdk-0.4.6.dev0/tests/test_rendering.py +218 -0
  69. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_runner.py +6 -8
  70. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_submission_and_download_errors.py +8 -9
  71. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_type_safety.py +6 -12
  72. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_union_type_signatures.py +6 -12
  73. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_with_dependencies.py +8 -5
  74. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_workflow_container_integration.py +14 -11
  75. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_workflow_context.py +10 -11
  76. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_workflow_integration.py +10 -18
  77. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_workflow_slurmfile_generation.py +6 -1
  78. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/uv.lock +33 -23
  79. slurm_sdk-0.4.5.dev0/docs/explanation/callbacks_and_events.md +0 -61
  80. slurm_sdk-0.4.5.dev0/docs/explanation/container_packaging.md +0 -35
  81. slurm_sdk-0.4.5.dev0/docs/how-to/container_dependencies.md +0 -29
  82. slurm_sdk-0.4.5.dev0/docs/how-to/hello_torch.md +0 -27
  83. slurm_sdk-0.4.5.dev0/docs/how-to/parallelization_patterns.md +0 -106
  84. slurm_sdk-0.4.5.dev0/src/slurm/cluster.py +0 -2209
  85. slurm_sdk-0.4.5.dev0/src/slurm/packaging/__init__.py +0 -70
  86. slurm_sdk-0.4.5.dev0/src/slurm/runner/main.py +0 -322
  87. slurm_sdk-0.4.5.dev0/tests/test_rendering.py +0 -163
  88. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/.claude/settings.json +0 -0
  89. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/.devcontainer/devcontainer.json +0 -0
  90. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/.dockerignore +0 -0
  91. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/.github/workflows/ci.yml +0 -0
  92. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/.github/workflows/publish-docs.yml +0 -0
  93. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/.github/workflows/publish.yml +0 -0
  94. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/.gitignore +0 -0
  95. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/.python-version +0 -0
  96. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/CLAUDE.md +0 -0
  97. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/DOCS_PUBLISHING.md +0 -0
  98. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/LICENSE.md +0 -0
  99. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/README.md +0 -0
  100. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/containers/dev/Containerfile +0 -0
  101. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/containers/docker-compose.yml +0 -0
  102. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/containers/slurm-pyxis-integration/Containerfile +0 -0
  103. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/containers/slurm-pyxis-integration/cgroup.conf +0 -0
  104. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/containers/slurm-pyxis-integration/install-enroot.sh +0 -0
  105. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/containers/slurm-pyxis-integration/install-pyxis.sh +0 -0
  106. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/containers/slurm-pyxis-integration/slurm.conf +0 -0
  107. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/CONTRIBUTING.md +0 -0
  108. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/explanation/cli-and-tui.md +0 -0
  109. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/explanation/index.md +0 -0
  110. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/explanation/rendering_and_runner.md +0 -0
  111. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/explanation/security-model.md +0 -0
  112. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/explanation/slurm_concepts.md +0 -0
  113. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/explanation/system_overview.md +0 -0
  114. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/explanation/workflow_execution.md +0 -0
  115. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/how-to/cli.md +0 -0
  116. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/how-to/ssh_security.md +0 -0
  117. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/index.md +0 -0
  118. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/reference/api/callbacks.md +0 -0
  119. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/reference/api/cluster.md +0 -0
  120. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/reference/api/errors.md +0 -0
  121. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/reference/api/index.md +0 -0
  122. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/reference/api/jobs_arrays.md +0 -0
  123. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/reference/api/packaging_container.md +0 -0
  124. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/reference/cli.md +0 -0
  125. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/reference/index.md +0 -0
  126. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/tutorials/hello_torch.md +0 -0
  127. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/tutorials/index.md +0 -0
  128. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/tutorials/map_reduce.md +0 -0
  129. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/docs/tutorials/parallelization_patterns.md +0 -0
  130. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/scripts/run-integration-tests.sh +0 -0
  131. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/api/__init__.py +0 -0
  132. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/callbacks/__init__.py +0 -0
  133. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/callbacks/callbacks.py +0 -0
  134. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/callbacks/debug.py +0 -0
  135. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/cli/__init__.py +0 -0
  136. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/cli/app.py +0 -0
  137. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/cli/cluster.py +0 -0
  138. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/cli/dash.py +0 -0
  139. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/cli/docs.py +0 -0
  140. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/cli/formatters.py +0 -0
  141. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/cli/live/__init__.py +0 -0
  142. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/cli/live/cluster_dashboard.py +0 -0
  143. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/cli/live/jobs_dashboard.py +0 -0
  144. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/cli/mcp.py +0 -0
  145. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/cli/utils.py +0 -0
  146. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/config.py +0 -0
  147. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/errors.py +0 -0
  148. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/__init__.py +0 -0
  149. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/container_test_functions.py +0 -0
  150. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/hello_container.Dockerfile +0 -0
  151. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/hello_container.py +0 -0
  152. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/hello_torch.Dockerfile +0 -0
  153. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/hello_torch.py +0 -0
  154. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/hello_torch.uv.Dockerfile +0 -0
  155. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/hello_world.Dockerfile +0 -0
  156. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/integration_test_task.py +0 -0
  157. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/integration_test_workflow.py +0 -0
  158. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/map_reduce.Dockerfile +0 -0
  159. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/map_reduce.py +0 -0
  160. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/parallel_train_eval/README.md +0 -0
  161. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/parallel_train_eval/__init__.py +0 -0
  162. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/parallel_train_eval/eval_task.py +0 -0
  163. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/parallel_train_eval/state.py +0 -0
  164. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/parallel_train_eval/train_task.py +0 -0
  165. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/parallel_train_eval/workflow.py +0 -0
  166. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/parallelization_patterns.py +0 -0
  167. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/workflow_graph_visualization.Dockerfile +0 -0
  168. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/examples/workflow_graph_visualization.py +0 -0
  169. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/logging.py +0 -0
  170. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/mcp_server.py +0 -0
  171. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/packaging/base.py +0 -0
  172. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/packaging/inherit.py +0 -0
  173. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/packaging/none.py +0 -0
  174. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/packaging/wheel.py +0 -0
  175. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/py.typed +0 -0
  176. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/runner/callbacks.py +0 -0
  177. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/runner/context_manager.py +0 -0
  178. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/runner/workflow_builder.py +0 -0
  179. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/__init__.py +0 -0
  180. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/common/__init__.py +0 -0
  181. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/common/styles.py +0 -0
  182. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/common/widgets.py +0 -0
  183. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/dashboard/__init__.py +0 -0
  184. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/dashboard/app.py +0 -0
  185. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/dashboard/data.py +0 -0
  186. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/dashboard/widgets/__init__.py +0 -0
  187. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/dashboard/widgets/cluster_tree.py +0 -0
  188. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/dashboard/widgets/detail_panel.py +0 -0
  189. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/dashboard/widgets/status_bar.py +0 -0
  190. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/docs/__init__.py +0 -0
  191. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/docs/app.py +0 -0
  192. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/docs/loader.py +0 -0
  193. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/docs/search.py +0 -0
  194. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/docs/widgets/__init__.py +0 -0
  195. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/docs/widgets/nav_tree.py +0 -0
  196. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/tui/docs/widgets/search_results.py +0 -0
  197. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/ui.py +0 -0
  198. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/src/slurm/validation.py +0 -0
  199. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/__init__.py +0 -0
  200. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/conftest.py +0 -0
  201. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/integration/__init__.py +0 -0
  202. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/integration/container_test_tasks.py +0 -0
  203. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/integration/test_container_packaging_advanced.py +0 -0
  204. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/integration/test_container_packaging_basic.py +0 -0
  205. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/integration/test_container_packaging_comprehensive.py +0 -0
  206. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/integration/test_examples_end_to_end.py +0 -0
  207. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/integration/test_job_script_persistence.py +0 -0
  208. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/integration/test_native_array_jobs.py +0 -0
  209. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/integration/test_output_dir.py +0 -0
  210. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/integration/test_slurm_container.py +0 -0
  211. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/integration/test_workflow_callbacks_integration.py +0 -0
  212. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_account_jobs.py +0 -0
  213. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_callbacks_logging.py +0 -0
  214. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_cli.py +0 -0
  215. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_cluster_from_env.py +0 -0
  216. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_debug_callback.py +0 -0
  217. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_dynamic_task.py +0 -0
  218. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_job_status.py +0 -0
  219. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_local_backend.py +0 -0
  220. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_mcp_server.py +0 -0
  221. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_metadata_resolution.py +0 -0
  222. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_packaging_parse.py +0 -0
  223. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_runtime.py +0 -0
  224. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_task_decorator.py +0 -0
  225. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_wheel_packaging.py +0 -0
  226. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_wheel_packaging_errors.py +0 -0
  227. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_workflow_callbacks.py +0 -0
  228. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_workflow_event_emission.py +0 -0
  229. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/test_workflow_example_callbacks.py +0 -0
  230. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/tui/__init__.py +0 -0
  231. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/tui/test_dashboard_data.py +0 -0
  232. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/tui/test_docs_loader.py +0 -0
  233. {slurm_sdk-0.4.5.dev0 → slurm_sdk-0.4.6.dev0}/tests/tui/test_docs_search.py +0 -0
@@ -90,6 +90,70 @@ gh pr create --fill
90
90
  - The PR description should summarize changes and reference any related issues
91
91
  - Wait for CI to pass before requesting human review
92
92
 
93
+ ## Publishing to PyPI
94
+
95
+ The package is published to PyPI via GitHub Actions using trusted publishing (no API tokens needed).
96
+
97
+ ### Dev Releases
98
+
99
+ Dev releases publish the current version in `pyproject.toml` (e.g., `0.4.5-dev`) for testing:
100
+
101
+ ```bash
102
+ gh workflow run publish.yml -f version_type=dev
103
+ ```
104
+
105
+ To test the build without uploading:
106
+
107
+ ```bash
108
+ gh workflow run publish.yml -f version_type=dev -f dry_run=true
109
+ ```
110
+
111
+ ### Production Releases
112
+
113
+ Production releases require a clean version number and updated changelog:
114
+
115
+ 1. **Update version** in `pyproject.toml` (remove `-dev` suffix):
116
+
117
+ ```python
118
+ version = "0.4.5" # was "0.4.5-dev"
119
+ ```
120
+
121
+ 1. **Update changelog** in `docs/CHANGELOG.md`:
122
+
123
+ - Move entries from `## [Unreleased]` to new section `## [0.4.5] - YYYY-MM-DD`
124
+ - Keep an empty `## [Unreleased]` section at the top
125
+
126
+ 1. **Commit, tag, and create GitHub release**:
127
+
128
+ ```bash
129
+ git add pyproject.toml docs/CHANGELOG.md
130
+ git commit -m "chore: release v0.4.5"
131
+ git tag v0.4.5
132
+ git push origin main --tags
133
+ gh release create v0.4.5 --generate-notes
134
+ ```
135
+
136
+ The GitHub release event automatically triggers PyPI publishing.
137
+
138
+ 1. **Prepare for next development cycle**:
139
+
140
+ ```bash
141
+ # Update version to next dev version
142
+ # version = "0.4.6-dev"
143
+ git commit -am "chore: bump version to 0.4.6-dev"
144
+ git push
145
+ ```
146
+
147
+ ### Manual Production Release
148
+
149
+ If you need to publish a release without creating a GitHub release:
150
+
151
+ ```bash
152
+ gh workflow run publish.yml -f version_type=release
153
+ ```
154
+
155
+ This validates that the version doesn't contain `-dev`, `-alpha`, or `-beta` suffixes.
156
+
93
157
  ## Coding Style & Naming Conventions
94
158
 
95
159
  - Use 4-space indentation and type hints throughout; the package ships `py.typed`.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: slurm-sdk
3
- Version: 0.4.5.dev0
3
+ Version: 0.4.6.dev0
4
4
  Summary: Pythonic SDK for Slurm.
5
5
  Author-email: Ville Kallioniemi <ville.kallioniemi@gmail.com>
6
6
  License-Expression: MIT
@@ -12,6 +12,7 @@ Requires-Dist: paramiko>=3.5.1
12
12
  Requires-Dist: requests>=2.32.3
13
13
  Requires-Dist: rich>=13.9.4
14
14
  Requires-Dist: tomli>=2.0.0; python_version < '3.11'
15
+ Requires-Dist: tomlkit>=0.12
15
16
  Provides-Extra: tui
16
17
  Requires-Dist: pyyaml>=6.0; extra == 'tui'
17
18
  Requires-Dist: textual>=0.89.0; extra == 'tui'
@@ -7,6 +7,117 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ### Fixed
11
+
12
+ - Fixed missing imports in parallel train-eval workflow tutorial
13
+ - Added `JobContext` to API reference documentation
14
+ - Updated import paths in tutorials and how-to guides to use public API
15
+ (`from slurm.callbacks import ...`) instead of internal modules
16
+ - Removed unused imports from workflow graph visualization tutorial
17
+
18
+ ### Added
19
+
20
+ - `parse_packaging_config()` as a public API for parsing packaging specification
21
+ strings into configuration dicts; previously the private `_parse_packaging_config()`
22
+ - `PackagingConfig` TypedDict in `slurm.packaging` documenting all valid packaging
23
+ configuration keys
24
+ - `Job.snapshot()` method returning a frozen `JobSnapshot` dataclass with current
25
+ state, output tails, elapsed time, and terminal/success flags
26
+ - `Job.tail()` method for live log streaming with configurable `output` parameter
27
+ accepting any writable IO object (`sys.stdout`, `io.StringIO`, file objects)
28
+ - `BackendBase.tail_file()` method with implementations for SSH and local backends
29
+ - `slurm jobs tail <job-id>` CLI command with `--stderr`, `--no-follow`, and
30
+ `--lines` options
31
+ - Container image digest pinning via registry HTTP API; resolves digests with
32
+ a single HEAD request instead of pulling the full image
33
+ - Usage examples in docstrings for `SlurmTask.__call__()`, `ArrayJob.get_results()`,
34
+ `WorkflowContext`, and `JobContext`
35
+ - `llms.txt` file with complete API recipes, decision tree, and method signatures
36
+ for AI coding agent consumption
37
+ - How-to guide for creating custom task and workflow decorators using existing
38
+ `@task`, `@workflow`, and `with_options()` APIs
39
+ - `write_file()` and `close()` methods on `BackendBase` interface for unified
40
+ file operations and explicit resource cleanup
41
+ - Pickle version headers for cross-version mismatch detection; result files now
42
+ include Python version and SDK version metadata, with clear warnings on mismatch
43
+ - SSH lazy reconnection on transport errors (automatic retry once) and explicit
44
+ `cluster.reconnect()` for long-lived sessions (e.g. Jupyter notebooks)
45
+ - `reconnect()` method on `BackendBase` interface (no-op for local backend)
46
+
47
+ ### Changed
48
+
49
+ - Container packaging `use_digest` default changed from `False` to `True` for
50
+ reproducible deployments; pass `use_digest=False` to restore previous behavior
51
+ - Pre-existing container images no longer require `docker pull` for digest
52
+ resolution when the registry API is accessible
53
+ - Expanded container packaging explanation with details on multi-word Python
54
+ executables, container mounts, working directory, and array job naming
55
+ - Restructured GPU, container dependency, and parallelization how-to guides
56
+ with proper problem statements, prerequisites, steps, and verification
57
+ sections following Diataxis how-to guide format
58
+ - Input validation for `account` and `partition` sbatch options is now enforced
59
+ at submission time
60
+ - Removed redundant SBATCH option normalization in `render_job_script()`
61
+ - Merged `SlurmTaskWithDependencies` into `SlurmTask`; `.after()` now returns a
62
+ `SlurmTask` with bound dependencies. The `SlurmTaskWithDependencies` name is
63
+ kept as an alias for backward compatibility
64
+ - Extracted `_resolve_cluster()` helper in context module, eliminating duplicated
65
+ context resolution logic across task submission methods
66
+ - Consolidated packaging config resolution into `resolve_packaging_config()` with
67
+ documented precedence; eliminates duplicated logic between submission and
68
+ workflow dependency building
69
+ - Replaced 12 positional parameters on `render_job_script()` with structured
70
+ `RenderContext` dataclass
71
+ - Removed submission pipeline wrapper methods from `Cluster`; internal modules
72
+ now call extracted functions directly
73
+ - `Job` now depends on `BackendBase` interface instead of `Cluster`; accepts
74
+ `backend` and `on_completed` keyword arguments. The `cluster` parameter is
75
+ kept for backward compatibility
76
+ - `BackendBase` now provides `download_file()` (default: local copy) and
77
+ `hostname` class attribute (default: `"localhost"`)
78
+ - Decomposed `cluster.py` into private modules (`_polling`, `_submission`,
79
+ `_workflow`) for maintainability; public API unchanged
80
+ - Extracted private methods from `ContainerPackagingStrategy.prepare()` for
81
+ improved testability
82
+ - Callback exceptions are now logged at WARNING level with full tracebacks
83
+ (previously logged at DEBUG)
84
+ - SSH backend `host_key_policy` default changed from `"warn"` to `"reject"` for
85
+ improved security; pass `host_key_policy="warn"` to restore previous behavior
86
+ - Extracted `_dispatch_callbacks()` helper on `Cluster` to deduplicate callback
87
+ dispatch logic
88
+ - Consolidated `_runner_impl.py` into the `runner/` package; a thin
89
+ backwards-compatible shim remains for external references
90
+ - Slurmfile TOML modification now uses `tomlkit` for proper round-trip parsing
91
+ instead of fragile line-by-line string manipulation
92
+
93
+ ### Removed
94
+
95
+ - Removed unused `_runner_impl.py` backward-compatibility shim
96
+
97
+ ### Fixed
98
+
99
+ - Corrected callback method names in callbacks and events explanation; expanded
100
+ from stub to comprehensive coverage of all 11 hooks, execution loci, and
101
+ serialization behavior
102
+ - Moved `base64` import to module level in rendering to prevent potential
103
+ `NameError`
104
+ - Temp file leak in `Job.get_result()` when downloading results via SSH; files
105
+ are now cleaned up in a `finally` block
106
+ - Thread-safety issue with `Job` status cache; reads and writes to
107
+ `_status_cache`, `_status_cache_time`, and `_completed` are now protected
108
+ by an `RLock`
109
+ - Race condition in `_job_pollers` dict access between main and poller threads
110
+ - Job name validation and quoting in rendered sbatch scripts
111
+ - Replaced deprecated `datetime.utcnow()` with `datetime.now(timezone.utc)`
112
+ - Environment metadata files are now written with `0o600` permissions
113
+ - `LocalBackend.execute_command()` no longer uses `shell=True`
114
+
115
+ ### Dependencies
116
+
117
+ - Added `tomlkit>=0.12` as a required dependency
118
+
119
+ ## [0.4.5] - 2026-02-05
120
+
10
121
  ### Added
11
122
 
12
123
  - Interactive TUI commands (requires `pip install slurm-sdk[tui]`):
@@ -0,0 +1,215 @@
1
+ # Callbacks and Events
2
+
3
+ Callbacks let you observe packaging, submission, execution, and workflow events without changing task code. The SDK fires hooks at well-defined points in the job lifecycle, passing a typed context object that carries relevant metadata.
4
+
5
+ ## Lifecycle stages
6
+
7
+ A single job passes through up to five stages, each with a begin/end hook pair:
8
+
9
+ - **Packaging** (`on_begin_package_ctx` / `on_end_package_ctx`): Fires on the client while the SDK builds or resolves the deployment artifact (wheel or container image).
10
+ - **Submission** (`on_begin_submit_job_ctx` / `on_end_submit_job_ctx`): Fires on the client immediately before and after the `sbatch` call.
11
+ - **Execution** (`on_begin_run_job_ctx` / `on_end_run_job_ctx`): Fires on the runner (compute node) around the user function invocation.
12
+ - **Status polling** (`on_job_status_update_ctx`): Fires on the client each time the SDK polls SLURM and observes a state change or the polling interval elapses.
13
+ - **Completion** (`on_completed_ctx`): Fires when a job reaches a terminal state. By default this runs on both client and runner.
14
+
15
+ Workflow orchestration adds three more hooks:
16
+
17
+ - **Workflow begin/end** (`on_workflow_begin_ctx` / `on_workflow_end_ctx`): Fires on the runner around the workflow orchestrator logic, after the workflow job itself has started.
18
+ - **Workflow task submitted** (`on_workflow_task_submitted_ctx`): Fires on the client each time the workflow submits a child task, enabling dependency-graph tracking.
19
+
20
+ ## All hooks
21
+
22
+ The `BaseCallback` class defines 11 hooks. Each receives a single typed context argument:
23
+
24
+ | # | Hook method | Context type | Description |
25
+ | --- | -------------------------------- | --------------------------- | ---------------------------------------------------- |
26
+ | 1 | `on_begin_package_ctx` | `PackagingBeginContext` | Packaging is about to start |
27
+ | 2 | `on_end_package_ctx` | `PackagingEndContext` | Packaging has completed |
28
+ | 3 | `on_begin_submit_job_ctx` | `SubmitBeginContext` | Job is about to be submitted via sbatch |
29
+ | 4 | `on_end_submit_job_ctx` | `SubmitEndContext` | Job has been submitted; job ID is available |
30
+ | 5 | `on_job_status_update_ctx` | `JobStatusUpdatedContext` | Polling detected a status change or interval elapsed |
31
+ | 6 | `on_begin_run_job_ctx` | `RunBeginContext` | Runner is about to invoke the user function |
32
+ | 7 | `on_end_run_job_ctx` | `RunEndContext` | User function has returned or raised |
33
+ | 8 | `on_completed_ctx` | `CompletedContext` | Job reached a terminal SLURM state |
34
+ | 9 | `on_workflow_begin_ctx` | `WorkflowCallbackContext` | Workflow orchestrator is starting |
35
+ | 10 | `on_workflow_end_ctx` | `WorkflowCallbackContext` | Workflow orchestrator has finished |
36
+ | 11 | `on_workflow_task_submitted_ctx` | `WorkflowTaskSubmitContext` | Workflow submitted a child task |
37
+
38
+ ## Callback timeline
39
+
40
+ The diagram below shows the order in which hooks fire for a single job submission, with an optional workflow layer:
41
+
42
+ ```mermaid
43
+ sequenceDiagram
44
+ participant Client
45
+ participant SLURM
46
+ participant Runner
47
+
48
+ rect rgb(230, 245, 255)
49
+ Note over Client: Client-side callbacks
50
+ Client->>Client: on_begin_package_ctx
51
+ Client->>Client: on_end_package_ctx
52
+ Client->>Client: on_begin_submit_job_ctx
53
+ Client->>SLURM: sbatch
54
+ SLURM-->>Client: job_id
55
+ Client->>Client: on_end_submit_job_ctx
56
+ end
57
+
58
+ rect rgb(240, 240, 255)
59
+ Note over Client: Client-side polling
60
+ loop poll_interval_secs
61
+ Client->>SLURM: squeue / sacct
62
+ SLURM-->>Client: status
63
+ Client->>Client: on_job_status_update_ctx
64
+ end
65
+ end
66
+
67
+ rect rgb(255, 245, 230)
68
+ Note over Runner: Runner-side callbacks
69
+ SLURM->>Runner: Start job
70
+ Runner->>Runner: on_begin_run_job_ctx
71
+ Runner->>Runner: Execute task function
72
+ Runner->>Runner: on_end_run_job_ctx
73
+ Runner->>Runner: on_completed_ctx (runner side)
74
+ end
75
+
76
+ rect rgb(230, 255, 230)
77
+ Note over Client: Completion
78
+ Client->>Client: on_completed_ctx (client side)
79
+ end
80
+
81
+ rect rgb(255, 240, 245)
82
+ Note over Runner: Workflow callbacks (runner-side)
83
+ Runner->>Runner: on_workflow_begin_ctx
84
+ loop For each child task
85
+ Runner->>Runner: on_workflow_task_submitted_ctx
86
+ end
87
+ Runner->>Runner: on_workflow_end_ctx
88
+ end
89
+ ```
90
+
91
+ ## Execution loci
92
+
93
+ Every hook has a **default execution locus** that determines whether it fires on the client process, on the runner (compute node), or both. The SDK stores these defaults in `_DEFAULT_HOOK_LOCI`:
94
+
95
+ | Hook | Default locus | Context type |
96
+ | -------------------------------- | ------------- | --------------------------- |
97
+ | `on_begin_package_ctx` | `CLIENT` | `PackagingBeginContext` |
98
+ | `on_end_package_ctx` | `CLIENT` | `PackagingEndContext` |
99
+ | `on_begin_submit_job_ctx` | `CLIENT` | `SubmitBeginContext` |
100
+ | `on_end_submit_job_ctx` | `CLIENT` | `SubmitEndContext` |
101
+ | `on_job_status_update_ctx` | `CLIENT` | `JobStatusUpdatedContext` |
102
+ | `on_begin_run_job_ctx` | `RUNNER` | `RunBeginContext` |
103
+ | `on_end_run_job_ctx` | `RUNNER` | `RunEndContext` |
104
+ | `on_completed_ctx` | `BOTH` | `CompletedContext` |
105
+ | `on_workflow_begin_ctx` | `RUNNER` | `WorkflowCallbackContext` |
106
+ | `on_workflow_end_ctx` | `RUNNER` | `WorkflowCallbackContext` |
107
+ | `on_workflow_task_submitted_ctx` | `CLIENT` | `WorkflowTaskSubmitContext` |
108
+
109
+ The SDK calls `should_run_on_client(hook_name)` and `should_run_on_runner(hook_name)` to decide where each hook fires. For hooks with locus `BOTH`, the hook fires in both locations, and the `CompletedContext.emitted_by` field tells you which side emitted the current invocation.
110
+
111
+ ### Overriding the default locus with `execution_loci`
112
+
113
+ You can override the default locus for any hook by setting the `execution_loci` dict on your callback subclass:
114
+
115
+ ```python
116
+ class MyCallback(BaseCallback):
117
+ execution_loci = {
118
+ "on_completed_ctx": ExecutionLocus.CLIENT, # only fire on client
119
+ }
120
+ ```
121
+
122
+ This is a per-hook override. Any hook not listed in the dict falls back to its default from `_DEFAULT_HOOK_LOCI`. If a hook is not in either dict, it defaults to `CLIENT`.
123
+
124
+ ## `requires_pickling`
125
+
126
+ The `requires_pickling` class attribute controls whether the SDK serializes the callback and ships it to the runner alongside the job script. It defaults to `True`.
127
+
128
+ Set `requires_pickling = False` when your callback only needs client-side hooks (packaging, submission, polling). This avoids serialization failures for callbacks that hold unpicklable references such as open file handles, database connections, or Rich consoles.
129
+
130
+ When `requires_pickling` is `False`, runner-side hooks (`on_begin_run_job_ctx`, `on_end_run_job_ctx`, `on_workflow_begin_ctx`, `on_workflow_end_ctx`) will never fire for that callback because the callback object is not present on the compute node.
131
+
132
+ ## `poll_interval_secs`
133
+
134
+ The `poll_interval_secs` class attribute controls SDK-managed status polling. When set to a positive number, the SDK spawns a background thread that periodically queries SLURM for the job's current state and fires `on_job_status_update_ctx` on each poll cycle.
135
+
136
+ ```python
137
+ class ProgressCallback(BaseCallback):
138
+ requires_pickling = False
139
+ poll_interval_secs = 30.0 # check every 30 seconds
140
+ ```
141
+
142
+ If `poll_interval_secs` is `None` (the default), no automatic polling occurs and `on_job_status_update_ctx` is never called.
143
+
144
+ The `JobStatusUpdatedContext` passed to the hook includes the current SLURM status dict, the previous state string, and a boolean `is_terminal` flag that is `True` when the job has reached a final state (COMPLETED, FAILED, CANCELLED, etc.).
145
+
146
+ ## Serialization rules
147
+
148
+ Callbacks that need to run on the runner must survive pickling. The SDK serializes them into the job directory so the runner process can reconstruct them. The rules are:
149
+
150
+ 1. **`requires_pickling = True` (default)**: The callback is pickled and sent to the runner. All runner-side hooks fire normally. If pickling fails, the SDK raises an error at submission time.
151
+ 1. **`requires_pickling = False`**: The callback stays on the client only. Runner-side hooks are silently skipped. Client-side hooks (packaging, submission, polling, and the client side of `on_completed_ctx`) still fire.
152
+ 1. **Hooks with locus `BOTH`**: Currently only `on_completed_ctx` defaults to `BOTH`. When `requires_pickling = False`, only the client-side invocation fires. When `requires_pickling = True`, the hook fires on both the runner (immediately after `on_end_run_job_ctx`) and on the client (when polling detects the terminal state).
153
+
154
+ The runner reconstructs callbacks from the pickled file, calls the runner-side hooks in order, and discards the callback objects when the job finishes. The client-side callback instances are the original objects held in memory by the submitting process.
155
+
156
+ ## Custom callback example
157
+
158
+ Below is a complete `BaseCallback` subclass that logs timing information for packaging and submission on the client, without needing to be serialized to the runner:
159
+
160
+ ```python
161
+ import logging
162
+ from slurm.callbacks import (
163
+ BaseCallback,
164
+ PackagingBeginContext,
165
+ PackagingEndContext,
166
+ SubmitEndContext,
167
+ JobStatusUpdatedContext,
168
+ )
169
+
170
+ logger = logging.getLogger(__name__)
171
+
172
+
173
+ class TimingCallback(BaseCallback):
174
+ """Logs wall-clock durations for packaging and submission."""
175
+
176
+ requires_pickling = False
177
+ poll_interval_secs = 60.0
178
+
179
+ def on_begin_package_ctx(self, ctx: PackagingBeginContext) -> None:
180
+ self._pack_start = ctx.timestamp
181
+ logger.info("Packaging started for %s", ctx.task)
182
+
183
+ def on_end_package_ctx(self, ctx: PackagingEndContext) -> None:
184
+ duration = ctx.duration or (ctx.timestamp - self._pack_start)
185
+ logger.info("Packaging finished in %.1fs", duration)
186
+
187
+ def on_end_submit_job_ctx(self, ctx: SubmitEndContext) -> None:
188
+ logger.info("Job %s submitted to %s", ctx.job_id, ctx.target_job_dir)
189
+
190
+ def on_job_status_update_ctx(self, ctx: JobStatusUpdatedContext) -> None:
191
+ state = ctx.status.get("job_state", "UNKNOWN")
192
+ logger.info(
193
+ "Job %s state: %s (terminal=%s)", ctx.job_id, state, ctx.is_terminal
194
+ )
195
+ ```
196
+
197
+ Register the callback when creating the cluster or submitting a job:
198
+
199
+ ```python
200
+ cluster = Cluster.from_file("Slurmfile", callbacks=[TimingCallback()])
201
+ job = cluster.submit(my_task)
202
+ ```
203
+
204
+ ## Typical uses
205
+
206
+ - **Structured logging and progress output**: Use client-side hooks to print Rich progress bars or write structured log lines.
207
+ - **Dependency graph visualization**: Use `on_workflow_task_submitted_ctx` to capture parent-child edges and render a DAG.
208
+ - **Custom metrics and telemetry**: Fire metrics to Prometheus, Datadog, or MLflow from `on_end_run_job_ctx`.
209
+ - **Alerting on failure**: Check `RunEndContext.status` or `CompletedContext.job_state` and send notifications.
210
+ - **Benchmarking**: Measure end-to-end wall time from `on_begin_package_ctx` through `on_completed_ctx`.
211
+
212
+ ## Further reading
213
+
214
+ - [Callbacks reference](../reference/api/callbacks.md) for the full API surface of `BaseCallback` and all context dataclasses.
215
+ - [How to create custom task and workflow decorators](../how-to/custom-task-decorators.md) for extending the SDK's decorator system.
@@ -0,0 +1,87 @@
1
+ # Container Packaging
2
+
3
+ Container packaging is the default execution model. Tasks are built into a container image, pushed to a registry if needed, and executed on Slurm via Pyxis/enroot.
4
+
5
+ ## Build and resolve flow
6
+
7
+ 1. **Resolve image reference**: `ContainerPackagingStrategy._resolve_image_reference` picks a registry/name:tag.
8
+ 1. **Build image**: If a Dockerfile or build context is provided, the SDK runs `docker build` or `podman build`.
9
+ 1. **Push image**: Controlled by `packaging_push` and `packaging_registry`.
10
+ 1. **Convert for Pyxis**: Registry references are converted to enroot format when needed.
11
+
12
+ ## Runtime behavior
13
+
14
+ - The job script exports `CONTAINER_IMAGE` for Pyxis.
15
+ - `PY_EXEC` is set to the configured Python executable inside the container.
16
+ - The runner executes with `srun --container-image` under the hood.
17
+
18
+ ## Multi-word Python executables
19
+
20
+ When `python_executable` is a single word like `python`, the SDK sets `PY_EXEC` as a simple shell variable. However, when it contains multiple words (e.g., `uv run python`), the SDK stores it as a **bash array**:
21
+
22
+ ```bash
23
+ # Single-word executable
24
+ PY_EXEC='python'
25
+
26
+ # Multi-word executable
27
+ PY_EXEC=('uv' 'run' 'python')
28
+ ```
29
+
30
+ The array is resolved with `PY_EXEC_RESOLVED="${PY_EXEC[*]}"` and expanded using `${PY_EXEC[@]}` in the execution command. This approach avoids bash word-splitting issues that would occur if a multi-word command were stored in a plain string variable -- the shell would attempt to find an executable literally named `uv run python` rather than running `uv` with arguments `run python`.
31
+
32
+ ## Container mounts
33
+
34
+ The SDK automatically mounts the **job base directory** (the parent of the task-level directory tree) into the container with read-write access. This allows the runner to locate result files from dependent jobs when resolving `JobResultPlaceholder` objects.
35
+
36
+ Additional mounts can be configured via the `packaging_mounts` task option. Mounts follow the standard `source:target:options` format:
37
+
38
+ ```python
39
+ packaging_mounts=["/data:/data:ro", "/scratch:/scratch:rw"]
40
+ ```
41
+
42
+ The SDK resolves shell expressions in mount paths so that job directory references remain valid inside the container.
43
+
44
+ ## Container working directory
45
+
46
+ The container's working directory is set to the job directory via the `--container-workdir` flag on `srun`. This means task code that uses relative paths will resolve them against the job directory inside the container. If `container_workdir` is explicitly configured, the SDK uses that value instead, and `{job_dir}` can be used as a placeholder token.
47
+
48
+ ## Array job container naming
49
+
50
+ Each container gets a unique name based on the job's pre-submission identifier: `slurm-sdk-{pre_submission_id}`. For array jobs, the SLURM array task ID is appended as a suffix: `slurm-sdk-{pre_submission_id}_{task_id}`. This naming scheme prevents container name collisions across array elements and enables `slurm jobs connect` to find and attach to the correct container.
51
+
52
+ ## Configuration knobs
53
+
54
+ - `packaging_dockerfile`: Dockerfile path for builds.
55
+ - `packaging_context`: Build context directory.
56
+ - `packaging_registry`: Registry host/path for pushes and pulls.
57
+ - `packaging_platform`: Target platform (e.g., `linux/amd64`).
58
+ - `packaging_tls_verify`: TLS verification for registry access.
59
+ - `packaging_runtime`: Explicit runtime (`docker` or `podman`).
60
+ - `packaging_python_executable`: Python command inside the container (supports multi-word).
61
+ - `packaging_mounts`: Additional bind mounts for the container.
62
+
63
+ ### Configuration example
64
+
65
+ A complete task definition with container packaging options:
66
+
67
+ ```python
68
+ @task(
69
+ time="01:00:00",
70
+ gpus_per_node=4,
71
+ packaging="container:my-registry.com/training:latest",
72
+ packaging_python_executable="uv run python",
73
+ packaging_mounts=["/data:/data:ro"],
74
+ )
75
+ def train(config: dict) -> dict:
76
+ return run_training(config)
77
+ ```
78
+
79
+ ## How workflows reuse images
80
+
81
+ Workflow jobs export packaging config into `SLURM_SDK_PACKAGING_CONFIG`. Child tasks inherit the resolved image reference so they do not rebuild containers mid-workflow.
82
+
83
+ ## Design goals
84
+
85
+ - Reproducible environments with minimal host coupling.
86
+ - Explicit control over build/push/pull behavior.
87
+ - Compatibility with Slurm + Pyxis/enroot deployments.
@@ -0,0 +1,160 @@
1
+ # How to chain containerized tasks with dependencies
2
+
3
+ ## Problem
4
+
5
+ You need to run a multi-phase pipeline (e.g., prepare, map, reduce) where all
6
+ tasks run in the same container and each phase depends on the previous one
7
+ completing successfully.
8
+
9
+ ## Prerequisites
10
+
11
+ - A Slurm cluster with Pyxis/enroot installed
12
+ - A container registry accessible from compute nodes
13
+ - `slurm-sdk` installed locally
14
+
15
+ ## Steps
16
+
17
+ ### 1. Define a shared container image
18
+
19
+ Create a single Dockerfile for all tasks in the pipeline:
20
+
21
+ ```dockerfile
22
+ FROM python:3.11-slim
23
+
24
+ WORKDIR /workspace
25
+
26
+ COPY pyproject.toml README.md mkdocs.yml ./
27
+ COPY src/ src/
28
+ COPY docs/ docs/
29
+
30
+ RUN pip install --no-cache-dir .
31
+ ```
32
+
33
+ Set this as the cluster default so all tasks share it:
34
+
35
+ ```python
36
+ from slurm import Cluster
37
+
38
+ cluster = Cluster.from_args(
39
+ args,
40
+ default_packaging="container",
41
+ default_packaging_dockerfile="path/to/pipeline.Dockerfile",
42
+ )
43
+ ```
44
+
45
+ ### 2. Define the pipeline tasks
46
+
47
+ Define each phase as a separate task with the `@task` decorator:
48
+
49
+ ```python
50
+ from slurm.decorators import task
51
+ from typing import List
52
+
53
+
54
+ @task(time="00:02:00", mem="256M", cpus_per_task=1)
55
+ def prepare_data(num_chunks: int) -> List[dict]:
56
+ """Create data chunks for parallel processing."""
57
+ return [
58
+ {"chunk_id": i, "data": list(range(i * 100, (i + 1) * 100))}
59
+ for i in range(num_chunks)
60
+ ]
61
+
62
+
63
+ @task(time="00:03:00", mem="256M", cpus_per_task=1)
64
+ def process_chunk(chunk_id: int, data: List[int]) -> dict:
65
+ """Process a single data chunk (map phase)."""
66
+ return {
67
+ "chunk_id": chunk_id,
68
+ "count": len(data),
69
+ "sum": sum(data),
70
+ }
71
+
72
+
73
+ @task(time="00:05:00", mem="512M", cpus_per_task=1)
74
+ def aggregate_results(results: List[dict]) -> dict:
75
+ """Combine results from all chunks (reduce phase)."""
76
+ return {
77
+ "total_chunks": len(results),
78
+ "total_count": sum(r["count"] for r in results),
79
+ "total_sum": sum(r["sum"] for r in results),
80
+ }
81
+ ```
82
+
83
+ ### 3. Chain the tasks with dependencies
84
+
85
+ Use `.after()` for sequential dependencies and `.map()` for the parallel phase:
86
+
87
+ ```python
88
+ from slurm import Job
89
+ from typing import List
90
+
91
+ with cluster:
92
+ # Phase 1: Prepare data
93
+ prep_job: Job[List[dict]] = prepare_data(num_chunks=5)
94
+ prep_job.wait()
95
+ chunks = prep_job.get_result()
96
+
97
+ # Phase 2: Process chunks in parallel (array job)
98
+ # .after(prep_job) ensures map tasks wait for preparation
99
+ # .map(chunks) submits one task per chunk
100
+ map_jobs = process_chunk.after(prep_job).map(chunks)
101
+ map_jobs.wait()
102
+ map_results = map_jobs.get_results()
103
+
104
+ # Phase 3: Aggregate all results
105
+ # .after(map_jobs) waits for ALL map tasks to complete
106
+ reduce_job: Job[dict] = aggregate_results.after(map_jobs)(map_results)
107
+ reduce_job.wait()
108
+ final = reduce_job.get_result()
109
+ ```
110
+
111
+ ### 4. Run the built-in example
112
+
113
+ The SDK includes a complete map-reduce example:
114
+
115
+ ```bash
116
+ uv run python -m slurm.examples.map_reduce \
117
+ --hostname your-slurm-host \
118
+ --username $USER \
119
+ --partition debug \
120
+ --num-chunks 5 \
121
+ --packaging container \
122
+ --packaging-registry registry:5000/map-reduce \
123
+ --packaging-platform linux/amd64 \
124
+ --packaging-tls-verify false
125
+ ```
126
+
127
+ Use `--num-chunks` to control the parallelism level.
128
+
129
+ ## Verification
130
+
131
+ - All three phases should complete successfully in sequence.
132
+ - The map phase should show tasks distributed across available nodes.
133
+ - The final result should contain aggregated statistics:
134
+
135
+ ```
136
+ Final Results:
137
+ Total Chunks: 5
138
+ Total Items: 500
139
+ Sum: 124750
140
+ Hosts Used: 3 (node001, node002, node003)
141
+ ```
142
+
143
+ ## Troubleshooting
144
+
145
+ - **Map tasks fail to start**: Verify the prepare task completed
146
+ successfully before map tasks are submitted. Check that `.after(prep_job)`
147
+ is called before `.map(chunks)`.
148
+ - **Reduce runs before map completes**: Ensure you pass the `map_jobs` array
149
+ to `.after()`, not a single job.
150
+ - **Registry pull errors**: If compute nodes cannot pull images, configure a
151
+ registry with `--packaging-registry`.
152
+
153
+ ## See also
154
+
155
+ - [Map-reduce tutorial](../tutorials/map_reduce.md) for a guided walkthrough
156
+ of the full example
157
+ - [Choosing a parallelization pattern](parallelization_patterns.md) for other
158
+ orchestration patterns
159
+ - [Tasks and Workflows reference](../reference/api/tasks_workflows.md) for
160
+ `.map()` and `.after()` API details