xmanager-slurm 0.4.8__tar.gz → 0.4.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xmanager-slurm might be problematic. Click here for more details.

Files changed (127) hide show
  1. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/PKG-INFO +1 -1
  2. xmanager_slurm-0.4.10/examples/job-timeout/launch.py +53 -0
  3. xmanager_slurm-0.4.10/examples/job-timeout/main.py +11 -0
  4. xmanager_slurm-0.4.10/examples/uv/pyproject.toml +6 -0
  5. xmanager_slurm-0.4.10/examples/uv/uv.lock +66 -0
  6. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/pyproject.toml +1 -1
  7. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/uv.lock +2 -1
  8. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/executors.py +5 -0
  9. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/packaging/docker.py +5 -8
  10. xmanager_slurm-0.4.10/xm_slurm/templates/slurm/fragments/monitor.bash.j2 +76 -0
  11. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/job.bash.j2 +1 -1
  12. xmanager_slurm-0.4.8/xm_slurm/templates/slurm/fragments/monitor.bash.j2 +0 -37
  13. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.devcontainer.json +0 -0
  14. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.github/workflows/ci.yml +0 -0
  15. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.github/workflows/deploy-docs.yml +0 -0
  16. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.gitignore +0 -0
  17. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.pdm-build/.gitignore +0 -0
  18. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.pdm-build/xm_slurm-0.1+editable.dist-info/METADATA +0 -0
  19. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.pdm-build/xm_slurm-0.1+editable.dist-info/WHEEL +0 -0
  20. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.pdm-build/xm_slurm.pth +0 -0
  21. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.pre-commit-config.yaml +0 -0
  22. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.python-version +0 -0
  23. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.vscode/settings.json +0 -0
  24. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/LICENSE.md +0 -0
  25. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/README.md +0 -0
  26. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/api/executables.rst +0 -0
  27. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/api/executors.rst +0 -0
  28. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/api/packageables.rst +0 -0
  29. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/assets/workflow-dark.svg +0 -0
  30. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/assets/workflow-light.svg +0 -0
  31. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/conf.py +0 -0
  32. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/getting-started/xmanager.md +0 -0
  33. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/guides/index.md +0 -0
  34. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/guides/remote-dev.md +0 -0
  35. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/index.md +0 -0
  36. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/conda/environment.yml +0 -0
  37. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/conda/launch.py +0 -0
  38. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/conda/main.py +0 -0
  39. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/conda/pyproject.toml +0 -0
  40. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/custom-dockerfile/Dockerfile +0 -0
  41. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/custom-dockerfile/launch.py +0 -0
  42. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/custom-dockerfile/pyproject.toml +0 -0
  43. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-array-sweep/launch.py +0 -0
  44. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-array-sweep/main.py +0 -0
  45. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-array-sweep/pyproject.toml +0 -0
  46. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-array-sweep/uv.lock +0 -0
  47. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-dependencies/eval.py +0 -0
  48. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-dependencies/launch.py +0 -0
  49. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-dependencies/pyproject.toml +0 -0
  50. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-dependencies/train.py +0 -0
  51. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-dependencies/uv.lock +0 -0
  52. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-group/Dockerfile +0 -0
  53. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-group/launch.py +0 -0
  54. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-group/pyproject.toml +0 -0
  55. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-group/uv.lock +0 -0
  56. {xmanager_slurm-0.4.8/examples/uv → xmanager_slurm-0.4.10/examples/job-timeout}/pyproject.toml +0 -0
  57. {xmanager_slurm-0.4.8/examples/uv → xmanager_slurm-0.4.10/examples/job-timeout}/uv.lock +0 -0
  58. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/metadata/launch.py +0 -0
  59. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/metadata/main.py +0 -0
  60. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/metadata/pyproject.toml +0 -0
  61. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/metadata/requirements.txt +0 -0
  62. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/parameter-controller/launch.py +0 -0
  63. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/parameter-controller/main.py +0 -0
  64. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/parameter-controller/pyproject.toml +0 -0
  65. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/parameter-controller/requirements.txt +0 -0
  66. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/pip/launch.py +0 -0
  67. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/pip/main.py +0 -0
  68. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/pip/pyproject.toml +0 -0
  69. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/pip/requirements.txt +0 -0
  70. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/uv/launch.py +0 -0
  71. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/conftest.py +0 -0
  72. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/Dockerfile +0 -0
  73. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/README.md +0 -0
  74. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/cgroup.conf +0 -0
  75. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/docker-compose.yml +0 -0
  76. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/docker-entrypoint.sh +0 -0
  77. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/host_ed25519 +0 -0
  78. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/host_ed25519.pub +0 -0
  79. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/id_ed25519 +0 -0
  80. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/id_ed25519.pub +0 -0
  81. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/slurm.conf +0 -0
  82. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/slurmdbd.conf +0 -0
  83. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/sshd_config +0 -0
  84. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/test_remote_execution.py +0 -0
  85. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/test_dependencies.py +0 -0
  86. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/__init__.py +0 -0
  87. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/api/__init__.py +0 -0
  88. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/api/abc.py +0 -0
  89. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/api/models.py +0 -0
  90. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/api/sqlite/client.py +0 -0
  91. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/api/web/client.py +0 -0
  92. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/batching.py +0 -0
  93. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/config.py +0 -0
  94. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/console.py +0 -0
  95. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/constants.py +0 -0
  96. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/contrib/__init__.py +0 -0
  97. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/contrib/clusters/__init__.py +0 -0
  98. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/contrib/clusters/drac.py +0 -0
  99. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/dependencies.py +0 -0
  100. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/executables.py +0 -0
  101. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/execution.py +0 -0
  102. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/experiment.py +0 -0
  103. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/experimental/parameter_controller.py +0 -0
  104. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/filesystem.py +0 -0
  105. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/job_blocks.py +0 -0
  106. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/metadata_context.py +0 -0
  107. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/packageables.py +0 -0
  108. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/packaging/__init__.py +0 -0
  109. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/packaging/registry.py +0 -0
  110. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/packaging/router.py +0 -0
  111. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/packaging/utils.py +0 -0
  112. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/resources.py +0 -0
  113. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/scripts/_cloudpickle.py +0 -0
  114. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/scripts/cli.py +0 -0
  115. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/status.py +0 -0
  116. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/docker/docker-bake.hcl.j2 +0 -0
  117. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/docker/mamba.Dockerfile +0 -0
  118. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/docker/python.Dockerfile +0 -0
  119. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/docker/uv.Dockerfile +0 -0
  120. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/entrypoint.bash.j2 +0 -0
  121. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/fragments/proxy.bash.j2 +0 -0
  122. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/job-array.bash.j2 +0 -0
  123. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/job-group.bash.j2 +0 -0
  124. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +0 -0
  125. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/runtimes/podman.bash.j2 +0 -0
  126. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/types.py +0 -0
  127. {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xmanager-slurm
3
- Version: 0.4.8
3
+ Version: 0.4.10
4
4
  Summary: Slurm backend for XManager.
5
5
  Project-URL: GitHub, https://github.com/jessefarebro/xm-slurm
6
6
  Author-email: Jesse Farebrother <jfarebro@cs.mcgill.ca>
@@ -0,0 +1,53 @@
1
+ import datetime as dt
2
+
3
+ from absl import app
4
+ from xmanager import xm
5
+
6
+ import xm_slurm
7
+ import xm_slurm.contrib.clusters
8
+
9
+
10
+ @xm.run_in_asyncio_loop
11
+ async def main(_):
12
+ async with xm_slurm.create_experiment("My Experiment") as experiment:
13
+ # Step 1: Specify the executor specification
14
+ executor_spec = xm_slurm.Slurm.Spec(tag="ghcr.io/jessefarebro/xm-slurm/test:latest")
15
+
16
+ # Step 2: Specify the executable and package it
17
+ [executable] = experiment.package(
18
+ [
19
+ xm_slurm.uv_container(
20
+ executor_spec=executor_spec,
21
+ # Equivalent of `-m rich.status`
22
+ entrypoint=xm.CommandList(["main.py", "900"]),
23
+ ),
24
+ ],
25
+ )
26
+
27
+ # Step 3: Construct requirements & executor
28
+ requirements = xm_slurm.JobRequirements(
29
+ CPU=1,
30
+ RAM=1.0 * xm.GiB,
31
+ GPU=1,
32
+ replicas=1,
33
+ cluster=xm_slurm.contrib.clusters.mila(),
34
+ )
35
+ executor = xm_slurm.Slurm(
36
+ requirements=requirements,
37
+ time=dt.timedelta(minutes=10),
38
+ )
39
+
40
+ # Step 4: Schedule job
41
+ wu = await experiment.add(
42
+ xm.Job(
43
+ executable=executable,
44
+ executor=executor,
45
+ )
46
+ )
47
+
48
+ await wu.wait_until_complete()
49
+ print(f"Job finished executing with status {await wu.get_status()}")
50
+
51
+
52
+ if __name__ == "__main__":
53
+ app.run(main)
@@ -0,0 +1,11 @@
1
+ import sys
2
+ import time
3
+
4
+
5
+ def main():
6
+ print("Starting job...")
7
+ time.sleep(int(sys.argv[1]))
8
+
9
+
10
+ if __name__ == "__main__":
11
+ main()
@@ -0,0 +1,6 @@
1
+ [project]
2
+ name = "xm-slurm-example"
3
+ description = "XManager Slurm test project"
4
+ version = "0.0.1"
5
+ requires-python = ">=3.10"
6
+ dependencies = ["rich"]
@@ -0,0 +1,66 @@
1
+ version = 1
2
+ requires-python = ">=3.10"
3
+
4
+ [[package]]
5
+ name = "markdown-it-py"
6
+ version = "3.0.0"
7
+ source = { registry = "https://pypi.org/simple" }
8
+ dependencies = [
9
+ { name = "mdurl" },
10
+ ]
11
+ sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596 }
12
+ wheels = [
13
+ { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528 },
14
+ ]
15
+
16
+ [[package]]
17
+ name = "mdurl"
18
+ version = "0.1.2"
19
+ source = { registry = "https://pypi.org/simple" }
20
+ sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729 }
21
+ wheels = [
22
+ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
23
+ ]
24
+
25
+ [[package]]
26
+ name = "pygments"
27
+ version = "2.18.0"
28
+ source = { registry = "https://pypi.org/simple" }
29
+ sdist = { url = "https://files.pythonhosted.org/packages/8e/62/8336eff65bcbc8e4cb5d05b55faf041285951b6e80f33e2bff2024788f31/pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199", size = 4891905 }
30
+ wheels = [
31
+ { url = "https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a", size = 1205513 },
32
+ ]
33
+
34
+ [[package]]
35
+ name = "rich"
36
+ version = "13.9.2"
37
+ source = { registry = "https://pypi.org/simple" }
38
+ dependencies = [
39
+ { name = "markdown-it-py" },
40
+ { name = "pygments" },
41
+ { name = "typing-extensions", marker = "python_full_version < '3.11'" },
42
+ ]
43
+ sdist = { url = "https://files.pythonhosted.org/packages/aa/9e/1784d15b057b0075e5136445aaea92d23955aad2c93eaede673718a40d95/rich-13.9.2.tar.gz", hash = "sha256:51a2c62057461aaf7152b4d611168f93a9fc73068f8ded2790f29fe2b5366d0c", size = 222843 }
44
+ wheels = [
45
+ { url = "https://files.pythonhosted.org/packages/67/91/5474b84e505a6ccc295b2d322d90ff6aa0746745717839ee0c5fb4fdcceb/rich-13.9.2-py3-none-any.whl", hash = "sha256:8c82a3d3f8dcfe9e734771313e606b39d8247bb6b826e196f4914b333b743cf1", size = 242117 },
46
+ ]
47
+
48
+ [[package]]
49
+ name = "typing-extensions"
50
+ version = "4.12.2"
51
+ source = { registry = "https://pypi.org/simple" }
52
+ sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 }
53
+ wheels = [
54
+ { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 },
55
+ ]
56
+
57
+ [[package]]
58
+ name = "xm-slurm-example"
59
+ version = "0.0.1"
60
+ source = { virtual = "." }
61
+ dependencies = [
62
+ { name = "rich" },
63
+ ]
64
+
65
+ [package.metadata]
66
+ requires-dist = [{ name = "rich" }]
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
5
5
  [project]
6
6
  name = "xmanager-slurm"
7
7
  description = "Slurm backend for XManager."
8
- version = "0.4.8"
8
+ version = "0.4.10"
9
9
  # readme = "README.md"
10
10
  requires-python = ">=3.10"
11
11
  license = { text = "MIT" }
@@ -1,4 +1,5 @@
1
1
  version = 1
2
+ revision = 1
2
3
  requires-python = ">=3.10"
3
4
  resolution-markers = [
4
5
  "python_full_version < '3.11'",
@@ -2250,7 +2251,7 @@ wheels = [
2250
2251
 
2251
2252
  [[package]]
2252
2253
  name = "xmanager-slurm"
2253
- version = "0.4.8"
2254
+ version = "0.4.10"
2254
2255
  source = { editable = "." }
2255
2256
  dependencies = [
2256
2257
  { name = "aiofile" },
@@ -57,8 +57,13 @@ class Slurm(xm.Executor):
57
57
 
58
58
  requeue: bool = True # Is this job ellible for requeueing?
59
59
  requeue_on_exit_code: int = 42 # The exit code that triggers requeueing
60
+ requeue_on_timeout: bool = True # Should the job requeue upon timeout minus the grace period
60
61
  requeue_max_attempts: int = 5 # How many times to attempt requeueing
61
62
 
63
+ @property
64
+ def requeue_timeout(self) -> dt.timedelta:
65
+ return self.time - self.timeout_signal_grace_period
66
+
62
67
  def __post_init__(self) -> None:
63
68
  if not isinstance(self.time, dt.timedelta):
64
69
  raise TypeError(f"time must be a `datetime.timedelta`, got {type(self.time)}")
@@ -139,15 +139,12 @@ class DockerClient:
139
139
  )
140
140
  return _parse_credentials_from_config(podman_config_path)
141
141
 
142
- def inspect(
143
- self, image: ImageURI, element: str | None = None, type: tp.Literal["image"] = "image"
144
- ) -> dict[str, tp.Any]:
142
+ def inspect(self, image: ImageURI, element: str) -> dict[str, tp.Any]:
145
143
  output = utils.run_command(
146
144
  xm.merge_args(
147
145
  self._client_call,
148
- ["inspect"],
149
- ["--format", f"{{{{json .{element}}}}}"] if element else [],
150
- ["--type", type] if type else [],
146
+ ["buildx", "imagetools", "inspect"],
147
+ ["--format", f"{{{{json .{element}}}}}"],
151
148
  [str(image)],
152
149
  ),
153
150
  check=True,
@@ -259,7 +256,7 @@ class DockerClient:
259
256
  uri = ImageURI(target.value.executor_spec.tag).with_digest(
260
257
  executable_metadata["containerimage.digest"]
261
258
  )
262
- config = self.inspect(uri, "Config")
259
+ config = self.inspect(uri, "Image.Config")
263
260
  if "WorkingDir" not in config:
264
261
  raise ValueError(
265
262
  "Docker image does not have a working directory. "
@@ -320,7 +317,7 @@ def _(
320
317
 
321
318
  uri = ImageURI(target.value.executable_spec.image)
322
319
 
323
- config = client.inspect(uri, "Config")
320
+ config = client.inspect(uri, "Image.Config")
324
321
  if "WorkingDir" not in config:
325
322
  raise ValueError(
326
323
  "Docker image does not have a working directory. "
@@ -0,0 +1,76 @@
1
+ {% macro monitor(requeue_max_attempts, requeue_exit_code, requeue_on_timeout, requeue_timeout) -%}
2
+ __xm_slurm_wait_for_children() {
3
+ if [[ -n "${SLURM_ARRAY_JOB_ID:-}" ]]; then
4
+ local -r JOB_ID="${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}"
5
+ else
6
+ local -r JOB_ID="${SLURM_JOB_ID}"
7
+ fi
8
+
9
+ # If there are no child jobs we should error out
10
+ children=( $(jobs -p) )
11
+ {% raw %}
12
+ if [ ${#children[@]} -eq 0 ]; then
13
+ {% endraw %}
14
+ echo "ERROR: no child jobs exist..." >&2
15
+ exit 1
16
+ fi
17
+
18
+ {% if requeue_on_timeout %}
19
+ # Start a watchdog process to signal timeout.
20
+ sleep {{ requeue_timeout }} &
21
+ timeout_pid=$!
22
+ {% endif %}
23
+
24
+ {% raw %}
25
+ while [ ${#children[@]} -gt 0 ]; do
26
+ {% endraw %}
27
+ echo "INFO: Waiting for child processes to finish..."
28
+ {% if requeue_on_timeout %}
29
+ # Wait on either one of the child processes or the timeout process.
30
+ wait -n -p child_pid "${children[@]}" "${timeout_pid}"
31
+ {% else %}
32
+ wait -n -p child_pid "${children[@]}"
33
+ {% endif %}
34
+ local child_exit_code=$?
35
+
36
+ {% if requeue_on_timeout %}
37
+ # If the finished process is the watchdog, trigger the timeout handling.
38
+ if [ "${child_pid}" = "${timeout_pid}" ]; then
39
+ echo "INFO: Timeout of {{ requeue_timeout }} seconds reached. Killing remaining processes: ${children[*]}" >&2
40
+ kill "${children[@]}" 2>/dev/null || true
41
+ scontrol requeue "${JOB_ID}"
42
+ exit {{ requeue_exit_code }}
43
+ fi
44
+ {% endif %}
45
+
46
+ echo "INFO: Process ${child_pid} finished with exit code ${child_exit_code}."
47
+
48
+ # Handle the exit code of the finished process.
49
+ if [ "${child_exit_code}" -eq "{{ requeue_exit_code }}" ] && [ "${SLURM_RESTART_COUNT:-0}" -le "{{ requeue_max_attempts }}" ]; then
50
+ echo "INFO: Received requeue exit code {{ requeue_exit_code }} from process ${child_pid}. Requeuing Slurm job ${JOB_ID} after ${SLURM_RESTART_COUNT-0} restarts." >&2
51
+ scontrol requeue "${JOB_ID}"
52
+ exit {{ requeue_exit_code }}
53
+ elif [ "${child_exit_code}" -ne 0 ]; then
54
+ echo "ERROR: Process ${child_pid} exited with code ${child_exit_code}." >&2
55
+ exit "${child_exit_code}"
56
+ fi
57
+
58
+ # Remove the finished PID from the array in a concise way.
59
+ for i in "${!children[@]}"; do
60
+ if [ "${children[i]}" = "$child_pid" ]; then
61
+ unset 'children[i]'
62
+ break
63
+ fi
64
+ done
65
+
66
+ # Reindex the array.
67
+ children=( "${children[@]}" )
68
+ done
69
+
70
+ {% if requeue_on_timeout %}
71
+ kill "$timeout_pid" 2>/dev/null || true
72
+ {% endif %}
73
+ }
74
+
75
+ __xm_slurm_wait_for_children
76
+ {%- endmacro %}
@@ -73,7 +73,7 @@ echo "[INFO] Start timestamp: $(date)"
73
73
 
74
74
  {% block monitor -%}
75
75
  {% from 'fragments/monitor.bash.j2' import monitor %}
76
- {{ monitor(job.executor.requeue_max_attempts, job.executor.requeue_on_exit_code) }}
76
+ {{ monitor(job.executor.requeue_max_attempts, job.executor.requeue_on_exit_code, job.executor.requeue_on_timeout, job.executor.requeue_timeout.seconds) }}
77
77
  {%- endblock monitor %}
78
78
 
79
79
 
@@ -1,37 +0,0 @@
1
- {% macro monitor(requeue_max_attempts, requeue_exit_code) -%}
2
- __xm_slurm_wait_for_children() {
3
- if [[ -n "${SLURM_ARRAY_JOB_ID:-}" ]]; then
4
- local -r JOB_ID="${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}"
5
- else
6
- local -r JOB_ID="${SLURM_JOB_ID}"
7
- fi
8
-
9
- # If there are no child jobs we should error out
10
- if [ -z "$(jobs -p)" ]; then
11
- echo "ERROR: no child jobs exist..." >&2
12
- exit -1
13
- fi
14
-
15
- # Loop through all job IDs in the background job list and wait for them to finish
16
- for job in "$(jobs -p)"; do
17
- echo "INFO: Waiting for job ${job} to finish..."
18
- set +e
19
- wait "${job}"
20
- local -r JOB_EXIT_CODE="${?}"
21
- set -e
22
-
23
- if [ "${JOB_EXIT_CODE}" -eq "{{ requeue_exit_code }}" ] && [ "${SLURM_RESTART_COUNT-0}" -le "{{ requeue_max_attempts }}" ]; then
24
- echo "INFO: Received requeue exit code {{ requeue_exit_code }} from job ${job}. Requeing Slurm job ${JOB_ID} after ${SLURM_RESTART_COUNT-0} restarts." >&2
25
- scontrol requeue "${JOB_ID}"
26
- exit {{ requeue_exit_code }}
27
- elif [ "${JOB_EXIT_CODE}" -ne 0 ]; then
28
- echo "ERROR: Job ${job} exited with code ${JOB_EXIT_CODE}." >&2
29
- exit "${JOB_EXIT_CODE}"
30
- else
31
- echo "INFO: Job ${job} exited successfully." >&2
32
- fi
33
- done
34
- }
35
-
36
- __xm_slurm_wait_for_children
37
- {%- endmacro %}