xmanager-slurm 0.4.1__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xmanager-slurm might be problematic. Click here for more details.

Files changed (110) hide show
  1. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/PKG-INFO +1 -1
  2. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/pyproject.toml +1 -1
  3. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/uv.lock +1 -1
  4. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/contrib/clusters/__init__.py +2 -18
  5. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/resources.py +2 -0
  6. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/scripts/cli.py +29 -4
  7. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +1 -0
  8. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/.devcontainer.json +0 -0
  9. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/.github/workflows/ci.yml +0 -0
  10. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/.github/workflows/deploy-docs.yml +0 -0
  11. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/.gitignore +0 -0
  12. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/.pre-commit-config.yaml +0 -0
  13. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/.vscode/settings.json +0 -0
  14. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/LICENSE.md +0 -0
  15. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/README.md +0 -0
  16. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/docs/api/executables.rst +0 -0
  17. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/docs/api/executors.rst +0 -0
  18. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/docs/api/packageables.rst +0 -0
  19. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/docs/assets/workflow-dark.svg +0 -0
  20. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/docs/assets/workflow-light.svg +0 -0
  21. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/docs/conf.py +0 -0
  22. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/docs/getting-started/xmanager.md +0 -0
  23. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/docs/guides/index.md +0 -0
  24. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/docs/guides/remote-dev.md +0 -0
  25. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/docs/index.md +0 -0
  26. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/conda/environment.yml +0 -0
  27. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/conda/launch.py +0 -0
  28. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/conda/main.py +0 -0
  29. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/conda/pyproject.toml +0 -0
  30. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/custom-dockerfile/Dockerfile +0 -0
  31. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/custom-dockerfile/launch.py +0 -0
  32. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/custom-dockerfile/pyproject.toml +0 -0
  33. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/job-array-sweep/launch.py +0 -0
  34. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/job-array-sweep/main.py +0 -0
  35. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/job-array-sweep/pyproject.toml +0 -0
  36. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/job-array-sweep/uv.lock +0 -0
  37. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/job-dependencies/eval.py +0 -0
  38. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/job-dependencies/launch.py +0 -0
  39. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/job-dependencies/pyproject.toml +0 -0
  40. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/job-dependencies/train.py +0 -0
  41. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/job-dependencies/uv.lock +0 -0
  42. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/job-group/Dockerfile +0 -0
  43. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/job-group/launch.py +0 -0
  44. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/job-group/pyproject.toml +0 -0
  45. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/job-group/uv.lock +0 -0
  46. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/metadata/launch.py +0 -0
  47. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/metadata/main.py +0 -0
  48. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/metadata/pyproject.toml +0 -0
  49. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/metadata/requirements.txt +0 -0
  50. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/parameter-controller/launch.py +0 -0
  51. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/parameter-controller/main.py +0 -0
  52. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/parameter-controller/pyproject.toml +0 -0
  53. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/parameter-controller/requirements.txt +0 -0
  54. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/pip/launch.py +0 -0
  55. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/pip/main.py +0 -0
  56. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/pip/pyproject.toml +0 -0
  57. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/pip/requirements.txt +0 -0
  58. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/uv/launch.py +0 -0
  59. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/uv/pyproject.toml +0 -0
  60. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/examples/uv/uv.lock +0 -0
  61. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/conftest.py +0 -0
  62. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/fixtures/slurm/Dockerfile +0 -0
  63. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/fixtures/slurm/README.md +0 -0
  64. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/fixtures/slurm/cgroup.conf +0 -0
  65. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/fixtures/slurm/docker-compose.yml +0 -0
  66. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/fixtures/slurm/docker-entrypoint.sh +0 -0
  67. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/fixtures/slurm/host_ed25519 +0 -0
  68. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/fixtures/slurm/host_ed25519.pub +0 -0
  69. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/fixtures/slurm/id_ed25519 +0 -0
  70. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/fixtures/slurm/id_ed25519.pub +0 -0
  71. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/fixtures/slurm/slurm.conf +0 -0
  72. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/fixtures/slurm/slurmdbd.conf +0 -0
  73. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/fixtures/slurm/sshd_config +0 -0
  74. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/integration/test_remote_execution.py +0 -0
  75. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/tests/test_dependencies.py +0 -0
  76. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/__init__.py +0 -0
  77. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/api.py +0 -0
  78. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/batching.py +0 -0
  79. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/config.py +0 -0
  80. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/console.py +0 -0
  81. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/constants.py +0 -0
  82. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/contrib/__init__.py +0 -0
  83. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/contrib/clusters/drac.py +0 -0
  84. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/dependencies.py +0 -0
  85. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/executables.py +0 -0
  86. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/execution.py +0 -0
  87. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/executors.py +0 -0
  88. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/experiment.py +0 -0
  89. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/experimental/parameter_controller.py +0 -0
  90. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/job_blocks.py +0 -0
  91. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/packageables.py +0 -0
  92. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/packaging/__init__.py +0 -0
  93. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/packaging/docker.py +0 -0
  94. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/packaging/registry.py +0 -0
  95. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/packaging/router.py +0 -0
  96. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/packaging/utils.py +0 -0
  97. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/scripts/_cloudpickle.py +0 -0
  98. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/status.py +0 -0
  99. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/templates/docker/docker-bake.hcl.j2 +0 -0
  100. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/templates/docker/mamba.Dockerfile +0 -0
  101. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/templates/docker/python.Dockerfile +0 -0
  102. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/templates/docker/uv.Dockerfile +0 -0
  103. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/templates/slurm/fragments/monitor.bash.j2 +0 -0
  104. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/templates/slurm/fragments/proxy.bash.j2 +0 -0
  105. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/templates/slurm/job-array.bash.j2 +0 -0
  106. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/templates/slurm/job-group.bash.j2 +0 -0
  107. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/templates/slurm/job.bash.j2 +0 -0
  108. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/templates/slurm/runtimes/podman.bash.j2 +0 -0
  109. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/types.py +0 -0
  110. {xmanager_slurm-0.4.1 → xmanager_slurm-0.4.2}/xm_slurm/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: xmanager-slurm
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Slurm backend for XManager.
5
5
  Project-URL: GitHub, https://github.com/jessefarebro/xm-slurm
6
6
  Author-email: Jesse Farebrother <jfarebro@cs.mcgill.ca>
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
5
5
  [project]
6
6
  name = "xmanager-slurm"
7
7
  description = "Slurm backend for XManager."
8
- version = "0.4.1"
8
+ version = "0.4.2"
9
9
  # readme = "README.md"
10
10
  requires-python = ">=3.10"
11
11
  license = { text = "MIT" }
@@ -2196,7 +2196,7 @@ wheels = [
2196
2196
 
2197
2197
  [[package]]
2198
2198
  name = "xmanager-slurm"
2199
- version = "0.4.0"
2199
+ version = "0.4.2"
2200
2200
  source = { editable = "." }
2201
2201
  dependencies = [
2202
2202
  { name = "asyncssh" },
@@ -31,23 +31,6 @@ def mila(
31
31
  "/home/mila/${USER:0:1}/$USER/.ssh": "/home/mila/${USER:0:1}/$USER/.ssh",
32
32
  }
33
33
 
34
- def validate(job: xm.Job) -> None:
35
- assert isinstance(job.executor, Slurm)
36
-
37
- wants_requeue_with_grace_period = (
38
- job.executor.requeue and job.executor.timeout_signal_grace_period > dt.timedelta(0)
39
- )
40
- partition = job.executor.partition or "main"
41
-
42
- if wants_requeue_with_grace_period and (
43
- partition is None or not partition.endswith("-grace")
44
- ):
45
- logger.warning(
46
- f"Job {job.name} wants requeue with grace period, but partition `{partition}` does not end with '-grace'. "
47
- "Mila Cluster requires you specify a grace partition. "
48
- "This may result in the job not being requeued properly."
49
- )
50
-
51
34
  return config.SlurmClusterConfig(
52
35
  name="mila",
53
36
  ssh=config.SlurmSSHConfig(
@@ -77,10 +60,11 @@ def mila(
77
60
  resources.ResourceType.A100: "a100",
78
61
  resources.ResourceType.A100_80GIB: "a100l",
79
62
  resources.ResourceType.A6000: "a6000",
63
+ resources.ResourceType.L40S: "l40s",
64
+ resources.ResourceType.H100: "h100",
80
65
  },
81
66
  features={
82
67
  resources.FeatureType.NVIDIA_MIG: "mig",
83
68
  resources.FeatureType.NVIDIA_NVLINK: "nvlink",
84
69
  },
85
- validate=validate,
86
70
  )
@@ -33,6 +33,7 @@ class ResourceType(enum.IntEnum):
33
33
  A6000 = 1033
34
34
 
35
35
  H100 = 1040
36
+ L40S = 1041
36
37
 
37
38
 
38
39
  AcceleratorType = set([
@@ -47,6 +48,7 @@ AcceleratorType = set([
47
48
  ResourceType.A5000,
48
49
  ResourceType.A6000,
49
50
  ResourceType.H100,
51
+ ResourceType.L40S,
50
52
  ])
51
53
 
52
54
  assert AcceleratorType | {
@@ -1,4 +1,5 @@
1
1
  import argparse
2
+ import sys
2
3
 
3
4
  from xmanager import xm
4
5
 
@@ -8,13 +9,26 @@ from xm_slurm.console import console
8
9
 
9
10
  async def logs(
10
11
  experiment_id: int,
11
- wid: int,
12
12
  *,
13
+ wid: int | None,
14
+ identity: str | None,
13
15
  follow: bool = True,
14
16
  num_lines: int = 10,
15
17
  block_size: int = 1024,
16
18
  ):
17
- wu = xm_slurm.get_experiment(experiment_id).work_units()[wid]
19
+ xp = xm_slurm.get_experiment(experiment_id)
20
+
21
+ if wid is not None:
22
+ wu = xp.work_units()[wid]
23
+ elif identity is not None:
24
+ wu = xp._get_work_unit_by_identity(identity)
25
+ if wu is None:
26
+ console.print(f"[red]Work Unit with identity {identity} not found.[/red]")
27
+ sys.exit(1)
28
+ else:
29
+ raise ValueError("Must specify either wid or identity.")
30
+ assert wu is not None
31
+
18
32
  async for log in wu.logs(num_lines=num_lines, block_size=block_size, wait=True, follow=follow):
19
33
  console.print(log, end="\n")
20
34
 
@@ -26,7 +40,12 @@ async def main():
26
40
 
27
41
  logs_parser = subparsers.add_parser("logs", help="Display logs for a specific experiment.")
28
42
  logs_parser.add_argument("xid", type=int, help="Experiment ID.")
29
- logs_parser.add_argument("wid", type=int, help="Work Unit ID.")
43
+
44
+ # Create a mutually exclusive group for wid and identity
45
+ group = logs_parser.add_mutually_exclusive_group()
46
+ group.add_argument("--wid", type=int, help="Work Unit ID.")
47
+ group.add_argument("--identity", type=str, help="Work Unit identity.")
48
+
30
49
  logs_parser.add_argument(
31
50
  "-n",
32
51
  "--n-lines",
@@ -45,7 +64,13 @@ async def main():
45
64
  args = parser.parse_args()
46
65
  match args.subcommand:
47
66
  case "logs":
48
- await logs(args.xid, args.wid, follow=args.follow, num_lines=args.n_lines)
67
+ await logs(
68
+ args.xid,
69
+ wid=args.wid,
70
+ identity=args.identity,
71
+ follow=args.follow,
72
+ num_lines=args.n_lines,
73
+ )
49
74
 
50
75
 
51
76
  if __name__ == "__main__":
@@ -6,6 +6,7 @@ export {{ key }}="{{ value }}"
6
6
  {% else %}
7
7
  export APPTAINERENV_{{ key }}="{{ value }}"
8
8
  export SINGULARITYENV_{{ key }}="{{ value }}"
9
+ export {{ key }}="{{ value }}"
9
10
  {% endif %}
10
11
  {% endfor %}
11
12
  {%- endmacro %}
File without changes