xmanager-slurm 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xmanager-slurm might be problematic. Click here for more details.

@@ -31,23 +31,6 @@ def mila(
31
31
  "/home/mila/${USER:0:1}/$USER/.ssh": "/home/mila/${USER:0:1}/$USER/.ssh",
32
32
  }
33
33
 
34
- def validate(job: xm.Job) -> None:
35
- assert isinstance(job.executor, Slurm)
36
-
37
- wants_requeue_with_grace_period = (
38
- job.executor.requeue and job.executor.timeout_signal_grace_period > dt.timedelta(0)
39
- )
40
- partition = job.executor.partition or "main"
41
-
42
- if wants_requeue_with_grace_period and (
43
- partition is None or not partition.endswith("-grace")
44
- ):
45
- logger.warning(
46
- f"Job {job.name} wants requeue with grace period, but partition `{partition}` does not end with '-grace'. "
47
- "Mila Cluster requires you specify a grace partition. "
48
- "This may result in the job not being requeued properly."
49
- )
50
-
51
34
  return config.SlurmClusterConfig(
52
35
  name="mila",
53
36
  ssh=config.SlurmSSHConfig(
@@ -77,10 +60,11 @@ def mila(
77
60
  resources.ResourceType.A100: "a100",
78
61
  resources.ResourceType.A100_80GIB: "a100l",
79
62
  resources.ResourceType.A6000: "a6000",
63
+ resources.ResourceType.L40S: "l40s",
64
+ resources.ResourceType.H100: "h100",
80
65
  },
81
66
  features={
82
67
  resources.FeatureType.NVIDIA_MIG: "mig",
83
68
  resources.FeatureType.NVIDIA_NVLINK: "nvlink",
84
69
  },
85
- validate=validate,
86
70
  )
xm_slurm/resources.py CHANGED
@@ -33,6 +33,7 @@ class ResourceType(enum.IntEnum):
33
33
  A6000 = 1033
34
34
 
35
35
  H100 = 1040
36
+ L40S = 1041
36
37
 
37
38
 
38
39
  AcceleratorType = set([
@@ -47,6 +48,7 @@ AcceleratorType = set([
47
48
  ResourceType.A5000,
48
49
  ResourceType.A6000,
49
50
  ResourceType.H100,
51
+ ResourceType.L40S,
50
52
  ])
51
53
 
52
54
  assert AcceleratorType | {
xm_slurm/scripts/cli.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import argparse
2
+ import sys
2
3
 
3
4
  from xmanager import xm
4
5
 
@@ -8,13 +9,26 @@ from xm_slurm.console import console
8
9
 
9
10
  async def logs(
10
11
  experiment_id: int,
11
- wid: int,
12
12
  *,
13
+ wid: int | None,
14
+ identity: str | None,
13
15
  follow: bool = True,
14
16
  num_lines: int = 10,
15
17
  block_size: int = 1024,
16
18
  ):
17
- wu = xm_slurm.get_experiment(experiment_id).work_units()[wid]
19
+ xp = xm_slurm.get_experiment(experiment_id)
20
+
21
+ if wid is not None:
22
+ wu = xp.work_units()[wid]
23
+ elif identity is not None:
24
+ wu = xp._get_work_unit_by_identity(identity)
25
+ if wu is None:
26
+ console.print(f"[red]Work Unit with identity {identity} not found.[/red]")
27
+ sys.exit(1)
28
+ else:
29
+ raise ValueError("Must specify either wid or identity.")
30
+ assert wu is not None
31
+
18
32
  async for log in wu.logs(num_lines=num_lines, block_size=block_size, wait=True, follow=follow):
19
33
  console.print(log, end="\n")
20
34
 
@@ -26,7 +40,12 @@ async def main():
26
40
 
27
41
  logs_parser = subparsers.add_parser("logs", help="Display logs for a specific experiment.")
28
42
  logs_parser.add_argument("xid", type=int, help="Experiment ID.")
29
- logs_parser.add_argument("wid", type=int, help="Work Unit ID.")
43
+
44
+ # Create a mutually exclusive group for wid and identity
45
+ group = logs_parser.add_mutually_exclusive_group()
46
+ group.add_argument("--wid", type=int, help="Work Unit ID.")
47
+ group.add_argument("--identity", type=str, help="Work Unit identity.")
48
+
30
49
  logs_parser.add_argument(
31
50
  "-n",
32
51
  "--n-lines",
@@ -45,7 +64,13 @@ async def main():
45
64
  args = parser.parse_args()
46
65
  match args.subcommand:
47
66
  case "logs":
48
- await logs(args.xid, args.wid, follow=args.follow, num_lines=args.n_lines)
67
+ await logs(
68
+ args.xid,
69
+ wid=args.wid,
70
+ identity=args.identity,
71
+ follow=args.follow,
72
+ num_lines=args.n_lines,
73
+ )
49
74
 
50
75
 
51
76
  if __name__ == "__main__":
@@ -32,4 +32,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
32
32
 
33
33
  ENV PATH="/workspace/.venv/bin:$PATH"
34
34
 
35
- ENTRYPOINT [ "uv", "run", "python" ]
35
+ ENTRYPOINT [ "python" ]
@@ -6,6 +6,7 @@ export {{ key }}="{{ value }}"
6
6
  {% else %}
7
7
  export APPTAINERENV_{{ key }}="{{ value }}"
8
8
  export SINGULARITYENV_{{ key }}="{{ value }}"
9
+ export {{ key }}="{{ value }}"
9
10
  {% endif %}
10
11
  {% endfor %}
11
12
  {%- endmacro %}
@@ -1,11 +1,10 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: xmanager-slurm
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: Slurm backend for XManager.
5
5
  Project-URL: GitHub, https://github.com/jessefarebro/xm-slurm
6
6
  Author-email: Jesse Farebrother <jfarebro@cs.mcgill.ca>
7
7
  License: MIT
8
- License-File: LICENSE.md
9
8
  Classifier: License :: OSI Approved :: Apache Software License
10
9
  Classifier: License :: OSI Approved :: MIT License
11
10
  Classifier: Operating System :: OS Independent
@@ -11,12 +11,12 @@ xm_slurm/executors.py,sha256=fMtxGUCi4vEKmb_p4JEpqPUTh7L_f1LcR_TamMLAWNg,4667
11
11
  xm_slurm/experiment.py,sha256=trHapcYxPNKofzSqu7KZawML59tZ8FVjoEZYe2Wal7w,44521
12
12
  xm_slurm/job_blocks.py,sha256=_F8CKCs5BQFj40a2-mjG71HfacvWoBXBDPDKEaKTbXc,616
13
13
  xm_slurm/packageables.py,sha256=YZFTL6UWx9A_zyztTy1THUlj3pW1rA0cBPHJxD1LOJk,12884
14
- xm_slurm/resources.py,sha256=EaYDATVudrEDPKKdSZoWgfqPiidc6DMjIctmzLQmiH0,5683
14
+ xm_slurm/resources.py,sha256=tET3TPOQ8nXYE_SxAs2fiHt9UKJsCLW1vFktJTH0xG4,5722
15
15
  xm_slurm/status.py,sha256=WTWiDHi-ZHtwHRnDP0cGa-27zTSm6LkA-GCKsN-zBgg,6916
16
16
  xm_slurm/types.py,sha256=TsVykDm-LazVkrjeJrTwCMs4Q8APKhy7BTk0yKIhFNg,805
17
17
  xm_slurm/utils.py,sha256=ESjOkGT7bRSzIeZrUtZplSHP4oaH6VZ92y2woYdcyKM,2239
18
18
  xm_slurm/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- xm_slurm/contrib/clusters/__init__.py,sha256=vugR50D9fPJQN5bTd7cSArDGrA6pC-YJHMXrEyvr_Uw,2980
19
+ xm_slurm/contrib/clusters/__init__.py,sha256=JI_zTMfozanpfbBcNjPpgGi5Ppc6jyjM05X3rvFODLs,2321
20
20
  xm_slurm/contrib/clusters/drac.py,sha256=tJeQFWFIpeZ1gD3j6AAJssNoLSiDkB-3lz1_ObnkRhc,5905
21
21
  xm_slurm/experimental/parameter_controller.py,sha256=b5LfglHV307F6QcPrHeZX5GJBtyOK9aQydke_SZ3Wto,8457
22
22
  xm_slurm/packaging/__init__.py,sha256=dh307yLpUT9KN7rJ1e9fYC6hegGKfZcGboUq9nGpDVQ,233
@@ -25,20 +25,20 @@ xm_slurm/packaging/registry.py,sha256=GrdmQg9MgSo38OiqOzMKWSkQyBuyryOfc3zcdgZ4CU
25
25
  xm_slurm/packaging/router.py,sha256=yPbdA9clrhly97cLgDsSRZG2LZRKE-oz8Hhdb7WtYqk,2070
26
26
  xm_slurm/packaging/utils.py,sha256=KI5s32rNTCfgwzY_7Ghck27jHKvKg5sl5_NEEqJbJqI,3999
27
27
  xm_slurm/scripts/_cloudpickle.py,sha256=dlJYf2SceOuUn8wi-ozuoYAQg71wqD2MUVOUCyOwWIY,647
28
- xm_slurm/scripts/cli.py,sha256=ZXqYOs8X23TYDdKxvV-wIa-0mTfpxSl4_Pli6TiKI7s,1435
28
+ xm_slurm/scripts/cli.py,sha256=xA4SqcMtX_NXXdUDgJ47qNHw2uGvmn_JA3XiDXk-jFA,2152
29
29
  xm_slurm/templates/docker/docker-bake.hcl.j2,sha256=ClsFpj91Mr1VfA8L6eqBG3HQz0Z8VenF6mEfmAhQgUo,1498
30
30
  xm_slurm/templates/docker/mamba.Dockerfile,sha256=Sgxr5IA5T-pT1Shumb5k3JngoG4pgCdBXjzqslFJdZI,753
31
31
  xm_slurm/templates/docker/python.Dockerfile,sha256=U4b4QVkopckQ0o9jJIE7d_M6TvExEYlYDirNwCoZ7W4,865
32
- xm_slurm/templates/docker/uv.Dockerfile,sha256=kYD32oUS1jUaARsNV1o6EFnIfLCNh5GMmck27b-5NRU,969
32
+ xm_slurm/templates/docker/uv.Dockerfile,sha256=YB4LTs42ycDw8EHyz3U0_fR3lRAjmjrnXGlfV1Um394,956
33
33
  xm_slurm/templates/slurm/job-array.bash.j2,sha256=iYtGMRDXgwwc2_8E3v4a30f3fKuq4zWgZHkxCXJ9iXc,567
34
34
  xm_slurm/templates/slurm/job-group.bash.j2,sha256=UkjfBE7jg9mepcUWaHZEAjkiXsIM1j_sLxLzxkteD-Y,1120
35
35
  xm_slurm/templates/slurm/job.bash.j2,sha256=v0xGYzagDdWW6Tg44qobGJLNSUP1Cf4CcekrPibYdrE,1864
36
36
  xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=HYqYhXsTv8TCed5UaGCZVGIYsqxSKHcnPyNNTHWNvxc,1279
37
37
  xm_slurm/templates/slurm/fragments/proxy.bash.j2,sha256=VJLglZo-Nvx9R-qe3rHTxr07CylTQ6Z9NwBzvIpAZrA,814
38
- xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=dMntzelhs8DqKyIpO9S6wzMfH2PDevmgvyjCW8Xc2dY,3222
38
+ xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=ggSsAxv-2_Ct3hSxFJgwgwa3Wu8xH3JqLxWtJOYYrsA,3253
39
39
  xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=xKXYFvQvazMx0PgvmlRXR6eecoiBUl8y52dIzQtWkBE,1469
40
- xmanager_slurm-0.4.1.dist-info/METADATA,sha256=3mT4XIm8evv-5qw7oney4nYn3IasIA_l1rWz86XNOY8,954
41
- xmanager_slurm-0.4.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
42
- xmanager_slurm-0.4.1.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
43
- xmanager_slurm-0.4.1.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
44
- xmanager_slurm-0.4.1.dist-info/RECORD,,
40
+ xmanager_slurm-0.4.3.dist-info/METADATA,sha256=l-9s8x9MuROBJcpwD9qQqkQyU6Jq3WkU0uL5r6qEKJE,929
41
+ xmanager_slurm-0.4.3.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
42
+ xmanager_slurm-0.4.3.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
43
+ xmanager_slurm-0.4.3.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
44
+ xmanager_slurm-0.4.3.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.26.3
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any