xmanager-slurm 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xmanager-slurm might be problematic. Click here for more details.
- xm_slurm/contrib/clusters/__init__.py +2 -18
- xm_slurm/resources.py +2 -0
- xm_slurm/scripts/cli.py +29 -4
- xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +1 -0
- {xmanager_slurm-0.4.1.dist-info → xmanager_slurm-0.4.2.dist-info}/METADATA +1 -1
- {xmanager_slurm-0.4.1.dist-info → xmanager_slurm-0.4.2.dist-info}/RECORD +9 -9
- {xmanager_slurm-0.4.1.dist-info → xmanager_slurm-0.4.2.dist-info}/WHEEL +0 -0
- {xmanager_slurm-0.4.1.dist-info → xmanager_slurm-0.4.2.dist-info}/entry_points.txt +0 -0
- {xmanager_slurm-0.4.1.dist-info → xmanager_slurm-0.4.2.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -31,23 +31,6 @@ def mila(
|
|
|
31
31
|
"/home/mila/${USER:0:1}/$USER/.ssh": "/home/mila/${USER:0:1}/$USER/.ssh",
|
|
32
32
|
}
|
|
33
33
|
|
|
34
|
-
def validate(job: xm.Job) -> None:
|
|
35
|
-
assert isinstance(job.executor, Slurm)
|
|
36
|
-
|
|
37
|
-
wants_requeue_with_grace_period = (
|
|
38
|
-
job.executor.requeue and job.executor.timeout_signal_grace_period > dt.timedelta(0)
|
|
39
|
-
)
|
|
40
|
-
partition = job.executor.partition or "main"
|
|
41
|
-
|
|
42
|
-
if wants_requeue_with_grace_period and (
|
|
43
|
-
partition is None or not partition.endswith("-grace")
|
|
44
|
-
):
|
|
45
|
-
logger.warning(
|
|
46
|
-
f"Job {job.name} wants requeue with grace period, but partition `{partition}` does not end with '-grace'. "
|
|
47
|
-
"Mila Cluster requires you specify a grace partition. "
|
|
48
|
-
"This may result in the job not being requeued properly."
|
|
49
|
-
)
|
|
50
|
-
|
|
51
34
|
return config.SlurmClusterConfig(
|
|
52
35
|
name="mila",
|
|
53
36
|
ssh=config.SlurmSSHConfig(
|
|
@@ -77,10 +60,11 @@ def mila(
|
|
|
77
60
|
resources.ResourceType.A100: "a100",
|
|
78
61
|
resources.ResourceType.A100_80GIB: "a100l",
|
|
79
62
|
resources.ResourceType.A6000: "a6000",
|
|
63
|
+
resources.ResourceType.L40S: "l40s",
|
|
64
|
+
resources.ResourceType.H100: "h100",
|
|
80
65
|
},
|
|
81
66
|
features={
|
|
82
67
|
resources.FeatureType.NVIDIA_MIG: "mig",
|
|
83
68
|
resources.FeatureType.NVIDIA_NVLINK: "nvlink",
|
|
84
69
|
},
|
|
85
|
-
validate=validate,
|
|
86
70
|
)
|
xm_slurm/resources.py
CHANGED
|
@@ -33,6 +33,7 @@ class ResourceType(enum.IntEnum):
|
|
|
33
33
|
A6000 = 1033
|
|
34
34
|
|
|
35
35
|
H100 = 1040
|
|
36
|
+
L40S = 1041
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
AcceleratorType = set([
|
|
@@ -47,6 +48,7 @@ AcceleratorType = set([
|
|
|
47
48
|
ResourceType.A5000,
|
|
48
49
|
ResourceType.A6000,
|
|
49
50
|
ResourceType.H100,
|
|
51
|
+
ResourceType.L40S,
|
|
50
52
|
])
|
|
51
53
|
|
|
52
54
|
assert AcceleratorType | {
|
xm_slurm/scripts/cli.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import argparse
|
|
2
|
+
import sys
|
|
2
3
|
|
|
3
4
|
from xmanager import xm
|
|
4
5
|
|
|
@@ -8,13 +9,26 @@ from xm_slurm.console import console
|
|
|
8
9
|
|
|
9
10
|
async def logs(
|
|
10
11
|
experiment_id: int,
|
|
11
|
-
wid: int,
|
|
12
12
|
*,
|
|
13
|
+
wid: int | None,
|
|
14
|
+
identity: str | None,
|
|
13
15
|
follow: bool = True,
|
|
14
16
|
num_lines: int = 10,
|
|
15
17
|
block_size: int = 1024,
|
|
16
18
|
):
|
|
17
|
-
|
|
19
|
+
xp = xm_slurm.get_experiment(experiment_id)
|
|
20
|
+
|
|
21
|
+
if wid is not None:
|
|
22
|
+
wu = xp.work_units()[wid]
|
|
23
|
+
elif identity is not None:
|
|
24
|
+
wu = xp._get_work_unit_by_identity(identity)
|
|
25
|
+
if wu is None:
|
|
26
|
+
console.print(f"[red]Work Unit with identity {identity} not found.[/red]")
|
|
27
|
+
sys.exit(1)
|
|
28
|
+
else:
|
|
29
|
+
raise ValueError("Must specify either wid or identity.")
|
|
30
|
+
assert wu is not None
|
|
31
|
+
|
|
18
32
|
async for log in wu.logs(num_lines=num_lines, block_size=block_size, wait=True, follow=follow):
|
|
19
33
|
console.print(log, end="\n")
|
|
20
34
|
|
|
@@ -26,7 +40,12 @@ async def main():
|
|
|
26
40
|
|
|
27
41
|
logs_parser = subparsers.add_parser("logs", help="Display logs for a specific experiment.")
|
|
28
42
|
logs_parser.add_argument("xid", type=int, help="Experiment ID.")
|
|
29
|
-
|
|
43
|
+
|
|
44
|
+
# Create a mutually exclusive group for wid and identity
|
|
45
|
+
group = logs_parser.add_mutually_exclusive_group()
|
|
46
|
+
group.add_argument("--wid", type=int, help="Work Unit ID.")
|
|
47
|
+
group.add_argument("--identity", type=str, help="Work Unit identity.")
|
|
48
|
+
|
|
30
49
|
logs_parser.add_argument(
|
|
31
50
|
"-n",
|
|
32
51
|
"--n-lines",
|
|
@@ -45,7 +64,13 @@ async def main():
|
|
|
45
64
|
args = parser.parse_args()
|
|
46
65
|
match args.subcommand:
|
|
47
66
|
case "logs":
|
|
48
|
-
await logs(
|
|
67
|
+
await logs(
|
|
68
|
+
args.xid,
|
|
69
|
+
wid=args.wid,
|
|
70
|
+
identity=args.identity,
|
|
71
|
+
follow=args.follow,
|
|
72
|
+
num_lines=args.n_lines,
|
|
73
|
+
)
|
|
49
74
|
|
|
50
75
|
|
|
51
76
|
if __name__ == "__main__":
|
|
@@ -11,12 +11,12 @@ xm_slurm/executors.py,sha256=fMtxGUCi4vEKmb_p4JEpqPUTh7L_f1LcR_TamMLAWNg,4667
|
|
|
11
11
|
xm_slurm/experiment.py,sha256=trHapcYxPNKofzSqu7KZawML59tZ8FVjoEZYe2Wal7w,44521
|
|
12
12
|
xm_slurm/job_blocks.py,sha256=_F8CKCs5BQFj40a2-mjG71HfacvWoBXBDPDKEaKTbXc,616
|
|
13
13
|
xm_slurm/packageables.py,sha256=YZFTL6UWx9A_zyztTy1THUlj3pW1rA0cBPHJxD1LOJk,12884
|
|
14
|
-
xm_slurm/resources.py,sha256=
|
|
14
|
+
xm_slurm/resources.py,sha256=tET3TPOQ8nXYE_SxAs2fiHt9UKJsCLW1vFktJTH0xG4,5722
|
|
15
15
|
xm_slurm/status.py,sha256=WTWiDHi-ZHtwHRnDP0cGa-27zTSm6LkA-GCKsN-zBgg,6916
|
|
16
16
|
xm_slurm/types.py,sha256=TsVykDm-LazVkrjeJrTwCMs4Q8APKhy7BTk0yKIhFNg,805
|
|
17
17
|
xm_slurm/utils.py,sha256=ESjOkGT7bRSzIeZrUtZplSHP4oaH6VZ92y2woYdcyKM,2239
|
|
18
18
|
xm_slurm/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
-
xm_slurm/contrib/clusters/__init__.py,sha256=
|
|
19
|
+
xm_slurm/contrib/clusters/__init__.py,sha256=JI_zTMfozanpfbBcNjPpgGi5Ppc6jyjM05X3rvFODLs,2321
|
|
20
20
|
xm_slurm/contrib/clusters/drac.py,sha256=tJeQFWFIpeZ1gD3j6AAJssNoLSiDkB-3lz1_ObnkRhc,5905
|
|
21
21
|
xm_slurm/experimental/parameter_controller.py,sha256=b5LfglHV307F6QcPrHeZX5GJBtyOK9aQydke_SZ3Wto,8457
|
|
22
22
|
xm_slurm/packaging/__init__.py,sha256=dh307yLpUT9KN7rJ1e9fYC6hegGKfZcGboUq9nGpDVQ,233
|
|
@@ -25,7 +25,7 @@ xm_slurm/packaging/registry.py,sha256=GrdmQg9MgSo38OiqOzMKWSkQyBuyryOfc3zcdgZ4CU
|
|
|
25
25
|
xm_slurm/packaging/router.py,sha256=yPbdA9clrhly97cLgDsSRZG2LZRKE-oz8Hhdb7WtYqk,2070
|
|
26
26
|
xm_slurm/packaging/utils.py,sha256=KI5s32rNTCfgwzY_7Ghck27jHKvKg5sl5_NEEqJbJqI,3999
|
|
27
27
|
xm_slurm/scripts/_cloudpickle.py,sha256=dlJYf2SceOuUn8wi-ozuoYAQg71wqD2MUVOUCyOwWIY,647
|
|
28
|
-
xm_slurm/scripts/cli.py,sha256=
|
|
28
|
+
xm_slurm/scripts/cli.py,sha256=xA4SqcMtX_NXXdUDgJ47qNHw2uGvmn_JA3XiDXk-jFA,2152
|
|
29
29
|
xm_slurm/templates/docker/docker-bake.hcl.j2,sha256=ClsFpj91Mr1VfA8L6eqBG3HQz0Z8VenF6mEfmAhQgUo,1498
|
|
30
30
|
xm_slurm/templates/docker/mamba.Dockerfile,sha256=Sgxr5IA5T-pT1Shumb5k3JngoG4pgCdBXjzqslFJdZI,753
|
|
31
31
|
xm_slurm/templates/docker/python.Dockerfile,sha256=U4b4QVkopckQ0o9jJIE7d_M6TvExEYlYDirNwCoZ7W4,865
|
|
@@ -35,10 +35,10 @@ xm_slurm/templates/slurm/job-group.bash.j2,sha256=UkjfBE7jg9mepcUWaHZEAjkiXsIM1j
|
|
|
35
35
|
xm_slurm/templates/slurm/job.bash.j2,sha256=v0xGYzagDdWW6Tg44qobGJLNSUP1Cf4CcekrPibYdrE,1864
|
|
36
36
|
xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=HYqYhXsTv8TCed5UaGCZVGIYsqxSKHcnPyNNTHWNvxc,1279
|
|
37
37
|
xm_slurm/templates/slurm/fragments/proxy.bash.j2,sha256=VJLglZo-Nvx9R-qe3rHTxr07CylTQ6Z9NwBzvIpAZrA,814
|
|
38
|
-
xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=
|
|
38
|
+
xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=ggSsAxv-2_Ct3hSxFJgwgwa3Wu8xH3JqLxWtJOYYrsA,3253
|
|
39
39
|
xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=xKXYFvQvazMx0PgvmlRXR6eecoiBUl8y52dIzQtWkBE,1469
|
|
40
|
-
xmanager_slurm-0.4.
|
|
41
|
-
xmanager_slurm-0.4.
|
|
42
|
-
xmanager_slurm-0.4.
|
|
43
|
-
xmanager_slurm-0.4.
|
|
44
|
-
xmanager_slurm-0.4.
|
|
40
|
+
xmanager_slurm-0.4.2.dist-info/METADATA,sha256=FqBIDTKXc5mrv_R5WDxlOvs7l87Cxrv2bAmnQuJlq2k,954
|
|
41
|
+
xmanager_slurm-0.4.2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
42
|
+
xmanager_slurm-0.4.2.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
|
|
43
|
+
xmanager_slurm-0.4.2.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
|
|
44
|
+
xmanager_slurm-0.4.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|