xmanager-slurm 0.4.5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xmanager-slurm might be problematic. Click here for more details.
- xm_slurm/__init__.py +0 -2
- xm_slurm/api/__init__.py +33 -0
- xm_slurm/api/abc.py +65 -0
- xm_slurm/api/models.py +70 -0
- xm_slurm/api/sqlite/client.py +358 -0
- xm_slurm/api/web/client.py +173 -0
- xm_slurm/config.py +11 -3
- xm_slurm/contrib/clusters/__init__.py +3 -6
- xm_slurm/contrib/clusters/drac.py +4 -3
- xm_slurm/executables.py +4 -7
- xm_slurm/execution.py +290 -159
- xm_slurm/experiment.py +26 -180
- xm_slurm/filesystem.py +129 -0
- xm_slurm/metadata_context.py +253 -0
- xm_slurm/packageables.py +0 -9
- xm_slurm/packaging/docker.py +72 -22
- xm_slurm/packaging/utils.py +0 -108
- xm_slurm/scripts/cli.py +9 -2
- xm_slurm/templates/docker/uv.Dockerfile +6 -3
- xm_slurm/templates/slurm/entrypoint.bash.j2 +27 -0
- xm_slurm/templates/slurm/job-array.bash.j2 +4 -4
- xm_slurm/templates/slurm/job-group.bash.j2 +2 -2
- xm_slurm/templates/slurm/job.bash.j2 +5 -4
- xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +18 -54
- xm_slurm/templates/slurm/runtimes/podman.bash.j2 +9 -24
- xm_slurm/utils.py +122 -41
- {xmanager_slurm-0.4.5.dist-info → xmanager_slurm-0.4.7.dist-info}/METADATA +7 -3
- xmanager_slurm-0.4.7.dist-info/RECORD +51 -0
- {xmanager_slurm-0.4.5.dist-info → xmanager_slurm-0.4.7.dist-info}/WHEEL +1 -1
- xm_slurm/api.py +0 -528
- xmanager_slurm-0.4.5.dist-info/RECORD +0 -44
- {xmanager_slurm-0.4.5.dist-info → xmanager_slurm-0.4.7.dist-info}/entry_points.txt +0 -0
- {xmanager_slurm-0.4.5.dist-info → xmanager_slurm-0.4.7.dist-info}/licenses/LICENSE.md +0 -0
xm_slurm/experiment.py
CHANGED
|
@@ -18,12 +18,11 @@ from xmanager import xm
|
|
|
18
18
|
from xmanager.xm import async_packager, core, id_predictor, job_operators
|
|
19
19
|
from xmanager.xm import job_blocks as xm_job_blocks
|
|
20
20
|
|
|
21
|
-
from xm_slurm import api, config, dependencies, execution, executors
|
|
21
|
+
from xm_slurm import api, config, dependencies, execution, executors, metadata_context
|
|
22
22
|
from xm_slurm.console import console
|
|
23
23
|
from xm_slurm.job_blocks import JobArgs
|
|
24
24
|
from xm_slurm.packaging import router
|
|
25
25
|
from xm_slurm.status import SlurmWorkUnitStatus
|
|
26
|
-
from xm_slurm.utils import UserSet
|
|
27
26
|
|
|
28
27
|
logger = logging.getLogger(__name__)
|
|
29
28
|
|
|
@@ -62,66 +61,6 @@ def _validate_job(
|
|
|
62
61
|
raise ValueError(f"Only `args` and `env_vars` are supported for args on job {job!r}.")
|
|
63
62
|
|
|
64
63
|
|
|
65
|
-
@dataclasses.dataclass(kw_only=True, frozen=True)
|
|
66
|
-
class Artifact:
|
|
67
|
-
name: str
|
|
68
|
-
uri: str
|
|
69
|
-
|
|
70
|
-
def __hash__(self) -> int:
|
|
71
|
-
return hash((type(self), self.name))
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
class ContextArtifacts(UserSet[Artifact]):
|
|
75
|
-
def __init__(
|
|
76
|
-
self,
|
|
77
|
-
owner: "SlurmExperiment | SlurmExperimentUnit",
|
|
78
|
-
*,
|
|
79
|
-
artifacts: tp.Sequence[Artifact],
|
|
80
|
-
):
|
|
81
|
-
super().__init__(
|
|
82
|
-
artifacts,
|
|
83
|
-
on_add=self._on_add_artifact,
|
|
84
|
-
on_remove=self._on_remove_artifact,
|
|
85
|
-
on_discard=self._on_remove_artifact,
|
|
86
|
-
)
|
|
87
|
-
self._owner = owner
|
|
88
|
-
self._create_task = self._owner._create_task
|
|
89
|
-
|
|
90
|
-
def _on_add_artifact(self, artifact: Artifact) -> None:
|
|
91
|
-
match self._owner:
|
|
92
|
-
case SlurmExperiment():
|
|
93
|
-
api.client().insert_experiment_artifact(
|
|
94
|
-
self._owner.experiment_id,
|
|
95
|
-
api.ArtifactModel(
|
|
96
|
-
name=artifact.name,
|
|
97
|
-
uri=artifact.uri,
|
|
98
|
-
),
|
|
99
|
-
)
|
|
100
|
-
case SlurmWorkUnit():
|
|
101
|
-
api.client().insert_work_unit_artifact(
|
|
102
|
-
self._owner.experiment_id,
|
|
103
|
-
self._owner.work_unit_id,
|
|
104
|
-
api.ArtifactModel(
|
|
105
|
-
name=artifact.name,
|
|
106
|
-
uri=artifact.uri,
|
|
107
|
-
),
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
def _on_remove_artifact(self, artifact: Artifact) -> None:
|
|
111
|
-
match self._owner:
|
|
112
|
-
case SlurmExperiment():
|
|
113
|
-
api.client().delete_experiment_artifact(self._owner.experiment_id, artifact.name)
|
|
114
|
-
case SlurmWorkUnit():
|
|
115
|
-
api.client().delete_work_unit_artifact(
|
|
116
|
-
self._owner.experiment_id, self._owner.work_unit_id, artifact.name
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
@dataclasses.dataclass(frozen=True, kw_only=True)
|
|
121
|
-
class SlurmExperimentUnitMetadataContext:
|
|
122
|
-
artifacts: ContextArtifacts
|
|
123
|
-
|
|
124
|
-
|
|
125
64
|
class SlurmExperimentUnit(xm.ExperimentUnit):
|
|
126
65
|
"""ExperimentUnit is a collection of semantically associated `Job`s."""
|
|
127
66
|
|
|
@@ -138,8 +77,9 @@ class SlurmExperimentUnit(xm.ExperimentUnit):
|
|
|
138
77
|
super().__init__(experiment, create_task, args, role, identity=identity)
|
|
139
78
|
self._launched_jobs: list[xm.LaunchedJob] = []
|
|
140
79
|
self._execution_handles: list[execution.SlurmHandle] = []
|
|
141
|
-
self._context = SlurmExperimentUnitMetadataContext(
|
|
142
|
-
|
|
80
|
+
self._context = metadata_context.SlurmExperimentUnitMetadataContext(
|
|
81
|
+
self,
|
|
82
|
+
artifacts=metadata_context.SlurmContextArtifacts(owner=self, artifacts=[]),
|
|
143
83
|
)
|
|
144
84
|
|
|
145
85
|
def add( # type: ignore
|
|
@@ -289,7 +229,7 @@ class SlurmExperimentUnit(xm.ExperimentUnit):
|
|
|
289
229
|
api.client().insert_job(
|
|
290
230
|
self.experiment_id,
|
|
291
231
|
self.work_unit_id,
|
|
292
|
-
api.
|
|
232
|
+
api.models.SlurmJob(
|
|
293
233
|
name=job.name,
|
|
294
234
|
slurm_job_id=handle.slurm_job.job_id,
|
|
295
235
|
slurm_ssh_config=handle.ssh.serialize(),
|
|
@@ -351,7 +291,12 @@ class SlurmExperimentUnit(xm.ExperimentUnit):
|
|
|
351
291
|
wait: bool = True,
|
|
352
292
|
follow: bool = False,
|
|
353
293
|
) -> tp.AsyncGenerator[ConsoleRenderable, None]:
|
|
354
|
-
|
|
294
|
+
if not self._execution_handles:
|
|
295
|
+
raise ValueError(f"No execution handles found for experiment unit {self!r}")
|
|
296
|
+
elif len(self._execution_handles) > 1:
|
|
297
|
+
raise ValueError(f"Multiple execution handles found for experiment unit {self!r}")
|
|
298
|
+
assert len(self._execution_handles) == 1
|
|
299
|
+
|
|
355
300
|
handle = self._execution_handles[0] # TODO(jfarebro): interleave?
|
|
356
301
|
async for log in handle.logs(
|
|
357
302
|
num_lines=num_lines, block_size=block_size, wait=wait, follow=follow
|
|
@@ -363,7 +308,7 @@ class SlurmExperimentUnit(xm.ExperimentUnit):
|
|
|
363
308
|
return self._launched_jobs
|
|
364
309
|
|
|
365
310
|
@property
|
|
366
|
-
def context(self) -> SlurmExperimentUnitMetadataContext: # type: ignore
|
|
311
|
+
def context(self) -> metadata_context.SlurmExperimentUnitMetadataContext: # type: ignore
|
|
367
312
|
return self._context
|
|
368
313
|
|
|
369
314
|
def after_started(
|
|
@@ -427,7 +372,7 @@ class SlurmWorkUnit(xm.WorkUnit, SlurmExperimentUnit):
|
|
|
427
372
|
api.client().update_work_unit(
|
|
428
373
|
self.experiment_id,
|
|
429
374
|
self.work_unit_id,
|
|
430
|
-
api.
|
|
375
|
+
api.models.ExperimentUnitPatch(args=json.dumps(args_view), identity=None),
|
|
431
376
|
)
|
|
432
377
|
|
|
433
378
|
async with self._work_unit_id_predictor.submit_id(self.work_unit_id): # type: ignore
|
|
@@ -488,112 +433,10 @@ class SlurmAuxiliaryUnit(SlurmExperimentUnit):
|
|
|
488
433
|
return f"<SlurmAuxiliaryUnit {self.experiment_unit_name}>"
|
|
489
434
|
|
|
490
435
|
|
|
491
|
-
class SlurmExperimentContextAnnotations:
|
|
492
|
-
def __init__(
|
|
493
|
-
self,
|
|
494
|
-
experiment: "SlurmExperiment",
|
|
495
|
-
*,
|
|
496
|
-
title: str,
|
|
497
|
-
tags: set[str] | None = None,
|
|
498
|
-
description: str | None = None,
|
|
499
|
-
note: str | None = None,
|
|
500
|
-
):
|
|
501
|
-
self._experiment = experiment
|
|
502
|
-
self._create_task = self._experiment._create_task
|
|
503
|
-
self._title = title
|
|
504
|
-
self._tags = UserSet(
|
|
505
|
-
tags or set(),
|
|
506
|
-
on_add=self._on_tag_added,
|
|
507
|
-
on_remove=self._on_tag_removed,
|
|
508
|
-
on_discard=self._on_tag_removed,
|
|
509
|
-
)
|
|
510
|
-
self._description = description or ""
|
|
511
|
-
self._note = note or ""
|
|
512
|
-
|
|
513
|
-
@property
|
|
514
|
-
def title(self) -> str:
|
|
515
|
-
return self._title
|
|
516
|
-
|
|
517
|
-
@title.setter
|
|
518
|
-
def title(self, value: str) -> None:
|
|
519
|
-
self._title = value
|
|
520
|
-
api.client().update_experiment(
|
|
521
|
-
self._experiment.experiment_id,
|
|
522
|
-
api.ExperimentPatchModel(title=value),
|
|
523
|
-
)
|
|
524
|
-
|
|
525
|
-
@property
|
|
526
|
-
def description(self) -> str:
|
|
527
|
-
return self._description
|
|
528
|
-
|
|
529
|
-
@description.setter
|
|
530
|
-
def description(self, value: str) -> None:
|
|
531
|
-
self._description = value
|
|
532
|
-
api.client().update_experiment(
|
|
533
|
-
self._experiment.experiment_id,
|
|
534
|
-
api.ExperimentPatchModel(description=value),
|
|
535
|
-
)
|
|
536
|
-
|
|
537
|
-
@property
|
|
538
|
-
def note(self) -> str:
|
|
539
|
-
return self._note
|
|
540
|
-
|
|
541
|
-
@note.setter
|
|
542
|
-
def note(self, value: str) -> None:
|
|
543
|
-
self._note = value
|
|
544
|
-
api.client().update_experiment(
|
|
545
|
-
self._experiment.experiment_id,
|
|
546
|
-
api.ExperimentPatchModel(note=value),
|
|
547
|
-
)
|
|
548
|
-
|
|
549
|
-
@property
|
|
550
|
-
def tags(self) -> tp.MutableSet[str]:
|
|
551
|
-
return self._tags
|
|
552
|
-
|
|
553
|
-
@tags.setter
|
|
554
|
-
def tags(self, tags: set[str]) -> None:
|
|
555
|
-
# TODO(jfarebro): Create custom tag collection
|
|
556
|
-
# and set it here, we need this so we can hook add and remove
|
|
557
|
-
# to mutate the database transparently
|
|
558
|
-
self._tags = UserSet(tags, on_add=self._on_tag_added, on_remove=self._on_tag_removed)
|
|
559
|
-
api.client().update_experiment(
|
|
560
|
-
self._experiment.experiment_id,
|
|
561
|
-
api.ExperimentPatchModel(tags=list(self._tags)),
|
|
562
|
-
)
|
|
563
|
-
|
|
564
|
-
def _on_tag_added(self, tag: str) -> None:
|
|
565
|
-
del tag
|
|
566
|
-
api.client().update_experiment(
|
|
567
|
-
self._experiment.experiment_id,
|
|
568
|
-
api.ExperimentPatchModel(tags=list(self._tags)),
|
|
569
|
-
)
|
|
570
|
-
|
|
571
|
-
def _on_tag_removed(self, tag: str) -> None:
|
|
572
|
-
del tag
|
|
573
|
-
api.client().update_experiment(
|
|
574
|
-
self._experiment.experiment_id,
|
|
575
|
-
api.ExperimentPatchModel(tags=list(self._tags)),
|
|
576
|
-
)
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
class SlurmExperimentContextArtifacts(ContextArtifacts):
|
|
580
|
-
def add_graphviz_config(self, config: str) -> None:
|
|
581
|
-
self.add(Artifact(name="GRAPHVIZ", uri=f"graphviz://{config}"))
|
|
582
|
-
|
|
583
|
-
def add_python_config(self, config: str) -> None:
|
|
584
|
-
self.add(Artifact(name="PYTHON", uri=config))
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
@dataclasses.dataclass(frozen=True, kw_only=True)
|
|
588
|
-
class SlurmExperimentMetadataContext:
|
|
589
|
-
annotations: SlurmExperimentContextAnnotations
|
|
590
|
-
artifacts: ContextArtifacts
|
|
591
|
-
|
|
592
|
-
|
|
593
436
|
class SlurmExperiment(xm.Experiment):
|
|
594
437
|
_id: int
|
|
595
438
|
_experiment_units: list[SlurmExperimentUnit]
|
|
596
|
-
_experiment_context: SlurmExperimentMetadataContext
|
|
439
|
+
_experiment_context: metadata_context.SlurmExperimentMetadataContext
|
|
597
440
|
_work_unit_count: int
|
|
598
441
|
_async_packager = async_packager.AsyncPackager(router.package)
|
|
599
442
|
|
|
@@ -605,12 +448,13 @@ class SlurmExperiment(xm.Experiment):
|
|
|
605
448
|
super().__init__()
|
|
606
449
|
self._id = experiment_id
|
|
607
450
|
self._experiment_units = []
|
|
608
|
-
self._experiment_context = SlurmExperimentMetadataContext(
|
|
609
|
-
|
|
451
|
+
self._experiment_context = metadata_context.SlurmExperimentMetadataContext(
|
|
452
|
+
self,
|
|
453
|
+
annotations=metadata_context.SlurmExperimentContextAnnotations(
|
|
610
454
|
experiment=self,
|
|
611
455
|
title=experiment_title,
|
|
612
456
|
),
|
|
613
|
-
artifacts=
|
|
457
|
+
artifacts=metadata_context.SlurmContextArtifacts(self, artifacts=[]),
|
|
614
458
|
)
|
|
615
459
|
self._work_unit_count = 0
|
|
616
460
|
|
|
@@ -1005,7 +849,7 @@ class SlurmExperiment(xm.Experiment):
|
|
|
1005
849
|
|
|
1006
850
|
api.client().insert_work_unit(
|
|
1007
851
|
self.experiment_id,
|
|
1008
|
-
api.
|
|
852
|
+
api.models.WorkUnitPatch(
|
|
1009
853
|
wid=work_unit.work_unit_id,
|
|
1010
854
|
identity=work_unit.identity,
|
|
1011
855
|
args=json.dumps(args),
|
|
@@ -1080,7 +924,7 @@ class SlurmExperiment(xm.Experiment):
|
|
|
1080
924
|
return self.context.annotations.title
|
|
1081
925
|
|
|
1082
926
|
@property
|
|
1083
|
-
def context(self) -> SlurmExperimentMetadataContext: # type: ignore
|
|
927
|
+
def context(self) -> metadata_context.SlurmExperimentMetadataContext: # type: ignore
|
|
1084
928
|
return self._experiment_context
|
|
1085
929
|
|
|
1086
930
|
@property
|
|
@@ -1099,7 +943,9 @@ class SlurmExperiment(xm.Experiment):
|
|
|
1099
943
|
|
|
1100
944
|
def create_experiment(experiment_title: str) -> SlurmExperiment:
|
|
1101
945
|
"""Create Experiment."""
|
|
1102
|
-
experiment_id = api.client().insert_experiment(
|
|
946
|
+
experiment_id = api.client().insert_experiment(
|
|
947
|
+
api.models.ExperimentPatch(title=experiment_title)
|
|
948
|
+
)
|
|
1103
949
|
return SlurmExperiment(experiment_title=experiment_title, experiment_id=experiment_id)
|
|
1104
950
|
|
|
1105
951
|
|
|
@@ -1114,11 +960,11 @@ def get_experiment(experiment_id: int) -> SlurmExperiment:
|
|
|
1114
960
|
# Populate annotations
|
|
1115
961
|
experiment.context.annotations.description = experiment_model.description or ""
|
|
1116
962
|
experiment.context.annotations.note = experiment_model.note or ""
|
|
1117
|
-
experiment.context.annotations.tags =
|
|
963
|
+
experiment.context.annotations.tags = experiment_model.tags or []
|
|
1118
964
|
|
|
1119
965
|
# Populate artifacts
|
|
1120
966
|
for artifact in experiment_model.artifacts:
|
|
1121
|
-
experiment.context.artifacts
|
|
967
|
+
experiment.context.artifacts[artifact.name] = artifact.uri
|
|
1122
968
|
|
|
1123
969
|
# Populate work units
|
|
1124
970
|
for wu_model in experiment_model.work_units:
|
|
@@ -1145,7 +991,7 @@ def get_experiment(experiment_id: int) -> SlurmExperiment:
|
|
|
1145
991
|
|
|
1146
992
|
# Populate artifacts for each work unit
|
|
1147
993
|
for artifact in wu_model.artifacts:
|
|
1148
|
-
work_unit.context.artifacts
|
|
994
|
+
work_unit.context.artifacts[artifact.name] = artifact.uri
|
|
1149
995
|
|
|
1150
996
|
experiment._experiment_units.append(work_unit)
|
|
1151
997
|
experiment._work_unit_count += 1
|
xm_slurm/filesystem.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import asyncio
|
|
3
|
+
import os
|
|
4
|
+
import typing as tp
|
|
5
|
+
|
|
6
|
+
import aiofile
|
|
7
|
+
import asyncssh
|
|
8
|
+
import typing_extensions as tpe
|
|
9
|
+
import wrapt
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AsyncFileIO(tp.Protocol):
|
|
13
|
+
async def write(self, buffer: str | bytes, /) -> int: ...
|
|
14
|
+
async def read(self, size: int = -1, /) -> bytes: ...
|
|
15
|
+
async def seek(self, offset: int, /) -> int: ...
|
|
16
|
+
async def tell(self) -> int: ...
|
|
17
|
+
async def __aenter__(self) -> tpe.Self: ...
|
|
18
|
+
async def __aexit__(self, *args: tp.Any, **kwargs: tp.Any) -> None: ...
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AsyncFileSystem(tp.Protocol):
|
|
22
|
+
async def open(
|
|
23
|
+
self,
|
|
24
|
+
path: os.PathLike[str] | str,
|
|
25
|
+
mode: tp.Literal["r", "w", "rb", "wb"],
|
|
26
|
+
*,
|
|
27
|
+
encoding: str = "utf-8",
|
|
28
|
+
) -> AsyncFileIO: ...
|
|
29
|
+
|
|
30
|
+
async def read(
|
|
31
|
+
self, path: os.PathLike[str] | str, *, size: int = -1, offset: int = 0
|
|
32
|
+
) -> bytes: ...
|
|
33
|
+
async def write(
|
|
34
|
+
self, path: os.PathLike[str] | str, data: str | bytes, *, offset: int = 0
|
|
35
|
+
) -> None: ...
|
|
36
|
+
|
|
37
|
+
async def exists(self, path: os.PathLike[str] | str) -> bool: ...
|
|
38
|
+
async def size(self, path: os.PathLike[str] | str) -> int | None: ...
|
|
39
|
+
async def makedirs(
|
|
40
|
+
self, path: os.PathLike[str] | str, mode: int = 511, exist_ok: bool = False
|
|
41
|
+
) -> None: ...
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class AbstractAsyncFileSystem(AsyncFileSystem, abc.ABC):
|
|
45
|
+
@abc.abstractmethod
|
|
46
|
+
async def open(
|
|
47
|
+
self,
|
|
48
|
+
path: os.PathLike[str] | str,
|
|
49
|
+
mode: tp.Literal["r", "w", "rb", "wb"],
|
|
50
|
+
*,
|
|
51
|
+
encoding: str = "utf-8",
|
|
52
|
+
) -> AsyncFileIO: ...
|
|
53
|
+
|
|
54
|
+
async def read(self, path: os.PathLike[str] | str, *, size: int = -1, offset: int = 0) -> bytes:
|
|
55
|
+
async with await self.open(path, "rb") as f:
|
|
56
|
+
await f.seek(offset)
|
|
57
|
+
return await f.read(size)
|
|
58
|
+
|
|
59
|
+
async def write(
|
|
60
|
+
self, path: os.PathLike[str] | str, data: str | bytes, *, offset: int = 0
|
|
61
|
+
) -> None:
|
|
62
|
+
async with await self.open(path, "wb") as f:
|
|
63
|
+
await f.seek(offset)
|
|
64
|
+
await f.write(data)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class AsyncLocalFileIO(wrapt.ObjectProxy):
|
|
68
|
+
async def seek(self, offset: int, /) -> int:
|
|
69
|
+
await asyncio.to_thread(self.__wrapped__.seek, offset)
|
|
70
|
+
return await asyncio.to_thread(self.__wrapped__.tell)
|
|
71
|
+
|
|
72
|
+
async def tell(self) -> int:
|
|
73
|
+
return await asyncio.to_thread(self.__wrapped__.tell)
|
|
74
|
+
|
|
75
|
+
async def __aenter__(self) -> tpe.Self:
|
|
76
|
+
return AsyncLocalFileIO(await self.__wrapped__.__aenter__())
|
|
77
|
+
|
|
78
|
+
async def __aexit__(self, *args: tp.Any, **kwargs: tp.Any) -> None:
|
|
79
|
+
return await self.__wrapped__.__aexit__(*args, **kwargs)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class AsyncLocalFileSystem(AbstractAsyncFileSystem):
|
|
83
|
+
def __init__(self): ...
|
|
84
|
+
|
|
85
|
+
async def open(
|
|
86
|
+
self,
|
|
87
|
+
path: os.PathLike[str] | str,
|
|
88
|
+
mode: tp.Literal["r", "w", "rb", "wb"],
|
|
89
|
+
*,
|
|
90
|
+
encoding: str = "utf-8",
|
|
91
|
+
) -> AsyncFileIO:
|
|
92
|
+
return AsyncLocalFileIO(aiofile.async_open(os.fspath(path), mode=mode, encoding=encoding)) # type: ignore
|
|
93
|
+
|
|
94
|
+
async def exists(self, path: os.PathLike[str] | str) -> bool:
|
|
95
|
+
return await asyncio.to_thread(os.path.exists, os.fspath(path))
|
|
96
|
+
|
|
97
|
+
async def size(self, path: os.PathLike[str] | str) -> int | None:
|
|
98
|
+
return await asyncio.to_thread(os.path.getsize, os.fspath(path))
|
|
99
|
+
|
|
100
|
+
async def makedirs(
|
|
101
|
+
self, path: os.PathLike[str] | str, mode: int = 0o777, exist_ok: bool = False
|
|
102
|
+
) -> None:
|
|
103
|
+
return await asyncio.to_thread(os.makedirs, os.fspath(path), mode=mode, exist_ok=exist_ok)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class AsyncSSHFileSystem(AbstractAsyncFileSystem):
|
|
107
|
+
def __init__(self, client: asyncssh.SFTPClient):
|
|
108
|
+
self._client = client
|
|
109
|
+
|
|
110
|
+
async def open(
|
|
111
|
+
self,
|
|
112
|
+
path: os.PathLike[str] | str,
|
|
113
|
+
mode: tp.Literal["r", "w", "rb", "wb"],
|
|
114
|
+
*,
|
|
115
|
+
encoding: str = "utf-8",
|
|
116
|
+
) -> AsyncFileIO:
|
|
117
|
+
return await self._client.open(os.fspath(path), mode, encoding=encoding) # type: ignore
|
|
118
|
+
|
|
119
|
+
async def exists(self, path: os.PathLike[str] | str) -> bool:
|
|
120
|
+
return await self._client.exists(os.fspath(path))
|
|
121
|
+
|
|
122
|
+
async def size(self, path: os.PathLike[str] | str) -> int | None:
|
|
123
|
+
return (await self._client.stat(os.fspath(path))).size
|
|
124
|
+
|
|
125
|
+
async def makedirs(
|
|
126
|
+
self, path: os.PathLike[str] | str, mode: int = 0o777, exist_ok: bool = False
|
|
127
|
+
) -> None:
|
|
128
|
+
attrs = asyncssh.SFTPAttrs(permissions=mode)
|
|
129
|
+
return await self._client.makedirs(os.fspath(path), attrs=attrs, exist_ok=exist_ok)
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
import collections.abc
|
|
2
|
+
import typing as tp
|
|
3
|
+
|
|
4
|
+
from xmanager import xm
|
|
5
|
+
|
|
6
|
+
from xm_slurm import api
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SlurmContextArtifacts(collections.abc.MutableMapping[str, str]):
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
owner: xm.Experiment | xm.ExperimentUnit,
|
|
13
|
+
*,
|
|
14
|
+
artifacts: tp.Sequence[api.models.Artifact],
|
|
15
|
+
):
|
|
16
|
+
self._data = {artifact.name: artifact.uri for artifact in artifacts}
|
|
17
|
+
self._owner = owner
|
|
18
|
+
self._create_task = self._owner._create_task
|
|
19
|
+
|
|
20
|
+
def add(self, name: str, uri: str) -> None:
|
|
21
|
+
artifact = api.models.Artifact(name=name, uri=uri)
|
|
22
|
+
match self._owner:
|
|
23
|
+
case xm.Experiment():
|
|
24
|
+
api.client().insert_experiment_artifact(self._owner.experiment_id, artifact)
|
|
25
|
+
case xm.WorkUnit():
|
|
26
|
+
api.client().insert_work_unit_artifact(
|
|
27
|
+
self._owner.experiment_id, self._owner.work_unit_id, artifact
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
self._data[name] = uri
|
|
31
|
+
|
|
32
|
+
def remove(self, name: str) -> None:
|
|
33
|
+
match self._owner:
|
|
34
|
+
case xm.Experiment():
|
|
35
|
+
api.client().delete_experiment_artifact(self._owner.experiment_id, name)
|
|
36
|
+
case xm.WorkUnit():
|
|
37
|
+
api.client().delete_work_unit_artifact(
|
|
38
|
+
self._owner.experiment_id, self._owner.work_unit_id, name
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def __setitem__(self, name: str, uri: str) -> None:
|
|
42
|
+
self.add(name, uri)
|
|
43
|
+
|
|
44
|
+
def __delitem__(self, name: str) -> None:
|
|
45
|
+
self.remove(name)
|
|
46
|
+
|
|
47
|
+
def __getitem__(self, name: str) -> str:
|
|
48
|
+
return self._data[name]
|
|
49
|
+
|
|
50
|
+
def __iter__(self) -> tp.Iterator[str]:
|
|
51
|
+
return iter(self._data)
|
|
52
|
+
|
|
53
|
+
def __len__(self) -> int:
|
|
54
|
+
return len(self._data)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class SlurmExperimentAnnotationTags(collections.abc.MutableSet[str]):
|
|
58
|
+
def __init__(self, experiment: xm.Experiment, *, tags: tp.Iterable[str]):
|
|
59
|
+
self._experiment = experiment
|
|
60
|
+
# Use a dict to ensure order is preserved
|
|
61
|
+
self._tags = dict.fromkeys(tags)
|
|
62
|
+
|
|
63
|
+
def add(self, tag: str) -> None:
|
|
64
|
+
self._tags[tag] = None
|
|
65
|
+
api.client().update_experiment(
|
|
66
|
+
self._experiment.experiment_id,
|
|
67
|
+
api.models.ExperimentPatch(tags=list(self._tags)),
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def remove(self, tag: str) -> None:
|
|
71
|
+
self.discard(tag)
|
|
72
|
+
|
|
73
|
+
def discard(self, tag: str) -> None:
|
|
74
|
+
self._tags.pop(tag)
|
|
75
|
+
api.client().update_experiment(
|
|
76
|
+
self._experiment.experiment_id,
|
|
77
|
+
api.models.ExperimentPatch(tags=list(self._tags)),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def __contains__(self, tag: str) -> bool:
|
|
81
|
+
return tag in self._tags
|
|
82
|
+
|
|
83
|
+
def __iter__(self) -> tp.Iterator[str]:
|
|
84
|
+
return iter(self._tags)
|
|
85
|
+
|
|
86
|
+
def __len__(self) -> int:
|
|
87
|
+
return len(self._tags)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class SlurmExperimentUnitMetadataContext:
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
experiment_unit: xm.ExperimentUnit,
|
|
94
|
+
*,
|
|
95
|
+
artifacts: SlurmContextArtifacts,
|
|
96
|
+
):
|
|
97
|
+
self._experiment_unit = experiment_unit
|
|
98
|
+
self._artifacts = artifacts
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def artifacts(self) -> SlurmContextArtifacts:
|
|
102
|
+
return self._artifacts
|
|
103
|
+
|
|
104
|
+
@artifacts.setter
|
|
105
|
+
def artifacts(self, artifacts: SlurmContextArtifacts) -> None:
|
|
106
|
+
del artifacts
|
|
107
|
+
raise ValueError("The artifacts object is immutable.")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class SlurmExperimentContextAnnotations:
|
|
111
|
+
def __init__(
|
|
112
|
+
self,
|
|
113
|
+
experiment: xm.Experiment,
|
|
114
|
+
*,
|
|
115
|
+
title: str,
|
|
116
|
+
tags: set[str] | None = None,
|
|
117
|
+
description: str | None = None,
|
|
118
|
+
note: str | None = None,
|
|
119
|
+
):
|
|
120
|
+
self._experiment = experiment
|
|
121
|
+
self._create_task = self._experiment._create_task
|
|
122
|
+
self._title = title
|
|
123
|
+
self._tags = SlurmExperimentAnnotationTags(experiment, tags=tags or [])
|
|
124
|
+
self._description = description or ""
|
|
125
|
+
self._note = note or ""
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def title(self) -> str:
|
|
129
|
+
return self._title
|
|
130
|
+
|
|
131
|
+
@title.setter
|
|
132
|
+
def title(self, value: str) -> None:
|
|
133
|
+
self._title = value
|
|
134
|
+
api.client().update_experiment(
|
|
135
|
+
self._experiment.experiment_id,
|
|
136
|
+
api.models.ExperimentPatch(title=value),
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def description(self) -> str:
|
|
141
|
+
return self._description
|
|
142
|
+
|
|
143
|
+
@description.setter
|
|
144
|
+
def description(self, value: str) -> None:
|
|
145
|
+
self._description = value
|
|
146
|
+
api.client().update_experiment(
|
|
147
|
+
self._experiment.experiment_id,
|
|
148
|
+
api.models.ExperimentPatch(description=value),
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def note(self) -> str:
|
|
153
|
+
return self._note
|
|
154
|
+
|
|
155
|
+
@note.setter
|
|
156
|
+
def note(self, value: str) -> None:
|
|
157
|
+
self._note = value
|
|
158
|
+
api.client().update_experiment(
|
|
159
|
+
self._experiment.experiment_id,
|
|
160
|
+
api.models.ExperimentPatch(note=value),
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def tags(self) -> SlurmExperimentAnnotationTags:
|
|
165
|
+
return self._tags
|
|
166
|
+
|
|
167
|
+
@tags.setter
|
|
168
|
+
def tags(self, tags: tp.Iterable[str]) -> None:
|
|
169
|
+
self._tags = SlurmExperimentAnnotationTags(self._experiment, tags=tags)
|
|
170
|
+
api.client().update_experiment(
|
|
171
|
+
self._experiment.experiment_id,
|
|
172
|
+
api.models.ExperimentPatch(tags=list(self._tags)),
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class SlurmExperimentMetadataContext:
|
|
177
|
+
def __init__(
|
|
178
|
+
self,
|
|
179
|
+
experiment: xm.Experiment,
|
|
180
|
+
*,
|
|
181
|
+
annotations: SlurmExperimentContextAnnotations,
|
|
182
|
+
artifacts: SlurmContextArtifacts,
|
|
183
|
+
):
|
|
184
|
+
self._experiment = experiment
|
|
185
|
+
self._annotations = annotations
|
|
186
|
+
self._artifacts = artifacts
|
|
187
|
+
|
|
188
|
+
self._graphviz_config = None
|
|
189
|
+
self._python_config = None
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def annotations(self) -> SlurmExperimentContextAnnotations:
|
|
193
|
+
return self._annotations
|
|
194
|
+
|
|
195
|
+
@annotations.setter
|
|
196
|
+
def annotations(self, annotations: SlurmExperimentContextAnnotations) -> None:
|
|
197
|
+
del annotations
|
|
198
|
+
raise ValueError("The annotations object is immutable.")
|
|
199
|
+
|
|
200
|
+
@property
|
|
201
|
+
def artifacts(self) -> SlurmContextArtifacts:
|
|
202
|
+
return self._artifacts
|
|
203
|
+
|
|
204
|
+
@artifacts.setter
|
|
205
|
+
def artifacts(self, artifacts: SlurmContextArtifacts) -> None:
|
|
206
|
+
del artifacts
|
|
207
|
+
raise ValueError("The artifacts object is immutable.")
|
|
208
|
+
|
|
209
|
+
@property
|
|
210
|
+
def graphviz_config(self) -> str | None:
|
|
211
|
+
return self._graphviz_config
|
|
212
|
+
|
|
213
|
+
@graphviz_config.setter
|
|
214
|
+
def graphviz_config(self, config: str | None) -> None:
|
|
215
|
+
self._graphviz_config = config
|
|
216
|
+
match config:
|
|
217
|
+
case None:
|
|
218
|
+
api.client().delete_experiment_config_artifact(
|
|
219
|
+
self._experiment.experiment_id, "GRAPHVIZ"
|
|
220
|
+
)
|
|
221
|
+
case str():
|
|
222
|
+
api.client().insert_experiment_config_artifact(
|
|
223
|
+
self._experiment.experiment_id,
|
|
224
|
+
api.models.ConfigArtifact(name="GRAPHVIZ", uri=f"graphviz://{config}"),
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
@graphviz_config.deleter
|
|
228
|
+
def graphviz_config(self) -> None:
|
|
229
|
+
self._graphviz_config = None
|
|
230
|
+
api.client().delete_experiment_config_artifact(self._experiment.experiment_id, "GRAPHVIZ")
|
|
231
|
+
|
|
232
|
+
@property
|
|
233
|
+
def python_config(self) -> str | None:
|
|
234
|
+
return self._python_config
|
|
235
|
+
|
|
236
|
+
@python_config.setter
|
|
237
|
+
def python_config(self, config: str | None) -> None:
|
|
238
|
+
self._python_config = config
|
|
239
|
+
match config:
|
|
240
|
+
case None:
|
|
241
|
+
api.client().delete_experiment_config_artifact(
|
|
242
|
+
self._experiment.experiment_id, "PYTHON"
|
|
243
|
+
)
|
|
244
|
+
case str():
|
|
245
|
+
api.client().insert_experiment_config_artifact(
|
|
246
|
+
self._experiment.experiment_id,
|
|
247
|
+
api.models.ConfigArtifact(name="PYTHON", uri=config),
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
@python_config.deleter
|
|
251
|
+
def python_config(self) -> None:
|
|
252
|
+
self._python_config = None
|
|
253
|
+
api.client().delete_experiment_config_artifact(self._experiment.experiment_id, "PYTHON")
|