PyPI - xmanager-slurm - Versions diffs - 0.4.5__py3-none-any.whl → 0.4.6__py3-none-any.whl - Mend

xmanager-slurm 0.4.5py3-none-any.whl → 0.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xmanager-slurm might be problematic. Click here for more details.

Files changed (33) hide show

xm_slurm/__init__.py +0 -2
xm_slurm/api/__init__.py +33 -0
xm_slurm/api/abc.py +65 -0
xm_slurm/api/models.py +70 -0
xm_slurm/api/sqlite/client.py +358 -0
xm_slurm/api/web/client.py +173 -0
xm_slurm/config.py +11 -3
xm_slurm/contrib/clusters/__init__.py +3 -6
xm_slurm/contrib/clusters/drac.py +4 -3
xm_slurm/executables.py +4 -7
xm_slurm/execution.py +273 -159
xm_slurm/experiment.py +26 -180
xm_slurm/filesystem.py +129 -0
xm_slurm/metadata_context.py +253 -0
xm_slurm/packageables.py +0 -9
xm_slurm/packaging/docker.py +72 -22
xm_slurm/packaging/utils.py +0 -108
xm_slurm/scripts/cli.py +9 -2
xm_slurm/templates/docker/uv.Dockerfile +6 -3
xm_slurm/templates/slurm/entrypoint.bash.j2 +27 -0
xm_slurm/templates/slurm/job-array.bash.j2 +4 -4
xm_slurm/templates/slurm/job-group.bash.j2 +2 -2
xm_slurm/templates/slurm/job.bash.j2 +5 -4
xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +18 -54
xm_slurm/templates/slurm/runtimes/podman.bash.j2 +10 -24
xm_slurm/utils.py +122 -41
{xmanager_slurm-0.4.5.dist-info → xmanager_slurm-0.4.6.dist-info}/METADATA +7 -3
xmanager_slurm-0.4.6.dist-info/RECORD +51 -0
{xmanager_slurm-0.4.5.dist-info → xmanager_slurm-0.4.6.dist-info}/WHEEL +1 -1
xm_slurm/api.py +0 -528
xmanager_slurm-0.4.5.dist-info/RECORD +0 -44
{xmanager_slurm-0.4.5.dist-info → xmanager_slurm-0.4.6.dist-info}/entry_points.txt +0 -0
{xmanager_slurm-0.4.5.dist-info → xmanager_slurm-0.4.6.dist-info}/licenses/LICENSE.md +0 -0

xm_slurm/api/web/client.py ADDED Viewed

@@ -0,0 +1,173 @@
+import dataclasses
+import backoff
+import httpx
+from xm_slurm.api import models
+from xm_slurm.api.abc import XManagerAPI
+# Define which exceptions should trigger a retry
+RETRY_EXCEPTIONS = (
+    httpx.ConnectError,
+    httpx.ConnectTimeout,
+    httpx.ReadTimeout,
+    httpx.WriteTimeout,
+    httpx.NetworkError,
+)
+# Common backoff decorator for all API calls
+def with_backoff(f):
+    return backoff.on_exception(
+        backoff.expo,
+        RETRY_EXCEPTIONS,
+        max_tries=3,  # Maximum number of attempts
+        max_time=30,  # Maximum total time to try in seconds
+        jitter=backoff.full_jitter,  # Add jitter to prevent thundering herd
+    )(f)
+class XManagerWebAPI(XManagerAPI):
+    def __init__(self, base_url: str, token: str):
+        self.base_url = base_url.rstrip("/")
+        self.client = httpx.Client(headers={"Authorization": f"Bearer {token}"}, verify=False)
+    def _make_url(self, path: str) -> str:
+        return f"{self.base_url}/api{path}"
+    @with_backoff
+    def get_experiment(self, xid: int) -> models.Experiment:
+        response = self.client.get(self._make_url(f"/experiment/{xid}"))
+        response.raise_for_status()
+        data = response.json()
+        # Construct work units with nested jobs and artifacts
+        work_units = []
+        for wu_data in data.pop("work_units", []):
+            # Build jobs for this work unit
+            jobs = [
+                models.SlurmJob(
+                    name=job["name"],
+                    slurm_job_id=job["slurm_job_id"],
+                    slurm_ssh_config=job["slurm_ssh_config"],
+                )
+                for job in wu_data.pop("jobs", [])
+            ]
+            # Build artifacts for this work unit
+            artifacts = [
+                models.Artifact(name=artifact["name"], uri=artifact["uri"])
+                for artifact in wu_data.pop("artifacts", [])
+            ]
+            # Create work unit with its jobs and artifacts
+            wu_data["jobs"] = jobs
+            wu_data["artifacts"] = artifacts
+            work_units.append(models.WorkUnit(**wu_data))
+        # Build experiment artifacts
+        artifacts = [
+            models.Artifact(name=artifact["name"], uri=artifact["uri"])
+            for artifact in data.pop("artifacts", [])
+        ]
+        return models.Experiment(**data, work_units=work_units, artifacts=artifacts)
+    @with_backoff
+    def delete_experiment(self, experiment_id: int) -> None:
+        response = self.client.delete(self._make_url(f"/experiment/{experiment_id}"))
+        response.raise_for_status()
+    @with_backoff
+    def insert_experiment(self, experiment: models.ExperimentPatch) -> int:
+        assert experiment.title is not None, "Title must be set in the experiment model."
+        assert (
+            experiment.description is None and experiment.note is None and experiment.tags is None
+        ), "Only title should be set in the experiment model."
+        response = self.client.put(
+            self._make_url("/experiment"), json=dataclasses.asdict(experiment)
+        )
+        response.raise_for_status()
+        return int(response.json()["xid"])
+    @with_backoff
+    def update_experiment(
+        self, experiment_id: int, experiment_patch: models.ExperimentPatch
+    ) -> None:
+        response = self.client.patch(
+            self._make_url(f"/experiment/{experiment_id}"),
+            json=dataclasses.asdict(experiment_patch),
+        )
+        response.raise_for_status()
+    @with_backoff
+    def insert_work_unit(self, experiment_id: int, work_unit: models.WorkUnitPatch) -> None:
+        response = self.client.put(
+            self._make_url(f"/experiment/{experiment_id}/wu"),
+            json=dataclasses.asdict(work_unit),
+        )
+        response.raise_for_status()
+    @with_backoff
+    def insert_job(self, experiment_id: int, work_unit_id: int, job: models.SlurmJob) -> None:
+        response = self.client.put(
+            self._make_url(f"/experiment/{experiment_id}/wu/{work_unit_id}/job"),
+            json=dataclasses.asdict(job),
+        )
+        response.raise_for_status()
+    @with_backoff
+    def insert_work_unit_artifact(
+        self, experiment_id: int, work_unit_id: int, artifact: models.Artifact
+    ) -> None:
+        response = self.client.put(
+            self._make_url(f"/experiment/{experiment_id}/wu/{work_unit_id}/artifact"),
+            json=dataclasses.asdict(artifact),
+        )
+        response.raise_for_status()
+    @with_backoff
+    def delete_work_unit_artifact(self, experiment_id: int, work_unit_id: int, name: str) -> None:
+        response = self.client.delete(
+            self._make_url(f"/experiment/{experiment_id}/wu/{work_unit_id}/artifact/{name}")
+        )
+        response.raise_for_status()
+    @with_backoff
+    def delete_experiment_artifact(self, experiment_id: int, name: str) -> None:
+        response = self.client.delete(
+            self._make_url(f"/experiment/{experiment_id}/artifact/{name}")
+        )
+        response.raise_for_status()
+    @with_backoff
+    def insert_experiment_artifact(self, experiment_id: int, artifact: models.Artifact) -> None:
+        response = self.client.put(
+            self._make_url(f"/experiment/{experiment_id}/artifact"),
+            json=dataclasses.asdict(artifact),
+        )
+        response.raise_for_status()
+    @with_backoff
+    def insert_experiment_config_artifact(
+        self, experiment_id: int, artifact: models.ConfigArtifact
+    ) -> None:
+        response = self.client.put(
+            self._make_url(f"/experiment/{experiment_id}/config"), json=dataclasses.asdict(artifact)
+        )
+        response.raise_for_status()
+    @with_backoff
+    def delete_experiment_config_artifact(self, experiment_id: int, name: str) -> None:
+        response = self.client.delete(self._make_url(f"/experiment/{experiment_id}/config/{name}"))
+        response.raise_for_status()
+    @with_backoff
+    def update_work_unit(
+        self, experiment_id: int, work_unit_id: int, patch: models.ExperimentUnitPatch
+    ) -> None:
+        response = self.client.patch(
+            self._make_url(f"/experiment/{experiment_id}/wu/{work_unit_id}"),
+            json=dataclasses.asdict(patch),
+        )
+        response.raise_for_status()

xm_slurm/config.py CHANGED Viewed

@@ -80,6 +80,8 @@ class SlurmSSHConfig:
             None,
             ssh_config_paths,
             False,
+            True,
+            True,
             getpass.getuser(),
             self.user or (),
             self.host,
@@ -113,7 +115,11 @@ class SlurmSSHConfig:
     @functools.cached_property
     def connection_options(self) -> asyncssh.SSHClientConnectionOptions:
-        options = asyncssh.SSHClientConnectionOptions(config=None)
+        options = asyncssh.SSHClientConnectionOptions(
+            config=None,
+            kbdint_auth=False,
+            disable_trivial_auth=True,
+        )
         options.prepare(last_config=self.config, known_hosts=self.known_hosts)
         return options
@@ -165,7 +171,8 @@ class SlurmClusterConfig:
     runtime: ContainerRuntime
     # Environment variables
-    environment: Mapping[str, str] = dataclasses.field(default_factory=dict)
+    host_environment: Mapping[str, str] = dataclasses.field(default_factory=dict)
+    container_environment: Mapping[str, str] = dataclasses.field(default_factory=dict)
     # Mounts
     mounts: Mapping[os.PathLike[str] | str, os.PathLike[str] | str] = dataclasses.field(
@@ -208,5 +215,6 @@ class SlurmClusterConfig:
             self.qos,
             self.proxy,
             self.runtime,
-            frozenset(self.environment.items()),
+            frozenset(self.host_environment.items()),
+            frozenset(self.container_environment.items()),
         ))

xm_slurm/contrib/clusters/__init__.py CHANGED Viewed

@@ -1,12 +1,8 @@
-import datetime as dt
 import logging
 import os
-from xmanager import xm
 from xm_slurm import config, resources
 from xm_slurm.contrib.clusters import drac
-from xm_slurm.executors import Slurm
 # ComputeCanada alias
 cc = drac
@@ -45,12 +41,13 @@ def mila(
         runtime=config.ContainerRuntime.SINGULARITY,
         partition=partition,
         prolog="module load singularity",
-        environment={
+        host_environment={
             "SINGULARITY_CACHEDIR": "$SCRATCH/.apptainer",
             "SINGULARITY_TMPDIR": "$SLURM_TMPDIR",
             "SINGULARITY_LOCALCACHEDIR": "$SLURM_TMPDIR",
+        },
+        container_environment={
             "SCRATCH": "/scratch",
-            # TODO: move this somewhere common to all cluster configs.
             "XM_SLURM_STATE_DIR": "/xm-slurm-state",
         },
         mounts=mounts,

xm_slurm/contrib/clusters/drac.py CHANGED Viewed

@@ -42,13 +42,14 @@ def _drac_cluster(
         proxy=proxy,
         runtime=config.ContainerRuntime.APPTAINER,
         prolog=f"module load apptainer {' '.join(modules) if modules else ''}".rstrip(),
-        environment={
+        host_environment={
+            "XDG_DATA_HOME": "$SLURM_TMPDIR/.local",
             "APPTAINER_CACHEDIR": "$SCRATCH/.apptainer",
             "APPTAINER_TMPDIR": "$SLURM_TMPDIR",
             "APPTAINER_LOCALCACHEDIR": "$SLURM_TMPDIR",
-            "_XDG_DATA_HOME": "$SLURM_TMPDIR/.local",
+        },
+        container_environment={
             "SCRATCH": "/scratch",
-            # TODO: move this somewhere common to all cluster configs.
             "XM_SLURM_STATE_DIR": "/xm-slurm-state",
         },
         mounts=mounts,

xm_slurm/executables.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import dataclasses
+import os
 import pathlib
 import typing as tp
@@ -19,7 +20,6 @@ class Dockerfile(xm.ExecutableSpec):
         ssh: A list of docker SSH sockets/keys.
         build_args: Build arguments to docker.
         cache_from: Where to pull the BuildKit cache from. See `--cache-from` in `docker build`.
-        workdir: The working directory in container.
         labels: The container labels.
         platforms: The target platform.
     """
@@ -41,9 +41,6 @@ class Dockerfile(xm.ExecutableSpec):
     # --cache-from field in BuildKit
     cache_from: tp.Sequence[str] = dataclasses.field(default_factory=list)
-    # Working directory in container
-    workdir: pathlib.Path | None = None
     # Container labels
     labels: tp.Mapping[str, str] = dataclasses.field(default_factory=dict)
@@ -66,7 +63,6 @@ class Dockerfile(xm.ExecutableSpec):
             tuple(sorted(self.ssh)),
             tuple(sorted(self.build_args.items())),
             tuple(sorted(self.cache_from)),
-            self.workdir,
             tuple(sorted(self.labels.items())),
             tuple(sorted(self.platforms)),
         ))
@@ -190,8 +186,8 @@ class RemoteImage(xm.Executable):
     # Remote base image
     image: Descriptor[ImageURI, str | ImageURI] = ImageDescriptor()
-    # Working directory in container
-    workdir: pathlib.Path | None = None
+    workdir: os.PathLike[str] | str
+    entrypoint: xm.SequentialArgs
     # Container arguments
     args: xm.SequentialArgs = dataclasses.field(default_factory=xm.SequentialArgs)
@@ -211,6 +207,7 @@ class RemoteImage(xm.Executable):
                 type(self),
                 self.image,
                 self.workdir,
+                tuple(sorted(self.entrypoint.to_list())),
                 tuple(sorted(self.args.to_list())),
                 tuple(sorted(self.env_vars.items())),
                 self.credentials,

xmanager-slurm 0.4.5__py3-none-any.whl → 0.4.6__py3-none-any.whl

Potentially problematic release.

xmanager-slurm 0.4.5py3-none-any.whl → 0.4.6py3-none-any.whl