PyPI - xmanager-slurm - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

xmanager-slurm 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xmanager-slurm might be problematic. Click here for more details.

Files changed (30) hide show

xm_slurm/__init__.py +4 -2
xm_slurm/api.py +1 -1
xm_slurm/config.py +7 -2
xm_slurm/constants.py +4 -0
xm_slurm/contrib/clusters/__init__.py +9 -0
xm_slurm/dependencies.py +171 -0
xm_slurm/executables.py +20 -15
xm_slurm/execution.py +246 -96
xm_slurm/executors.py +8 -12
xm_slurm/experiment.py +374 -83
xm_slurm/experimental/parameter_controller.py +12 -10
xm_slurm/packaging/{docker/local.py → docker.py} +126 -32
xm_slurm/packaging/router.py +3 -1
xm_slurm/packaging/utils.py +4 -28
xm_slurm/resources.py +2 -0
xm_slurm/scripts/cli.py +77 -0
xm_slurm/templates/docker/mamba.Dockerfile +1 -1
xm_slurm/templates/slurm/fragments/monitor.bash.j2 +5 -0
xm_slurm/templates/slurm/job-array.bash.j2 +1 -2
xm_slurm/templates/slurm/job.bash.j2 +4 -3
xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +1 -0
xm_slurm/types.py +23 -0
{xmanager_slurm-0.4.0.dist-info → xmanager_slurm-0.4.2.dist-info}/METADATA +1 -1
xmanager_slurm-0.4.2.dist-info/RECORD +44 -0
xmanager_slurm-0.4.2.dist-info/entry_points.txt +2 -0
xm_slurm/packaging/docker/__init__.py +0 -69
xm_slurm/packaging/docker/abc.py +0 -112
xmanager_slurm-0.4.0.dist-info/RECORD +0 -42
{xmanager_slurm-0.4.0.dist-info → xmanager_slurm-0.4.2.dist-info}/WHEEL +0 -0
{xmanager_slurm-0.4.0.dist-info → xmanager_slurm-0.4.2.dist-info}/licenses/LICENSE.md +0 -0

xm_slurm/execution.py CHANGED Viewed

@@ -5,10 +5,8 @@ import functools
 import hashlib
 import logging
 import operator
-import re
 import shlex
-import typing
-from typing import Any, Mapping, Sequence
+import typing as tp
 import asyncssh
 import backoff
@@ -16,15 +14,20 @@ import jinja2 as j2
 import more_itertools as mit
 from asyncssh.auth import KbdIntPrompts, KbdIntResponse
 from asyncssh.misc import MaybeAwait
+from rich.console import ConsoleRenderable
+from rich.rule import Rule
 from xmanager import xm
-from xm_slurm import batching, config, executors, status
+from xm_slurm import batching, config, constants, dependencies, executors, status
 from xm_slurm.console import console
 from xm_slurm.job_blocks import JobArgs
+from xm_slurm.types import Descriptor
 SlurmClusterConfig = config.SlurmClusterConfig
 ContainerRuntime = config.ContainerRuntime
+logger = logging.getLogger(__name__)
 """
 === Runtime Configurations ===
 With RunC:
@@ -43,11 +46,6 @@ With Singularity / Apptainer:
     apptainer run --compat <digest>
 """
-"""
-#SBATCH --error=/dev/null
-#SBATCH --output=/dev/null
-"""
 _POLL_INTERVAL = 30.0
 _BATCHED_BATCH_SIZE = 16
 _BATCHED_TIMEOUT = 0.2
@@ -69,12 +67,79 @@ class NoKBAuthSSHClient(asyncssh.SSHClient):
         return []
-def _group_by_ssh_options(
-    ssh_options: Sequence[asyncssh.SSHClientConnectionOptions], job_ids: Sequence[str]
-) -> dict[asyncssh.SSHClientConnectionOptions, list[str]]:
+@dataclasses.dataclass(frozen=True, kw_only=True)
+class SlurmJob:
+    job_id: str
+    @property
+    def is_array_job(self) -> bool:
+        return isinstance(self, SlurmArrayJob)
+    @property
+    def is_heterogeneous_job(self) -> bool:
+        return isinstance(self, SlurmHeterogeneousJob)
+    def __hash__(self) -> int:
+        return hash((type(self), self.job_id))
+@dataclasses.dataclass(frozen=True, kw_only=True)
+class SlurmArrayJob(SlurmJob):
+    array_job_id: str
+    array_task_id: str
+@dataclasses.dataclass(frozen=True, kw_only=True)
+class SlurmHeterogeneousJob(SlurmJob):
+    het_job_id: str
+    het_component_id: str
+SlurmJobT = tp.TypeVar("SlurmJobT", bound=SlurmJob, covariant=True)
+class SlurmJobDescriptor(Descriptor[SlurmJobT, str]):
+    def __set_name__(self, owner: type, name: str):
+        del owner
+        self.job = f"_{name}"
+    def __get__(self, instance: object | None, owner: tp.Type[object] | None = None) -> SlurmJobT:
+        del owner
+        return getattr(instance, self.job)
+    def __set__(self, instance: object, value: str):
+        _setattr = object.__setattr__ if not hasattr(instance, self.job) else setattr
+        match = constants.SLURM_JOB_ID_REGEX.match(value)
+        if match is None:
+            raise ValueError(f"Invalid Slurm job ID: {value}")
+        groups = match.groupdict()
+        job_id = groups["jobid"]
+        if array_task_id := groups.get("arraytaskid", None):
+            _setattr(
+                instance,
+                self.job,
+                SlurmArrayJob(job_id=value, array_job_id=job_id, array_task_id=array_task_id),
+            )
+        elif het_component_id := groups.get("componentid", None):
+            _setattr(
+                instance,
+                self.job,
+                SlurmHeterogeneousJob(
+                    job_id=value, het_job_id=job_id, het_component_id=het_component_id
+                ),
+            )
+        else:
+            _setattr(instance, self.job, SlurmJob(job_id=value))
+def _group_by_ssh_configs(
+    ssh_configs: tp.Sequence[config.SlurmSSHConfig], slurm_jobs: tp.Sequence[SlurmJob]
+) -> dict[config.SlurmSSHConfig, list[SlurmJob]]:
     jobs_by_cluster = collections.defaultdict(list)
-    for options, job_id in zip(ssh_options, job_ids):
-        jobs_by_cluster[options].append(job_id)
+    for ssh_config, slurm_job in zip(ssh_configs, slurm_jobs):
+        jobs_by_cluster[ssh_config].append(slurm_job)
     return jobs_by_cluster
@@ -85,18 +150,20 @@ class _BatchedSlurmHandle:
         batch_timeout=_BATCHED_TIMEOUT,
     )
     @staticmethod
+    @backoff.on_exception(backoff.expo, SlurmExecutionError, max_tries=5, max_time=60.0)
     async def _batched_get_state(
-        ssh_options: Sequence[asyncssh.SSHClientConnectionOptions], job_ids: Sequence[str]
-    ) -> Sequence[status.SlurmJobState]:
+        ssh_configs: tp.Sequence[config.SlurmSSHConfig],
+        slurm_jobs: tp.Sequence[SlurmJob],
+    ) -> tp.Sequence[status.SlurmJobState]:
         async def _get_state(
-            options: asyncssh.SSHClientConnectionOptions, job_ids: Sequence[str]
-        ) -> Sequence[status.SlurmJobState]:
+            options: config.SlurmSSHConfig, slurm_jobs: tp.Sequence[SlurmJob]
+        ) -> tp.Sequence[status.SlurmJobState]:
             result = await get_client().run(
                 options,
                 [
                     "sacct",
                     "--jobs",
-                    ",".join(job_ids),
+                    ",".join([slurm_job.job_id for slurm_job in slurm_jobs]),
                     "--format",
                     "JobID,State",
                     "--allocations",
@@ -113,32 +180,35 @@ class _BatchedSlurmHandle:
                 states_by_job_id[job_id] = status.SlurmJobState.from_slurm_str(state)
             job_states = []
-            for job_id in job_ids:
-                if job_id in states_by_job_id:
-                    job_states.append(states_by_job_id[job_id])
+            for slurm_job in slurm_jobs:
+                if slurm_job.job_id in states_by_job_id:
+                    job_states.append(states_by_job_id[slurm_job.job_id])
                 # This is a stupid hack around sacct's inability to display state information for
                 # array job elements that haven't begun. We'll assume that if the job ID is not found,
                 # and it's an array job, then it's pending.
-                elif re.match(r"^\d+_\d+$", job_id) is not None:
+                elif slurm_job.is_array_job:
                     job_states.append(status.SlurmJobState.PENDING)
                 else:
-                    raise SlurmExecutionError(f"Failed to find job state info for {job_id}")
+                    raise SlurmExecutionError(f"Failed to find job state info for {slurm_job!r}")
             return job_states
-        jobs_by_cluster = _group_by_ssh_options(ssh_options, job_ids)
+        # Group Slurm jobs by their cluster so we can batch requests
+        jobs_by_cluster = _group_by_ssh_configs(ssh_configs, slurm_jobs)
+        # Async get state for each cluster
         job_states_per_cluster = await asyncio.gather(*[
-            _get_state(options, job_ids) for options, job_ids in jobs_by_cluster.items()
+            _get_state(options, jobs) for options, jobs in jobs_by_cluster.items()
         ])
-        job_states_by_cluster: dict[
-            asyncssh.SSHClientConnectionOptions, dict[str, status.SlurmJobState]
-        ] = {}
-        for options, job_states in zip(ssh_options, job_states_per_cluster):
-            job_states_by_cluster[options] = dict(zip(jobs_by_cluster[options], job_states))
+        # Reconstruct the job states by cluster
+        job_states_by_cluster = {}
+        for ssh_config, job_states in zip(ssh_configs, job_states_per_cluster):
+            job_states_by_cluster[ssh_config] = dict(zip(jobs_by_cluster[ssh_config], job_states))
+        # Reconstruct the job states in the original order
         job_states = []
-        for options, job_id in zip(ssh_options, job_ids):
-            job_states.append(job_states_by_cluster[options][job_id])
+        for ssh_config, slurm_job in zip(ssh_configs, slurm_jobs):
+            job_states.append(job_states_by_cluster[ssh_config][slurm_job])
         return job_states
     @functools.partial(
@@ -148,31 +218,33 @@ class _BatchedSlurmHandle:
     )
     @staticmethod
     async def _batched_cancel(
-        ssh_options: Sequence[asyncssh.SSHClientConnectionOptions], job_ids: Sequence[str]
-    ) -> Sequence[None]:
+        ssh_configs: tp.Sequence[config.SlurmSSHConfig],
+        slurm_jobs: tp.Sequence[SlurmJob],
+    ) -> tp.Sequence[None]:
         async def _cancel(
-            options: asyncssh.SSHClientConnectionOptions, job_ids: Sequence[str]
+            options: config.SlurmSSHConfig, slurm_jobs: tp.Sequence[SlurmJob]
         ) -> None:
-            await get_client().run(options, ["scancel", " ".join(job_ids)], check=True)
+            await get_client().run(
+                options,
+                ["scancel", " ".join([slurm_job.job_id for slurm_job in slurm_jobs])],
+                check=True,
+            )
-        jobs_by_cluster = _group_by_ssh_options(ssh_options, job_ids)
+        jobs_by_cluster = _group_by_ssh_configs(ssh_configs, slurm_jobs)
         return await asyncio.gather(*[
             _cancel(options, job_ids) for options, job_ids in jobs_by_cluster.items()
         ])
 @dataclasses.dataclass(frozen=True, kw_only=True)
-class SlurmHandle(_BatchedSlurmHandle):
+class SlurmHandle(_BatchedSlurmHandle, tp.Generic[SlurmJobT]):
     """A handle for referring to the launched container."""
+    experiment_id: int
     ssh: config.SlurmSSHConfig
-    job_id: str
+    slurm_job: Descriptor[SlurmJobT, str] = SlurmJobDescriptor[SlurmJobT]()
     job_name: str  # XManager job name associated with this handle
-    def __post_init__(self):
-        if re.match(r"^\d+(_\d+|\+\d+)?$", self.job_id) is None:
-            raise ValueError(f"Invalid job ID: {self.job_id}")
     @backoff.on_predicate(
         backoff.constant,
         lambda state: state in status.SlurmActiveJobStates,
@@ -183,15 +255,56 @@ class SlurmHandle(_BatchedSlurmHandle):
         return await self.get_state()
     async def stop(self) -> None:
-        await self._batched_cancel(self.ssh.connection_options, self.job_id)
+        await self._batched_cancel(self.ssh, self.slurm_job)
     async def get_state(self) -> status.SlurmJobState:
-        return await self._batched_get_state(self.ssh.connection_options, self.job_id)
+        return await self._batched_get_state(self.ssh, self.slurm_job)
-@functools.cache
-def get_client() -> "Client":
-    return Client()
+    async def logs(
+        self, *, num_lines: int, block_size: int, wait: bool, follow: bool
+    ) -> tp.AsyncGenerator[ConsoleRenderable, None]:
+        file = f".local/state/xm-slurm/{self.experiment_id}/slurm-{self.slurm_job.job_id}.out"
+        conn = await get_client().connection(self.ssh)
+        async with conn.start_sftp_client() as sftp:
+            if wait:
+                while not (await sftp.exists(file)):
+                    await asyncio.sleep(5)
+            async with sftp.open(file, "rb") as remote_file:
+                file_stat = await remote_file.stat()
+                file_size = file_stat.size
+                assert file_size is not None
+                data = b""
+                lines = []
+                position = file_size
+                while len(lines) <= num_lines and position > 0:
+                    read_size = min(block_size, position)
+                    position -= read_size
+                    await remote_file.seek(position)
+                    chunk = await remote_file.read(read_size)
+                    data = chunk + data
+                    lines = data.splitlines()
+                if position <= 0:
+                    yield Rule("[bold red]BEGINNING OF FILE[/bold red]")
+                for line in lines[-num_lines:]:
+                    yield line.decode("utf-8", errors="replace")
+                if (await self.get_state()) not in status.SlurmActiveJobStates:
+                    yield Rule("[bold red]END OF FILE[/bold red]")
+                    return
+                if not follow:
+                    return
+                await remote_file.seek(file_size)
+                while True:
+                    if new_data := (await remote_file.read(block_size)):
+                        yield new_data.decode("utf-8", errors="replace")
+                    else:
+                        await asyncio.sleep(0.25)
 @functools.cache
@@ -218,11 +331,14 @@ def get_template_env(container_runtime: ContainerRuntime) -> j2.Environment:
     return template_env
+@functools.cache
+def get_client() -> "Client":
+    return Client()
 class Client:
     def __init__(self) -> None:
-        self._connections = dict[
-            asyncssh.SSHClientConnectionOptions, asyncssh.SSHClientConnection
-        ]()
+        self._connections = dict[config.SlurmSSHConfig, asyncssh.SSHClientConnection]()
         self._connection_lock = asyncio.Lock()
     @backoff.on_exception(backoff.expo, asyncssh.Error, max_tries=5, max_time=60.0)
@@ -231,53 +347,52 @@ class Client:
         async with conn.start_sftp_client() as sftp_client:
             await sftp_client.makedirs(".local/state/xm-slurm", exist_ok=True)
-    async def connection(
-        self,
-        options: asyncssh.SSHClientConnectionOptions,
-    ) -> asyncssh.SSHClientConnection:
-        if options not in self._connections:
+    async def connection(self, ssh_config: config.SlurmSSHConfig) -> asyncssh.SSHClientConnection:
+        if ssh_config not in self._connections:
             async with self._connection_lock:
                 try:
-                    conn, _ = await asyncssh.create_connection(NoKBAuthSSHClient, options=options)
+                    conn, _ = await asyncssh.create_connection(
+                        NoKBAuthSSHClient, options=ssh_config.connection_options
+                    )
                     await self._setup_remote_connection(conn)
-                    self._connections[options] = conn
+                    self._connections[ssh_config] = conn
                 except asyncssh.misc.PermissionDenied as ex:
                     raise SlurmExecutionError(
-                        f"Permission denied connecting to {options.host}"
+                        f"Permission denied connecting to {ssh_config.host}"
                     ) from ex
                 except asyncssh.misc.ConnectionLost as ex:
-                    raise SlurmExecutionError(f"Connection lost to host {options.host}") from ex
+                    raise SlurmExecutionError(f"Connection lost to host {ssh_config.host}") from ex
                 except asyncssh.misc.HostKeyNotVerifiable as ex:
                     raise SlurmExecutionError(
-                        f"Cannot verify the public key for host {options.host}"
+                        f"Cannot verify the public key for host {ssh_config.host}"
                     ) from ex
                 except asyncssh.misc.KeyExchangeFailed as ex:
                     raise SlurmExecutionError(
-                        f"Failed to exchange keys with host {options.host}"
+                        f"Failed to exchange keys with host {ssh_config.host}"
                     ) from ex
                 except asyncssh.Error as ex:
                     raise SlurmExecutionError(
-                        f"SSH connection error when connecting to {options.host}"
+                        f"SSH connection error when connecting to {ssh_config.host}"
                     ) from ex
-        return self._connections[options]
+        return self._connections[ssh_config]
     @backoff.on_exception(backoff.expo, asyncssh.Error, max_tries=5, max_time=60.0)
     async def run(
         self,
-        options: asyncssh.SSHClientConnectionOptions,
-        command: xm.SequentialArgs | str | Sequence[str],
+        ssh_config: config.SlurmSSHConfig,
+        command: xm.SequentialArgs | str | tp.Sequence[str],
         *,
         check: bool = False,
         timeout: float | None = None,
     ) -> asyncssh.SSHCompletedProcess:
-        client = await self.connection(options)
+        client = await self.connection(ssh_config)
         if isinstance(command, xm.SequentialArgs):
             command = command.to_list()
         if not isinstance(command, str) and isinstance(command, collections.abc.Sequence):
             command = shlex.join(command)
         assert isinstance(command, str)
-        logging.debug("Running command on %s: %s", options.host, command)
+        logger.debug("Running command on %s: %s", ssh_config.host, command)
         return await client.run(command, check=check, timeout=timeout)
@@ -285,8 +400,9 @@ class Client:
         self,
         *,
         job: xm.Job | xm.JobGroup,
+        dependency: dependencies.SlurmJobDependency | None = None,
         cluster: SlurmClusterConfig,
-        args: Mapping[str, Any] | Sequence[Mapping[str, Any]] | None,
+        args: tp.Mapping[str, tp.Any] | tp.Sequence[tp.Mapping[str, tp.Any]] | None,
         experiment_id: int,
         identity: str | None,
     ) -> str:
@@ -297,7 +413,7 @@ class Client:
         # Sanitize job groups
         if isinstance(job, xm.JobGroup) and len(job.jobs) == 1:
-            job = typing.cast(xm.Job, list(job.jobs.values())[0])
+            job = tp.cast(xm.Job, list(job.jobs.values())[0])
         elif isinstance(job, xm.JobGroup) and len(job.jobs) == 0:
             raise ValueError("Job group must have at least one job")
@@ -315,6 +431,7 @@ class Client:
                 return template.render(
                     job=job_array,
+                    dependency=dependency,
                     cluster=cluster,
                     args=sequential_args,
                     env_vars=env_vars,
@@ -327,6 +444,7 @@ class Client:
                 env_vars = args.get("env_vars", None)
                 return template.render(
                     job=job,
+                    dependency=dependency,
                     cluster=cluster,
                     args=sequential_args,
                     env_vars=env_vars,
@@ -347,6 +465,7 @@ class Client:
                 }
                 return template.render(
                     job_group=job_group,
+                    dependency=dependency,
                     cluster=cluster,
                     args=sequential_args,
                     env_vars=env_vars,
@@ -356,54 +475,66 @@ class Client:
             case _:
                 raise ValueError(f"Unsupported job type: {type(job)}")
-    @typing.overload
+    @tp.overload
     async def launch(
         self,
         *,
         cluster: SlurmClusterConfig,
         job: xm.JobGroup,
-        args: Mapping[str, JobArgs] | None,
+        dependency: dependencies.SlurmJobDependency | None = None,
+        args: tp.Mapping[str, JobArgs] | None,
         experiment_id: int,
         identity: str | None = ...,
     ) -> SlurmHandle: ...
-    @typing.overload
+    @tp.overload
     async def launch(
         self,
         *,
         cluster: SlurmClusterConfig,
         job: xm.Job,
-        args: Sequence[JobArgs],
+        dependency: dependencies.SlurmJobDependency | None = None,
+        args: tp.Sequence[JobArgs],
         experiment_id: int,
         identity: str | None = ...,
     ) -> list[SlurmHandle]: ...
-    @typing.overload
+    @tp.overload
     async def launch(
         self,
         *,
         cluster: SlurmClusterConfig,
         job: xm.Job,
+        dependency: dependencies.SlurmJobDependency | None = None,
         args: JobArgs,
         experiment_id: int,
         identity: str | None = ...,
     ) -> SlurmHandle: ...
-    async def launch(self, *, cluster, job, args, experiment_id, identity=None):
-        # Construct template
+    async def launch(
+        self,
+        *,
+        cluster: SlurmClusterConfig,
+        job: xm.Job | xm.JobGroup,
+        dependency: dependencies.SlurmJobDependency | None = None,
+        args: tp.Mapping[str, JobArgs] | tp.Sequence[JobArgs] | JobArgs | None,
+        experiment_id: int,
+        identity: str | None = None,
+    ):
         template = await self.template(
             job=job,
+            dependency=dependency,
             cluster=cluster,
             args=args,
             experiment_id=experiment_id,
             identity=identity,
         )
-        logging.debug("Slurm submission script:\n%s", template)
+        logger.debug("Slurm submission script:\n%s", template)
         # Hash submission script
         template_hash = hashlib.blake2s(template.encode()).hexdigest()[:8]
-        conn = await self.connection(cluster.ssh.connection_options)
+        conn = await self.connection(cluster.ssh)
         async with conn.start_sftp_client() as sftp:
             # Write the submission script to the cluster
             # TODO(jfarebro): SHOULD FIND A WAY TO GET THE HOME DIRECTORY
@@ -416,7 +547,7 @@ class Client:
         # Construct and run command on the cluster
         command = f"sbatch --chdir .local/state/xm-slurm/{experiment_id} --parsable submission-script-{template_hash}.sh"
-        result = await self.run(cluster.ssh.connection_options, command)
+        result = await self.run(cluster.ssh, command)
         if result.returncode != 0:
             raise RuntimeError(f"Failed to schedule job on {cluster.ssh.host}: {result.stderr}")
@@ -435,21 +566,32 @@ class Client:
             assert job.name is not None
             return [
                 SlurmHandle(
+                    experiment_id=experiment_id,
                     ssh=cluster.ssh,
-                    job_id=f"{slurm_job_id}_{array_index}",
+                    slurm_job=f"{slurm_job_id}_{array_index}",
                     job_name=job.name,
                 )
                 for array_index in range(len(args))
             ]
         elif isinstance(job, xm.Job):
             assert job.name is not None
-            return SlurmHandle(ssh=cluster.ssh, job_id=slurm_job_id, job_name=job.name)
+            return SlurmHandle(
+                experiment_id=experiment_id,
+                ssh=cluster.ssh,
+                slurm_job=slurm_job_id,
+                job_name=job.name,
+            )
         elif isinstance(job, xm.JobGroup):
             # TODO: make this work for actual job groups.
-            job = mit.one(job.jobs.values())
+            job = tp.cast(xm.Job, mit.one(job.jobs.values()))
             assert isinstance(job, xm.Job)
             assert job.name is not None
-            return SlurmHandle(ssh=cluster.ssh, job_id=slurm_job_id, job_name=job.name)
+            return SlurmHandle(
+                experiment_id=experiment_id,
+                ssh=cluster.ssh,
+                slurm_job=slurm_job_id,
+                job_name=job.name,
+            )
         else:
             raise ValueError(f"Unsupported job type: {type(job)}")
@@ -458,30 +600,33 @@ class Client:
             conn.close()
-@typing.overload
+@tp.overload
 async def launch(
     *,
     job: xm.JobGroup,
-    args: Mapping[str, JobArgs],
+    dependency: dependencies.SlurmJobDependency | None = None,
+    args: tp.Mapping[str, JobArgs],
     experiment_id: int,
     identity: str | None = ...,
 ) -> SlurmHandle: ...
-@typing.overload
+@tp.overload
 async def launch(
     *,
     job: xm.Job,
-    args: Sequence[JobArgs],
+    dependency: dependencies.SlurmJobDependency | None = None,
+    args: tp.Sequence[JobArgs],
     experiment_id: int,
     identity: str | None = ...,
 ) -> list[SlurmHandle]: ...
-@typing.overload
+@tp.overload
 async def launch(
     *,
     job: xm.Job,
+    dependency: dependencies.SlurmJobDependency | None = None,
     args: JobArgs,
     experiment_id: int,
     identity: str | None = ...,
@@ -491,7 +636,8 @@ async def launch(
 async def launch(
     *,
     job: xm.Job | xm.JobGroup,
-    args: Mapping[str, JobArgs] | Sequence[JobArgs] | JobArgs,
+    dependency: dependencies.SlurmJobDependency | None = None,
+    args: tp.Mapping[str, JobArgs] | tp.Sequence[JobArgs] | JobArgs,
     experiment_id: int,
     identity: str | None = None,
 ) -> SlurmHandle | list[SlurmHandle]:
@@ -503,11 +649,14 @@ async def launch(
             cluster = job_requirements.cluster
             if cluster is None:
                 raise ValueError("Job must have a cluster requirement")
+            if cluster.validate is not None:
+                cluster.validate(job)
             return await get_client().launch(
                 cluster=cluster,
                 job=job,
-                args=typing.cast(JobArgs | Sequence[JobArgs], args),
+                dependency=dependency,
+                args=tp.cast(JobArgs | tp.Sequence[JobArgs], args),
                 experiment_id=experiment_id,
                 identity=identity,
             )
@@ -521,6 +670,8 @@ async def launch(
                     raise ValueError("Job must have a Slurm executor")
                 if job_item.executor.requirements.cluster is None:
                     raise ValueError("Job must have a cluster requirement")
+                if job_item.executor.requirements.cluster.validate is not None:
+                    job_item.executor.requirements.cluster.validate(job_item)
                 job_group_clusters.add(job_item.executor.requirements.cluster)
                 job_group_executors.add(id(job_item.executor))
             if len(job_group_executors) != 1:
@@ -531,9 +682,8 @@ async def launch(
             return await get_client().launch(
                 cluster=job_group_clusters.pop(),
                 job=job_group,
-                args=typing.cast(Mapping[str, JobArgs], args),
+                dependency=dependency,
+                args=tp.cast(tp.Mapping[str, JobArgs], args),
                 experiment_id=experiment_id,
                 identity=identity,
             )
-        case _:
-            raise ValueError("Unsupported job type")

xm_slurm/executors.py CHANGED Viewed

@@ -48,6 +48,9 @@ class Slurm(xm.Executor):
     qos: str | None = None
     priority: int | None = None
+    # Job dependency handling
+    kill_on_invalid_dependencies: bool = True
     # Job rescheduling
     timeout_signal: signal.Signals = signal.SIGUSR2
     timeout_signal_grace_period: dt.timedelta = dt.timedelta(seconds=90)
@@ -93,6 +96,11 @@ class Slurm(xm.Executor):
         minutes, seconds = divmod(remainder, 60)
         directives.append(f"--time={days}-{hours:02}:{minutes:02}:{seconds:02}")
+        # Job dependency handling
+        directives.append(
+            f"--kill-on-invalid-dep={'yes' if self.kill_on_invalid_dependencies else 'no'}"
+        )
         # Placement
         if self.account:
             directives.append(f"--account={self.account}")
@@ -113,15 +121,3 @@ class Slurm(xm.Executor):
             directives.append("--no-requeue")
         return directives
-class DockerSpec(xm.ExecutorSpec):
-    """Local Docker executor specification that describes the container runtime."""
-class Docker(xm.Executor):
-    """Local Docker executor describing the runtime environment."""
-    @classmethod
-    def Spec(cls) -> DockerSpec:
-        return DockerSpec()

xmanager-slurm 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

Potentially problematic release.

xmanager-slurm 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl