PyPI - xmanager-slurm - Versions diffs - 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

xmanager-slurm 0.3.2py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xmanager-slurm might be problematic. Click here for more details.

Files changed (42) hide show

xm_slurm/__init__.py +6 -2
xm_slurm/api.py +301 -34
xm_slurm/batching.py +4 -4
xm_slurm/config.py +105 -55
xm_slurm/constants.py +19 -0
xm_slurm/contrib/__init__.py +0 -0
xm_slurm/contrib/clusters/__init__.py +47 -13
xm_slurm/contrib/clusters/drac.py +34 -16
xm_slurm/dependencies.py +171 -0
xm_slurm/executables.py +34 -22
xm_slurm/execution.py +305 -107
xm_slurm/executors.py +8 -12
xm_slurm/experiment.py +601 -168
xm_slurm/experimental/parameter_controller.py +202 -0
xm_slurm/job_blocks.py +7 -0
xm_slurm/packageables.py +42 -20
xm_slurm/packaging/{docker/local.py → docker.py} +135 -40
xm_slurm/packaging/router.py +3 -1
xm_slurm/packaging/utils.py +9 -81
xm_slurm/resources.py +28 -4
xm_slurm/scripts/_cloudpickle.py +28 -0
xm_slurm/scripts/cli.py +52 -0
xm_slurm/status.py +9 -0
xm_slurm/templates/docker/mamba.Dockerfile +4 -2
xm_slurm/templates/docker/python.Dockerfile +18 -10
xm_slurm/templates/docker/uv.Dockerfile +35 -0
xm_slurm/templates/slurm/fragments/monitor.bash.j2 +5 -0
xm_slurm/templates/slurm/job-array.bash.j2 +1 -2
xm_slurm/templates/slurm/job.bash.j2 +4 -3
xm_slurm/types.py +23 -0
xm_slurm/utils.py +18 -10
xmanager_slurm-0.4.1.dist-info/METADATA +26 -0
xmanager_slurm-0.4.1.dist-info/RECORD +44 -0
{xmanager_slurm-0.3.2.dist-info → xmanager_slurm-0.4.1.dist-info}/WHEEL +1 -1
xmanager_slurm-0.4.1.dist-info/entry_points.txt +2 -0
xmanager_slurm-0.4.1.dist-info/licenses/LICENSE.md +227 -0
xm_slurm/packaging/docker/__init__.py +0 -75
xm_slurm/packaging/docker/abc.py +0 -112
xm_slurm/packaging/docker/cloud.py +0 -503
xm_slurm/templates/docker/pdm.Dockerfile +0 -31
xmanager_slurm-0.3.2.dist-info/METADATA +0 -25
xmanager_slurm-0.3.2.dist-info/RECORD +0 -38

xm_slurm/execution.py CHANGED Viewed

@@ -5,24 +5,29 @@ import functools
 import hashlib
 import logging
 import operator
-import re
 import shlex
-import typing
-from typing import Any, Mapping, Sequence
+import typing as tp
 import asyncssh
 import backoff
 import jinja2 as j2
+import more_itertools as mit
 from asyncssh.auth import KbdIntPrompts, KbdIntResponse
 from asyncssh.misc import MaybeAwait
+from rich.console import ConsoleRenderable
+from rich.rule import Rule
 from xmanager import xm
-from xm_slurm import batching, config, executors, status
+from xm_slurm import batching, config, constants, dependencies, executors, status
 from xm_slurm.console import console
+from xm_slurm.job_blocks import JobArgs
+from xm_slurm.types import Descriptor
 SlurmClusterConfig = config.SlurmClusterConfig
 ContainerRuntime = config.ContainerRuntime
+logger = logging.getLogger(__name__)
 """
 === Runtime Configurations ===
 With RunC:
@@ -41,11 +46,6 @@ With Singularity / Apptainer:
     apptainer run --compat <digest>
 """
-"""
-#SBATCH --error=/dev/null
-#SBATCH --output=/dev/null
-"""
 _POLL_INTERVAL = 30.0
 _BATCHED_BATCH_SIZE = 16
 _BATCHED_TIMEOUT = 0.2
@@ -67,12 +67,79 @@ class NoKBAuthSSHClient(asyncssh.SSHClient):
         return []
-def _group_by_ssh_options(
-    ssh_options: Sequence[asyncssh.SSHClientConnectionOptions], job_ids: Sequence[str]
-) -> dict[asyncssh.SSHClientConnectionOptions, list[str]]:
+@dataclasses.dataclass(frozen=True, kw_only=True)
+class SlurmJob:
+    job_id: str
+    @property
+    def is_array_job(self) -> bool:
+        return isinstance(self, SlurmArrayJob)
+    @property
+    def is_heterogeneous_job(self) -> bool:
+        return isinstance(self, SlurmHeterogeneousJob)
+    def __hash__(self) -> int:
+        return hash((type(self), self.job_id))
+@dataclasses.dataclass(frozen=True, kw_only=True)
+class SlurmArrayJob(SlurmJob):
+    array_job_id: str
+    array_task_id: str
+@dataclasses.dataclass(frozen=True, kw_only=True)
+class SlurmHeterogeneousJob(SlurmJob):
+    het_job_id: str
+    het_component_id: str
+SlurmJobT = tp.TypeVar("SlurmJobT", bound=SlurmJob, covariant=True)
+class SlurmJobDescriptor(Descriptor[SlurmJobT, str]):
+    def __set_name__(self, owner: type, name: str):
+        del owner
+        self.job = f"_{name}"
+    def __get__(self, instance: object | None, owner: tp.Type[object] | None = None) -> SlurmJobT:
+        del owner
+        return getattr(instance, self.job)
+    def __set__(self, instance: object, value: str):
+        _setattr = object.__setattr__ if not hasattr(instance, self.job) else setattr
+        match = constants.SLURM_JOB_ID_REGEX.match(value)
+        if match is None:
+            raise ValueError(f"Invalid Slurm job ID: {value}")
+        groups = match.groupdict()
+        job_id = groups["jobid"]
+        if array_task_id := groups.get("arraytaskid", None):
+            _setattr(
+                instance,
+                self.job,
+                SlurmArrayJob(job_id=value, array_job_id=job_id, array_task_id=array_task_id),
+            )
+        elif het_component_id := groups.get("componentid", None):
+            _setattr(
+                instance,
+                self.job,
+                SlurmHeterogeneousJob(
+                    job_id=value, het_job_id=job_id, het_component_id=het_component_id
+                ),
+            )
+        else:
+            _setattr(instance, self.job, SlurmJob(job_id=value))
+def _group_by_ssh_configs(
+    ssh_configs: tp.Sequence[config.SlurmSSHConfig], slurm_jobs: tp.Sequence[SlurmJob]
+) -> dict[config.SlurmSSHConfig, list[SlurmJob]]:
     jobs_by_cluster = collections.defaultdict(list)
-    for options, job_id in zip(ssh_options, job_ids):
-        jobs_by_cluster[options].append(job_id)
+    for ssh_config, slurm_job in zip(ssh_configs, slurm_jobs):
+        jobs_by_cluster[ssh_config].append(slurm_job)
     return jobs_by_cluster
@@ -83,18 +150,20 @@ class _BatchedSlurmHandle:
         batch_timeout=_BATCHED_TIMEOUT,
     )
     @staticmethod
+    @backoff.on_exception(backoff.expo, SlurmExecutionError, max_tries=5, max_time=60.0)
     async def _batched_get_state(
-        ssh_options: Sequence[asyncssh.SSHClientConnectionOptions], job_ids: Sequence[str]
-    ) -> Sequence[status.SlurmJobState]:
+        ssh_configs: tp.Sequence[config.SlurmSSHConfig],
+        slurm_jobs: tp.Sequence[SlurmJob],
+    ) -> tp.Sequence[status.SlurmJobState]:
         async def _get_state(
-            options: asyncssh.SSHClientConnectionOptions, job_ids: Sequence[str]
-        ) -> Sequence[status.SlurmJobState]:
+            options: config.SlurmSSHConfig, slurm_jobs: tp.Sequence[SlurmJob]
+        ) -> tp.Sequence[status.SlurmJobState]:
             result = await get_client().run(
                 options,
                 [
                     "sacct",
                     "--jobs",
-                    ",".join(job_ids),
+                    ",".join([slurm_job.job_id for slurm_job in slurm_jobs]),
                     "--format",
                     "JobID,State",
                     "--allocations",
@@ -111,32 +180,35 @@ class _BatchedSlurmHandle:
                 states_by_job_id[job_id] = status.SlurmJobState.from_slurm_str(state)
             job_states = []
-            for job_id in job_ids:
-                if job_id in states_by_job_id:
-                    job_states.append(states_by_job_id[job_id])
+            for slurm_job in slurm_jobs:
+                if slurm_job.job_id in states_by_job_id:
+                    job_states.append(states_by_job_id[slurm_job.job_id])
                 # This is a stupid hack around sacct's inability to display state information for
                 # array job elements that haven't begun. We'll assume that if the job ID is not found,
                 # and it's an array job, then it's pending.
-                elif re.match(r"^\d+_\d+$", job_id) is not None:
+                elif slurm_job.is_array_job:
                     job_states.append(status.SlurmJobState.PENDING)
                 else:
-                    raise SlurmExecutionError(f"Failed to find job state info for {job_id}")
+                    raise SlurmExecutionError(f"Failed to find job state info for {slurm_job!r}")
             return job_states
-        jobs_by_cluster = _group_by_ssh_options(ssh_options, job_ids)
+        # Group Slurm jobs by their cluster so we can batch requests
+        jobs_by_cluster = _group_by_ssh_configs(ssh_configs, slurm_jobs)
+        # Async get state for each cluster
         job_states_per_cluster = await asyncio.gather(*[
-            _get_state(options, job_ids) for options, job_ids in jobs_by_cluster.items()
+            _get_state(options, jobs) for options, jobs in jobs_by_cluster.items()
         ])
-        job_states_by_cluster: dict[
-            asyncssh.SSHClientConnectionOptions, dict[str, status.SlurmJobState]
-        ] = {}
-        for options, job_states in zip(ssh_options, job_states_per_cluster):
-            job_states_by_cluster[options] = dict(zip(jobs_by_cluster[options], job_states))
+        # Reconstruct the job states by cluster
+        job_states_by_cluster = {}
+        for ssh_config, job_states in zip(ssh_configs, job_states_per_cluster):
+            job_states_by_cluster[ssh_config] = dict(zip(jobs_by_cluster[ssh_config], job_states))
+        # Reconstruct the job states in the original order
         job_states = []
-        for options, job_id in zip(ssh_options, job_ids):
-            job_states.append(job_states_by_cluster[options][job_id])
+        for ssh_config, slurm_job in zip(ssh_configs, slurm_jobs):
+            job_states.append(job_states_by_cluster[ssh_config][slurm_job])
         return job_states
     @functools.partial(
@@ -146,29 +218,32 @@ class _BatchedSlurmHandle:
     )
     @staticmethod
     async def _batched_cancel(
-        ssh_options: Sequence[asyncssh.SSHClientConnectionOptions], job_ids: Sequence[str]
-    ) -> Sequence[None]:
+        ssh_configs: tp.Sequence[config.SlurmSSHConfig],
+        slurm_jobs: tp.Sequence[SlurmJob],
+    ) -> tp.Sequence[None]:
         async def _cancel(
-            options: asyncssh.SSHClientConnectionOptions, job_ids: Sequence[str]
+            options: config.SlurmSSHConfig, slurm_jobs: tp.Sequence[SlurmJob]
         ) -> None:
-            await get_client().run(options, ["scancel", " ".join(job_ids)], check=True)
+            await get_client().run(
+                options,
+                ["scancel", " ".join([slurm_job.job_id for slurm_job in slurm_jobs])],
+                check=True,
+            )
-        jobs_by_cluster = _group_by_ssh_options(ssh_options, job_ids)
+        jobs_by_cluster = _group_by_ssh_configs(ssh_configs, slurm_jobs)
         return await asyncio.gather(*[
             _cancel(options, job_ids) for options, job_ids in jobs_by_cluster.items()
         ])
 @dataclasses.dataclass(frozen=True, kw_only=True)
-class SlurmHandle(_BatchedSlurmHandle):
+class SlurmHandle(_BatchedSlurmHandle, tp.Generic[SlurmJobT]):
     """A handle for referring to the launched container."""
-    ssh_connection_options: asyncssh.SSHClientConnectionOptions
-    job_id: str
-    def __post_init__(self):
-        if re.match(r"^\d+(_\d+|\+\d+)?$", self.job_id) is None:
-            raise ValueError(f"Invalid job ID: {self.job_id}")
+    experiment_id: int
+    ssh: config.SlurmSSHConfig
+    slurm_job: Descriptor[SlurmJobT, str] = SlurmJobDescriptor[SlurmJobT]()
+    job_name: str  # XManager job name associated with this handle
     @backoff.on_predicate(
         backoff.constant,
@@ -180,15 +255,56 @@ class SlurmHandle(_BatchedSlurmHandle):
         return await self.get_state()
     async def stop(self) -> None:
-        await self._batched_cancel(self.ssh_connection_options, self.job_id)
+        await self._batched_cancel(self.ssh, self.slurm_job)
     async def get_state(self) -> status.SlurmJobState:
-        return await self._batched_get_state(self.ssh_connection_options, self.job_id)
+        return await self._batched_get_state(self.ssh, self.slurm_job)
-@functools.cache
-def get_client() -> "Client":
-    return Client()
+    async def logs(
+        self, *, num_lines: int, block_size: int, wait: bool, follow: bool
+    ) -> tp.AsyncGenerator[ConsoleRenderable, None]:
+        file = f".local/state/xm-slurm/{self.experiment_id}/slurm-{self.slurm_job.job_id}.out"
+        conn = await get_client().connection(self.ssh)
+        async with conn.start_sftp_client() as sftp:
+            if wait:
+                while not (await sftp.exists(file)):
+                    await asyncio.sleep(5)
+            async with sftp.open(file, "rb") as remote_file:
+                file_stat = await remote_file.stat()
+                file_size = file_stat.size
+                assert file_size is not None
+                data = b""
+                lines = []
+                position = file_size
+                while len(lines) <= num_lines and position > 0:
+                    read_size = min(block_size, position)
+                    position -= read_size
+                    await remote_file.seek(position)
+                    chunk = await remote_file.read(read_size)
+                    data = chunk + data
+                    lines = data.splitlines()
+                if position <= 0:
+                    yield Rule("[bold red]BEGINNING OF FILE[/bold red]")
+                for line in lines[-num_lines:]:
+                    yield line.decode("utf-8", errors="replace")
+                if (await self.get_state()) not in status.SlurmActiveJobStates:
+                    yield Rule("[bold red]END OF FILE[/bold red]")
+                    return
+                if not follow:
+                    return
+                await remote_file.seek(file_size)
+                while True:
+                    if new_data := (await remote_file.read(block_size)):
+                        yield new_data.decode("utf-8", errors="replace")
+                    else:
+                        await asyncio.sleep(0.25)
 @functools.cache
@@ -208,55 +324,75 @@ def get_template_env(container_runtime: ContainerRuntime) -> j2.Environment:
         case ContainerRuntime.PODMAN:
             runtime_template = template_env.get_template("runtimes/podman.bash.j2")
         case _:
-            raise NotImplementedError
+            raise NotImplementedError(f"Container runtime {container_runtime} is not implemented.")
     # Update our global env with the runtime template's exported globals
     template_env.globals.update(runtime_template.module.__dict__)
     return template_env
+@functools.cache
+def get_client() -> "Client":
+    return Client()
 class Client:
-    def __init__(self):
-        self._connections: dict[
-            asyncssh.SSHClientConnectionOptions, asyncssh.SSHClientConnection
-        ] = {}
+    def __init__(self) -> None:
+        self._connections = dict[config.SlurmSSHConfig, asyncssh.SSHClientConnection]()
         self._connection_lock = asyncio.Lock()
     @backoff.on_exception(backoff.expo, asyncssh.Error, max_tries=5, max_time=60.0)
     async def _setup_remote_connection(self, conn: asyncssh.SSHClientConnection) -> None:
         # Make sure the xm-slurm state directory exists
-        await conn.run("mkdir -p ~/.local/state/xm-slurm", check=True)
+        async with conn.start_sftp_client() as sftp_client:
+            await sftp_client.makedirs(".local/state/xm-slurm", exist_ok=True)
-    async def connection(
-        self,
-        options: asyncssh.SSHClientConnectionOptions,
-    ) -> asyncssh.SSHClientConnection:
-        if options not in self._connections:
+    async def connection(self, ssh_config: config.SlurmSSHConfig) -> asyncssh.SSHClientConnection:
+        if ssh_config not in self._connections:
             async with self._connection_lock:
                 try:
-                    conn, _ = await asyncssh.create_connection(NoKBAuthSSHClient, options=options)
+                    conn, _ = await asyncssh.create_connection(
+                        NoKBAuthSSHClient, options=ssh_config.connection_options
+                    )
                     await self._setup_remote_connection(conn)
-                    self._connections[options] = conn
+                    self._connections[ssh_config] = conn
                 except asyncssh.misc.PermissionDenied as ex:
-                    raise RuntimeError(f"Permission denied connecting to {options.host}") from ex
-        return self._connections[options]
+                    raise SlurmExecutionError(
+                        f"Permission denied connecting to {ssh_config.host}"
+                    ) from ex
+                except asyncssh.misc.ConnectionLost as ex:
+                    raise SlurmExecutionError(f"Connection lost to host {ssh_config.host}") from ex
+                except asyncssh.misc.HostKeyNotVerifiable as ex:
+                    raise SlurmExecutionError(
+                        f"Cannot verify the public key for host {ssh_config.host}"
+                    ) from ex
+                except asyncssh.misc.KeyExchangeFailed as ex:
+                    raise SlurmExecutionError(
+                        f"Failed to exchange keys with host {ssh_config.host}"
+                    ) from ex
+                except asyncssh.Error as ex:
+                    raise SlurmExecutionError(
+                        f"SSH connection error when connecting to {ssh_config.host}"
+                    ) from ex
+        return self._connections[ssh_config]
     @backoff.on_exception(backoff.expo, asyncssh.Error, max_tries=5, max_time=60.0)
     async def run(
         self,
-        options: asyncssh.SSHClientConnectionOptions,
-        command: xm.SequentialArgs | str | Sequence[str],
+        ssh_config: config.SlurmSSHConfig,
+        command: xm.SequentialArgs | str | tp.Sequence[str],
         *,
         check: bool = False,
         timeout: float | None = None,
     ) -> asyncssh.SSHCompletedProcess:
-        client = await self.connection(options)
+        client = await self.connection(ssh_config)
         if isinstance(command, xm.SequentialArgs):
             command = command.to_list()
         if not isinstance(command, str) and isinstance(command, collections.abc.Sequence):
             command = shlex.join(command)
         assert isinstance(command, str)
-        logging.debug("Running command on %s: %s", options.host, command)
+        logger.debug("Running command on %s: %s", ssh_config.host, command)
         return await client.run(command, check=check, timeout=timeout)
@@ -264,8 +400,9 @@ class Client:
         self,
         *,
         job: xm.Job | xm.JobGroup,
+        dependency: dependencies.SlurmJobDependency | None = None,
         cluster: SlurmClusterConfig,
-        args: Mapping[str, Any] | Sequence[Mapping[str, Any]] | None,
+        args: tp.Mapping[str, tp.Any] | tp.Sequence[tp.Mapping[str, tp.Any]] | None,
         experiment_id: int,
         identity: str | None,
     ) -> str:
@@ -276,7 +413,7 @@ class Client:
         # Sanitize job groups
         if isinstance(job, xm.JobGroup) and len(job.jobs) == 1:
-            job = typing.cast(xm.Job, list(job.jobs.values())[0])
+            job = tp.cast(xm.Job, list(job.jobs.values())[0])
         elif isinstance(job, xm.JobGroup) and len(job.jobs) == 0:
             raise ValueError("Job group must have at least one job")
@@ -294,6 +431,7 @@ class Client:
                 return template.render(
                     job=job_array,
+                    dependency=dependency,
                     cluster=cluster,
                     args=sequential_args,
                     env_vars=env_vars,
@@ -306,6 +444,7 @@ class Client:
                 env_vars = args.get("env_vars", None)
                 return template.render(
                     job=job,
+                    dependency=dependency,
                     cluster=cluster,
                     args=sequential_args,
                     env_vars=env_vars,
@@ -326,6 +465,7 @@ class Client:
                 }
                 return template.render(
                     job_group=job_group,
+                    dependency=dependency,
                     cluster=cluster,
                     args=sequential_args,
                     env_vars=env_vars,
@@ -335,51 +475,66 @@ class Client:
             case _:
                 raise ValueError(f"Unsupported job type: {type(job)}")
-    @typing.overload
+    @tp.overload
     async def launch(
         self,
         *,
         cluster: SlurmClusterConfig,
-        job: xm.Job | xm.JobGroup,
-        args: Mapping[str, Any] | None,
+        job: xm.JobGroup,
+        dependency: dependencies.SlurmJobDependency | None = None,
+        args: tp.Mapping[str, JobArgs] | None,
         experiment_id: int,
         identity: str | None = ...,
     ) -> SlurmHandle: ...
-    @typing.overload
+    @tp.overload
     async def launch(
         self,
         *,
         cluster: SlurmClusterConfig,
-        job: xm.Job | xm.JobGroup,
-        args: Sequence[Mapping[str, Any]],
+        job: xm.Job,
+        dependency: dependencies.SlurmJobDependency | None = None,
+        args: tp.Sequence[JobArgs],
+        experiment_id: int,
+        identity: str | None = ...,
+    ) -> list[SlurmHandle]: ...
+    @tp.overload
+    async def launch(
+        self,
+        *,
+        cluster: SlurmClusterConfig,
+        job: xm.Job,
+        dependency: dependencies.SlurmJobDependency | None = None,
+        args: JobArgs,
         experiment_id: int,
         identity: str | None = ...,
-    ) -> Sequence[SlurmHandle]: ...
+    ) -> SlurmHandle: ...
     async def launch(
         self,
         *,
         cluster: SlurmClusterConfig,
         job: xm.Job | xm.JobGroup,
-        args: Mapping[str, Any] | Sequence[Mapping[str, Any]] | None,
+        dependency: dependencies.SlurmJobDependency | None = None,
+        args: tp.Mapping[str, JobArgs] | tp.Sequence[JobArgs] | JobArgs | None,
         experiment_id: int,
         identity: str | None = None,
-    ) -> SlurmHandle | Sequence[SlurmHandle]:
-        # Construct template
+    ):
         template = await self.template(
             job=job,
+            dependency=dependency,
             cluster=cluster,
             args=args,
             experiment_id=experiment_id,
             identity=identity,
         )
-        logging.debug("Slurm submission script:\n%s", template)
+        logger.debug("Slurm submission script:\n%s", template)
         # Hash submission script
         template_hash = hashlib.blake2s(template.encode()).hexdigest()[:8]
-        conn = await self.connection(cluster.ssh_connection_options)
+        conn = await self.connection(cluster.ssh)
         async with conn.start_sftp_client() as sftp:
             # Write the submission script to the cluster
             # TODO(jfarebro): SHOULD FIND A WAY TO GET THE HOME DIRECTORY
@@ -392,9 +547,9 @@ class Client:
         # Construct and run command on the cluster
         command = f"sbatch --chdir .local/state/xm-slurm/{experiment_id} --parsable submission-script-{template_hash}.sh"
-        result = await self.run(cluster.ssh_connection_options, command)
+        result = await self.run(cluster.ssh, command)
         if result.returncode != 0:
-            raise RuntimeError(f"Failed to schedule job on {cluster.host}: {result.stderr}")
+            raise RuntimeError(f"Failed to schedule job on {cluster.ssh.host}: {result.stderr}")
         assert isinstance(result.stdout, str)
         slurm_job_id, *_ = result.stdout.split(",")
@@ -405,61 +560,103 @@ class Client:
             f"[cyan]{cluster.name}[/cyan] "
         )
+        # If we scheduled an array job make sure to return a list of handles
+        # The indexing is always sequential in 0, 1, ..., n - 1
         if isinstance(job, xm.Job) and isinstance(args, collections.abc.Sequence):
+            assert job.name is not None
             return [
                 SlurmHandle(
-                    ssh_connection_options=cluster.ssh_connection_options,
-                    job_id=f"{slurm_job_id}_{array_index}",
+                    experiment_id=experiment_id,
+                    ssh=cluster.ssh,
+                    slurm_job=f"{slurm_job_id}_{array_index}",
+                    job_name=job.name,
                 )
                 for array_index in range(len(args))
             ]
+        elif isinstance(job, xm.Job):
+            assert job.name is not None
+            return SlurmHandle(
+                experiment_id=experiment_id,
+                ssh=cluster.ssh,
+                slurm_job=slurm_job_id,
+                job_name=job.name,
+            )
+        elif isinstance(job, xm.JobGroup):
+            # TODO: make this work for actual job groups.
+            job = tp.cast(xm.Job, mit.one(job.jobs.values()))
+            assert isinstance(job, xm.Job)
+            assert job.name is not None
+            return SlurmHandle(
+                experiment_id=experiment_id,
+                ssh=cluster.ssh,
+                slurm_job=slurm_job_id,
+                job_name=job.name,
+            )
+        else:
+            raise ValueError(f"Unsupported job type: {type(job)}")
-        return SlurmHandle(
-            ssh_connection_options=cluster.ssh_connection_options,
-            job_id=slurm_job_id,
-        )
+    def __del__(self):
+        for conn in self._connections.values():
+            conn.close()
-@typing.overload
+@tp.overload
 async def launch(
     *,
-    job: xm.Job | xm.JobGroup,
-    args: Mapping[str, Any],
+    job: xm.JobGroup,
+    dependency: dependencies.SlurmJobDependency | None = None,
+    args: tp.Mapping[str, JobArgs],
     experiment_id: int,
     identity: str | None = ...,
 ) -> SlurmHandle: ...
-@typing.overload
+@tp.overload
 async def launch(
     *,
-    job: xm.Job | xm.JobGroup,
-    args: Sequence[Mapping[str, Any]],
+    job: xm.Job,
+    dependency: dependencies.SlurmJobDependency | None = None,
+    args: tp.Sequence[JobArgs],
     experiment_id: int,
     identity: str | None = ...,
-) -> Sequence[SlurmHandle]: ...
+) -> list[SlurmHandle]: ...
+@tp.overload
+async def launch(
+    *,
+    job: xm.Job,
+    dependency: dependencies.SlurmJobDependency | None = None,
+    args: JobArgs,
+    experiment_id: int,
+    identity: str | None = ...,
+) -> SlurmHandle: ...
 async def launch(
     *,
     job: xm.Job | xm.JobGroup,
-    args: Mapping[str, Any] | Sequence[Mapping[str, Any]],
+    dependency: dependencies.SlurmJobDependency | None = None,
+    args: tp.Mapping[str, JobArgs] | tp.Sequence[JobArgs] | JobArgs,
     experiment_id: int,
     identity: str | None = None,
-) -> SlurmHandle | Sequence[SlurmHandle]:
+) -> SlurmHandle | list[SlurmHandle]:
     match job:
-        case xm.Job():
+        case xm.Job() as job:
             if not isinstance(job.executor, executors.Slurm):
                 raise ValueError("Job must have a Slurm executor")
             job_requirements = job.executor.requirements
             cluster = job_requirements.cluster
             if cluster is None:
                 raise ValueError("Job must have a cluster requirement")
+            if cluster.validate is not None:
+                cluster.validate(job)
             return await get_client().launch(
                 cluster=cluster,
                 job=job,
-                args=args,
+                dependency=dependency,
+                args=tp.cast(JobArgs | tp.Sequence[JobArgs], args),
                 experiment_id=experiment_id,
                 identity=identity,
             )
@@ -473,6 +670,8 @@ async def launch(
                     raise ValueError("Job must have a Slurm executor")
                 if job_item.executor.requirements.cluster is None:
                     raise ValueError("Job must have a cluster requirement")
+                if job_item.executor.requirements.cluster.validate is not None:
+                    job_item.executor.requirements.cluster.validate(job_item)
                 job_group_clusters.add(job_item.executor.requirements.cluster)
                 job_group_executors.add(id(job_item.executor))
             if len(job_group_executors) != 1:
@@ -482,10 +681,9 @@ async def launch(
             return await get_client().launch(
                 cluster=job_group_clusters.pop(),
-                job=job,
-                args=args,
+                job=job_group,
+                dependency=dependency,
+                args=tp.cast(tp.Mapping[str, JobArgs], args),
                 experiment_id=experiment_id,
                 identity=identity,
             )
-        case _:
-            raise ValueError("Unsupported job type")

xmanager-slurm 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl

Potentially problematic release.

xmanager-slurm 0.3.2py3-none-any.whl → 0.4.1py3-none-any.whl