PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251203py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (245) hide show

sky/__init__.py +6 -2
sky/adaptors/aws.py +1 -61
sky/adaptors/slurm.py +565 -0
sky/backends/backend_utils.py +95 -12
sky/backends/cloud_vm_ray_backend.py +224 -65
sky/backends/task_codegen.py +380 -4
sky/catalog/__init__.py +0 -3
sky/catalog/data_fetchers/fetch_gcp.py +9 -1
sky/catalog/data_fetchers/fetch_nebius.py +1 -1
sky/catalog/data_fetchers/fetch_vast.py +4 -2
sky/catalog/kubernetes_catalog.py +12 -4
sky/catalog/seeweb_catalog.py +30 -15
sky/catalog/shadeform_catalog.py +5 -2
sky/catalog/slurm_catalog.py +236 -0
sky/catalog/vast_catalog.py +30 -6
sky/check.py +25 -11
sky/client/cli/command.py +391 -32
sky/client/interactive_utils.py +190 -0
sky/client/sdk.py +64 -2
sky/client/sdk_async.py +9 -0
sky/clouds/__init__.py +2 -0
sky/clouds/aws.py +60 -2
sky/clouds/azure.py +2 -0
sky/clouds/cloud.py +7 -0
sky/clouds/kubernetes.py +2 -0
sky/clouds/runpod.py +38 -7
sky/clouds/slurm.py +610 -0
sky/clouds/ssh.py +3 -2
sky/clouds/vast.py +39 -16
sky/core.py +197 -37
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -0
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +26 -12
sky/data/mounting_utils.py +44 -5
sky/global_user_state.py +111 -19
sky/jobs/client/sdk.py +8 -3
sky/jobs/controller.py +191 -31
sky/jobs/recovery_strategy.py +109 -11
sky/jobs/server/core.py +81 -4
sky/jobs/server/server.py +14 -0
sky/jobs/state.py +417 -19
sky/jobs/utils.py +73 -80
sky/models.py +11 -0
sky/optimizer.py +8 -6
sky/provision/__init__.py +12 -9
sky/provision/common.py +20 -0
sky/provision/docker_utils.py +15 -2
sky/provision/kubernetes/utils.py +163 -20
sky/provision/kubernetes/volume.py +52 -17
sky/provision/provisioner.py +17 -7
sky/provision/runpod/instance.py +3 -1
sky/provision/runpod/utils.py +13 -1
sky/provision/runpod/volume.py +25 -9
sky/provision/slurm/__init__.py +12 -0
sky/provision/slurm/config.py +13 -0
sky/provision/slurm/instance.py +618 -0
sky/provision/slurm/utils.py +689 -0
sky/provision/vast/instance.py +4 -1
sky/provision/vast/utils.py +11 -6
sky/resources.py +135 -13
sky/schemas/api/responses.py +4 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
sky/schemas/db/spot_jobs/009_job_events.py +32 -0
sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
sky/schemas/db/spot_jobs/011_add_links.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +9 -5
sky/schemas/generated/jobsv1_pb2.pyi +12 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
sky/serve/serve_utils.py +232 -40
sky/serve/server/impl.py +1 -1
sky/server/common.py +17 -0
sky/server/constants.py +1 -1
sky/server/metrics.py +6 -3
sky/server/plugins.py +238 -0
sky/server/requests/executor.py +5 -2
sky/server/requests/payloads.py +30 -1
sky/server/requests/request_names.py +4 -0
sky/server/requests/requests.py +33 -11
sky/server/requests/serializers/encoders.py +22 -0
sky/server/requests/serializers/return_value_serializers.py +70 -0
sky/server/server.py +506 -109
sky/server/server_utils.py +30 -0
sky/server/uvicorn.py +5 -0
sky/setup_files/MANIFEST.in +1 -0
sky/setup_files/dependencies.py +22 -9
sky/sky_logging.py +2 -1
sky/skylet/attempt_skylet.py +13 -3
sky/skylet/constants.py +55 -13
sky/skylet/events.py +10 -4
sky/skylet/executor/__init__.py +1 -0
sky/skylet/executor/slurm.py +187 -0
sky/skylet/job_lib.py +91 -5
sky/skylet/log_lib.py +22 -6
sky/skylet/log_lib.pyi +8 -6
sky/skylet/services.py +18 -3
sky/skylet/skylet.py +5 -1
sky/skylet/subprocess_daemon.py +2 -1
sky/ssh_node_pools/constants.py +12 -0
sky/ssh_node_pools/core.py +40 -3
sky/ssh_node_pools/deploy/__init__.py +4 -0
sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
sky/ssh_node_pools/deploy/utils.py +173 -0
sky/ssh_node_pools/server.py +11 -13
sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
sky/templates/kubernetes-ray.yml.j2 +12 -6
sky/templates/slurm-ray.yml.j2 +115 -0
sky/templates/vast-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +18 -41
sky/users/model.conf +1 -1
sky/users/permission.py +85 -52
sky/users/rbac.py +31 -3
sky/utils/annotations.py +108 -8
sky/utils/auth_utils.py +42 -0
sky/utils/cli_utils/status_utils.py +19 -5
sky/utils/cluster_utils.py +10 -3
sky/utils/command_runner.py +389 -35
sky/utils/command_runner.pyi +43 -4
sky/utils/common_utils.py +47 -31
sky/utils/context.py +32 -0
sky/utils/db/db_utils.py +36 -6
sky/utils/db/migration_utils.py +41 -21
sky/utils/infra_utils.py +5 -1
sky/utils/instance_links.py +139 -0
sky/utils/interactive_utils.py +49 -0
sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
sky/utils/kubernetes/rsync_helper.sh +5 -1
sky/utils/kubernetes/ssh-tunnel.sh +7 -376
sky/utils/plugin_extensions/__init__.py +14 -0
sky/utils/plugin_extensions/external_failure_source.py +176 -0
sky/utils/resources_utils.py +10 -8
sky/utils/rich_utils.py +9 -11
sky/utils/schemas.py +93 -19
sky/utils/status_lib.py +7 -0
sky/utils/subprocess_utils.py +17 -0
sky/volumes/client/sdk.py +6 -3
sky/volumes/server/core.py +65 -27
sky_templates/ray/start_cluster +8 -4
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
/sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
/sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0

sky/provision/slurm/instance.py ADDED Viewed

@@ -0,0 +1,618 @@
+"""Slurm instance provisioning."""
+import tempfile
+import textwrap
+import time
+from typing import Any, cast, Dict, List, Optional, Tuple
+from sky import sky_logging
+from sky import skypilot_config
+from sky.adaptors import slurm
+from sky.provision import common
+from sky.provision import constants
+from sky.provision.slurm import utils as slurm_utils
+from sky.utils import command_runner
+from sky.utils import common_utils
+from sky.utils import status_lib
+from sky.utils import subprocess_utils
+from sky.utils import timeline
+logger = sky_logging.init_logger(__name__)
+# TODO(kevin): This assumes $HOME is in a shared filesystem.
+# We should probably make it configurable, and add a check
+# during sky check.
+SHARED_ROOT_SKY_DIRECTORY = '~/.sky_clusters'
+PROVISION_SCRIPTS_DIRECTORY_NAME = '.sky_provision'
+PROVISION_SCRIPTS_DIRECTORY = f'~/{PROVISION_SCRIPTS_DIRECTORY_NAME}'
+POLL_INTERVAL_SECONDS = 2
+# Default KillWait is 30 seconds, so we add some buffer time here.
+_JOB_TERMINATION_TIMEOUT_SECONDS = 60
+_SKY_DIR_CREATION_TIMEOUT_SECONDS = 30
+def _sky_cluster_home_dir(cluster_name_on_cloud: str) -> str:
+    """Returns the SkyPilot cluster's home directory path on the Slurm cluster.
+    This path is assumed to be on a shared NFS mount accessible by all nodes.
+    To support clusters with non-NFS home directories, we would need to let
+    users specify an NFS-backed "working directory" or use a different
+    coordination mechanism.
+    """
+    return f'{SHARED_ROOT_SKY_DIRECTORY}/{cluster_name_on_cloud}'
+def _sbatch_provision_script_path(filename: str) -> str:
+    """Returns the path to the sbatch provision script on the login node."""
+    # Put sbatch script in $HOME instead of /tmp as there can be
+    # multiple login nodes, and different SSH connections
+    # can land on different login nodes.
+    return f'{PROVISION_SCRIPTS_DIRECTORY}/{filename}'
+def _skypilot_runtime_dir(cluster_name_on_cloud: str) -> str:
+    """Returns the SkyPilot runtime directory path on the Slurm cluster."""
+    return f'/tmp/{cluster_name_on_cloud}'
+@timeline.event
+def _create_virtual_instance(
+        region: str, cluster_name_on_cloud: str,
+        config: common.ProvisionConfig) -> common.ProvisionRecord:
+    """Creates a Slurm virtual instance from the config.
+    A Slurm virtual instance is created by submitting a long-running
+    job with sbatch, to mimic a cloud VM.
+    """
+    provider_config = config.provider_config
+    ssh_config_dict = provider_config['ssh']
+    ssh_host = ssh_config_dict['hostname']
+    ssh_port = int(ssh_config_dict['port'])
+    ssh_user = ssh_config_dict['user']
+    ssh_key = ssh_config_dict['private_key']
+    ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
+    ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
+    partition = slurm_utils.get_partition_from_config(provider_config)
+    client = slurm.SlurmClient(
+        ssh_host,
+        ssh_port,
+        ssh_user,
+        ssh_key,
+        ssh_proxy_command=ssh_proxy_command,
+        ssh_proxy_jump=ssh_proxy_jump,
+    )
+    # COMPLETING state occurs when a job is being terminated - during this
+    # phase, slurmd sends SIGTERM to tasks, waits for KillWait period, sends
+    # SIGKILL if needed, runs epilog scripts, and notifies slurmctld. This
+    # typically happens when a previous job with the same name is being
+    # cancelled or has finished. Jobs can get stuck in COMPLETING if epilog
+    # scripts hang or tasks don't respond to signals, so we wait with a
+    # timeout.
+    completing_jobs = client.query_jobs(
+        cluster_name_on_cloud,
+        ['completing'],
+    )
+    start_time = time.time()
+    while (completing_jobs and
+           time.time() - start_time < _JOB_TERMINATION_TIMEOUT_SECONDS):
+        logger.debug(f'Found {len(completing_jobs)} completing jobs. '
+                     f'Waiting for them to finish: {completing_jobs}')
+        time.sleep(POLL_INTERVAL_SECONDS)
+        completing_jobs = client.query_jobs(
+            cluster_name_on_cloud,
+            ['completing'],
+        )
+    if completing_jobs:
+        # TODO(kevin): Automatically handle this, following the suggestions in
+        # https://slurm.schedmd.com/troubleshoot.html#completing
+        raise RuntimeError(f'Found {len(completing_jobs)} jobs still in '
+                           'completing state after '
+                           f'{_JOB_TERMINATION_TIMEOUT_SECONDS}s. '
+                           'This is typically due to non-killable processes '
+                           'associated with the job.')
+    # Check if job already exists
+    existing_jobs = client.query_jobs(
+        cluster_name_on_cloud,
+        ['pending', 'running'],
+    )
+    # Get provision_timeout from config. If not specified, use None,
+    # which will use the default timeout specified in the Slurm adaptor.
+    provision_timeout = skypilot_config.get_effective_region_config(
+        cloud='slurm',
+        region=region,
+        keys=('provision_timeout',),
+        default_value=None)
+    if existing_jobs:
+        assert len(existing_jobs) == 1, (
+            f'Multiple jobs found with name {cluster_name_on_cloud}: '
+            f'{existing_jobs}')
+        job_id = existing_jobs[0]
+        logger.debug(f'Job with name {cluster_name_on_cloud} already exists '
+                     f'(JOBID: {job_id})')
+        # Wait for nodes to be allocated (job might be in PENDING state)
+        nodes, _ = client.get_job_nodes(job_id,
+                                        wait=True,
+                                        timeout=provision_timeout)
+        return common.ProvisionRecord(provider_name='slurm',
+                                      region=region,
+                                      zone=partition,
+                                      cluster_name=cluster_name_on_cloud,
+                                      head_instance_id=slurm_utils.instance_id(
+                                          job_id, nodes[0]),
+                                      resumed_instance_ids=[],
+                                      created_instance_ids=[])
+    resources = config.node_config
+    # Note: By default Slurm terminates the entire job allocation if any node
+    # fails in its range of allocated nodes.
+    # In the future we can consider running sbatch with --no-kill to not
+    # automatically terminate a job if one of the nodes it has been
+    # allocated fails.
+    num_nodes = config.count
+    accelerator_type = resources.get('accelerator_type')
+    accelerator_count_raw = resources.get('accelerator_count')
+    try:
+        accelerator_count = int(
+            accelerator_count_raw) if accelerator_count_raw is not None else 0
+    except (TypeError, ValueError):
+        accelerator_count = 0
+    skypilot_runtime_dir = _skypilot_runtime_dir(cluster_name_on_cloud)
+    sky_home_dir = _sky_cluster_home_dir(cluster_name_on_cloud)
+    ready_signal = f'{sky_home_dir}/.sky_sbatch_ready'
+    slurm_marker_file = f'{sky_home_dir}/{slurm_utils.SLURM_MARKER_FILE}'
+    # Build the sbatch script
+    gpu_directive = ''
+    if (accelerator_type is not None and accelerator_type.upper() != 'NONE' and
+            accelerator_count > 0):
+        gpu_directive = (f'#SBATCH --gres=gpu:{accelerator_type}:'
+                         f'{accelerator_count}')
+    # By default stdout and stderr will be written to $HOME/slurm-%j.out
+    # (because we invoke sbatch from $HOME). Redirect elsewhere to not pollute
+    # the home directory.
+    provision_script = textwrap.dedent(f"""\
+        #!/bin/bash
+        #SBATCH --job-name={cluster_name_on_cloud}
+        #SBATCH --output={PROVISION_SCRIPTS_DIRECTORY_NAME}/slurm-%j.out
+        #SBATCH --error={PROVISION_SCRIPTS_DIRECTORY_NAME}/slurm-%j.out
+        #SBATCH --nodes={num_nodes}
+        #SBATCH --wait-all-nodes=1
+        # Let the job be terminated rather than requeued implicitly.
+        #SBATCH --no-requeue
+        #SBATCH --cpus-per-task={int(resources["cpus"])}
+        #SBATCH --mem={int(resources["memory"])}G
+        {gpu_directive}
+        # Cleanup function to remove cluster dirs on job termination.
+        cleanup() {{
+            # The Skylet is daemonized, so it is not automatically terminated when
+            # the Slurm job is terminated, we need to kill it manually.
+            echo "Terminating Skylet..."
+            if [ -f "{skypilot_runtime_dir}/.sky/skylet_pid" ]; then
+                kill $(cat "{skypilot_runtime_dir}/.sky/skylet_pid") 2>/dev/null || true
+            fi
+            echo "Cleaning up sky directories..."
+            # Clean up sky runtime directory on each node.
+            # NOTE: We can do this because --nodes for both this srun and the
+            # sbatch is the same number. Otherwise, there are no guarantees
+            # that this srun will run on the same subset of nodes as the srun
+            # that created the sky directories.
+            srun --nodes={num_nodes} rm -rf {skypilot_runtime_dir}
+            rm -rf {sky_home_dir}
+        }}
+        trap cleanup TERM
+        # Create sky home directory for the cluster.
+        mkdir -p {sky_home_dir}
+        # Create sky runtime directory on each node.
+        srun --nodes={num_nodes} mkdir -p {skypilot_runtime_dir}
+        # Marker file to indicate we're in a Slurm cluster.
+        touch {slurm_marker_file}
+        # Suppress login messages.
+        touch {sky_home_dir}/.hushlogin
+        # Signal that the sbatch script has completed setup.
+        touch {ready_signal}
+        sleep infinity
+        """)
+    # To bootstrap things, we need to do it with SSHCommandRunner first.
+    # SlurmCommandRunner is for after the virtual instances are created.
+    login_node_runner = command_runner.SSHCommandRunner(
+        (ssh_host, ssh_port),
+        ssh_user,
+        ssh_key,
+        ssh_proxy_command=ssh_proxy_command,
+        ssh_proxy_jump=ssh_proxy_jump,
+    )
+    cmd = f'mkdir -p {PROVISION_SCRIPTS_DIRECTORY}'
+    rc, stdout, stderr = login_node_runner.run(cmd,
+                                               require_outputs=True,
+                                               stream_logs=False)
+    subprocess_utils.handle_returncode(
+        rc,
+        cmd,
+        'Failed to create provision scripts directory on login node.',
+        stderr=f'{stdout}\n{stderr}')
+    # Rsync the provision script to the login node
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=True) as f:
+        f.write(provision_script)
+        f.flush()
+        src_path = f.name
+        tgt_path = _sbatch_provision_script_path(f'{cluster_name_on_cloud}.sh')
+        login_node_runner.rsync(src_path, tgt_path, up=True, stream_logs=False)
+    job_id = client.submit_job(partition, cluster_name_on_cloud, tgt_path)
+    logger.debug(f'Successfully submitted Slurm job {job_id} to partition '
+                 f'{partition} for cluster {cluster_name_on_cloud} '
+                 f'with {num_nodes} nodes')
+    nodes, _ = client.get_job_nodes(job_id,
+                                    wait=True,
+                                    timeout=provision_timeout)
+    created_instance_ids = [
+        slurm_utils.instance_id(job_id, node) for node in nodes
+    ]
+    # Wait for the sbatch script to create the cluster's sky directories,
+    # to avoid a race condition where post-provision commands try to
+    # access the directories before they are created.
+    ready_check_cmd = (f'end=$((SECONDS+{_SKY_DIR_CREATION_TIMEOUT_SECONDS})); '
+                       f'while [ ! -f {ready_signal} ]; do '
+                       'if (( SECONDS >= end )); then '
+                       'exit 1; fi; '
+                       'sleep 0.5; '
+                       'done')
+    rc, stdout, stderr = login_node_runner.run(ready_check_cmd,
+                                               require_outputs=True,
+                                               stream_logs=False)
+    subprocess_utils.handle_returncode(
+        rc,
+        ready_check_cmd,
+        'Failed to verify sky directories creation.',
+        stderr=f'{stdout}\n{stderr}')
+    return common.ProvisionRecord(provider_name='slurm',
+                                  region=region,
+                                  zone=partition,
+                                  cluster_name=cluster_name_on_cloud,
+                                  head_instance_id=created_instance_ids[0],
+                                  resumed_instance_ids=[],
+                                  created_instance_ids=created_instance_ids)
+@common_utils.retry
+def query_instances(
+    cluster_name: str,
+    cluster_name_on_cloud: str,
+    provider_config: Optional[Dict[str, Any]] = None,
+    non_terminated_only: bool = True,
+    retry_if_missing: bool = False,
+) -> Dict[str, Tuple[Optional[status_lib.ClusterStatus], Optional[str]]]:
+    """See sky/provision/__init__.py"""
+    del cluster_name, retry_if_missing  # Unused for Slurm
+    assert provider_config is not None, (cluster_name_on_cloud, provider_config)
+    ssh_config_dict = provider_config['ssh']
+    ssh_host = ssh_config_dict['hostname']
+    ssh_port = int(ssh_config_dict['port'])
+    ssh_user = ssh_config_dict['user']
+    ssh_key = ssh_config_dict['private_key']
+    ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
+    ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
+    client = slurm.SlurmClient(
+        ssh_host,
+        ssh_port,
+        ssh_user,
+        ssh_key,
+        ssh_proxy_command=ssh_proxy_command,
+        ssh_proxy_jump=ssh_proxy_jump,
+    )
+    # Map Slurm job states to SkyPilot ClusterStatus
+    # Slurm states:
+    # https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
+    # TODO(kevin): Include more states here.
+    status_map = {
+        'pending': status_lib.ClusterStatus.INIT,
+        'running': status_lib.ClusterStatus.UP,
+        'completing': status_lib.ClusterStatus.UP,
+        'completed': None,
+        'cancelled': None,
+        # NOTE: Jobs that get cancelled (from sky down) will go to failed state
+        # with the reason 'NonZeroExitCode' and remain in the squeue output for
+        # a while.
+        'failed': None,
+        'node_fail': None,
+    }
+    statuses: Dict[str, Tuple[Optional[status_lib.ClusterStatus],
+                              Optional[str]]] = {}
+    for state, sky_status in status_map.items():
+        jobs = client.query_jobs(
+            cluster_name_on_cloud,
+            [state],
+        )
+        for job_id in jobs:
+            if state in ('pending', 'failed', 'node_fail', 'cancelled',
+                         'completed'):
+                reason = client.get_job_reason(job_id)
+                if non_terminated_only and sky_status is None:
+                    # TODO(kevin): For better UX, we should also find out
+                    # which node(s) exactly that failed if it's a node_fail
+                    # state.
+                    logger.debug(f'Job {job_id} is terminated, but '
+                                 'query_instances is called with '
+                                 f'non_terminated_only=True. State: {state}, '
+                                 f'Reason: {reason}')
+                    continue
+                statuses[job_id] = (sky_status, reason)
+            else:
+                nodes, _ = client.get_job_nodes(job_id, wait=False)
+                for node in nodes:
+                    instance_id = slurm_utils.instance_id(job_id, node)
+                    statuses[instance_id] = (sky_status, None)
+        # TODO(kevin): Query sacct too to get more historical job info.
+        # squeue only includes completed jobs that finished in the last
+        # MinJobAge seconds (default 300s). Or could be earlier if it
+        # reaches MaxJobCount first (default 10_000).
+    return statuses
+def run_instances(
+        region: str,
+        cluster_name: str,  # pylint: disable=unused-argument
+        cluster_name_on_cloud: str,
+        config: common.ProvisionConfig) -> common.ProvisionRecord:
+    """Run instances for the given cluster (Slurm in this case)."""
+    return _create_virtual_instance(region, cluster_name_on_cloud, config)
+def wait_instances(region: str, cluster_name_on_cloud: str,
+                   state: Optional[status_lib.ClusterStatus]) -> None:
+    """See sky/provision/__init__.py"""
+    del region, cluster_name_on_cloud, state
+    # We already wait for the instances to be running in run_instances.
+    # So we don't need to wait here.
+def get_cluster_info(
+        region: str,
+        cluster_name_on_cloud: str,
+        provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
+    del region
+    assert provider_config is not None, cluster_name_on_cloud
+    # The SSH host is the remote machine running slurmctld daemon.
+    # Cross-cluster operations are supported by interacting with
+    # the current controller. For details, please refer to
+    # https://slurm.schedmd.com/multi_cluster.html.
+    ssh_config_dict = provider_config['ssh']
+    ssh_host = ssh_config_dict['hostname']
+    ssh_port = int(ssh_config_dict['port'])
+    ssh_user = ssh_config_dict['user']
+    ssh_key = ssh_config_dict['private_key']
+    ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
+    ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
+    client = slurm.SlurmClient(
+        ssh_host,
+        ssh_port,
+        ssh_user,
+        ssh_key,
+        ssh_proxy_command=ssh_proxy_command,
+        ssh_proxy_jump=ssh_proxy_jump,
+    )
+    # Find running job for this cluster
+    running_jobs = client.query_jobs(
+        cluster_name_on_cloud,
+        ['running'],
+    )
+    if not running_jobs:
+        # No running jobs found - cluster may be in pending or terminated state
+        return common.ClusterInfo(
+            instances={},
+            head_instance_id=None,
+            ssh_user=ssh_user,
+            provider_name='slurm',
+            provider_config=provider_config,
+        )
+    assert len(running_jobs) == 1, (
+        f'Multiple running jobs found for cluster {cluster_name_on_cloud}: '
+        f'{running_jobs}')
+    job_id = running_jobs[0]
+    # Running jobs should already have nodes allocated, so don't wait
+    nodes, node_ips = client.get_job_nodes(job_id, wait=False)
+    instances = {
+        f'{slurm_utils.instance_id(job_id, node)}': [
+            common.InstanceInfo(
+                instance_id=slurm_utils.instance_id(job_id, node),
+                internal_ip=node_ip,
+                external_ip=ssh_host,
+                ssh_port=ssh_port,
+                tags={
+                    constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud,
+                    'job_id': job_id,
+                    'node': node,
+                },
+            )
+        ] for node, node_ip in zip(nodes, node_ips)
+    }
+    return common.ClusterInfo(
+        instances=instances,
+        head_instance_id=slurm_utils.instance_id(job_id, nodes[0]),
+        ssh_user=ssh_user,
+        provider_name='slurm',
+        provider_config=provider_config,
+    )
+def stop_instances(
+    cluster_name_on_cloud: str,
+    provider_config: Optional[Dict[str, Any]] = None,
+    worker_only: bool = False,
+) -> None:
+    """Keep the Slurm virtual instances running."""
+    raise NotImplementedError()
+def terminate_instances(
+    cluster_name_on_cloud: str,
+    provider_config: Optional[Dict[str, Any]] = None,
+    worker_only: bool = False,
+) -> None:
+    """See sky/provision/__init__.py"""
+    assert provider_config is not None, cluster_name_on_cloud
+    if worker_only:
+        logger.warning(
+            'worker_only=True is not supported for Slurm, this is a no-op.')
+        return
+    # Check if we are running inside a Slurm cluster (only happens with
+    # autodown, where the Skylet invokes terminate_instances on the remote
+    # cluster). In this case, use local execution instead of SSH.
+    # This assumes that the compute node is able to run scancel.
+    # TODO(kevin): Validate this assumption.
+    if slurm_utils.is_inside_slurm_cluster():
+        logger.debug('Running inside a Slurm cluster, using local execution')
+        client = slurm.SlurmClient(is_inside_slurm_cluster=True)
+    else:
+        ssh_config_dict = provider_config['ssh']
+        ssh_host = ssh_config_dict['hostname']
+        ssh_port = int(ssh_config_dict['port'])
+        ssh_user = ssh_config_dict['user']
+        ssh_private_key = ssh_config_dict['private_key']
+        ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
+        ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
+        client = slurm.SlurmClient(
+            ssh_host,
+            ssh_port,
+            ssh_user,
+            ssh_private_key,
+            ssh_proxy_command=ssh_proxy_command,
+            ssh_proxy_jump=ssh_proxy_jump,
+        )
+    jobs_state = client.get_jobs_state_by_name(cluster_name_on_cloud)
+    if not jobs_state:
+        logger.debug(f'Job for cluster {cluster_name_on_cloud} not found, '
+                     'it may have been terminated.')
+        return
+    assert len(jobs_state) == 1, (
+        f'Multiple jobs found for cluster {cluster_name_on_cloud}: {jobs_state}'
+    )
+    job_state = jobs_state[0].strip()
+    # Terminal states where scancel is not needed or will fail.
+    terminal_states = {
+        'COMPLETED', 'CANCELLED', 'FAILED', 'TIMEOUT', 'NODE_FAIL', 'PREEMPTED',
+        'SPECIAL_EXIT'
+    }
+    if job_state in terminal_states:
+        logger.debug(
+            f'Job for cluster {cluster_name_on_cloud} is already in a terminal '
+            f'state {job_state}. No action needed.')
+        return
+    if job_state in ('PENDING', 'CONFIGURING'):
+        # For pending/configuring jobs, cancel without signal to avoid hangs.
+        client.cancel_jobs_by_name(cluster_name_on_cloud, signal=None)
+    elif job_state == 'COMPLETING':
+        # Job is already being terminated. No action needed.
+        logger.debug(
+            f'Job for cluster {cluster_name_on_cloud} is already completing. '
+            'No action needed.')
+    else:
+        # For other states (e.g., RUNNING, SUSPENDED), send a TERM signal.
+        client.cancel_jobs_by_name(cluster_name_on_cloud,
+                                   signal='TERM',
+                                   full=True)
+def open_ports(
+    cluster_name_on_cloud: str,
+    ports: List[str],
+    provider_config: Optional[Dict[str, Any]] = None,
+) -> None:
+    """See sky/provision/__init__.py"""
+    del cluster_name_on_cloud, ports, provider_config
+    pass
+def cleanup_ports(
+    cluster_name_on_cloud: str,
+    ports: List[str],
+    provider_config: Optional[Dict[str, Any]] = None,
+) -> None:
+    """See sky/provision/__init__.py"""
+    del cluster_name_on_cloud, ports, provider_config
+    pass
+def get_command_runners(
+    cluster_info: common.ClusterInfo,
+    **credentials: Dict[str, Any],
+) -> List[command_runner.SlurmCommandRunner]:
+    """Get a command runner for the given cluster."""
+    assert cluster_info.provider_config is not None, cluster_info
+    if cluster_info.head_instance_id is None:
+        # No running job found
+        return []
+    head_instance = cluster_info.get_head_instance()
+    assert head_instance is not None, 'Head instance not found'
+    cluster_name_on_cloud = head_instance.tags.get(
+        constants.TAG_SKYPILOT_CLUSTER_NAME, None)
+    assert cluster_name_on_cloud is not None, cluster_info
+    # There can only be one InstanceInfo per instance_id.
+    instances = [
+        instance_infos[0] for instance_infos in cluster_info.instances.values()
+    ]
+    # Note: For Slurm, the external IP for all instances is the same,
+    # it is the login node's. The internal IP is the private IP of the node.
+    ssh_user = cast(str, credentials.pop('ssh_user'))
+    ssh_private_key = cast(str, credentials.pop('ssh_private_key'))
+    # ssh_proxy_jump is Slurm-specific, it does not exist in the auth section
+    # of the cluster yaml.
+    ssh_proxy_jump = cluster_info.provider_config.get('ssh', {}).get(
+        'proxyjump', None)
+    runners = [
+        command_runner.SlurmCommandRunner(
+            (instance_info.external_ip or '', instance_info.ssh_port),
+            ssh_user,
+            ssh_private_key,
+            sky_dir=_sky_cluster_home_dir(cluster_name_on_cloud),
+            skypilot_runtime_dir=_skypilot_runtime_dir(cluster_name_on_cloud),
+            job_id=instance_info.tags['job_id'],
+            slurm_node=instance_info.tags['node'],
+            ssh_proxy_jump=ssh_proxy_jump,
+            enable_interactive_auth=True,
+            **credentials) for instance_info in instances
+    ]
+    return runners

skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

skypilot-nightly 1.0.0.dev20251203py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl