skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Utilities for the API server."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Type, TypeVar
|
|
4
|
+
|
|
5
|
+
import fastapi
|
|
6
|
+
|
|
7
|
+
from sky.server.requests import payloads
|
|
8
|
+
from sky.skylet import constants
|
|
9
|
+
|
|
10
|
+
_BodyT = TypeVar('_BodyT', bound=payloads.RequestBody)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# TODO(aylei): remove this and disable request body construction at server-side
|
|
14
|
+
def build_body_at_server(request: Optional[fastapi.Request],
|
|
15
|
+
body_type: Type[_BodyT], **data) -> _BodyT:
|
|
16
|
+
"""Builds the request body at the server.
|
|
17
|
+
|
|
18
|
+
For historical reasons, some handlers mimic a client request body
|
|
19
|
+
at server-side in order to coordinate with the interface of executor.
|
|
20
|
+
This will cause issues where the client info like user identity is not
|
|
21
|
+
respected in these handlers. This function is a helper to build the request
|
|
22
|
+
body at server-side with the auth user overridden.
|
|
23
|
+
"""
|
|
24
|
+
request_body = body_type(**data)
|
|
25
|
+
if request is not None:
|
|
26
|
+
auth_user = getattr(request.state, 'auth_user', None)
|
|
27
|
+
if auth_user:
|
|
28
|
+
request_body.env_vars[constants.USER_ID_ENV_VAR] = auth_user.id
|
|
29
|
+
request_body.env_vars[constants.USER_ENV_VAR] = auth_user.name
|
|
30
|
+
return request_body
|
sky/server/uvicorn.py
CHANGED
|
@@ -20,6 +20,7 @@ from uvicorn.supervisors import multiprocess
|
|
|
20
20
|
from sky import sky_logging
|
|
21
21
|
from sky.server import daemons
|
|
22
22
|
from sky.server import metrics as metrics_lib
|
|
23
|
+
from sky.server import plugins
|
|
23
24
|
from sky.server import state
|
|
24
25
|
from sky.server.requests import requests as requests_lib
|
|
25
26
|
from sky.skylet import constants
|
|
@@ -237,6 +238,10 @@ def run(config: uvicorn.Config, max_db_connections: Optional[int] = None):
|
|
|
237
238
|
server = Server(config=config, max_db_connections=max_db_connections)
|
|
238
239
|
try:
|
|
239
240
|
if config.workers is not None and config.workers > 1:
|
|
241
|
+
# When workers > 1, uvicorn does not run server app in the main
|
|
242
|
+
# process. In this case, plugins are not loaded at this point, so
|
|
243
|
+
# load plugins here without uvicorn app.
|
|
244
|
+
plugins.load_plugins(plugins.ExtensionContext())
|
|
240
245
|
sock = config.bind_socket()
|
|
241
246
|
SlowStartMultiprocess(config, target=server.run,
|
|
242
247
|
sockets=[sock]).run()
|
sky/setup_files/MANIFEST.in
CHANGED
|
@@ -15,6 +15,7 @@ include sky/jobs/dashboard/templates/*
|
|
|
15
15
|
include sky/jobs/dashboard/static/*
|
|
16
16
|
include sky/templates/*
|
|
17
17
|
include sky/utils/kubernetes/*
|
|
18
|
+
include sky/ssh_node_pools/deploy/tunnel/*
|
|
18
19
|
include sky/server/html/*
|
|
19
20
|
recursive-include sky/dashboard/out *
|
|
20
21
|
include sky/users/*.conf
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -84,6 +84,7 @@ install_requires = [
|
|
|
84
84
|
'bcrypt==4.0.1',
|
|
85
85
|
'pyjwt',
|
|
86
86
|
'gitpython',
|
|
87
|
+
'paramiko',
|
|
87
88
|
'types-paramiko',
|
|
88
89
|
'alembic',
|
|
89
90
|
'aiohttp',
|
|
@@ -143,9 +144,11 @@ aws_dependencies = [
|
|
|
143
144
|
'awscli>=1.27.10',
|
|
144
145
|
'botocore>=1.29.10',
|
|
145
146
|
'boto3>=1.26.1',
|
|
146
|
-
# NOTE:
|
|
147
|
-
#
|
|
148
|
-
|
|
147
|
+
# NOTE: colorama is a dependency of awscli. We pin it to match the
|
|
148
|
+
# version constraint in awscli (<0.4.7) to prevent potential conflicts
|
|
149
|
+
# with other packages like ray, which might otherwise install a newer
|
|
150
|
+
# version.
|
|
151
|
+
'colorama<0.4.7',
|
|
149
152
|
]
|
|
150
153
|
|
|
151
154
|
# Kubernetes 32.0.0 has an authentication bug:
|
|
@@ -203,12 +206,21 @@ cloud_dependencies: Dict[str, List[str]] = {
|
|
|
203
206
|
'ssh': kubernetes_dependencies,
|
|
204
207
|
# For the container registry auth api. Reference:
|
|
205
208
|
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
209
|
+
'runpod': [
|
|
210
|
+
# For the container registry auth api. Reference:
|
|
211
|
+
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
|
|
212
|
+
'runpod>=1.6.1',
|
|
213
|
+
# RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python
|
|
214
|
+
# 3.11+ stdlib provides tomllib; on lower versions we depend on tomli
|
|
215
|
+
# explicitly. Instead of installing tomli conditionally, we install it
|
|
216
|
+
# explicitly. This is because the conditional installation of tomli does
|
|
217
|
+
# not work with controller package installation code.
|
|
218
|
+
'tomli',
|
|
219
|
+
# runpod installs aiodns (via aiohttp[speedups]), which is incompatible
|
|
220
|
+
# with pycares 5.0.0 due to deprecations.
|
|
221
|
+
# See https://github.com/aio-libs/aiodns/issues/214
|
|
222
|
+
'pycares<5',
|
|
223
|
+
],
|
|
212
224
|
'fluidstack': [], # No dependencies needed for fluidstack
|
|
213
225
|
'cudo': ['cudo-compute>=0.1.10'],
|
|
214
226
|
'paperspace': [], # No dependencies needed for paperspace
|
|
@@ -234,6 +246,7 @@ cloud_dependencies: Dict[str, List[str]] = {
|
|
|
234
246
|
'hyperbolic': [], # No dependencies needed for hyperbolic
|
|
235
247
|
'seeweb': ['ecsapi==0.4.0'],
|
|
236
248
|
'shadeform': [], # No dependencies needed for shadeform
|
|
249
|
+
'slurm': ['python-hostlist'],
|
|
237
250
|
}
|
|
238
251
|
|
|
239
252
|
# Calculate which clouds should be included in the [all] installation.
|
sky/sky_logging.py
CHANGED
|
@@ -15,7 +15,8 @@ from sky.utils import env_options
|
|
|
15
15
|
from sky.utils import rich_utils
|
|
16
16
|
|
|
17
17
|
# UX: Should we show logging prefixes and some extra information in optimizer?
|
|
18
|
-
_FORMAT = '%(levelname).1s %(asctime)s
|
|
18
|
+
_FORMAT = ('%(levelname).1s %(asctime)s.%(msecs)03d PID=%(process)d '
|
|
19
|
+
'%(filename)s:%(lineno)d] %(message)s')
|
|
19
20
|
_DATE_FORMAT = '%m-%d %H:%M:%S'
|
|
20
21
|
_SENSITIVE_LOGGER = ['sky.provisioner', 'sky.optimizer']
|
|
21
22
|
|
sky/skylet/attempt_skylet.py
CHANGED
|
@@ -9,6 +9,7 @@ import psutil
|
|
|
9
9
|
|
|
10
10
|
from sky.skylet import constants
|
|
11
11
|
from sky.skylet import runtime_utils
|
|
12
|
+
from sky.utils import common_utils
|
|
12
13
|
|
|
13
14
|
VERSION_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_VERSION_FILE)
|
|
14
15
|
SKYLET_LOG_FILE = runtime_utils.get_runtime_dir_path(constants.SKYLET_LOG_FILE)
|
|
@@ -97,8 +98,13 @@ def restart_skylet():
|
|
|
97
98
|
for pid in _find_running_skylet_pids():
|
|
98
99
|
try:
|
|
99
100
|
os.kill(pid, signal.SIGKILL)
|
|
100
|
-
|
|
101
|
-
#
|
|
101
|
+
# Wait until process fully terminates so its socket gets released.
|
|
102
|
+
# Without this, find_free_port may race with the kernel closing the
|
|
103
|
+
# socket and fail to bind to the port that's supposed to be free.
|
|
104
|
+
psutil.Process(pid).wait(timeout=5)
|
|
105
|
+
except (OSError, ProcessLookupError, psutil.NoSuchProcess,
|
|
106
|
+
psutil.TimeoutExpired):
|
|
107
|
+
# Process died between detection and kill, or timeout waiting
|
|
102
108
|
pass
|
|
103
109
|
# Clean up the PID file
|
|
104
110
|
try:
|
|
@@ -106,7 +112,11 @@ def restart_skylet():
|
|
|
106
112
|
except OSError:
|
|
107
113
|
pass # Best effort cleanup
|
|
108
114
|
|
|
109
|
-
|
|
115
|
+
# TODO(kevin): Handle race conditions here. Race conditions can only
|
|
116
|
+
# happen on Slurm, where there could be multiple clusters running in
|
|
117
|
+
# one network namespace. For other clouds, the behaviour will be that
|
|
118
|
+
# it always gets port 46590 (default port).
|
|
119
|
+
port = common_utils.find_free_port(constants.SKYLET_GRPC_PORT)
|
|
110
120
|
subprocess.run(
|
|
111
121
|
# We have made sure that `attempt_skylet.py` is executed with the
|
|
112
122
|
# skypilot runtime env activated, so that skylet can access the cloud
|
sky/skylet/constants.py
CHANGED
|
@@ -20,11 +20,13 @@ SKY_RUNTIME_DIR = '${SKY_RUNTIME_DIR:-$HOME}'
|
|
|
20
20
|
# os.path.expanduser(os.environ.get(SKY_RUNTIME_DIR_ENV_VAR_KEY, '~')),
|
|
21
21
|
# '.sky/jobs.db')
|
|
22
22
|
SKY_RUNTIME_DIR_ENV_VAR_KEY = 'SKY_RUNTIME_DIR'
|
|
23
|
+
SKY_CLUSTER_NAME_ENV_VAR_KEY = 'SKY_CLUSTER_NAME'
|
|
23
24
|
# We keep sky_logs and sky_workdir in $HOME, because
|
|
24
25
|
# these are artifacts that users can access, and having
|
|
25
26
|
# them be in $HOME makes it more convenient.
|
|
26
27
|
SKY_LOGS_DIRECTORY = '~/sky_logs'
|
|
27
28
|
SKY_REMOTE_WORKDIR = '~/sky_workdir'
|
|
29
|
+
SKY_TEMPLATES_DIRECTORY = '~/sky_templates'
|
|
28
30
|
SKY_IGNORE_FILE = '.skyignore'
|
|
29
31
|
GIT_IGNORE_FILE = '.gitignore'
|
|
30
32
|
|
|
@@ -45,7 +47,19 @@ SKY_REMOTE_RAY_PORT_FILE = '.sky/ray_port.json'
|
|
|
45
47
|
SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
|
|
46
48
|
SKY_REMOTE_RAY_VERSION = '2.9.3'
|
|
47
49
|
|
|
48
|
-
|
|
50
|
+
# To avoid user image causing issue with the SkyPilot runtime, we run SkyPilot
|
|
51
|
+
# commands the following prefix:
|
|
52
|
+
# 1. env -u PYTHONPATH: unset PYTHONPATH to avoid any package specified in
|
|
53
|
+
# PYTHONPATH interfering with the SkyPilot runtime.
|
|
54
|
+
# 2. env -C $HOME: set the execution directory to $HOME to avoid the case when
|
|
55
|
+
# a user's WORKDIR in Dockerfile is a Python site-packages directory. Python
|
|
56
|
+
# adds CWD to the beginning of sys.path, so if WORKDIR contains packages (e.g.,
|
|
57
|
+
# compiled for a different Python version), imports will fail with errors like
|
|
58
|
+
# "ModuleNotFoundError: No module named 'rpds.rpds'".
|
|
59
|
+
#
|
|
60
|
+
# TODO(zhwu): Switch -C $HOME to PYTHONSAFEPATH=1, once we moved our runtime to
|
|
61
|
+
# Python 3.11 for a more robust setup.
|
|
62
|
+
SKY_UNSET_PYTHONPATH_AND_SET_CWD = 'env -u PYTHONPATH -C $HOME'
|
|
49
63
|
# We store the absolute path of the python executable (/opt/conda/bin/python3)
|
|
50
64
|
# in this file, so that any future internal commands that need to use python
|
|
51
65
|
# can use this path. This is useful for the case where the user has a custom
|
|
@@ -57,7 +71,8 @@ SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
|
|
|
57
71
|
f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
|
|
58
72
|
'which python3')
|
|
59
73
|
# Python executable, e.g., /opt/conda/bin/python3
|
|
60
|
-
SKY_PYTHON_CMD = f'{
|
|
74
|
+
SKY_PYTHON_CMD = (f'{SKY_UNSET_PYTHONPATH_AND_SET_CWD} '
|
|
75
|
+
f'$({SKY_GET_PYTHON_PATH_CMD})')
|
|
61
76
|
# Prefer SKY_UV_PIP_CMD, which is faster.
|
|
62
77
|
# TODO(cooperc): remove remaining usage (GCP TPU setup).
|
|
63
78
|
SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
|
|
@@ -67,17 +82,30 @@ SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
|
|
|
67
82
|
# #!/opt/conda/bin/python3
|
|
68
83
|
SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
|
|
69
84
|
f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
|
|
85
|
+
|
|
86
|
+
# Use $(which env) to find env, falling back to /usr/bin/env if which is
|
|
87
|
+
# unavailable. This works around a Slurm quirk where srun's execvp() doesn't
|
|
88
|
+
# check execute permissions, failing when $HOME/.local/bin/env (non-executable,
|
|
89
|
+
# from uv installation) shadows /usr/bin/env.
|
|
90
|
+
SKY_SLURM_UNSET_PYTHONPATH = ('$(which env 2>/dev/null || echo /usr/bin/env) '
|
|
91
|
+
'-u PYTHONPATH')
|
|
92
|
+
SKY_SLURM_PYTHON_CMD = (f'{SKY_SLURM_UNSET_PYTHONPATH} '
|
|
93
|
+
f'$({SKY_GET_PYTHON_PATH_CMD})')
|
|
94
|
+
|
|
70
95
|
# Separate env for SkyPilot runtime dependencies.
|
|
71
96
|
SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
|
|
72
97
|
SKY_REMOTE_PYTHON_ENV: str = f'{SKY_RUNTIME_DIR}/{SKY_REMOTE_PYTHON_ENV_NAME}'
|
|
73
98
|
ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
|
|
99
|
+
# Place the conda root in the runtime directory, as installing to $HOME
|
|
100
|
+
# on an NFS takes too long (1-2m slower).
|
|
101
|
+
SKY_CONDA_ROOT = f'{SKY_RUNTIME_DIR}/miniconda3'
|
|
74
102
|
# uv is used for venv and pip, much faster than python implementations.
|
|
75
103
|
SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
|
|
76
104
|
# set UV_SYSTEM_PYTHON to false in case the
|
|
77
105
|
# user provided docker image set it to true.
|
|
78
106
|
# unset PYTHONPATH in case the user provided docker image set it.
|
|
79
107
|
SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
|
|
80
|
-
f'{
|
|
108
|
+
f'{SKY_UNSET_PYTHONPATH_AND_SET_CWD} {SKY_UV_INSTALL_DIR}/uv')
|
|
81
109
|
# This won't reinstall uv if it's already installed, so it's safe to re-run.
|
|
82
110
|
SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
|
|
83
111
|
'curl -LsSf https://astral.sh/uv/install.sh '
|
|
@@ -116,7 +144,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
116
144
|
# cluster yaml is updated.
|
|
117
145
|
#
|
|
118
146
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
119
|
-
SKYLET_VERSION = '
|
|
147
|
+
SKYLET_VERSION = '29'
|
|
120
148
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
121
149
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
122
150
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
@@ -162,6 +190,10 @@ DISABLE_GPU_ECC_COMMAND = (
|
|
|
162
190
|
'{ sudo reboot || echo "Failed to reboot. ECC mode may not be disabled"; } '
|
|
163
191
|
'|| true; ')
|
|
164
192
|
|
|
193
|
+
SETUP_SKY_DIRS_COMMANDS = (f'mkdir -p ~/sky_workdir && '
|
|
194
|
+
f'mkdir -p ~/.sky/sky_app && '
|
|
195
|
+
f'mkdir -p {SKY_RUNTIME_DIR}/.sky;')
|
|
196
|
+
|
|
165
197
|
# Install conda on the remote cluster if it is not already installed.
|
|
166
198
|
# We use conda with python 3.10 to be consistent across multiple clouds with
|
|
167
199
|
# best effort.
|
|
@@ -178,8 +210,9 @@ CONDA_INSTALLATION_COMMANDS = (
|
|
|
178
210
|
# because for some images, conda is already installed, but not initialized.
|
|
179
211
|
# In this case, we need to initialize conda and set auto_activate_base to
|
|
180
212
|
# true.
|
|
181
|
-
'{
|
|
182
|
-
'
|
|
213
|
+
'{ '
|
|
214
|
+
f'bash Miniconda3-Linux.sh -b -p "{SKY_CONDA_ROOT}" || true; '
|
|
215
|
+
f'eval "$({SKY_CONDA_ROOT}/bin/conda shell.bash hook)" && conda init && '
|
|
183
216
|
# Caller should replace {conda_auto_activate} with either true or false.
|
|
184
217
|
'conda config --set auto_activate_base {conda_auto_activate} && '
|
|
185
218
|
'conda activate base; }; '
|
|
@@ -222,7 +255,7 @@ _sky_version = str(version.parse(sky.__version__))
|
|
|
222
255
|
RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status'
|
|
223
256
|
RAY_INSTALLATION_COMMANDS = (
|
|
224
257
|
f'{SKY_UV_INSTALL_CMD};'
|
|
225
|
-
'
|
|
258
|
+
f'{SETUP_SKY_DIRS_COMMANDS}'
|
|
226
259
|
# Print the PATH in provision.log to help debug PATH issues.
|
|
227
260
|
'echo PATH=$PATH; '
|
|
228
261
|
# Install setuptools<=69.5.1 to avoid the issue with the latest setuptools
|
|
@@ -256,7 +289,7 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
256
289
|
#
|
|
257
290
|
# Here, we add ~/.local/bin to the end of the PATH to make sure the issues
|
|
258
291
|
# mentioned above are resolved.
|
|
259
|
-
'export PATH=$PATH
|
|
292
|
+
f'export PATH=$PATH:{SKY_RUNTIME_DIR}/.local/bin; '
|
|
260
293
|
# Writes ray path to file if it does not exist or the file is empty.
|
|
261
294
|
f'[ -s {SKY_RAY_PATH_FILE} ] || '
|
|
262
295
|
f'{{ {SKY_UV_RUN_CMD} '
|
|
@@ -264,18 +297,23 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
264
297
|
|
|
265
298
|
# Copy SkyPilot templates from the installed wheel to ~/sky_templates.
|
|
266
299
|
# This must run after the skypilot wheel is installed.
|
|
300
|
+
# Note: We remove ~/sky_templates first to avoid import conflicts where Python
|
|
301
|
+
# would import from ~/sky_templates instead of site-packages (because
|
|
302
|
+
# sky_templates itself is a package), leading to src == dst error when
|
|
303
|
+
# launching on an existing cluster.
|
|
267
304
|
COPY_SKYPILOT_TEMPLATES_COMMANDS = (
|
|
305
|
+
f'rm -rf {SKY_TEMPLATES_DIRECTORY}; '
|
|
268
306
|
f'{ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
|
|
269
307
|
f'{SKY_PYTHON_CMD} -c \''
|
|
270
308
|
'import sky_templates, shutil, os; '
|
|
271
309
|
'src = os.path.dirname(sky_templates.__file__); '
|
|
272
|
-
'dst = os.path.expanduser(\"
|
|
310
|
+
f'dst = os.path.expanduser(\"{SKY_TEMPLATES_DIRECTORY}\"); '
|
|
273
311
|
'print(f\"Copying templates from {src} to {dst}...\"); '
|
|
274
|
-
'shutil.copytree(src, dst
|
|
312
|
+
'shutil.copytree(src, dst); '
|
|
275
313
|
'print(f\"Templates copied successfully\")\'; '
|
|
276
314
|
# Make scripts executable.
|
|
277
|
-
'find
|
|
278
|
-
'-exec chmod +x {}
|
|
315
|
+
f'find {SKY_TEMPLATES_DIRECTORY} -type f ! -name "*.py" ! -name "*.md" '
|
|
316
|
+
'-exec chmod +x {} + ; ')
|
|
279
317
|
|
|
280
318
|
SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
|
|
281
319
|
f'{SKY_UV_INSTALL_CMD};'
|
|
@@ -438,6 +476,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
438
476
|
('gcp', 'enable_gvnic'),
|
|
439
477
|
('gcp', 'enable_gpu_direct'),
|
|
440
478
|
('gcp', 'placement_policy'),
|
|
479
|
+
('vast', 'datacenter_only'),
|
|
441
480
|
('active_workspace',),
|
|
442
481
|
]
|
|
443
482
|
# When overriding the SkyPilot configs on the API server with the client one,
|
|
@@ -498,6 +537,9 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
|
498
537
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
|
499
538
|
OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
|
|
500
539
|
IS_SKYPILOT_SERVE_CONTROLLER = 'IS_SKYPILOT_SERVE_CONTROLLER'
|
|
540
|
+
# Environment variable that is set to 'true' if rolling update strategy is
|
|
541
|
+
# enabled for the API server deployment.
|
|
542
|
+
SKYPILOT_ROLLING_UPDATE_ENABLED = 'SKYPILOT_ROLLING_UPDATE_ENABLED'
|
|
501
543
|
|
|
502
544
|
SERVE_OVERRIDE_CONCURRENT_LAUNCHES = (
|
|
503
545
|
f'{SKYPILOT_ENV_VAR_PREFIX}SERVE_OVERRIDE_CONCURRENT_LAUNCHES')
|
|
@@ -532,7 +574,7 @@ CATALOG_SCHEMA_VERSION = 'v8'
|
|
|
532
574
|
CATALOG_DIR = '~/.sky/catalogs'
|
|
533
575
|
ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
|
|
534
576
|
'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
|
|
535
|
-
'paperspace', 'primeintellect', 'do', 'nebius', 'ssh',
|
|
577
|
+
'paperspace', 'primeintellect', 'do', 'nebius', 'ssh', 'slurm',
|
|
536
578
|
'hyperbolic', 'seeweb', 'shadeform')
|
|
537
579
|
# END constants used for service catalog.
|
|
538
580
|
|
sky/skylet/events.py
CHANGED
|
@@ -236,7 +236,7 @@ class AutostopEvent(SkyletEvent):
|
|
|
236
236
|
RAY_PROVISIONER_SKYPILOT_TERMINATOR):
|
|
237
237
|
logger.info('Using new provisioner to stop the cluster.')
|
|
238
238
|
self._stop_cluster_with_new_provisioner(autostop_config, config,
|
|
239
|
-
provider_name)
|
|
239
|
+
provider_name, cloud)
|
|
240
240
|
return
|
|
241
241
|
logger.info('Not using new provisioner to stop the cluster. '
|
|
242
242
|
f'Cloud of this cluster: {provider_name}')
|
|
@@ -314,7 +314,8 @@ class AutostopEvent(SkyletEvent):
|
|
|
314
314
|
raise NotImplementedError
|
|
315
315
|
|
|
316
316
|
def _stop_cluster_with_new_provisioner(self, autostop_config,
|
|
317
|
-
cluster_config, provider_name
|
|
317
|
+
cluster_config, provider_name,
|
|
318
|
+
cloud):
|
|
318
319
|
# pylint: disable=import-outside-toplevel
|
|
319
320
|
from sky import provision as provision_lib
|
|
320
321
|
autostop_lib.set_autostopping_started()
|
|
@@ -334,8 +335,13 @@ class AutostopEvent(SkyletEvent):
|
|
|
334
335
|
|
|
335
336
|
# Stop the ray autoscaler to avoid scaling up, during
|
|
336
337
|
# stopping/terminating of the cluster.
|
|
337
|
-
|
|
338
|
-
|
|
338
|
+
if not cloud.uses_ray():
|
|
339
|
+
logger.info('Skipping ray stop as cloud does not use Ray.')
|
|
340
|
+
else:
|
|
341
|
+
logger.info('Stopping the ray cluster.')
|
|
342
|
+
subprocess.run(f'{constants.SKY_RAY_CMD} stop',
|
|
343
|
+
shell=True,
|
|
344
|
+
check=True)
|
|
339
345
|
|
|
340
346
|
operation_fn = provision_lib.stop_instances
|
|
341
347
|
if autostop_config.down:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Task Executors"""
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""Slurm distributed task executor for SkyPilot.
|
|
2
|
+
|
|
3
|
+
This module is invoked on each Slurm compute node via:
|
|
4
|
+
srun python -m sky.skylet.executor.slurm --script=... --log-dir=...
|
|
5
|
+
"""
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import pathlib
|
|
10
|
+
import socket
|
|
11
|
+
import subprocess
|
|
12
|
+
import sys
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
import colorama
|
|
16
|
+
|
|
17
|
+
from sky.skylet.log_lib import run_bash_command_with_log
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_ip_address() -> str:
|
|
21
|
+
"""Get the IP address of the current node."""
|
|
22
|
+
# Use socket.gethostbyname to be consistent with _get_job_node_ips(),
|
|
23
|
+
# which resolves hostnames the same way. Using `hostname -I` can return
|
|
24
|
+
# Docker bridge IPs (172.17.x.x) first, causing IP mismatch errors.
|
|
25
|
+
return socket.gethostbyname(socket.gethostname())
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_job_node_ips() -> str:
|
|
29
|
+
"""Get IPs of all nodes in the current Slurm job."""
|
|
30
|
+
nodelist = os.environ.get('SLURM_JOB_NODELIST', '')
|
|
31
|
+
assert nodelist, 'SLURM_JOB_NODELIST is not set'
|
|
32
|
+
|
|
33
|
+
# Expand compressed nodelist (e.g., "node[1-3,5]"
|
|
34
|
+
# -> "node1\nnode2\nnode3\nnode5")
|
|
35
|
+
result = subprocess.run(['scontrol', 'show', 'hostnames', nodelist],
|
|
36
|
+
capture_output=True,
|
|
37
|
+
text=True,
|
|
38
|
+
check=False)
|
|
39
|
+
if result.returncode != 0:
|
|
40
|
+
raise RuntimeError(f'Failed to get hostnames for: {nodelist}')
|
|
41
|
+
|
|
42
|
+
hostnames = result.stdout.strip().split('\n')
|
|
43
|
+
ips = []
|
|
44
|
+
for hostname in hostnames:
|
|
45
|
+
try:
|
|
46
|
+
ip = socket.gethostbyname(hostname)
|
|
47
|
+
ips.append(ip)
|
|
48
|
+
except socket.gaierror as e:
|
|
49
|
+
raise RuntimeError('Failed to get IP for hostname: '
|
|
50
|
+
f'{hostname}') from e
|
|
51
|
+
|
|
52
|
+
return '\n'.join(ips)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def main():
|
|
56
|
+
parser = argparse.ArgumentParser(
|
|
57
|
+
description='SkyPilot Slurm task runner for distributed execution')
|
|
58
|
+
parser.add_argument('--script', help='User script (inline, shell-quoted)')
|
|
59
|
+
parser.add_argument('--script-path',
|
|
60
|
+
help='Path to script file (if too long for inline)')
|
|
61
|
+
parser.add_argument('--env-vars',
|
|
62
|
+
default='{}',
|
|
63
|
+
help='JSON-encoded environment variables')
|
|
64
|
+
parser.add_argument('--log-dir',
|
|
65
|
+
required=True,
|
|
66
|
+
help='Directory for log files')
|
|
67
|
+
parser.add_argument('--cluster-num-nodes',
|
|
68
|
+
type=int,
|
|
69
|
+
required=True,
|
|
70
|
+
help='Total number of nodes in the cluster')
|
|
71
|
+
parser.add_argument('--cluster-ips',
|
|
72
|
+
required=True,
|
|
73
|
+
help='Comma-separated list of cluster node IPs')
|
|
74
|
+
parser.add_argument('--task-name',
|
|
75
|
+
default=None,
|
|
76
|
+
help='Task name for single-node log prefix')
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
'--is-setup',
|
|
79
|
+
action='store_true',
|
|
80
|
+
help=
|
|
81
|
+
'Whether this is a setup command (affects logging prefix and filename)')
|
|
82
|
+
parser.add_argument('--alloc-signal-file',
|
|
83
|
+
help='Path to allocation signal file')
|
|
84
|
+
parser.add_argument('--setup-done-signal-file',
|
|
85
|
+
help='Path to setup-done signal file')
|
|
86
|
+
args = parser.parse_args()
|
|
87
|
+
|
|
88
|
+
assert args.script is not None or args.script_path is not None, (
|
|
89
|
+
'Either '
|
|
90
|
+
'--script or --script-path must be provided')
|
|
91
|
+
|
|
92
|
+
# Task rank, different from index of the node in the cluster.
|
|
93
|
+
rank = int(os.environ['SLURM_PROCID'])
|
|
94
|
+
num_nodes = int(os.environ.get('SLURM_NNODES', 1))
|
|
95
|
+
is_single_node_cluster = (args.cluster_num_nodes == 1)
|
|
96
|
+
|
|
97
|
+
# Determine node index from IP (like Ray's cluster_ips_to_node_id)
|
|
98
|
+
cluster_ips = args.cluster_ips.split(',')
|
|
99
|
+
ip_addr = _get_ip_address()
|
|
100
|
+
try:
|
|
101
|
+
node_idx = cluster_ips.index(ip_addr)
|
|
102
|
+
except ValueError as e:
|
|
103
|
+
raise RuntimeError(f'IP address {ip_addr} not found in '
|
|
104
|
+
f'cluster IPs: {cluster_ips}') from e
|
|
105
|
+
node_name = 'head' if node_idx == 0 else f'worker{node_idx}'
|
|
106
|
+
|
|
107
|
+
# Log files are written to a shared filesystem, so each node must use a
|
|
108
|
+
# unique filename to avoid collisions.
|
|
109
|
+
if args.is_setup:
|
|
110
|
+
# TODO(kevin): This is inconsistent with other clouds, where it is
|
|
111
|
+
# simply called 'setup.log'. On Slurm that is obviously not possible,
|
|
112
|
+
# since the ~/sky_logs directory is shared by all nodes, so
|
|
113
|
+
# 'setup.log' will be overwritten by other nodes.
|
|
114
|
+
# Perhaps we should apply this naming convention to other clouds.
|
|
115
|
+
log_filename = f'setup-{node_name}.log'
|
|
116
|
+
elif is_single_node_cluster:
|
|
117
|
+
log_filename = 'run.log'
|
|
118
|
+
else:
|
|
119
|
+
log_filename = f'{rank}-{node_name}.log'
|
|
120
|
+
log_path = os.path.join(args.log_dir, log_filename)
|
|
121
|
+
|
|
122
|
+
if args.script_path:
|
|
123
|
+
with open(args.script_path, 'r', encoding='utf-8') as f:
|
|
124
|
+
script = f.read()
|
|
125
|
+
else:
|
|
126
|
+
script = args.script
|
|
127
|
+
|
|
128
|
+
# Parse env vars and add SKYPILOT environment variables
|
|
129
|
+
env_vars = json.loads(args.env_vars)
|
|
130
|
+
if not args.is_setup:
|
|
131
|
+
# For setup, env vars are set in CloudVmRayBackend._setup.
|
|
132
|
+
env_vars['SKYPILOT_NODE_RANK'] = str(rank)
|
|
133
|
+
env_vars['SKYPILOT_NUM_NODES'] = str(num_nodes)
|
|
134
|
+
env_vars['SKYPILOT_NODE_IPS'] = _get_job_node_ips()
|
|
135
|
+
|
|
136
|
+
# Signal file coordination for setup/run synchronization
|
|
137
|
+
# Rank 0 touches the allocation signal to indicate resources acquired
|
|
138
|
+
if args.alloc_signal_file is not None and rank == 0:
|
|
139
|
+
pathlib.Path(args.alloc_signal_file).touch()
|
|
140
|
+
|
|
141
|
+
# Wait for setup to complete.
|
|
142
|
+
while args.setup_done_signal_file is not None and not os.path.exists(
|
|
143
|
+
args.setup_done_signal_file):
|
|
144
|
+
time.sleep(0.1)
|
|
145
|
+
|
|
146
|
+
# Build log prefix
|
|
147
|
+
# For setup on head: (setup pid={pid})
|
|
148
|
+
# For setup on workers: (setup pid={pid}, ip=1.2.3.4)
|
|
149
|
+
# For single-node cluster: (task_name, pid={pid})
|
|
150
|
+
# For multi-node on head: (head, rank=0, pid={pid})
|
|
151
|
+
# For multi-node on workers: (worker1, rank=1, pid={pid}, ip=1.2.3.4)
|
|
152
|
+
# The {pid} placeholder will be replaced by run_with_log
|
|
153
|
+
if args.is_setup:
|
|
154
|
+
# Setup prefix: head (node_idx=0) shows no IP, workers show IP
|
|
155
|
+
if node_idx == 0:
|
|
156
|
+
prefix = (f'{colorama.Fore.CYAN}(setup pid={{pid}})'
|
|
157
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
158
|
+
else:
|
|
159
|
+
prefix = (f'{colorama.Fore.CYAN}(setup pid={{pid}}, ip={ip_addr})'
|
|
160
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
161
|
+
elif is_single_node_cluster:
|
|
162
|
+
# Single-node cluster: use task name
|
|
163
|
+
name_str = args.task_name if args.task_name else 'task'
|
|
164
|
+
prefix = (f'{colorama.Fore.CYAN}({name_str}, pid={{pid}})'
|
|
165
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
166
|
+
else:
|
|
167
|
+
# Multi-node cluster: head (node_idx=0) shows no IP, workers show IP
|
|
168
|
+
if node_idx == 0:
|
|
169
|
+
prefix = (
|
|
170
|
+
f'{colorama.Fore.CYAN}({node_name}, rank={rank}, pid={{pid}})'
|
|
171
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
172
|
+
else:
|
|
173
|
+
prefix = (f'{colorama.Fore.CYAN}'
|
|
174
|
+
f'({node_name}, rank={rank}, pid={{pid}}, ip={ip_addr})'
|
|
175
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
176
|
+
|
|
177
|
+
returncode = run_bash_command_with_log(script,
|
|
178
|
+
log_path,
|
|
179
|
+
env_vars=env_vars,
|
|
180
|
+
stream_logs=True,
|
|
181
|
+
streaming_prefix=prefix)
|
|
182
|
+
|
|
183
|
+
sys.exit(returncode)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
if __name__ == '__main__':
|
|
187
|
+
main()
|