skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/skylet/constants.py
CHANGED
@@ -1,10 +1,15 @@
|
|
1
1
|
"""Constants for SkyPilot."""
|
2
|
+
from typing import List, Tuple
|
3
|
+
|
2
4
|
from packaging import version
|
3
5
|
|
4
6
|
import sky
|
7
|
+
from sky.setup_files import dependencies
|
5
8
|
|
6
9
|
SKY_LOGS_DIRECTORY = '~/sky_logs'
|
7
10
|
SKY_REMOTE_WORKDIR = '~/sky_workdir'
|
11
|
+
SKY_IGNORE_FILE = '.skyignore'
|
12
|
+
GIT_IGNORE_FILE = '.gitignore'
|
8
13
|
|
9
14
|
# Default Ray port is 6379. Default Ray dashboard port is 8265.
|
10
15
|
# Default Ray tempdir is /tmp/ray.
|
@@ -35,32 +40,47 @@ SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
|
|
35
40
|
'which python3')
|
36
41
|
# Python executable, e.g., /opt/conda/bin/python3
|
37
42
|
SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
|
43
|
+
# Prefer SKY_UV_PIP_CMD, which is faster.
|
44
|
+
# TODO(cooperc): remove remaining usage (GCP TPU setup).
|
38
45
|
SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
|
39
46
|
# Ray executable, e.g., /opt/conda/bin/ray
|
40
47
|
# We need to add SKY_PYTHON_CMD before ray executable because:
|
41
48
|
# The ray executable is a python script with a header like:
|
42
49
|
# #!/opt/conda/bin/python3
|
43
|
-
# When we create the skypilot-runtime venv, the previously installed ray
|
44
|
-
# executable will be reused (due to --system-site-packages), and that will cause
|
45
|
-
# running ray CLI commands to use the wrong python executable.
|
46
50
|
SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
|
47
51
|
f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
|
48
52
|
# Separate env for SkyPilot runtime dependencies.
|
49
53
|
SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
|
50
54
|
SKY_REMOTE_PYTHON_ENV = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
|
51
55
|
ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
|
56
|
+
# uv is used for venv and pip, much faster than python implementations.
|
57
|
+
SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
|
58
|
+
SKY_UV_CMD = f'UV_SYSTEM_PYTHON=false {SKY_UV_INSTALL_DIR}/uv'
|
59
|
+
# This won't reinstall uv if it's already installed, so it's safe to re-run.
|
60
|
+
SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
|
61
|
+
'curl -LsSf https://astral.sh/uv/install.sh '
|
62
|
+
f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
|
63
|
+
SKY_UV_PIP_CMD = f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip'
|
64
|
+
# Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH to deactivate the
|
65
|
+
# environment. `deactivate` command does not work when conda is used.
|
66
|
+
DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
|
67
|
+
'export PATH='
|
68
|
+
f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||")')
|
69
|
+
|
70
|
+
# Prefix for SkyPilot environment variables
|
71
|
+
SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
|
52
72
|
|
53
73
|
# The name for the environment variable that stores the unique ID of the
|
54
74
|
# current task. This will stay the same across multiple recoveries of the
|
55
75
|
# same managed task.
|
56
|
-
TASK_ID_ENV_VAR = '
|
76
|
+
TASK_ID_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_ID'
|
57
77
|
# This environment variable stores a '\n'-separated list of task IDs that
|
58
78
|
# are within the same managed job (DAG). This can be used by the user to
|
59
79
|
# retrieve the task IDs of any tasks that are within the same managed job.
|
60
80
|
# This environment variable is pre-assigned before any task starts
|
61
81
|
# running within the same job, and will remain constant throughout the
|
62
82
|
# lifetime of the job.
|
63
|
-
TASK_ID_LIST_ENV_VAR = '
|
83
|
+
TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
64
84
|
|
65
85
|
# The version of skylet. MUST bump this version whenever we need the skylet to
|
66
86
|
# be restarted on existing clusters updated with the new version of SkyPilot,
|
@@ -69,11 +89,11 @@ TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS'
|
|
69
89
|
# cluster yaml is updated.
|
70
90
|
#
|
71
91
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
72
|
-
SKYLET_VERSION = '
|
92
|
+
SKYLET_VERSION = '12'
|
73
93
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
74
94
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
75
95
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
76
|
-
SKYLET_LIB_VERSION =
|
96
|
+
SKYLET_LIB_VERSION = 2
|
77
97
|
SKYLET_VERSION_FILE = '~/.sky/skylet_version'
|
78
98
|
|
79
99
|
# `sky jobs dashboard`-related
|
@@ -84,15 +104,37 @@ SPOT_DASHBOARD_REMOTE_PORT = 5000
|
|
84
104
|
# Docker default options
|
85
105
|
DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
|
86
106
|
DEFAULT_DOCKER_PORT = 10022
|
87
|
-
DOCKER_USERNAME_ENV_VAR = '
|
88
|
-
DOCKER_PASSWORD_ENV_VAR = '
|
89
|
-
DOCKER_SERVER_ENV_VAR = '
|
107
|
+
DOCKER_USERNAME_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}DOCKER_USERNAME'
|
108
|
+
DOCKER_PASSWORD_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}DOCKER_PASSWORD'
|
109
|
+
DOCKER_SERVER_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}DOCKER_SERVER'
|
90
110
|
DOCKER_LOGIN_ENV_VARS = {
|
91
111
|
DOCKER_USERNAME_ENV_VAR,
|
92
112
|
DOCKER_PASSWORD_ENV_VAR,
|
93
113
|
DOCKER_SERVER_ENV_VAR,
|
94
114
|
}
|
95
115
|
|
116
|
+
RUNPOD_DOCKER_USERNAME_ENV_VAR = 'SKYPILOT_RUNPOD_DOCKER_USERNAME'
|
117
|
+
|
118
|
+
# Commands for disable GPU ECC, which can improve the performance of the GPU
|
119
|
+
# for some workloads by 30%. This will only be applied when a user specify
|
120
|
+
# `nvidia_gpus.disable_ecc: true` in ~/.sky/config.yaml.
|
121
|
+
# Running this command will reboot the machine, introducing overhead for
|
122
|
+
# provisioning the machine.
|
123
|
+
# https://portal.nutanix.com/page/documents/kbs/details?targetId=kA00e000000LKjOCAW
|
124
|
+
DISABLE_GPU_ECC_COMMAND = (
|
125
|
+
# Check if the GPU ECC is enabled. We use `sudo which` to check nvidia-smi
|
126
|
+
# because in some environments, nvidia-smi is not in path for sudo and we
|
127
|
+
# should skip disabling ECC in this case.
|
128
|
+
'sudo which nvidia-smi && echo "Checking Nvidia ECC Mode" && '
|
129
|
+
'out=$(nvidia-smi -q | grep "ECC Mode" -A2) && '
|
130
|
+
'echo "$out" && echo "$out" | grep Current | grep Enabled && '
|
131
|
+
'echo "Disabling Nvidia ECC" && '
|
132
|
+
# Disable the GPU ECC.
|
133
|
+
'sudo nvidia-smi -e 0 && '
|
134
|
+
# Reboot the machine to apply the changes.
|
135
|
+
'{ sudo reboot || echo "Failed to reboot. ECC mode may not be disabled"; } '
|
136
|
+
'|| true; ')
|
137
|
+
|
96
138
|
# Install conda on the remote cluster if it is not already installed.
|
97
139
|
# We use conda with python 3.10 to be consistent across multiple clouds with
|
98
140
|
# best effort.
|
@@ -101,40 +143,51 @@ DOCKER_LOGIN_ENV_VARS = {
|
|
101
143
|
# AWS's Deep Learning AMI's default conda environment.
|
102
144
|
CONDA_INSTALLATION_COMMANDS = (
|
103
145
|
'which conda > /dev/null 2>&1 || '
|
104
|
-
'{
|
105
|
-
'
|
146
|
+
'{ '
|
147
|
+
'curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long
|
148
|
+
# We do not use && for installation of conda and the following init commands
|
149
|
+
# because for some images, conda is already installed, but not initialized.
|
150
|
+
# In this case, we need to initialize conda and set auto_activate_base to
|
151
|
+
# true.
|
152
|
+
'{ bash Miniconda3-Linux-x86_64.sh -b; '
|
106
153
|
'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
|
107
|
-
|
108
|
-
|
154
|
+
# Caller should replace {conda_auto_activate} with either true or false.
|
155
|
+
'conda config --set auto_activate_base {conda_auto_activate} && '
|
156
|
+
'conda activate base; }; '
|
157
|
+
'}; '
|
109
158
|
'grep "# >>> conda initialize >>>" ~/.bashrc || '
|
110
159
|
'{ conda init && source ~/.bashrc; };'
|
111
|
-
#
|
112
|
-
|
113
|
-
# We don't use a separate conda env for SkyPilot dependencies because it is
|
114
|
-
# costly to create a new conda env, and venv should be a lightweight and
|
115
|
-
# faster alternative when the python version satisfies the requirement.
|
116
|
-
'[[ $(python3 --version | cut -d " " -f 2 | cut -d "." -f 2) -ge 12 ]] && '
|
117
|
-
f'echo "Creating conda env with Python 3.10" && '
|
118
|
-
f'conda create -y -n {SKY_REMOTE_PYTHON_ENV_NAME} python=3.10 && '
|
119
|
-
f'conda activate {SKY_REMOTE_PYTHON_ENV_NAME};'
|
160
|
+
# Install uv for venv management and pip installation.
|
161
|
+
f'{SKY_UV_INSTALL_CMD};'
|
120
162
|
# Create a separate conda environment for SkyPilot dependencies.
|
121
163
|
f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
|
122
|
-
|
123
|
-
|
164
|
+
# Do NOT use --system-site-packages here, because if users upgrade any
|
165
|
+
# packages in the base env, they interfere with skypilot dependencies.
|
166
|
+
# Reference: https://github.com/skypilot-org/skypilot/issues/4097
|
167
|
+
# --seed will include pip and setuptools, which are present in venvs created
|
168
|
+
# with python -m venv.
|
169
|
+
# --python 3.10 will ensure the specific python version is downloaded
|
170
|
+
# and installed in the venv. SkyPilot requires Python<3.12, and 3.10 is
|
171
|
+
# preferred. We have to always pass in `--python` to avoid the issue when a
|
172
|
+
# user has `.python_version` file in their home directory, which will cause
|
173
|
+
# uv to use the python version specified in the `.python_version` file.
|
174
|
+
# TODO(zhwu): consider adding --python-preference only-managed to avoid
|
175
|
+
# using the system python, if a user report such issue.
|
176
|
+
f'{SKY_UV_CMD} venv --seed {SKY_REMOTE_PYTHON_ENV} --python 3.10;'
|
177
|
+
f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE};'
|
124
178
|
)
|
125
179
|
|
126
180
|
_sky_version = str(version.parse(sky.__version__))
|
127
181
|
RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status'
|
128
|
-
|
129
|
-
|
130
|
-
# backend_utils.write_cluster_config.
|
131
|
-
RAY_SKYPILOT_INSTALLATION_COMMANDS = (
|
182
|
+
RAY_INSTALLATION_COMMANDS = (
|
183
|
+
f'{SKY_UV_INSTALL_CMD};'
|
132
184
|
'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;'
|
133
|
-
# Disable the pip version check to avoid the warning message, which makes
|
134
|
-
# the output hard to read.
|
135
|
-
'export PIP_DISABLE_PIP_VERSION_CHECK=1;'
|
136
185
|
# Print the PATH in provision.log to help debug PATH issues.
|
137
186
|
'echo PATH=$PATH; '
|
187
|
+
# Install setuptools<=69.5.1 to avoid the issue with the latest setuptools
|
188
|
+
# causing the error:
|
189
|
+
# ImportError: cannot import name 'packaging' from 'pkg_resources'"
|
190
|
+
f'{SKY_UV_PIP_CMD} install "setuptools<70"; '
|
138
191
|
# Backward compatibility for ray upgrade (#3248): do not upgrade ray if the
|
139
192
|
# ray cluster is already running, to avoid the ray cluster being restarted.
|
140
193
|
#
|
@@ -148,10 +201,10 @@ RAY_SKYPILOT_INSTALLATION_COMMANDS = (
|
|
148
201
|
# latest ray port 6380, but those existing cluster launched before #1790
|
149
202
|
# that has ray cluster on the default port 6379 will be upgraded and
|
150
203
|
# restarted.
|
151
|
-
f'{
|
204
|
+
f'{SKY_UV_PIP_CMD} list | grep "ray " | '
|
152
205
|
f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
|
153
206
|
f'|| {RAY_STATUS} || '
|
154
|
-
f'{
|
207
|
+
f'{SKY_UV_PIP_CMD} install -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long
|
155
208
|
# In some envs, e.g. pip does not have permission to write under /opt/conda
|
156
209
|
# ray package will be installed under ~/.local/bin. If the user's PATH does
|
157
210
|
# not include ~/.local/bin (the pip install will have the output: `WARNING:
|
@@ -164,35 +217,54 @@ RAY_SKYPILOT_INSTALLATION_COMMANDS = (
|
|
164
217
|
# Writes ray path to file if it does not exist or the file is empty.
|
165
218
|
f'[ -s {SKY_RAY_PATH_FILE} ] || '
|
166
219
|
f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && '
|
167
|
-
f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; '
|
168
|
-
|
169
|
-
|
220
|
+
f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
|
221
|
+
|
222
|
+
SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
|
223
|
+
f'{SKY_UV_INSTALL_CMD};'
|
224
|
+
f'{{ {SKY_UV_PIP_CMD} list | grep "skypilot " && '
|
170
225
|
'[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long
|
171
|
-
f'{{ {
|
172
|
-
|
226
|
+
f'{{ {SKY_UV_PIP_CMD} uninstall skypilot; '
|
227
|
+
# uv cannot install azure-cli normally, since it depends on pre-release
|
228
|
+
# packages. Manually install azure-cli with the --prerelease=allow flag
|
229
|
+
# first. This will allow skypilot to successfully install. See
|
230
|
+
# https://docs.astral.sh/uv/pip/compatibility/#pre-release-compatibility.
|
231
|
+
# We don't want to use --prerelease=allow for all packages, because it will
|
232
|
+
# cause uv to use pre-releases for some other packages that have sufficient
|
233
|
+
# stable releases.
|
234
|
+
'if [ "{cloud}" = "azure" ]; then '
|
235
|
+
f'{SKY_UV_PIP_CMD} install --prerelease=allow "{dependencies.AZURE_CLI}";'
|
236
|
+
'fi;'
|
237
|
+
# Install skypilot from wheel
|
238
|
+
f'{SKY_UV_PIP_CMD} install "$(echo ~/.sky/wheels/{{sky_wheel_hash}}/'
|
173
239
|
f'skypilot-{_sky_version}*.whl)[{{cloud}}, remote]" && '
|
174
240
|
'echo "{sky_wheel_hash}" > ~/.sky/wheels/current_sky_wheel_hash || '
|
175
|
-
'exit 1; }; '
|
176
|
-
# END SkyPilot package check and installation
|
241
|
+
'exit 1; }; ')
|
177
242
|
|
243
|
+
# Install ray and skypilot on the remote cluster if they are not already
|
244
|
+
# installed. {var} will be replaced with the actual value in
|
245
|
+
# backend_utils.write_cluster_config.
|
246
|
+
RAY_SKYPILOT_INSTALLATION_COMMANDS = (
|
247
|
+
f'{RAY_INSTALLATION_COMMANDS} '
|
248
|
+
f'{SKYPILOT_WHEEL_INSTALLATION_COMMANDS} '
|
178
249
|
# Only patch ray when the ray version is the same as the expected version.
|
179
250
|
# The ray installation above can be skipped due to the existing ray cluster
|
180
251
|
# for backward compatibility. In this case, we should not patch the ray
|
181
252
|
# files.
|
182
|
-
f'{
|
183
|
-
f'
|
184
|
-
'
|
253
|
+
f'{SKY_UV_PIP_CMD} list | grep "ray " | '
|
254
|
+
f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null && '
|
255
|
+
f'{{ {SKY_PYTHON_CMD} -c '
|
256
|
+
'"from sky.skylet.ray_patches import patch; patch()" || exit 1; }; ')
|
185
257
|
|
186
258
|
# The name for the environment variable that stores SkyPilot user hash, which
|
187
259
|
# is mainly used to make sure sky commands runs on a VM launched by SkyPilot
|
188
260
|
# will be recognized as the same user (e.g., jobs controller or sky serve
|
189
261
|
# controller).
|
190
|
-
USER_ID_ENV_VAR = '
|
262
|
+
USER_ID_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}USER_ID'
|
191
263
|
|
192
264
|
# The name for the environment variable that stores SkyPilot user name.
|
193
265
|
# Similar to USER_ID_ENV_VAR, this is mainly used to make sure sky commands
|
194
266
|
# runs on a VM launched by SkyPilot will be recognized as the same user.
|
195
|
-
USER_ENV_VAR = '
|
267
|
+
USER_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}USER'
|
196
268
|
|
197
269
|
# In most clouds, cluster names can only contain lowercase letters, numbers
|
198
270
|
# and hyphens. We use this regex to validate the cluster name.
|
@@ -201,13 +273,25 @@ CLUSTER_NAME_VALID_REGEX = '[a-zA-Z]([-_.a-zA-Z0-9]*[a-zA-Z0-9])?'
|
|
201
273
|
# Used for translate local file mounts to cloud storage. Please refer to
|
202
274
|
# sky/execution.py::_maybe_translate_local_file_mounts_and_sync_up for
|
203
275
|
# more details.
|
204
|
-
|
205
|
-
FILE_MOUNTS_BUCKET_NAME = 'skypilot-filemounts-folder-{username}-{id}'
|
206
|
-
FILE_MOUNTS_FILE_ONLY_BUCKET_NAME = 'skypilot-filemounts-files-{username}-{id}'
|
276
|
+
FILE_MOUNTS_BUCKET_NAME = 'skypilot-filemounts-{username}-{user_hash}-{id}'
|
207
277
|
FILE_MOUNTS_LOCAL_TMP_DIR = 'skypilot-filemounts-files-{id}'
|
208
278
|
FILE_MOUNTS_REMOTE_TMP_DIR = '/tmp/sky-{}-filemounts-files'
|
279
|
+
# For API server, the use a temporary directory in the same path as the upload
|
280
|
+
# directory to avoid using a different block device, which may not allow hard
|
281
|
+
# linking. E.g., in our API server deployment on k8s, ~/.sky/ is mounted from a
|
282
|
+
# persistent volume, so any contents in ~/.sky/ cannot be hard linked elsewhere.
|
283
|
+
FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.sky/tmp/'
|
284
|
+
# Base path for two-hop file mounts translation. See
|
285
|
+
# controller_utils.translate_local_file_mounts_to_two_hop().
|
286
|
+
FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.sky/tmp/controller'
|
287
|
+
|
288
|
+
# Used when an managed jobs are created and
|
289
|
+
# files are synced up to the cloud.
|
290
|
+
FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
|
291
|
+
FILE_MOUNTS_SUBPATH = 'job-{run_id}/local-file-mounts/{i}'
|
292
|
+
FILE_MOUNTS_TMP_SUBPATH = 'job-{run_id}/tmp-files'
|
209
293
|
|
210
|
-
# The default idle timeout for SkyPilot controllers. This include
|
294
|
+
# The default idle timeout for SkyPilot controllers. This include jobs
|
211
295
|
# controller and sky serve controller.
|
212
296
|
# TODO(tian): Refactor to controller_utils. Current blocker: circular import.
|
213
297
|
CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10
|
@@ -220,3 +304,52 @@ CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10
|
|
220
304
|
# Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
|
221
305
|
# services.
|
222
306
|
CONTROLLER_PROCESS_CPU_DEMAND = 0.25
|
307
|
+
# The log for SkyPilot API server.
|
308
|
+
API_SERVER_LOGS = '~/.sky/api_server/server.log'
|
309
|
+
# The lock for creating the SkyPilot API server.
|
310
|
+
API_SERVER_CREATION_LOCK_PATH = '~/.sky/api_server/.creation.lock'
|
311
|
+
|
312
|
+
# The name for the environment variable that stores the URL of the SkyPilot
|
313
|
+
# API server.
|
314
|
+
SKY_API_SERVER_URL_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}API_SERVER_ENDPOINT'
|
315
|
+
|
316
|
+
# SkyPilot environment variables
|
317
|
+
SKYPILOT_NUM_NODES = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_NODES'
|
318
|
+
SKYPILOT_NODE_IPS = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_IPS'
|
319
|
+
SKYPILOT_NUM_GPUS_PER_NODE = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_GPUS_PER_NODE'
|
320
|
+
SKYPILOT_NODE_RANK = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_RANK'
|
321
|
+
|
322
|
+
# Placeholder for the SSH user in proxy command, replaced when the ssh_user is
|
323
|
+
# known after provisioning.
|
324
|
+
SKY_SSH_USER_PLACEHOLDER = 'skypilot:ssh_user'
|
325
|
+
|
326
|
+
# The keys that can be overridden in the `~/.sky/config.yaml` file. The
|
327
|
+
# overrides are specified in task YAMLs.
|
328
|
+
OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
329
|
+
('docker', 'run_options'),
|
330
|
+
('nvidia_gpus', 'disable_ecc'),
|
331
|
+
('kubernetes', 'pod_config'),
|
332
|
+
('kubernetes', 'provision_timeout'),
|
333
|
+
('gcp', 'managed_instance_group'),
|
334
|
+
]
|
335
|
+
# When overriding the SkyPilot configs on the API server with the client one,
|
336
|
+
# we skip the following keys because they are meant to be client-side configs.
|
337
|
+
SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('admin_policy',),
|
338
|
+
('api_server',),
|
339
|
+
('allowed_clouds',)]
|
340
|
+
|
341
|
+
# Constants for Azure blob storage
|
342
|
+
WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
|
343
|
+
# Observed time for new role assignment to propagate was ~45s
|
344
|
+
WAIT_FOR_STORAGE_ACCOUNT_ROLE_ASSIGNMENT = 180
|
345
|
+
RETRY_INTERVAL_AFTER_ROLE_ASSIGNMENT = 10
|
346
|
+
ROLE_ASSIGNMENT_FAILURE_ERROR_MSG = (
|
347
|
+
'Failed to assign Storage Blob Data Owner role to the '
|
348
|
+
'storage account {storage_account_name}.')
|
349
|
+
|
350
|
+
# The placeholder for the local skypilot config path in file mounts for
|
351
|
+
# controllers.
|
352
|
+
LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
|
353
|
+
|
354
|
+
# Path to the generated cluster config yamls and ssh configs.
|
355
|
+
SKY_USER_FILE_PATH = '~/.sky/generated'
|
sky/skylet/events.py
CHANGED
@@ -12,14 +12,17 @@ import yaml
|
|
12
12
|
from sky import clouds
|
13
13
|
from sky import sky_logging
|
14
14
|
from sky.backends import cloud_vm_ray_backend
|
15
|
-
from sky.
|
15
|
+
from sky.jobs import scheduler as managed_job_scheduler
|
16
|
+
from sky.jobs import state as managed_job_state
|
16
17
|
from sky.jobs import utils as managed_job_utils
|
17
18
|
from sky.serve import serve_utils
|
18
19
|
from sky.skylet import autostop_lib
|
19
20
|
from sky.skylet import constants
|
20
21
|
from sky.skylet import job_lib
|
21
|
-
from sky.
|
22
|
+
from sky.usage import usage_lib
|
23
|
+
from sky.utils import cluster_utils
|
22
24
|
from sky.utils import common_utils
|
25
|
+
from sky.utils import registry
|
23
26
|
from sky.utils import ux_utils
|
24
27
|
|
25
28
|
# Seconds of sleep between the processing of skylet events.
|
@@ -67,12 +70,13 @@ class JobSchedulerEvent(SkyletEvent):
|
|
67
70
|
job_lib.scheduler.schedule_step(force_update_jobs=True)
|
68
71
|
|
69
72
|
|
70
|
-
class
|
71
|
-
"""Skylet event for updating managed
|
73
|
+
class ManagedJobEvent(SkyletEvent):
|
74
|
+
"""Skylet event for updating and scheduling managed jobs."""
|
72
75
|
EVENT_INTERVAL_SECONDS = 300
|
73
76
|
|
74
77
|
def _run(self):
|
75
|
-
managed_job_utils.
|
78
|
+
managed_job_utils.update_managed_jobs_statuses()
|
79
|
+
managed_job_scheduler.maybe_schedule_next_jobs()
|
76
80
|
|
77
81
|
|
78
82
|
class ServiceUpdateEvent(SkyletEvent):
|
@@ -87,6 +91,14 @@ class ServiceUpdateEvent(SkyletEvent):
|
|
87
91
|
serve_utils.update_service_status()
|
88
92
|
|
89
93
|
|
94
|
+
class UsageHeartbeatReportEvent(SkyletEvent):
|
95
|
+
"""Skylet event for reporting usage."""
|
96
|
+
EVENT_INTERVAL_SECONDS = 600
|
97
|
+
|
98
|
+
def _run(self):
|
99
|
+
usage_lib.send_heartbeat(interval_seconds=self.EVENT_INTERVAL_SECONDS)
|
100
|
+
|
101
|
+
|
90
102
|
class AutostopEvent(SkyletEvent):
|
91
103
|
"""Skylet event for autostop.
|
92
104
|
|
@@ -116,7 +128,8 @@ class AutostopEvent(SkyletEvent):
|
|
116
128
|
logger.debug('autostop_config not set. Skipped.')
|
117
129
|
return
|
118
130
|
|
119
|
-
if job_lib.is_cluster_idle()
|
131
|
+
if (job_lib.is_cluster_idle() and
|
132
|
+
not managed_job_state.get_num_alive_jobs()):
|
120
133
|
idle_minutes = (time.time() -
|
121
134
|
autostop_lib.get_last_active_time()) // 60
|
122
135
|
logger.debug(
|
@@ -140,11 +153,10 @@ class AutostopEvent(SkyletEvent):
|
|
140
153
|
autostop_lib.set_autostopping_started()
|
141
154
|
|
142
155
|
config_path = os.path.abspath(
|
143
|
-
os.path.expanduser(
|
144
|
-
cluster_yaml_utils.SKY_CLUSTER_YAML_REMOTE_PATH))
|
156
|
+
os.path.expanduser(cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH))
|
145
157
|
config = common_utils.read_yaml(config_path)
|
146
|
-
provider_name =
|
147
|
-
cloud =
|
158
|
+
provider_name = cluster_utils.get_provider_name(config)
|
159
|
+
cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
|
148
160
|
assert cloud is not None, f'Unknown cloud: {provider_name}'
|
149
161
|
|
150
162
|
if (cloud.PROVISIONER_VERSION >= clouds.ProvisionerVersion.
|