skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -14,6 +14,7 @@ from sky.backends import backend_utils
|
|
14
14
|
from sky.backends import docker_utils
|
15
15
|
from sky.data import storage as storage_lib
|
16
16
|
from sky.utils import rich_utils
|
17
|
+
from sky.utils import ux_utils
|
17
18
|
|
18
19
|
if typing.TYPE_CHECKING:
|
19
20
|
from sky import resources
|
@@ -130,13 +131,14 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
130
131
|
pass
|
131
132
|
|
132
133
|
def _provision(
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
134
|
+
self,
|
135
|
+
task: 'task_lib.Task',
|
136
|
+
to_provision: Optional['resources.Resources'],
|
137
|
+
dryrun: bool,
|
138
|
+
stream_logs: bool,
|
139
|
+
cluster_name: str,
|
140
|
+
retry_until_up: bool = False,
|
141
|
+
skip_unnecessary_provisioning: bool = False,
|
140
142
|
) -> Optional[LocalDockerResourceHandle]:
|
141
143
|
"""Builds docker image for the task and returns cluster name as handle.
|
142
144
|
|
@@ -152,6 +154,9 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
152
154
|
logger.warning(
|
153
155
|
f'Retrying until up is not supported in backend: {self.NAME}. '
|
154
156
|
'Ignored the flag.')
|
157
|
+
if skip_unnecessary_provisioning:
|
158
|
+
logger.warning(f'skip_unnecessary_provisioning is not supported in '
|
159
|
+
f'backend: {self.NAME}. Ignored the flag.')
|
155
160
|
if stream_logs:
|
156
161
|
logger.info(
|
157
162
|
'Streaming build logs is not supported in LocalDockerBackend. '
|
@@ -159,7 +164,8 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
159
164
|
handle = LocalDockerResourceHandle(cluster_name)
|
160
165
|
logger.info(f'Building docker image for task {task.name}. '
|
161
166
|
'This might take some time.')
|
162
|
-
with rich_utils.safe_status(
|
167
|
+
with rich_utils.safe_status(
|
168
|
+
ux_utils.spinner_message('Building Docker image')):
|
163
169
|
image_tag, metadata = docker_utils.build_dockerimage_from_task(task)
|
164
170
|
self.images[handle] = (image_tag, metadata)
|
165
171
|
logger.info(f'Image {image_tag} built.')
|
sky/backends/wheel_utils.py
CHANGED
@@ -39,29 +39,30 @@ _WHEEL_PATTERN = (f'{_PACKAGE_WHEEL_NAME}-'
|
|
39
39
|
f'{version.parse(sky.__version__)}-*.whl')
|
40
40
|
|
41
41
|
|
42
|
-
def
|
43
|
-
|
42
|
+
def _remove_stale_wheels(latest_wheel_dir: pathlib.Path) -> None:
|
43
|
+
"""Remove all wheels except the latest one."""
|
44
|
+
for f in WHEEL_DIR.iterdir():
|
45
|
+
if f != latest_wheel_dir:
|
46
|
+
if f.is_dir() and not f.is_symlink():
|
47
|
+
shutil.rmtree(f, ignore_errors=True)
|
48
|
+
|
49
|
+
|
50
|
+
def _get_latest_wheel() -> pathlib.Path:
|
51
|
+
wheel_name = f'**/{_WHEEL_PATTERN}'
|
44
52
|
try:
|
45
53
|
latest_wheel = max(WHEEL_DIR.glob(wheel_name), key=os.path.getctime)
|
46
54
|
except ValueError:
|
47
55
|
raise FileNotFoundError(
|
48
56
|
'Could not find built SkyPilot wheels with glob pattern '
|
49
57
|
f'{wheel_name} under {WHEEL_DIR!r}') from None
|
50
|
-
|
51
|
-
latest_wheel_dir_name = latest_wheel.parent
|
52
|
-
# Cleanup older wheels.
|
53
|
-
for f in WHEEL_DIR.iterdir():
|
54
|
-
if f != latest_wheel_dir_name:
|
55
|
-
if f.is_dir() and not f.is_symlink():
|
56
|
-
shutil.rmtree(f, ignore_errors=True)
|
57
58
|
return latest_wheel
|
58
59
|
|
59
60
|
|
60
|
-
def _build_sky_wheel():
|
61
|
-
"""Build a wheel for SkyPilot."""
|
62
|
-
with tempfile.TemporaryDirectory() as
|
61
|
+
def _build_sky_wheel() -> pathlib.Path:
|
62
|
+
"""Build a wheel for SkyPilot and return the path to the wheel."""
|
63
|
+
with tempfile.TemporaryDirectory() as tmp_dir_str:
|
63
64
|
# prepare files
|
64
|
-
tmp_dir = pathlib.Path(
|
65
|
+
tmp_dir = pathlib.Path(tmp_dir_str)
|
65
66
|
sky_tmp_dir = tmp_dir / 'sky'
|
66
67
|
sky_tmp_dir.mkdir()
|
67
68
|
for item in SKY_PACKAGE_PATH.iterdir():
|
@@ -128,7 +129,12 @@ def _build_sky_wheel():
|
|
128
129
|
|
129
130
|
wheel_dir = WHEEL_DIR / hash_of_latest_wheel
|
130
131
|
wheel_dir.mkdir(parents=True, exist_ok=True)
|
131
|
-
shutil.move
|
132
|
+
# shutil.move will fail when the file already exists and is being
|
133
|
+
# moved across filesystems.
|
134
|
+
if not os.path.exists(
|
135
|
+
os.path.join(wheel_dir, os.path.basename(wheel_path))):
|
136
|
+
shutil.move(str(wheel_path), wheel_dir)
|
137
|
+
return wheel_dir / wheel_path.name
|
132
138
|
|
133
139
|
|
134
140
|
def build_sky_wheel() -> Tuple[pathlib.Path, str]:
|
@@ -147,7 +153,10 @@ def build_sky_wheel() -> Tuple[pathlib.Path, str]:
|
|
147
153
|
if not path.exists():
|
148
154
|
return -1.
|
149
155
|
try:
|
150
|
-
return max(
|
156
|
+
return max(
|
157
|
+
os.path.getmtime(os.path.join(root, f))
|
158
|
+
for root, dirs, files in os.walk(path)
|
159
|
+
for f in (*dirs, *files))
|
151
160
|
except ValueError:
|
152
161
|
return -1.
|
153
162
|
|
@@ -161,13 +170,22 @@ def build_sky_wheel() -> Tuple[pathlib.Path, str]:
|
|
161
170
|
last_modification_time = _get_latest_modification_time(SKY_PACKAGE_PATH)
|
162
171
|
last_wheel_modification_time = _get_latest_modification_time(WHEEL_DIR)
|
163
172
|
|
164
|
-
#
|
165
|
-
|
173
|
+
# Only build wheels if the wheel is outdated or wheel does not exist
|
174
|
+
# for the requested version.
|
175
|
+
if (last_wheel_modification_time < last_modification_time) or not any(
|
176
|
+
WHEEL_DIR.glob(f'**/{_WHEEL_PATTERN}')):
|
166
177
|
if not WHEEL_DIR.exists():
|
167
178
|
WHEEL_DIR.mkdir(parents=True, exist_ok=True)
|
168
|
-
_build_sky_wheel()
|
169
|
-
|
170
|
-
|
179
|
+
latest_wheel = _build_sky_wheel()
|
180
|
+
else:
|
181
|
+
latest_wheel = _get_latest_wheel()
|
182
|
+
|
183
|
+
# We remove all wheels except the latest one for garbage collection.
|
184
|
+
# Otherwise stale wheels will accumulate over time.
|
185
|
+
# TODO(romilb): If the user switches versions every alternate launch,
|
186
|
+
# the wheel will be rebuilt every time. At the risk of adding
|
187
|
+
# complexity, we can consider TTL caching wheels by version here.
|
188
|
+
_remove_stale_wheels(latest_wheel.parent)
|
171
189
|
|
172
190
|
wheel_hash = latest_wheel.parent.name
|
173
191
|
|
sky/benchmark/benchmark_utils.py
CHANGED
@@ -20,10 +20,11 @@ from rich import progress as rich_progress
|
|
20
20
|
|
21
21
|
import sky
|
22
22
|
from sky import backends
|
23
|
+
from sky import clouds
|
23
24
|
from sky import data
|
24
25
|
from sky import global_user_state
|
26
|
+
from sky import optimizer
|
25
27
|
from sky import sky_logging
|
26
|
-
from sky import status_lib
|
27
28
|
from sky.backends import backend_utils
|
28
29
|
from sky.benchmark import benchmark_state
|
29
30
|
from sky.data import storage as storage_lib
|
@@ -33,6 +34,7 @@ from sky.skylet import log_lib
|
|
33
34
|
from sky.utils import common_utils
|
34
35
|
from sky.utils import log_utils
|
35
36
|
from sky.utils import rich_utils
|
37
|
+
from sky.utils import status_lib
|
36
38
|
from sky.utils import subprocess_utils
|
37
39
|
from sky.utils import ux_utils
|
38
40
|
|
@@ -99,7 +101,9 @@ def _get_optimized_resources(
|
|
99
101
|
task = sky.Task()
|
100
102
|
task.set_resources(resources)
|
101
103
|
|
102
|
-
|
104
|
+
# Do not use `sky.optimize` here, as this should be called on the API
|
105
|
+
# server side.
|
106
|
+
dag = optimizer.Optimizer.optimize(dag, quiet=True)
|
103
107
|
task = dag.tasks[0]
|
104
108
|
optimized_resources.append(task.best_resources)
|
105
109
|
return optimized_resources
|
@@ -170,13 +174,19 @@ def _create_benchmark_bucket() -> Tuple[str, str]:
|
|
170
174
|
# Select the bucket type.
|
171
175
|
enabled_clouds = storage_lib.get_cached_enabled_storage_clouds_or_refresh(
|
172
176
|
raise_if_no_cloud_access=True)
|
173
|
-
#
|
174
|
-
|
177
|
+
# Sky Benchmark only supports S3 (see _download_remote_dir and
|
178
|
+
# _delete_remote_dir).
|
179
|
+
enabled_clouds = [
|
180
|
+
cloud for cloud in enabled_clouds if cloud in [str(clouds.AWS())]
|
181
|
+
]
|
182
|
+
assert enabled_clouds, ('No enabled cloud storage found. Sky Benchmark '
|
183
|
+
'requires GCP or AWS to store logs.')
|
175
184
|
bucket_type = data.StoreType.from_cloud(enabled_clouds[0]).value
|
176
185
|
|
177
186
|
# Create a benchmark bucket.
|
178
187
|
logger.info(f'Creating a bucket {bucket_name} to save the benchmark logs.')
|
179
188
|
storage = data.Storage(bucket_name, source=None, persistent=True)
|
189
|
+
storage.construct()
|
180
190
|
storage.add_store(bucket_type)
|
181
191
|
|
182
192
|
# Save the bucket name and type to the config.
|
@@ -242,14 +252,8 @@ def _download_remote_dir(remote_dir: str, local_dir: str,
|
|
242
252
|
stdout=subprocess.DEVNULL,
|
243
253
|
stderr=subprocess.DEVNULL,
|
244
254
|
check=True)
|
245
|
-
elif bucket_type == data.StoreType.GCS:
|
246
|
-
remote_dir = f'gs://{remote_dir}'
|
247
|
-
subprocess.run(['gsutil', '-m', 'cp', '-r', remote_dir, local_dir],
|
248
|
-
stdout=subprocess.DEVNULL,
|
249
|
-
stderr=subprocess.DEVNULL,
|
250
|
-
check=True)
|
251
255
|
else:
|
252
|
-
raise RuntimeError('
|
256
|
+
raise RuntimeError(f'{bucket_type} is not supported yet.')
|
253
257
|
|
254
258
|
|
255
259
|
def _delete_remote_dir(remote_dir: str, bucket_type: data.StoreType) -> None:
|
@@ -260,14 +264,8 @@ def _delete_remote_dir(remote_dir: str, bucket_type: data.StoreType) -> None:
|
|
260
264
|
stdout=subprocess.DEVNULL,
|
261
265
|
stderr=subprocess.DEVNULL,
|
262
266
|
check=True)
|
263
|
-
elif bucket_type == data.StoreType.GCS:
|
264
|
-
remote_dir = f'gs://{remote_dir}'
|
265
|
-
subprocess.run(['gsutil', '-m', 'rm', '-r', remote_dir],
|
266
|
-
stdout=subprocess.DEVNULL,
|
267
|
-
stderr=subprocess.DEVNULL,
|
268
|
-
check=True)
|
269
267
|
else:
|
270
|
-
raise RuntimeError('
|
268
|
+
raise RuntimeError(f'{bucket_type} is not supported yet.')
|
271
269
|
|
272
270
|
|
273
271
|
def _read_timestamp(path: str) -> float:
|
@@ -541,7 +539,7 @@ def launch_benchmark_clusters(benchmark: str, clusters: List[str],
|
|
541
539
|
for yaml_fd, cluster in zip(yaml_fds, clusters)]
|
542
540
|
|
543
541
|
# Save stdout/stderr from cluster launches.
|
544
|
-
run_timestamp =
|
542
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
545
543
|
log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp)
|
546
544
|
log_dir = os.path.expanduser(log_dir)
|
547
545
|
logger.info(
|
@@ -601,7 +599,8 @@ def update_benchmark_state(benchmark: str) -> None:
|
|
601
599
|
remote_dir = os.path.join(bucket_name, benchmark)
|
602
600
|
local_dir = os.path.join(_SKY_LOCAL_BENCHMARK_DIR, benchmark)
|
603
601
|
os.makedirs(local_dir, exist_ok=True)
|
604
|
-
with rich_utils.safe_status(
|
602
|
+
with rich_utils.safe_status(
|
603
|
+
ux_utils.spinner_message('Downloading benchmark logs')):
|
605
604
|
_download_remote_dir(remote_dir, local_dir, bucket_type)
|
606
605
|
|
607
606
|
# Update the benchmark results in parallel.
|
@@ -610,9 +609,9 @@ def update_benchmark_state(benchmark: str) -> None:
|
|
610
609
|
progress = rich_progress.Progress(transient=True,
|
611
610
|
redirect_stdout=False,
|
612
611
|
redirect_stderr=False)
|
613
|
-
task = progress.add_task(
|
614
|
-
f'
|
615
|
-
|
612
|
+
task = progress.add_task(ux_utils.spinner_message(
|
613
|
+
f'Processing {num_candidates} benchmark result{plural}'),
|
614
|
+
total=num_candidates)
|
616
615
|
|
617
616
|
def _update_with_progress_bar(arg: Any) -> None:
|
618
617
|
message = _update_benchmark_result(arg)
|
sky/check.py
CHANGED
@@ -1,26 +1,33 @@
|
|
1
1
|
"""Credential checks: check cloud credentials and enable clouds."""
|
2
|
+
import os
|
2
3
|
import traceback
|
3
4
|
from types import ModuleType
|
4
5
|
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
5
6
|
|
6
7
|
import click
|
7
8
|
import colorama
|
8
|
-
import rich
|
9
9
|
|
10
10
|
from sky import clouds as sky_clouds
|
11
11
|
from sky import exceptions
|
12
12
|
from sky import global_user_state
|
13
13
|
from sky import skypilot_config
|
14
14
|
from sky.adaptors import cloudflare
|
15
|
+
from sky.utils import registry
|
16
|
+
from sky.utils import rich_utils
|
15
17
|
from sky.utils import ux_utils
|
16
18
|
|
19
|
+
CHECK_MARK_EMOJI = '\U00002714' # Heavy check mark unicode
|
20
|
+
PARTY_POPPER_EMOJI = '\U0001F389' # Party popper unicode
|
21
|
+
|
17
22
|
|
18
23
|
def check(
|
19
24
|
quiet: bool = False,
|
20
25
|
verbose: bool = False,
|
21
26
|
clouds: Optional[Iterable[str]] = None,
|
22
|
-
) ->
|
23
|
-
echo = (lambda *_args, **_kwargs: None
|
27
|
+
) -> List[str]:
|
28
|
+
echo = (lambda *_args, **_kwargs: None
|
29
|
+
) if quiet else lambda *args, **kwargs: click.echo(
|
30
|
+
*args, **kwargs, color=True)
|
24
31
|
echo('Checking credentials to enable clouds for SkyPilot.')
|
25
32
|
enabled_clouds = []
|
26
33
|
disabled_clouds = []
|
@@ -29,14 +36,13 @@ def check(
|
|
29
36
|
cloud_tuple: Tuple[str, Union[sky_clouds.Cloud,
|
30
37
|
ModuleType]]) -> None:
|
31
38
|
cloud_repr, cloud = cloud_tuple
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
echo('\r', nl=False)
|
39
|
+
with rich_utils.safe_status(f'Checking {cloud_repr}...'):
|
40
|
+
try:
|
41
|
+
ok, reason = cloud.check_credentials()
|
42
|
+
except Exception: # pylint: disable=broad-except
|
43
|
+
# Catch all exceptions to prevent a single cloud from blocking
|
44
|
+
# the check for other clouds.
|
45
|
+
ok, reason = False, traceback.format_exc()
|
40
46
|
status_msg = 'enabled' if ok else 'disabled'
|
41
47
|
styles = {'fg': 'green', 'bold': False} if ok else {'dim': True}
|
42
48
|
echo(' ' + click.style(f'{cloud_repr}: {status_msg}', **styles) +
|
@@ -44,7 +50,7 @@ def check(
|
|
44
50
|
if ok:
|
45
51
|
enabled_clouds.append(cloud_repr)
|
46
52
|
if verbose and cloud is not cloudflare:
|
47
|
-
activated_account = cloud.
|
53
|
+
activated_account = cloud.get_active_user_identity_str()
|
48
54
|
if activated_account is not None:
|
49
55
|
echo(f' Activated account: {activated_account}')
|
50
56
|
if reason is not None:
|
@@ -60,12 +66,12 @@ def check(
|
|
60
66
|
if cloud_name.lower().startswith('cloudflare'):
|
61
67
|
return cloudflare.SKY_CHECK_NAME, cloudflare
|
62
68
|
else:
|
63
|
-
cloud_obj =
|
69
|
+
cloud_obj = registry.CLOUD_REGISTRY.from_str(cloud_name)
|
64
70
|
assert cloud_obj is not None, f'Cloud {cloud_name!r} not found'
|
65
71
|
return repr(cloud_obj), cloud_obj
|
66
72
|
|
67
73
|
def get_all_clouds():
|
68
|
-
return tuple([repr(c) for c in
|
74
|
+
return tuple([repr(c) for c in registry.CLOUD_REGISTRY.values()] +
|
69
75
|
[cloudflare.SKY_CHECK_NAME])
|
70
76
|
|
71
77
|
if clouds is not None:
|
@@ -77,8 +83,8 @@ def check(
|
|
77
83
|
# Use allowed_clouds from config if it exists, otherwise check all clouds.
|
78
84
|
# Also validate names with get_cloud_tuple.
|
79
85
|
config_allowed_cloud_names = [
|
80
|
-
get_cloud_tuple(c)[0] for c in skypilot_config.get_nested(
|
81
|
-
|
86
|
+
get_cloud_tuple(c)[0] for c in skypilot_config.get_nested((
|
87
|
+
'allowed_clouds',), get_all_clouds())
|
82
88
|
]
|
83
89
|
# Use disallowed_cloud_names for logging the clouds that will be disabled
|
84
90
|
# because they are not included in allowed_clouds in config.yaml.
|
@@ -93,7 +99,7 @@ def check(
|
|
93
99
|
for cloud_tuple in sorted(clouds_to_check):
|
94
100
|
check_one_cloud(cloud_tuple)
|
95
101
|
|
96
|
-
# Cloudflare is not a real cloud in
|
102
|
+
# Cloudflare is not a real cloud in registry.CLOUD_REGISTRY, and should
|
97
103
|
# not be inserted into the DB (otherwise `sky launch` and other code would
|
98
104
|
# error out when it's trying to look it up in the registry).
|
99
105
|
enabled_clouds_set = {
|
@@ -126,7 +132,7 @@ def check(
|
|
126
132
|
'\nNote: The following clouds were disabled because they were not '
|
127
133
|
'included in allowed_clouds in ~/.sky/config.yaml: '
|
128
134
|
f'{", ".join([c for c in disallowed_cloud_names])}')
|
129
|
-
if
|
135
|
+
if not all_enabled_clouds:
|
130
136
|
echo(
|
131
137
|
click.style(
|
132
138
|
'No cloud is enabled. SkyPilot will not be able to run any '
|
@@ -145,7 +151,7 @@ def check(
|
|
145
151
|
dim=True) + click.style(f'sky check{clouds_arg}', bold=True) +
|
146
152
|
'\n' + click.style(
|
147
153
|
'If any problems remain, refer to detailed docs at: '
|
148
|
-
'https://skypilot.
|
154
|
+
'https://docs.skypilot.co/en/latest/getting-started/installation.html', # pylint: disable=line-too-long
|
149
155
|
dim=True))
|
150
156
|
|
151
157
|
if disallowed_clouds_hint:
|
@@ -153,10 +159,14 @@ def check(
|
|
153
159
|
|
154
160
|
# Pretty print for UX.
|
155
161
|
if not quiet:
|
156
|
-
enabled_clouds_str = '\n
|
157
|
-
|
158
|
-
|
159
|
-
|
162
|
+
enabled_clouds_str = '\n ' + '\n '.join([
|
163
|
+
_format_enabled_cloud(cloud)
|
164
|
+
for cloud in sorted(all_enabled_clouds)
|
165
|
+
])
|
166
|
+
echo(f'\n{colorama.Fore.GREEN}{PARTY_POPPER_EMOJI} '
|
167
|
+
f'Enabled clouds {PARTY_POPPER_EMOJI}'
|
168
|
+
f'{colorama.Style.RESET_ALL}{enabled_clouds_str}')
|
169
|
+
return enabled_clouds
|
160
170
|
|
161
171
|
|
162
172
|
def get_cached_enabled_clouds_or_refresh(
|
@@ -194,19 +204,25 @@ def get_cached_enabled_clouds_or_refresh(
|
|
194
204
|
def get_cloud_credential_file_mounts(
|
195
205
|
excluded_clouds: Optional[Iterable[sky_clouds.Cloud]]
|
196
206
|
) -> Dict[str, str]:
|
197
|
-
"""Returns the files necessary to access all
|
207
|
+
"""Returns the files necessary to access all clouds.
|
198
208
|
|
199
209
|
Returns a dictionary that will be added to a task's file mounts
|
200
210
|
and a list of patterns that will be excluded (used as rsync_exclude).
|
201
211
|
"""
|
202
|
-
|
212
|
+
# Uploading credentials for all clouds instead of only sky check
|
213
|
+
# enabled clouds because users may have partial credentials for some
|
214
|
+
# clouds to access their specific resources (e.g. cloud storage) but
|
215
|
+
# not have the complete credentials to pass sky check.
|
216
|
+
clouds = registry.CLOUD_REGISTRY.values()
|
203
217
|
file_mounts = {}
|
204
|
-
for cloud in
|
218
|
+
for cloud in clouds:
|
205
219
|
if (excluded_clouds is not None and
|
206
220
|
sky_clouds.cloud_in_iterable(cloud, excluded_clouds)):
|
207
221
|
continue
|
208
222
|
cloud_file_mounts = cloud.get_credential_file_mounts()
|
209
|
-
|
223
|
+
for remote_path, local_path in cloud_file_mounts.items():
|
224
|
+
if os.path.exists(os.path.expanduser(local_path)):
|
225
|
+
file_mounts[remote_path] = local_path
|
210
226
|
# Currently, get_cached_enabled_clouds_or_refresh() does not support r2 as
|
211
227
|
# only clouds with computing instances are marked as enabled by skypilot.
|
212
228
|
# This will be removed when cloudflare/r2 is added as a 'cloud'.
|
@@ -215,3 +231,36 @@ def get_cloud_credential_file_mounts(
|
|
215
231
|
r2_credential_mounts = cloudflare.get_credential_file_mounts()
|
216
232
|
file_mounts.update(r2_credential_mounts)
|
217
233
|
return file_mounts
|
234
|
+
|
235
|
+
|
236
|
+
def _format_enabled_cloud(cloud_name: str) -> str:
|
237
|
+
|
238
|
+
def _green_color(cloud_name: str) -> str:
|
239
|
+
return f'{colorama.Fore.GREEN}{cloud_name}{colorama.Style.RESET_ALL}'
|
240
|
+
|
241
|
+
if cloud_name == repr(sky_clouds.Kubernetes()):
|
242
|
+
# Get enabled contexts for Kubernetes
|
243
|
+
existing_contexts = sky_clouds.Kubernetes.existing_allowed_contexts()
|
244
|
+
if not existing_contexts:
|
245
|
+
return _green_color(cloud_name)
|
246
|
+
|
247
|
+
# Check if allowed_contexts is explicitly set in config
|
248
|
+
allowed_contexts = skypilot_config.get_nested(
|
249
|
+
('kubernetes', 'allowed_contexts'), None)
|
250
|
+
|
251
|
+
# Format the context info with consistent styling
|
252
|
+
if allowed_contexts is not None:
|
253
|
+
contexts_formatted = []
|
254
|
+
for i, context in enumerate(existing_contexts):
|
255
|
+
symbol = (ux_utils.INDENT_LAST_SYMBOL
|
256
|
+
if i == len(existing_contexts) -
|
257
|
+
1 else ux_utils.INDENT_SYMBOL)
|
258
|
+
contexts_formatted.append(f'\n {symbol}{context}')
|
259
|
+
context_info = f'Allowed contexts:{"".join(contexts_formatted)}'
|
260
|
+
else:
|
261
|
+
context_info = f'Active context: {existing_contexts[0]}'
|
262
|
+
|
263
|
+
return (f'{_green_color(cloud_name)}\n'
|
264
|
+
f' {colorama.Style.DIM}{context_info}'
|
265
|
+
f'{colorama.Style.RESET_ALL}')
|
266
|
+
return _green_color(cloud_name)
|