skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/utils/controller_utils.py
CHANGED
@@ -7,6 +7,7 @@ import os
|
|
7
7
|
import tempfile
|
8
8
|
import typing
|
9
9
|
from typing import Any, Dict, Iterable, List, Optional, Set
|
10
|
+
import uuid
|
10
11
|
|
11
12
|
import colorama
|
12
13
|
|
@@ -22,12 +23,16 @@ from sky.clouds import gcp
|
|
22
23
|
from sky.data import data_utils
|
23
24
|
from sky.data import storage as storage_lib
|
24
25
|
from sky.jobs import constants as managed_job_constants
|
25
|
-
from sky.jobs import utils as managed_job_utils
|
26
26
|
from sky.serve import constants as serve_constants
|
27
|
-
from sky.
|
27
|
+
from sky.setup_files import dependencies
|
28
28
|
from sky.skylet import constants
|
29
|
+
from sky.skylet import log_lib
|
30
|
+
from sky.utils import common
|
29
31
|
from sky.utils import common_utils
|
32
|
+
from sky.utils import config_utils
|
30
33
|
from sky.utils import env_options
|
34
|
+
from sky.utils import registry
|
35
|
+
from sky.utils import rich_utils
|
31
36
|
from sky.utils import ux_utils
|
32
37
|
|
33
38
|
if typing.TYPE_CHECKING:
|
@@ -44,8 +49,12 @@ CONTROLLER_RESOURCES_NOT_VALID_MESSAGE = (
|
|
44
49
|
'{controller_type}.controller.resources is a valid resources spec. '
|
45
50
|
'Details:\n {err}')
|
46
51
|
|
47
|
-
# The
|
48
|
-
|
52
|
+
# The suffix for local skypilot config path for a job/service in file mounts
|
53
|
+
# that tells the controller logic to update the config with specific settings,
|
54
|
+
# e.g., removing the ssh_proxy_command when a job/service is launched in a same
|
55
|
+
# cloud as controller.
|
56
|
+
_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX = (
|
57
|
+
'__skypilot:local_skypilot_config_path.yaml')
|
49
58
|
|
50
59
|
|
51
60
|
@dataclasses.dataclass
|
@@ -53,9 +62,7 @@ class _ControllerSpec:
|
|
53
62
|
"""Spec for skypilot controllers."""
|
54
63
|
controller_type: str
|
55
64
|
name: str
|
56
|
-
|
57
|
-
# fallback order.
|
58
|
-
candidate_cluster_names: List[str]
|
65
|
+
cluster_name: str
|
59
66
|
in_progress_hint: str
|
60
67
|
decline_cancel_hint: str
|
61
68
|
_decline_down_when_failed_to_fetch_status_hint: str
|
@@ -65,15 +72,6 @@ class _ControllerSpec:
|
|
65
72
|
connection_error_hint: str
|
66
73
|
default_resources_config: Dict[str, Any]
|
67
74
|
|
68
|
-
@property
|
69
|
-
def cluster_name(self) -> str:
|
70
|
-
"""The name in candidate_cluster_names that exists, else the first."""
|
71
|
-
for candidate_name in self.candidate_cluster_names:
|
72
|
-
record = global_user_state.get_cluster_from_name(candidate_name)
|
73
|
-
if record is not None:
|
74
|
-
return candidate_name
|
75
|
-
return self.candidate_cluster_names[0]
|
76
|
-
|
77
75
|
@property
|
78
76
|
def decline_down_when_failed_to_fetch_status_hint(self) -> str:
|
79
77
|
return self._decline_down_when_failed_to_fetch_status_hint.format(
|
@@ -85,6 +83,7 @@ class _ControllerSpec:
|
|
85
83
|
cluster_name=self.cluster_name)
|
86
84
|
|
87
85
|
|
86
|
+
# TODO: refactor controller class to not be an enum.
|
88
87
|
class Controllers(enum.Enum):
|
89
88
|
"""Skypilot controllers."""
|
90
89
|
# NOTE(dev): Keep this align with
|
@@ -92,10 +91,7 @@ class Controllers(enum.Enum):
|
|
92
91
|
JOBS_CONTROLLER = _ControllerSpec(
|
93
92
|
controller_type='jobs',
|
94
93
|
name='managed jobs controller',
|
95
|
-
|
96
|
-
managed_job_utils.JOB_CONTROLLER_NAME,
|
97
|
-
managed_job_utils.LEGACY_JOB_CONTROLLER_NAME
|
98
|
-
],
|
94
|
+
cluster_name=common.JOB_CONTROLLER_NAME,
|
99
95
|
in_progress_hint=(
|
100
96
|
'* {job_info}To see all managed jobs: '
|
101
97
|
f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
|
@@ -125,7 +121,7 @@ class Controllers(enum.Enum):
|
|
125
121
|
SKY_SERVE_CONTROLLER = _ControllerSpec(
|
126
122
|
controller_type='serve',
|
127
123
|
name='serve controller',
|
128
|
-
|
124
|
+
cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
129
125
|
in_progress_hint=(
|
130
126
|
f'* To see detailed service status: {colorama.Style.BRIGHT}'
|
131
127
|
f'sky serve status -a{colorama.Style.RESET_ALL}'),
|
@@ -161,10 +157,23 @@ class Controllers(enum.Enum):
|
|
161
157
|
The controller if the cluster name is a controller name.
|
162
158
|
Otherwise, returns None.
|
163
159
|
"""
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
160
|
+
if name is None:
|
161
|
+
return None
|
162
|
+
controller = None
|
163
|
+
# The controller name is always the same. However, on the client-side,
|
164
|
+
# we may not know the exact name, because we are missing the server-side
|
165
|
+
# common.SERVER_ID. So, we will assume anything that matches the prefix
|
166
|
+
# is a controller.
|
167
|
+
if name.startswith(common.SKY_SERVE_CONTROLLER_PREFIX):
|
168
|
+
controller = cls.SKY_SERVE_CONTROLLER
|
169
|
+
elif name.startswith(common.JOB_CONTROLLER_PREFIX):
|
170
|
+
controller = cls.JOBS_CONTROLLER
|
171
|
+
if controller is not None and name != controller.value.cluster_name:
|
172
|
+
# The client-side cluster_name is not accurate. Assume that `name`
|
173
|
+
# is the actual cluster name, so need to set the controller's
|
174
|
+
# cluster name to the input name.
|
175
|
+
controller.value.cluster_name = name
|
176
|
+
return controller
|
168
177
|
|
169
178
|
@classmethod
|
170
179
|
def from_type(cls, controller_type: str) -> Optional['Controllers']:
|
@@ -182,63 +191,59 @@ class Controllers(enum.Enum):
|
|
182
191
|
|
183
192
|
# Install cli dependencies. Not using SkyPilot wheels because the wheel
|
184
193
|
# can be cleaned up by another process.
|
185
|
-
# TODO(zhwu): Keep the dependencies align with the ones in setup.py
|
186
194
|
def _get_cloud_dependencies_installation_commands(
|
187
195
|
controller: Controllers) -> List[str]:
|
188
|
-
#
|
189
|
-
#
|
190
|
-
|
191
|
-
prefix_str = 'Check & install cloud dependencies
|
196
|
+
# We use <step>/<total> instead of strong formatting, as we need to update
|
197
|
+
# the <total> at the end of the for loop, and python does not support
|
198
|
+
# partial string formatting.
|
199
|
+
prefix_str = ('[<step>/<total>] Check & install cloud dependencies '
|
200
|
+
'on controller: ')
|
201
|
+
commands: List[str] = []
|
192
202
|
# This is to make sure the shorter checking message does not have junk
|
193
203
|
# characters from the previous message.
|
194
|
-
empty_str = ' ' *
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
204
|
+
empty_str = ' ' * 20
|
205
|
+
|
206
|
+
# All python dependencies will be accumulated and then installed in one
|
207
|
+
# command at the end. This is very fast if the packages are already
|
208
|
+
# installed, so we don't check that.
|
209
|
+
python_packages: Set[str] = set()
|
210
|
+
|
211
|
+
# add flask to the controller dependencies for dashboard
|
212
|
+
python_packages.add('flask')
|
213
|
+
|
214
|
+
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
215
|
+
commands.append(f'echo -en "\\r{step_prefix}uv{empty_str}" &&'
|
216
|
+
f'{constants.SKY_UV_INSTALL_CMD} >/dev/null 2>&1')
|
217
|
+
|
202
218
|
for cloud in sky_check.get_cached_enabled_clouds_or_refresh():
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
#
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
elif isinstance(cloud, clouds.Azure):
|
219
|
+
cloud_python_dependencies: List[str] = copy.deepcopy(
|
220
|
+
dependencies.extras_require[cloud.canonical_name()])
|
221
|
+
|
222
|
+
if isinstance(cloud, clouds.Azure):
|
223
|
+
# azure-cli cannot be normally installed by uv.
|
224
|
+
# See comments in sky/skylet/constants.py.
|
225
|
+
cloud_python_dependencies.remove(dependencies.AZURE_CLI)
|
226
|
+
|
227
|
+
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
213
228
|
commands.append(
|
214
|
-
f'echo -en "\\r{
|
215
|
-
'
|
216
|
-
'
|
217
|
-
'"azure-identity>=1.13.0" azure-mgmt-network > /dev/null 2>&1')
|
229
|
+
f'echo -en "\\r{step_prefix}azure-cli{empty_str}" &&'
|
230
|
+
f'{constants.SKY_UV_PIP_CMD} install --prerelease=allow '
|
231
|
+
f'"{dependencies.AZURE_CLI}" > /dev/null 2>&1')
|
218
232
|
elif isinstance(cloud, clouds.GCP):
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
'pip install "google-api-python-client>=2.69.0" '
|
223
|
-
'> /dev/null 2>&1')
|
224
|
-
# Have to separate the installation of google-cloud-storage from
|
225
|
-
# above because for a VM launched on GCP, the VM may have
|
226
|
-
# google-api-python-client installed alone.
|
227
|
-
commands.append(
|
228
|
-
'pip list | grep google-cloud-storage > /dev/null 2>&1 || '
|
229
|
-
'pip install google-cloud-storage > /dev/null 2>&1')
|
230
|
-
commands.append(f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}')
|
233
|
+
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
234
|
+
commands.append(f'echo -en "\\r{step_prefix}GCP SDK{empty_str}" &&'
|
235
|
+
f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}')
|
231
236
|
elif isinstance(cloud, clouds.Kubernetes):
|
237
|
+
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
232
238
|
commands.append(
|
233
|
-
f'echo -en "\\r{
|
234
|
-
'pip list | grep kubernetes > /dev/null 2>&1 || '
|
235
|
-
'pip install "kubernetes>=20.0.0" > /dev/null 2>&1 &&'
|
239
|
+
f'echo -en "\\r{step_prefix}Kubernetes{empty_str}" && '
|
236
240
|
# Install k8s + skypilot dependencies
|
237
241
|
'sudo bash -c "if '
|
238
242
|
'! command -v curl &> /dev/null || '
|
239
243
|
'! command -v socat &> /dev/null || '
|
240
244
|
'! command -v netcat &> /dev/null; '
|
241
|
-
'then apt update
|
245
|
+
'then apt update &> /dev/null && '
|
246
|
+
'apt install curl socat netcat -y &> /dev/null; '
|
242
247
|
'fi" && '
|
243
248
|
# Install kubectl
|
244
249
|
'(command -v kubectl &>/dev/null || '
|
@@ -247,34 +252,42 @@ def _get_cloud_dependencies_installation_commands(
|
|
247
252
|
'/bin/linux/amd64/kubectl" && '
|
248
253
|
'sudo install -o root -g root -m 0755 '
|
249
254
|
'kubectl /usr/local/bin/kubectl))')
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
commands.append(
|
269
|
-
f'echo -en "\\r{prefix_str}Cudo{empty_str}" && '
|
270
|
-
'pip list | grep cudo-compute > /dev/null 2>&1 || '
|
271
|
-
'pip install "cudo-compute>=0.1.8" > /dev/null 2>&1')
|
255
|
+
elif isinstance(cloud, clouds.Cudo):
|
256
|
+
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
257
|
+
commands.append(
|
258
|
+
f'echo -en "\\r{step_prefix}cudoctl{empty_str}" && '
|
259
|
+
'wget https://download.cudo.org/compute/cudoctl-0.3.2-amd64.deb -O ~/cudoctl.deb > /dev/null 2>&1 && ' # pylint: disable=line-too-long
|
260
|
+
'sudo dpkg -i ~/cudoctl.deb > /dev/null 2>&1')
|
261
|
+
elif isinstance(cloud, clouds.IBM):
|
262
|
+
if controller != Controllers.JOBS_CONTROLLER:
|
263
|
+
# We only need IBM deps on the jobs controller.
|
264
|
+
cloud_python_dependencies = []
|
265
|
+
elif isinstance(cloud, clouds.Vast):
|
266
|
+
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
267
|
+
commands.append(f'echo -en "\\r{step_prefix}Vast{empty_str}" && '
|
268
|
+
'pip list | grep vastai_sdk > /dev/null 2>&1 || '
|
269
|
+
'pip install "vastai_sdk>=0.1.12" > /dev/null 2>&1')
|
270
|
+
|
271
|
+
python_packages.update(cloud_python_dependencies)
|
272
|
+
|
272
273
|
if (cloudflare.NAME
|
273
274
|
in storage_lib.get_cached_enabled_storage_clouds_or_refresh()):
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
275
|
+
python_packages.update(dependencies.extras_require['cloudflare'])
|
276
|
+
|
277
|
+
packages_string = ' '.join([f'"{package}"' for package in python_packages])
|
278
|
+
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
279
|
+
commands.append(
|
280
|
+
f'echo -en "\\r{step_prefix}cloud python packages{empty_str}" && '
|
281
|
+
f'{constants.SKY_UV_PIP_CMD} install {packages_string} > /dev/null 2>&1'
|
282
|
+
)
|
283
|
+
|
284
|
+
total_commands = len(commands)
|
285
|
+
finish_prefix = prefix_str.replace('[<step>/<total>] ', ' ')
|
286
|
+
commands.append(f'echo -e "\\r{finish_prefix}done.{empty_str}"')
|
287
|
+
|
288
|
+
commands = [
|
289
|
+
command.replace('<total>', str(total_commands)) for command in commands
|
290
|
+
]
|
278
291
|
return commands
|
279
292
|
|
280
293
|
|
@@ -308,8 +321,10 @@ def download_and_stream_latest_job_log(
|
|
308
321
|
"""Downloads and streams the latest job log.
|
309
322
|
|
310
323
|
This function is only used by jobs controller and sky serve controller.
|
324
|
+
|
325
|
+
If the log cannot be fetched for any reason, return None.
|
311
326
|
"""
|
312
|
-
os.makedirs(local_dir, exist_ok=True)
|
327
|
+
os.makedirs(os.path.expanduser(local_dir), exist_ok=True)
|
313
328
|
log_file = None
|
314
329
|
try:
|
315
330
|
log_dirs = backend.sync_down_logs(
|
@@ -322,29 +337,74 @@ def download_and_stream_latest_job_log(
|
|
322
337
|
# job_ids all represent the same logical managed job.
|
323
338
|
job_ids=None,
|
324
339
|
local_dir=local_dir)
|
325
|
-
except
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
340
|
+
except Exception as e: # pylint: disable=broad-except
|
341
|
+
# We want to avoid crashing the controller. sync_down_logs() is pretty
|
342
|
+
# complicated and could crash in various places (creating remote
|
343
|
+
# runners, executing remote code, decoding the payload, etc.). So, we
|
344
|
+
# use a broad except and just return None.
|
345
|
+
logger.info(
|
346
|
+
f'Failed to download the logs: '
|
347
|
+
f'{common_utils.format_exception(e)}',
|
348
|
+
exc_info=True)
|
349
|
+
return None
|
350
|
+
|
351
|
+
if not log_dirs:
|
352
|
+
logger.error('Failed to find the logs for the user program.')
|
353
|
+
return None
|
354
|
+
|
355
|
+
log_dir = list(log_dirs.values())[0]
|
356
|
+
log_file = os.path.join(log_dir, 'run.log')
|
357
|
+
|
358
|
+
# Print the logs to the console.
|
359
|
+
# TODO(zhwu): refactor this into log_utils, along with the refactoring for
|
360
|
+
# the log_lib.tail_logs.
|
361
|
+
try:
|
362
|
+
with open(log_file, 'r', encoding='utf-8') as f:
|
363
|
+
# Stream the logs to the console without reading the whole file into
|
364
|
+
# memory.
|
365
|
+
start_streaming = False
|
366
|
+
for line in f:
|
367
|
+
if log_lib.LOG_FILE_START_STREAMING_AT in line:
|
368
|
+
start_streaming = True
|
369
|
+
if start_streaming:
|
370
|
+
print(line, end='', flush=True)
|
371
|
+
except FileNotFoundError:
|
372
|
+
logger.error('Failed to find the logs for the user '
|
373
|
+
f'program at {log_file}.')
|
374
|
+
except Exception as e: # pylint: disable=broad-except
|
375
|
+
logger.error(
|
376
|
+
f'Failed to stream the logs for the user program at '
|
377
|
+
f'{log_file}: {common_utils.format_exception(e)}',
|
378
|
+
exc_info=True)
|
379
|
+
# Return the log_file anyway.
|
380
|
+
|
342
381
|
return log_file
|
343
382
|
|
344
383
|
|
345
384
|
def shared_controller_vars_to_fill(
|
346
|
-
controller: Controllers,
|
347
|
-
|
385
|
+
controller: Controllers, remote_user_config_path: str,
|
386
|
+
local_user_config: Dict[str, Any]) -> Dict[str, str]:
|
387
|
+
if not local_user_config:
|
388
|
+
local_user_config_path = None
|
389
|
+
else:
|
390
|
+
# Remove admin_policy from local_user_config so that it is not applied
|
391
|
+
# again on the controller. This is required since admin_policy is not
|
392
|
+
# installed on the controller.
|
393
|
+
local_user_config.pop('admin_policy', None)
|
394
|
+
# Remove allowed_contexts from local_user_config since the controller
|
395
|
+
# may be running in a Kubernetes cluster with in-cluster auth and may
|
396
|
+
# not have kubeconfig available to it. This is the typical case since
|
397
|
+
# remote_identity default for Kubernetes is SERVICE_ACCOUNT.
|
398
|
+
# TODO(romilb): We should check the cloud the controller is running on
|
399
|
+
# before popping allowed_contexts. If it is not on Kubernetes,
|
400
|
+
# we may be able to use allowed_contexts.
|
401
|
+
local_user_config.pop('allowed_contexts', None)
|
402
|
+
with tempfile.NamedTemporaryFile(
|
403
|
+
delete=False,
|
404
|
+
suffix=_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX) as temp_file:
|
405
|
+
common_utils.dump_yaml(temp_file.name, dict(**local_user_config))
|
406
|
+
local_user_config_path = temp_file.name
|
407
|
+
|
348
408
|
vars_to_fill: Dict[str, Any] = {
|
349
409
|
'cloud_dependencies_installation_commands':
|
350
410
|
_get_cloud_dependencies_installation_commands(controller),
|
@@ -352,9 +412,11 @@ def shared_controller_vars_to_fill(
|
|
352
412
|
# cloud SDKs are installed in SkyPilot runtime environment and can be
|
353
413
|
# accessed.
|
354
414
|
'sky_activate_python_env': constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV,
|
415
|
+
'sky_python_cmd': constants.SKY_PYTHON_CMD,
|
416
|
+
'local_user_config_path': local_user_config_path,
|
355
417
|
}
|
356
418
|
env_vars: Dict[str, str] = {
|
357
|
-
env.
|
419
|
+
env.env_key: str(int(env.get())) for env in env_options.Options
|
358
420
|
}
|
359
421
|
env_vars.update({
|
360
422
|
# Should not use $USER here, as that env var can be empty when
|
@@ -362,7 +424,9 @@ def shared_controller_vars_to_fill(
|
|
362
424
|
constants.USER_ENV_VAR: getpass.getuser(),
|
363
425
|
constants.USER_ID_ENV_VAR: common_utils.get_user_hash(),
|
364
426
|
# Skip cloud identity check to avoid the overhead.
|
365
|
-
env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.
|
427
|
+
env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.env_key: '1',
|
428
|
+
# Disable minimize logging to get more details on the controller.
|
429
|
+
env_options.Options.MINIMIZE_LOGGING.env_key: '0',
|
366
430
|
})
|
367
431
|
if skypilot_config.loaded():
|
368
432
|
# Only set the SKYPILOT_CONFIG env var if the user has a config file.
|
@@ -395,10 +459,6 @@ def get_controller_resources(
|
|
395
459
|
if custom_controller_resources_config is not None:
|
396
460
|
controller_resources_config_copied.update(
|
397
461
|
custom_controller_resources_config)
|
398
|
-
elif controller == Controllers.JOBS_CONTROLLER:
|
399
|
-
controller_resources_config_copied.update(
|
400
|
-
skypilot_config.get_nested(('spot', 'controller', 'resources'),
|
401
|
-
{}))
|
402
462
|
|
403
463
|
try:
|
404
464
|
controller_resources = resources.Resources.from_yaml_config(
|
@@ -431,20 +491,17 @@ def get_controller_resources(
|
|
431
491
|
if handle is not None:
|
432
492
|
controller_resources_to_use = handle.launched_resources
|
433
493
|
|
434
|
-
|
435
|
-
|
494
|
+
# If the controller and replicas are from the same cloud (and region/zone),
|
495
|
+
# it should provide better connectivity. We will let the controller choose
|
496
|
+
# from the clouds (and regions/zones) of the resources if the user does not
|
497
|
+
# specify the cloud (and region/zone) for the controller.
|
436
498
|
|
437
|
-
|
438
|
-
|
439
|
-
# the clouds of the resources if the controller does not exist.
|
440
|
-
# TODO(tian): Consider respecting the regions/zones specified for the
|
441
|
-
# resources as well.
|
442
|
-
requested_clouds: Set['clouds.Cloud'] = set()
|
499
|
+
requested_clouds_with_region_zone: Dict[str, Dict[Optional[str],
|
500
|
+
Set[Optional[str]]]] = {}
|
443
501
|
for resource in task_resources:
|
444
|
-
# cloud is an object and will not be able to be distinguished by set.
|
445
|
-
# Here we manually check if the cloud is in the set.
|
446
502
|
if resource.cloud is not None:
|
447
|
-
|
503
|
+
cloud_name = str(resource.cloud)
|
504
|
+
if cloud_name not in requested_clouds_with_region_zone:
|
448
505
|
try:
|
449
506
|
resource.cloud.check_features_are_supported(
|
450
507
|
resources.Resources(),
|
@@ -452,7 +509,26 @@ def get_controller_resources(
|
|
452
509
|
except exceptions.NotSupportedError:
|
453
510
|
# Skip the cloud if it does not support hosting controllers.
|
454
511
|
continue
|
455
|
-
|
512
|
+
requested_clouds_with_region_zone[cloud_name] = {}
|
513
|
+
if resource.region is None:
|
514
|
+
# If one of the resource.region is None, this could represent
|
515
|
+
# that the user is unsure about which region the resource is
|
516
|
+
# hosted in. In this case, we allow any region for this cloud.
|
517
|
+
requested_clouds_with_region_zone[cloud_name] = {None: {None}}
|
518
|
+
elif None not in requested_clouds_with_region_zone[cloud_name]:
|
519
|
+
if resource.region not in requested_clouds_with_region_zone[
|
520
|
+
cloud_name]:
|
521
|
+
requested_clouds_with_region_zone[cloud_name][
|
522
|
+
resource.region] = set()
|
523
|
+
# If one of the resource.zone is None, allow any zone in the
|
524
|
+
# region.
|
525
|
+
if resource.zone is None:
|
526
|
+
requested_clouds_with_region_zone[cloud_name][
|
527
|
+
resource.region] = {None}
|
528
|
+
elif None not in requested_clouds_with_region_zone[cloud_name][
|
529
|
+
resource.region]:
|
530
|
+
requested_clouds_with_region_zone[cloud_name][
|
531
|
+
resource.region].add(resource.zone)
|
456
532
|
else:
|
457
533
|
# if one of the resource.cloud is None, this could represent user
|
458
534
|
# does not know which cloud is best for the specified resources.
|
@@ -462,18 +538,54 @@ def get_controller_resources(
|
|
462
538
|
# - cloud: runpod
|
463
539
|
# accelerators: A40
|
464
540
|
# In this case, we allow the controller to be launched on any cloud.
|
465
|
-
|
541
|
+
requested_clouds_with_region_zone.clear()
|
466
542
|
break
|
467
|
-
|
543
|
+
|
544
|
+
# Extract filtering criteria from the controller resources specified by the
|
545
|
+
# user.
|
546
|
+
controller_cloud = str(
|
547
|
+
controller_resources_to_use.cloud
|
548
|
+
) if controller_resources_to_use.cloud is not None else None
|
549
|
+
controller_region = controller_resources_to_use.region
|
550
|
+
controller_zone = controller_resources_to_use.zone
|
551
|
+
|
552
|
+
# Filter clouds if controller_resources_to_use.cloud is specified.
|
553
|
+
filtered_clouds = ({controller_cloud} if controller_cloud is not None else
|
554
|
+
requested_clouds_with_region_zone.keys())
|
555
|
+
|
556
|
+
# Filter regions and zones and construct the result.
|
557
|
+
result: Set[resources.Resources] = set()
|
558
|
+
for cloud_name in filtered_clouds:
|
559
|
+
regions = requested_clouds_with_region_zone.get(cloud_name,
|
560
|
+
{None: {None}})
|
561
|
+
|
562
|
+
# Filter regions if controller_resources_to_use.region is specified.
|
563
|
+
filtered_regions = ({controller_region} if controller_region is not None
|
564
|
+
else regions.keys())
|
565
|
+
|
566
|
+
for region in filtered_regions:
|
567
|
+
zones = regions.get(region, {None})
|
568
|
+
|
569
|
+
# Filter zones if controller_resources_to_use.zone is specified.
|
570
|
+
filtered_zones = ({controller_zone}
|
571
|
+
if controller_zone is not None else zones)
|
572
|
+
|
573
|
+
# Create combinations of cloud, region, and zone.
|
574
|
+
for zone in filtered_zones:
|
575
|
+
resource_copy = controller_resources_to_use.copy(
|
576
|
+
cloud=registry.CLOUD_REGISTRY.from_str(cloud_name),
|
577
|
+
region=region,
|
578
|
+
zone=zone)
|
579
|
+
result.add(resource_copy)
|
580
|
+
|
581
|
+
if not result:
|
468
582
|
return {controller_resources_to_use}
|
469
|
-
return
|
470
|
-
controller_resources_to_use.copy(cloud=controller_cloud)
|
471
|
-
for controller_cloud in requested_clouds
|
472
|
-
}
|
583
|
+
return result
|
473
584
|
|
474
585
|
|
475
586
|
def _setup_proxy_command_on_controller(
|
476
|
-
controller_launched_cloud: 'clouds.Cloud'
|
587
|
+
controller_launched_cloud: 'clouds.Cloud',
|
588
|
+
user_config: Dict[str, Any]) -> config_utils.Config:
|
477
589
|
"""Sets up proxy command on the controller.
|
478
590
|
|
479
591
|
This function should be called on the controller (remote cluster), which
|
@@ -507,21 +619,20 @@ def _setup_proxy_command_on_controller(
|
|
507
619
|
# (or name). It may not be a sufficient check (as it's always
|
508
620
|
# possible that peering is not set up), but it may catch some
|
509
621
|
# obvious errors.
|
622
|
+
config = config_utils.Config.from_dict(user_config)
|
510
623
|
proxy_command_key = (str(controller_launched_cloud).lower(),
|
511
624
|
'ssh_proxy_command')
|
512
|
-
ssh_proxy_command =
|
513
|
-
config_dict = skypilot_config.to_dict()
|
625
|
+
ssh_proxy_command = config.get_nested(proxy_command_key, None)
|
514
626
|
if isinstance(ssh_proxy_command, str):
|
515
|
-
|
627
|
+
config.set_nested(proxy_command_key, None)
|
516
628
|
elif isinstance(ssh_proxy_command, dict):
|
517
629
|
# Instead of removing the key, we set the value to empty string
|
518
630
|
# so that the controller will only try the regions specified by
|
519
631
|
# the keys.
|
520
632
|
ssh_proxy_command = {k: None for k in ssh_proxy_command}
|
521
|
-
|
522
|
-
ssh_proxy_command)
|
633
|
+
config.set_nested(proxy_command_key, ssh_proxy_command)
|
523
634
|
|
524
|
-
return
|
635
|
+
return config
|
525
636
|
|
526
637
|
|
527
638
|
def replace_skypilot_config_path_in_file_mounts(
|
@@ -535,29 +646,84 @@ def replace_skypilot_config_path_in_file_mounts(
|
|
535
646
|
if file_mounts is None:
|
536
647
|
return
|
537
648
|
replaced = False
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
if to_replace:
|
550
|
-
file_mounts[remote_path] = f.name
|
551
|
-
replaced = True
|
552
|
-
else:
|
553
|
-
del file_mounts[remote_path]
|
649
|
+
for remote_path, local_path in list(file_mounts.items()):
|
650
|
+
if local_path is None:
|
651
|
+
del file_mounts[remote_path]
|
652
|
+
continue
|
653
|
+
if local_path.endswith(_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX):
|
654
|
+
with tempfile.NamedTemporaryFile('w', delete=False) as f:
|
655
|
+
user_config = common_utils.read_yaml(local_path)
|
656
|
+
config = _setup_proxy_command_on_controller(cloud, user_config)
|
657
|
+
common_utils.dump_yaml(f.name, dict(**config))
|
658
|
+
file_mounts[remote_path] = f.name
|
659
|
+
replaced = True
|
554
660
|
if replaced:
|
555
|
-
logger.debug(f'Replaced {
|
556
|
-
f'the real path in file mounts: {file_mounts}')
|
661
|
+
logger.debug(f'Replaced {_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX} '
|
662
|
+
f'with the real path in file mounts: {file_mounts}')
|
557
663
|
|
558
664
|
|
665
|
+
def _generate_run_uuid() -> str:
|
666
|
+
"""Generates a unique run id for the job."""
|
667
|
+
return common_utils.base36_encode(uuid.uuid4().hex)[:8]
|
668
|
+
|
669
|
+
|
670
|
+
def translate_local_file_mounts_to_two_hop(
|
671
|
+
task: 'task_lib.Task') -> Dict[str, str]:
|
672
|
+
"""Translates local->VM mounts into two-hop file mounts.
|
673
|
+
|
674
|
+
This strategy will upload the local files to the controller first, using a
|
675
|
+
normal rsync as part of sky.launch() for the controller. Then, when the
|
676
|
+
controller launches the task, it will also use local file_mounts from the
|
677
|
+
destination path of the first hop.
|
678
|
+
|
679
|
+
Local machine/API server Controller Job cluster
|
680
|
+
------------------------ ----------------------- --------------------
|
681
|
+
| local path ----|--|-> controller path --|--|-> job dst path |
|
682
|
+
------------------------ ----------------------- --------------------
|
683
|
+
|
684
|
+
Returns:
|
685
|
+
A dict mapping from controller file mount path to local file mount path
|
686
|
+
for the first hop. The task is updated in-place to do the second hop.
|
687
|
+
"""
|
688
|
+
first_hop_file_mounts = {}
|
689
|
+
second_hop_file_mounts = {}
|
690
|
+
|
691
|
+
run_id = _generate_run_uuid()
|
692
|
+
base_tmp_dir = os.path.join(constants.FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH,
|
693
|
+
run_id)
|
694
|
+
|
695
|
+
# Use a simple counter to create unique paths within the base_tmp_dir for
|
696
|
+
# each mount.
|
697
|
+
file_mount_id = 0
|
698
|
+
|
699
|
+
file_mounts_to_translate = task.file_mounts or {}
|
700
|
+
if task.workdir is not None:
|
701
|
+
file_mounts_to_translate[constants.SKY_REMOTE_WORKDIR] = task.workdir
|
702
|
+
task.workdir = None
|
703
|
+
|
704
|
+
for job_cluster_path, local_path in file_mounts_to_translate.items():
|
705
|
+
if data_utils.is_cloud_store_url(
|
706
|
+
local_path) or data_utils.is_cloud_store_url(job_cluster_path):
|
707
|
+
raise exceptions.NotSupportedError(
|
708
|
+
'Cloud-based file_mounts are specified, but no cloud storage '
|
709
|
+
'is available. Please specify local file_mounts only.')
|
710
|
+
|
711
|
+
controller_path = os.path.join(base_tmp_dir, f'{file_mount_id}')
|
712
|
+
file_mount_id += 1
|
713
|
+
first_hop_file_mounts[controller_path] = local_path
|
714
|
+
second_hop_file_mounts[job_cluster_path] = controller_path
|
715
|
+
|
716
|
+
# Use set_file_mounts to override existing file mounts, if they exist.
|
717
|
+
task.set_file_mounts(second_hop_file_mounts)
|
718
|
+
|
719
|
+
# Return the first hop info so that it can be added to the jobs-controller
|
720
|
+
# YAML.
|
721
|
+
return first_hop_file_mounts
|
722
|
+
|
723
|
+
|
724
|
+
# (maybe translate local file mounts) and (sync up)
|
559
725
|
def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
560
|
-
|
726
|
+
task_type: str) -> None:
|
561
727
|
"""Translates local->VM mounts into Storage->VM, then syncs up any Storage.
|
562
728
|
|
563
729
|
Eagerly syncing up local->Storage ensures Storage->VM would work at task
|
@@ -566,11 +732,31 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
566
732
|
If there are no local source paths to be translated, this function would
|
567
733
|
still sync up any storage mounts with local source paths (which do not
|
568
734
|
undergo translation).
|
735
|
+
|
736
|
+
When jobs.bucket or serve.bucket is not specified, an intermediate storage
|
737
|
+
dedicated for the job is created for the workdir and local file mounts and
|
738
|
+
the storage is deleted when the job finishes. We don't share the storage
|
739
|
+
between jobs, because jobs might have different resources requirements, and
|
740
|
+
sharing storage between jobs may cause egress costs or slower transfer
|
741
|
+
speeds.
|
569
742
|
"""
|
743
|
+
|
570
744
|
# ================================================================
|
571
745
|
# Translate the workdir and local file mounts to cloud file mounts.
|
572
746
|
# ================================================================
|
573
|
-
|
747
|
+
|
748
|
+
def _sub_path_join(sub_path: Optional[str], path: str) -> str:
|
749
|
+
if sub_path is None:
|
750
|
+
return path
|
751
|
+
return os.path.join(sub_path, path).strip('/')
|
752
|
+
|
753
|
+
# We use uuid to generate a unique run id for the job, so that the bucket/
|
754
|
+
# subdirectory name is unique across different jobs/services.
|
755
|
+
# We should not use common_utils.get_usage_run_id() here, because when
|
756
|
+
# Python API is used, the run id will be the same across multiple
|
757
|
+
# jobs.launch/serve.up calls after the sky is imported.
|
758
|
+
run_id = _generate_run_uuid()
|
759
|
+
user_hash = common_utils.get_user_hash()
|
574
760
|
original_file_mounts = task.file_mounts if task.file_mounts else {}
|
575
761
|
original_storage_mounts = task.storage_mounts if task.storage_mounts else {}
|
576
762
|
|
@@ -589,14 +775,35 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
589
775
|
elif has_local_source_paths_workdir:
|
590
776
|
msg = 'workdir'
|
591
777
|
if msg:
|
592
|
-
logger.info(
|
593
|
-
|
778
|
+
logger.info(
|
779
|
+
ux_utils.starting_message(f'Translating {msg} to '
|
780
|
+
'SkyPilot Storage...'))
|
781
|
+
rich_utils.force_update_status(
|
782
|
+
ux_utils.spinner_message(
|
783
|
+
f'Translating {msg} to SkyPilot Storage...'))
|
784
|
+
|
785
|
+
# Get the bucket name for the workdir and file mounts,
|
786
|
+
# we store all these files in same bucket from config.
|
787
|
+
bucket_wth_prefix = skypilot_config.get_nested((task_type, 'bucket'), None)
|
788
|
+
store_kwargs: Dict[str, Any] = {}
|
789
|
+
if bucket_wth_prefix is None:
|
790
|
+
store_type = sub_path = None
|
791
|
+
storage_account_name = region = None
|
792
|
+
bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format(
|
793
|
+
username=common_utils.get_cleaned_username(),
|
794
|
+
user_hash=user_hash,
|
795
|
+
id=run_id)
|
796
|
+
else:
|
797
|
+
(store_type, bucket_name, sub_path, storage_account_name, region) = (
|
798
|
+
storage_lib.StoreType.get_fields_from_store_url(bucket_wth_prefix))
|
799
|
+
if storage_account_name is not None:
|
800
|
+
store_kwargs['storage_account_name'] = storage_account_name
|
801
|
+
if region is not None:
|
802
|
+
store_kwargs['region'] = region
|
594
803
|
|
595
804
|
# Step 1: Translate the workdir to SkyPilot storage.
|
596
805
|
new_storage_mounts = {}
|
597
806
|
if task.workdir is not None:
|
598
|
-
bucket_name = constants.WORKDIR_BUCKET_NAME.format(
|
599
|
-
username=common_utils.get_cleaned_username(), id=run_id)
|
600
807
|
workdir = task.workdir
|
601
808
|
task.workdir = None
|
602
809
|
if (constants.SKY_REMOTE_WORKDIR in original_file_mounts or
|
@@ -604,18 +811,29 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
604
811
|
raise ValueError(
|
605
812
|
f'Cannot mount {constants.SKY_REMOTE_WORKDIR} as both the '
|
606
813
|
'workdir and file_mounts contains it as the target.')
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
814
|
+
bucket_sub_path = _sub_path_join(
|
815
|
+
sub_path,
|
816
|
+
constants.FILE_MOUNTS_WORKDIR_SUBPATH.format(run_id=run_id))
|
817
|
+
stores = None
|
818
|
+
if store_type is not None:
|
819
|
+
stores = [store_type]
|
820
|
+
|
821
|
+
storage_obj = storage_lib.Storage(
|
822
|
+
name=bucket_name,
|
823
|
+
source=workdir,
|
824
|
+
persistent=False,
|
825
|
+
mode=storage_lib.StorageMode.COPY,
|
826
|
+
stores=stores,
|
827
|
+
# Set `_is_sky_managed` to False when `bucket_with_prefix` is
|
828
|
+
# specified, so that the storage is not deleted when job finishes,
|
829
|
+
# but only the sub path is deleted.
|
830
|
+
_is_sky_managed=bucket_wth_prefix is None,
|
831
|
+
_bucket_sub_path=bucket_sub_path)
|
832
|
+
new_storage_mounts[constants.SKY_REMOTE_WORKDIR] = storage_obj
|
615
833
|
# Check of the existence of the workdir in file_mounts is done in
|
616
834
|
# the task construction.
|
617
|
-
logger.info(f'Workdir {workdir!r}
|
618
|
-
f'{bucket_name!r}.')
|
835
|
+
logger.info(f' {colorama.Style.DIM}Workdir: {workdir!r} '
|
836
|
+
f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}')
|
619
837
|
|
620
838
|
# Step 2: Translate the local file mounts with folder in src to SkyPilot
|
621
839
|
# storage.
|
@@ -629,88 +847,111 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
629
847
|
if os.path.isfile(os.path.abspath(os.path.expanduser(src))):
|
630
848
|
copy_mounts_with_file_in_src[dst] = src
|
631
849
|
continue
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
850
|
+
bucket_sub_path = _sub_path_join(
|
851
|
+
sub_path, constants.FILE_MOUNTS_SUBPATH.format(i=i, run_id=run_id))
|
852
|
+
stores = None
|
853
|
+
if store_type is not None:
|
854
|
+
stores = [store_type]
|
855
|
+
storage_obj = storage_lib.Storage(name=bucket_name,
|
856
|
+
source=src,
|
857
|
+
persistent=False,
|
858
|
+
mode=storage_lib.StorageMode.COPY,
|
859
|
+
stores=stores,
|
860
|
+
_is_sky_managed=not bucket_wth_prefix,
|
861
|
+
_bucket_sub_path=bucket_sub_path)
|
862
|
+
new_storage_mounts[dst] = storage_obj
|
863
|
+
logger.info(f' {colorama.Style.DIM}Folder : {src!r} '
|
864
|
+
f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}')
|
645
865
|
|
646
866
|
# Step 3: Translate local file mounts with file in src to SkyPilot storage.
|
647
867
|
# Hard link the files in src to a temporary directory, and upload folder.
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
os.makedirs(
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
src_to_file_id
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
'
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
868
|
+
file_mounts_tmp_subpath = _sub_path_join(
|
869
|
+
sub_path, constants.FILE_MOUNTS_TMP_SUBPATH.format(run_id=run_id))
|
870
|
+
base_tmp_dir = os.path.expanduser(constants.FILE_MOUNTS_LOCAL_TMP_BASE_PATH)
|
871
|
+
os.makedirs(base_tmp_dir, exist_ok=True)
|
872
|
+
with tempfile.TemporaryDirectory(dir=base_tmp_dir) as temp_path:
|
873
|
+
local_fm_path = os.path.join(
|
874
|
+
temp_path, constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id))
|
875
|
+
os.makedirs(local_fm_path, exist_ok=True)
|
876
|
+
file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format(
|
877
|
+
task_type)
|
878
|
+
if copy_mounts_with_file_in_src:
|
879
|
+
src_to_file_id = {}
|
880
|
+
for i, src in enumerate(set(copy_mounts_with_file_in_src.values())):
|
881
|
+
src_to_file_id[src] = i
|
882
|
+
os.link(os.path.abspath(os.path.expanduser(src)),
|
883
|
+
os.path.join(local_fm_path, f'file-{i}'))
|
884
|
+
stores = None
|
885
|
+
if store_type is not None:
|
886
|
+
stores = [store_type]
|
887
|
+
storage_obj = storage_lib.Storage(
|
888
|
+
name=bucket_name,
|
889
|
+
source=local_fm_path,
|
890
|
+
persistent=False,
|
891
|
+
mode=storage_lib.StorageMode.MOUNT,
|
892
|
+
stores=stores,
|
893
|
+
_is_sky_managed=not bucket_wth_prefix,
|
894
|
+
_bucket_sub_path=file_mounts_tmp_subpath)
|
895
|
+
|
896
|
+
new_storage_mounts[file_mount_remote_tmp_dir] = storage_obj
|
897
|
+
if file_mount_remote_tmp_dir in original_storage_mounts:
|
898
|
+
with ux_utils.print_exception_no_traceback():
|
899
|
+
raise ValueError(
|
900
|
+
'Failed to translate file mounts, due to the default '
|
901
|
+
f'destination {file_mount_remote_tmp_dir} '
|
902
|
+
'being taken.')
|
903
|
+
sources = list(src_to_file_id.keys())
|
904
|
+
sources_str = '\n '.join(sources)
|
905
|
+
logger.info(f' {colorama.Style.DIM}Files (listed below) '
|
906
|
+
f' -> storage: {bucket_name}:'
|
907
|
+
f'\n {sources_str}{colorama.Style.RESET_ALL}')
|
908
|
+
|
909
|
+
rich_utils.force_update_status(
|
910
|
+
ux_utils.spinner_message(
|
911
|
+
'Uploading translated local files/folders'))
|
912
|
+
task.update_storage_mounts(new_storage_mounts)
|
913
|
+
|
914
|
+
# Step 4: Upload storage from sources
|
915
|
+
# Upload the local source to a bucket. The task will not be executed
|
916
|
+
# locally, so we need to upload the files/folders to the bucket manually
|
917
|
+
# here before sending the task to the remote jobs controller. This will
|
918
|
+
# also upload any storage mounts that are not translated. After
|
919
|
+
# sync_storage_mounts, we will also have file_mounts in the task, but
|
920
|
+
# these aren't used since the storage_mounts for the same paths take
|
921
|
+
# precedence.
|
922
|
+
if task.storage_mounts:
|
923
|
+
# There may be existing (non-translated) storage mounts, so log this
|
924
|
+
# whenever task.storage_mounts is non-empty.
|
925
|
+
rich_utils.force_update_status(
|
926
|
+
ux_utils.spinner_message(
|
927
|
+
'Uploading local sources to storage[/] '
|
928
|
+
'[dim]View storages: sky storage ls'))
|
929
|
+
try:
|
930
|
+
task.sync_storage_mounts()
|
931
|
+
except (ValueError, exceptions.NoCloudAccessError) as e:
|
932
|
+
if 'No enabled cloud for storage' in str(e) or isinstance(
|
933
|
+
e, exceptions.NoCloudAccessError):
|
934
|
+
data_src = None
|
935
|
+
if has_local_source_paths_file_mounts:
|
936
|
+
data_src = 'file_mounts'
|
937
|
+
if has_local_source_paths_workdir:
|
938
|
+
if data_src:
|
939
|
+
data_src += ' and workdir'
|
940
|
+
else:
|
941
|
+
data_src = 'workdir'
|
942
|
+
store_enabled_clouds = ', '.join(
|
943
|
+
storage_lib.STORE_ENABLED_CLOUDS)
|
944
|
+
with ux_utils.print_exception_no_traceback():
|
945
|
+
raise exceptions.NotSupportedError(
|
946
|
+
f'Unable to use {data_src} - no cloud with object '
|
947
|
+
'store support is enabled. Please enable at least one '
|
948
|
+
'cloud with object store support '
|
949
|
+
f'({store_enabled_clouds}) by running `sky check`, or '
|
950
|
+
f'remove {data_src} from your task.'
|
951
|
+
'\nHint: If you do not have any cloud access, you may '
|
952
|
+
'still download data and code over the network using '
|
953
|
+
'curl or other tools in the `setup` section of the '
|
954
|
+
'task.') from None
|
714
955
|
|
715
956
|
# Step 5: Add the file download into the file mounts, such as
|
716
957
|
# /original-dst: s3://spot-fm-file-only-bucket-name/file-0
|
@@ -718,10 +959,15 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
718
959
|
if copy_mounts_with_file_in_src:
|
719
960
|
# file_mount_remote_tmp_dir will only exist when there are files in
|
720
961
|
# the src for copy mounts.
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
962
|
+
storage_obj = task.storage_mounts[file_mount_remote_tmp_dir]
|
963
|
+
assert storage_obj.stores, (storage_obj.__dict__, task.to_yaml_config())
|
964
|
+
curr_store_type = list(storage_obj.stores.keys())[0]
|
965
|
+
store_object = storage_obj.stores[curr_store_type]
|
966
|
+
assert store_object is not None, (storage_obj.__dict__,
|
967
|
+
task.to_yaml_config())
|
968
|
+
bucket_url = storage_lib.StoreType.get_endpoint_url(
|
969
|
+
store_object, bucket_name)
|
970
|
+
bucket_url += f'/{file_mounts_tmp_subpath}'
|
725
971
|
for dst, src in copy_mounts_with_file_in_src.items():
|
726
972
|
file_id = src_to_file_id[src]
|
727
973
|
new_file_mounts[dst] = bucket_url + f'/file-{file_id}'
|
@@ -733,12 +979,48 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
733
979
|
if (storage_obj.source is not None and
|
734
980
|
not data_utils.is_cloud_store_url(storage_obj.source)):
|
735
981
|
# Need to replace the local path with bucket URI, and remove the
|
736
|
-
# name field, so that the storage mount can work on the
|
982
|
+
# name field, so that the storage mount can work on the jobs
|
737
983
|
# controller.
|
738
984
|
store_types = list(storage_obj.stores.keys())
|
739
985
|
assert len(store_types) == 1, (
|
740
986
|
'We only support one store type for now.', storage_obj.stores)
|
741
|
-
|
742
|
-
|
743
|
-
|
987
|
+
curr_store_type = store_types[0]
|
988
|
+
store_object = storage_obj.stores[curr_store_type]
|
989
|
+
assert store_object is not None and storage_obj.name is not None, (
|
990
|
+
store_object, storage_obj.name)
|
991
|
+
storage_obj.source = storage_lib.StoreType.get_endpoint_url(
|
992
|
+
store_object, storage_obj.name)
|
744
993
|
storage_obj.force_delete = True
|
994
|
+
|
995
|
+
# Step 7: Convert all `MOUNT` mode storages which don't specify a source
|
996
|
+
# to specifying a source. If the source is specified with a local path,
|
997
|
+
# it was handled in step 6.
|
998
|
+
updated_mount_storages = {}
|
999
|
+
for storage_path, storage_obj in task.storage_mounts.items():
|
1000
|
+
if (storage_obj.mode == storage_lib.StorageMode.MOUNT and
|
1001
|
+
not storage_obj.source):
|
1002
|
+
# Construct source URL with first store type and storage name
|
1003
|
+
# E.g., s3://my-storage-name
|
1004
|
+
store_types = list(storage_obj.stores.keys())
|
1005
|
+
assert len(store_types) == 1, (
|
1006
|
+
'We only support one store type for now.', storage_obj.stores)
|
1007
|
+
curr_store_type = store_types[0]
|
1008
|
+
store_object = storage_obj.stores[curr_store_type]
|
1009
|
+
assert store_object is not None and storage_obj.name is not None, (
|
1010
|
+
store_object, storage_obj.name)
|
1011
|
+
source = storage_lib.StoreType.get_endpoint_url(
|
1012
|
+
store_object, storage_obj.name)
|
1013
|
+
assert store_object is not None and storage_obj.name is not None, (
|
1014
|
+
store_object, storage_obj.name)
|
1015
|
+
new_storage = storage_lib.Storage.from_yaml_config({
|
1016
|
+
'source': source,
|
1017
|
+
'persistent': storage_obj.persistent,
|
1018
|
+
'mode': storage_lib.StorageMode.MOUNT.value,
|
1019
|
+
# We enable force delete to allow the controller to delete
|
1020
|
+
# the object store in case persistent is set to False.
|
1021
|
+
'_force_delete': True
|
1022
|
+
})
|
1023
|
+
updated_mount_storages[storage_path] = new_storage
|
1024
|
+
task.update_storage_mounts(updated_mount_storages)
|
1025
|
+
if msg:
|
1026
|
+
logger.info(ux_utils.finishing_message('Uploaded local files/folders.'))
|