skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/authentication.py
CHANGED
@@ -12,14 +12,12 @@ in ray yaml config as input,
|
|
12
12
|
2. Setup the `authorized_keys` on the remote VM with the public key content,
|
13
13
|
by cloud-init or directly using cloud provider's API.
|
14
14
|
|
15
|
-
The local machine's public key should not be uploaded to the
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
comments in setup_lambda_authentication)
|
15
|
+
The local machine's public key should not be uploaded to the remote VM, because
|
16
|
+
it will cause private/public key pair mismatch when the user tries to launch new
|
17
|
+
VM from that remote VM using SkyPilot, e.g., the node is used as a jobs
|
18
|
+
controller. (Lambda cloud is an exception, due to the limitation of the cloud
|
19
|
+
provider. See the comments in setup_lambda_authentication)
|
21
20
|
"""
|
22
|
-
import base64
|
23
21
|
import copy
|
24
22
|
import functools
|
25
23
|
import os
|
@@ -44,10 +42,12 @@ from sky.adaptors import gcp
|
|
44
42
|
from sky.adaptors import ibm
|
45
43
|
from sky.adaptors import kubernetes
|
46
44
|
from sky.adaptors import runpod
|
47
|
-
from sky.
|
45
|
+
from sky.adaptors import vast
|
48
46
|
from sky.provision.fluidstack import fluidstack_utils
|
49
47
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
48
|
+
from sky.provision.lambda_cloud import lambda_utils
|
50
49
|
from sky.utils import common_utils
|
50
|
+
from sky.utils import config_utils
|
51
51
|
from sky.utils import kubernetes_enums
|
52
52
|
from sky.utils import subprocess_utils
|
53
53
|
from sky.utils import ux_utils
|
@@ -61,9 +61,24 @@ logger = sky_logging.init_logger(__name__)
|
|
61
61
|
|
62
62
|
MAX_TRIALS = 64
|
63
63
|
# TODO(zhwu): Support user specified key pair.
|
64
|
-
|
65
|
-
|
66
|
-
|
64
|
+
# We intentionally not have the ssh key pair to be stored in
|
65
|
+
# ~/.sky/api_server/clients, i.e. sky.server.common.API_SERVER_CLIENT_DIR,
|
66
|
+
# because ssh key pair need to persist across API server restarts, while
|
67
|
+
# the former dir is empheral.
|
68
|
+
_SSH_KEY_PATH_PREFIX = '~/.sky/clients/{user_hash}/ssh'
|
69
|
+
|
70
|
+
|
71
|
+
def get_ssh_key_and_lock_path() -> Tuple[str, str, str]:
|
72
|
+
user_hash = common_utils.get_user_hash()
|
73
|
+
user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
|
74
|
+
|
75
|
+
os.makedirs(os.path.expanduser(user_ssh_key_prefix),
|
76
|
+
exist_ok=True,
|
77
|
+
mode=0o700)
|
78
|
+
private_key_path = os.path.join(user_ssh_key_prefix, 'sky-key')
|
79
|
+
public_key_path = os.path.join(user_ssh_key_prefix, 'sky-key.pub')
|
80
|
+
lock_path = os.path.join(user_ssh_key_prefix, '.__internal-sky-key.lock')
|
81
|
+
return private_key_path, public_key_path, lock_path
|
67
82
|
|
68
83
|
|
69
84
|
def _generate_rsa_key_pair() -> Tuple[str, str]:
|
@@ -106,16 +121,17 @@ def _save_key_pair(private_key_path: str, public_key_path: str,
|
|
106
121
|
|
107
122
|
def get_or_generate_keys() -> Tuple[str, str]:
|
108
123
|
"""Returns the aboslute private and public key paths."""
|
109
|
-
private_key_path =
|
110
|
-
|
124
|
+
private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path()
|
125
|
+
private_key_path = os.path.expanduser(private_key_path)
|
126
|
+
public_key_path = os.path.expanduser(public_key_path)
|
127
|
+
lock_path = os.path.expanduser(lock_path)
|
111
128
|
|
112
|
-
|
113
|
-
lock_dir = os.path.dirname(key_file_lock)
|
129
|
+
lock_dir = os.path.dirname(lock_path)
|
114
130
|
# We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
|
115
131
|
# as the ssh configs will be written to this folder as well in
|
116
132
|
# backend_utils.SSHConfigHelper
|
117
133
|
os.makedirs(lock_dir, exist_ok=True, mode=0o700)
|
118
|
-
with filelock.FileLock(
|
134
|
+
with filelock.FileLock(lock_path, timeout=10):
|
119
135
|
if not os.path.exists(private_key_path):
|
120
136
|
public_key, private_key = _generate_rsa_key_pair()
|
121
137
|
_save_key_pair(private_key_path, public_key_path, private_key,
|
@@ -270,43 +286,13 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
270
286
|
return configure_ssh_info(config)
|
271
287
|
|
272
288
|
|
273
|
-
# In Azure, cloud-init script must be encoded in base64. See
|
274
|
-
# https://learn.microsoft.com/en-us/azure/virtual-machines/custom-data
|
275
|
-
# for more information. Here we decode it and replace the ssh user
|
276
|
-
# and public key content, then encode it back.
|
277
|
-
def setup_azure_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
278
|
-
_, public_key_path = get_or_generate_keys()
|
279
|
-
with open(public_key_path, 'r', encoding='utf-8') as f:
|
280
|
-
public_key = f.read().strip()
|
281
|
-
for node_type in config['available_node_types']:
|
282
|
-
node_config = config['available_node_types'][node_type]['node_config']
|
283
|
-
cloud_init = (
|
284
|
-
node_config['azure_arm_parameters']['cloudInitSetupCommands'])
|
285
|
-
cloud_init = base64.b64decode(cloud_init).decode('utf-8')
|
286
|
-
cloud_init = cloud_init.replace('skypilot:ssh_user',
|
287
|
-
config['auth']['ssh_user'])
|
288
|
-
cloud_init = cloud_init.replace('skypilot:ssh_public_key_content',
|
289
|
-
public_key)
|
290
|
-
cloud_init = base64.b64encode(
|
291
|
-
cloud_init.encode('utf-8')).decode('utf-8')
|
292
|
-
node_config['azure_arm_parameters']['cloudInitSetupCommands'] = (
|
293
|
-
cloud_init)
|
294
|
-
config_str = common_utils.dump_yaml_str(config)
|
295
|
-
config_str = config_str.replace('skypilot:ssh_user',
|
296
|
-
config['auth']['ssh_user'])
|
297
|
-
config_str = config_str.replace('skypilot:ssh_public_key_content',
|
298
|
-
public_key)
|
299
|
-
config = yaml.safe_load(config_str)
|
300
|
-
return config
|
301
|
-
|
302
|
-
|
303
289
|
def setup_lambda_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
304
290
|
|
305
291
|
get_or_generate_keys()
|
306
292
|
|
307
293
|
# Ensure ssh key is registered with Lambda Cloud
|
308
294
|
lambda_client = lambda_utils.LambdaCloudClient()
|
309
|
-
public_key_path =
|
295
|
+
_, public_key_path = get_or_generate_keys()
|
310
296
|
with open(public_key_path, 'r', encoding='utf-8') as f:
|
311
297
|
public_key = f.read().strip()
|
312
298
|
prefix = f'sky-key-{common_utils.get_user_hash()}'
|
@@ -314,26 +300,16 @@ def setup_lambda_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
314
300
|
if not exists:
|
315
301
|
lambda_client.register_ssh_key(name, public_key)
|
316
302
|
|
317
|
-
|
318
|
-
# path for finding the public key path on both local and head node.
|
319
|
-
config['auth']['ssh_public_key'] = PUBLIC_SSH_KEY_PATH
|
320
|
-
|
321
|
-
# TODO(zhwu): we need to avoid uploading the public ssh key to the
|
322
|
-
# nodes, as that will cause problem when the node is used as spot
|
323
|
-
# controller, i.e., the public and private key on the node may
|
324
|
-
# not match.
|
325
|
-
file_mounts = config['file_mounts']
|
326
|
-
file_mounts[PUBLIC_SSH_KEY_PATH] = PUBLIC_SSH_KEY_PATH
|
327
|
-
config['file_mounts'] = file_mounts
|
328
|
-
|
303
|
+
config['auth']['remote_key_name'] = name
|
329
304
|
return config
|
330
305
|
|
331
306
|
|
332
|
-
def setup_ibm_authentication(config):
|
307
|
+
def setup_ibm_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
333
308
|
""" registers keys if they do not exist in sky folder
|
334
309
|
and updates config file.
|
335
310
|
keys default location: '~/.ssh/sky-key' and '~/.ssh/sky-key.pub'
|
336
311
|
"""
|
312
|
+
private_key_path, _ = get_or_generate_keys()
|
337
313
|
|
338
314
|
def _get_unique_key_name():
|
339
315
|
suffix_len = 10
|
@@ -373,17 +349,11 @@ def setup_ibm_authentication(config):
|
|
373
349
|
else:
|
374
350
|
raise Exception('Failed to register a key') from e
|
375
351
|
|
376
|
-
config['auth']['ssh_private_key'] =
|
352
|
+
config['auth']['ssh_private_key'] = private_key_path
|
377
353
|
|
378
354
|
for node_type in config['available_node_types']:
|
379
355
|
config['available_node_types'][node_type]['node_config'][
|
380
356
|
'key_id'] = vpc_key_id
|
381
|
-
|
382
|
-
# Add public key path to file mounts
|
383
|
-
file_mounts = config['file_mounts']
|
384
|
-
file_mounts[PUBLIC_SSH_KEY_PATH] = PUBLIC_SSH_KEY_PATH
|
385
|
-
config['file_mounts'] = file_mounts
|
386
|
-
|
387
357
|
return config
|
388
358
|
|
389
359
|
|
@@ -403,13 +373,19 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
403
373
|
with ux_utils.print_exception_no_traceback():
|
404
374
|
raise ValueError(str(e) + ' Please check: ~/.sky/config.yaml.') \
|
405
375
|
from None
|
406
|
-
get_or_generate_keys()
|
376
|
+
_, public_key_path = get_or_generate_keys()
|
407
377
|
|
408
378
|
# Add the user's public key to the SkyPilot cluster.
|
409
|
-
public_key_path = os.path.expanduser(PUBLIC_SSH_KEY_PATH)
|
410
379
|
secret_name = clouds.Kubernetes.SKY_SSH_KEY_SECRET_NAME
|
411
380
|
secret_field_name = clouds.Kubernetes().ssh_key_secret_field_name
|
412
|
-
|
381
|
+
context = config['provider'].get(
|
382
|
+
'context', kubernetes_utils.get_current_kube_config_context_name())
|
383
|
+
if context == kubernetes.in_cluster_context_name():
|
384
|
+
# If the context is an in-cluster context name, we are running in a pod
|
385
|
+
# with in-cluster configuration. We need to set the context to None
|
386
|
+
# to use the mounted service account.
|
387
|
+
context = None
|
388
|
+
namespace = kubernetes_utils.get_namespace_from_config(config['provider'])
|
413
389
|
k8s = kubernetes.kubernetes
|
414
390
|
with open(public_key_path, 'r', encoding='utf-8') as f:
|
415
391
|
public_key = f.read()
|
@@ -425,44 +401,71 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
425
401
|
}
|
426
402
|
custom_metadata = skypilot_config.get_nested(
|
427
403
|
('kubernetes', 'custom_metadata'), {})
|
428
|
-
|
404
|
+
config_utils.merge_k8s_configs(secret_metadata, custom_metadata)
|
429
405
|
|
430
406
|
secret = k8s.client.V1Secret(
|
431
407
|
metadata=k8s.client.V1ObjectMeta(**secret_metadata),
|
432
408
|
string_data={secret_field_name: public_key})
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
409
|
+
try:
|
410
|
+
if kubernetes_utils.check_secret_exists(secret_name, namespace,
|
411
|
+
context):
|
412
|
+
logger.debug(f'Key {secret_name} exists in the cluster, '
|
413
|
+
'patching it...')
|
414
|
+
kubernetes.core_api(context).patch_namespaced_secret(
|
415
|
+
secret_name, namespace, secret)
|
416
|
+
else:
|
417
|
+
logger.debug(f'Key {secret_name} does not exist in the cluster, '
|
418
|
+
'creating it...')
|
419
|
+
kubernetes.core_api(context).create_namespaced_secret(
|
420
|
+
namespace, secret)
|
421
|
+
except kubernetes.api_exception() as e:
|
422
|
+
if e.status == 409 and e.reason == 'AlreadyExists':
|
423
|
+
logger.debug(f'Key {secret_name} was created concurrently, '
|
424
|
+
'patching it...')
|
425
|
+
kubernetes.core_api(context).patch_namespaced_secret(
|
426
|
+
secret_name, namespace, secret)
|
427
|
+
else:
|
428
|
+
raise e
|
441
429
|
|
442
|
-
|
430
|
+
private_key_path, _ = get_or_generate_keys()
|
443
431
|
if network_mode == nodeport_mode:
|
432
|
+
ssh_jump_name = clouds.Kubernetes.SKY_SSH_JUMP_NAME
|
444
433
|
service_type = kubernetes_enums.KubernetesServiceType.NODEPORT
|
434
|
+
# Setup service for SSH jump pod. We create the SSH jump service here
|
435
|
+
# because we need to know the service IP address and port to set the
|
436
|
+
# ssh_proxy_command in the autoscaler config.
|
437
|
+
kubernetes_utils.setup_ssh_jump_svc(ssh_jump_name, namespace, context,
|
438
|
+
service_type)
|
439
|
+
ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
|
440
|
+
ssh_jump_name,
|
441
|
+
nodeport_mode,
|
442
|
+
private_key_path=private_key_path,
|
443
|
+
context=context,
|
444
|
+
namespace=namespace)
|
445
445
|
elif network_mode == port_forward_mode:
|
446
|
+
# Using `kubectl port-forward` creates a direct tunnel to the pod and
|
447
|
+
# does not require a ssh jump pod.
|
446
448
|
kubernetes_utils.check_port_forward_mode_dependencies()
|
447
|
-
#
|
448
|
-
#
|
449
|
-
# the
|
450
|
-
#
|
451
|
-
|
449
|
+
# TODO(romilb): This can be further optimized. Instead of using the
|
450
|
+
# head node as a jump pod for worker nodes, we can also directly
|
451
|
+
# set the ssh_target to the worker node. However, that requires
|
452
|
+
# changes in the downstream code to return a mapping of node IPs to
|
453
|
+
# pod names (to be used as ssh_target) and updating the upstream
|
454
|
+
# SSHConfigHelper to use a different ProxyCommand for each pod.
|
455
|
+
# This optimization can reduce SSH time from ~0.35s to ~0.25s, tested
|
456
|
+
# on GKE.
|
457
|
+
ssh_target = config['cluster_name'] + '-head'
|
458
|
+
ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
|
459
|
+
ssh_target,
|
460
|
+
port_forward_mode,
|
461
|
+
private_key_path=private_key_path,
|
462
|
+
context=context,
|
463
|
+
namespace=namespace)
|
452
464
|
else:
|
453
465
|
# This should never happen because we check for this in from_str above.
|
454
466
|
raise ValueError(f'Unsupported networking mode: {network_mode_str}')
|
455
|
-
# Setup service for SSH jump pod. We create the SSH jump service here
|
456
|
-
# because we need to know the service IP address and port to set the
|
457
|
-
# ssh_proxy_command in the autoscaler config.
|
458
|
-
kubernetes_utils.setup_ssh_jump_svc(ssh_jump_name, namespace, service_type)
|
459
|
-
|
460
|
-
ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
|
461
|
-
PRIVATE_SSH_KEY_PATH, ssh_jump_name, network_mode, namespace,
|
462
|
-
clouds.Kubernetes.PORT_FORWARD_PROXY_CMD_PATH,
|
463
|
-
clouds.Kubernetes.PORT_FORWARD_PROXY_CMD_TEMPLATE)
|
464
|
-
|
465
467
|
config['auth']['ssh_proxy_command'] = ssh_proxy_cmd
|
468
|
+
config['auth']['ssh_private_key'] = private_key_path
|
466
469
|
|
467
470
|
return config
|
468
471
|
|
@@ -481,15 +484,31 @@ def setup_runpod_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
481
484
|
return configure_ssh_info(config)
|
482
485
|
|
483
486
|
|
487
|
+
def setup_vast_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
488
|
+
"""Sets up SSH authentication for Vast.
|
489
|
+
- Generates a new SSH key pair if one does not exist.
|
490
|
+
- Adds the public SSH key to the user's Vast account.
|
491
|
+
"""
|
492
|
+
_, public_key_path = get_or_generate_keys()
|
493
|
+
with open(public_key_path, 'r', encoding='UTF-8') as pub_key_file:
|
494
|
+
public_key = pub_key_file.read().strip()
|
495
|
+
current_key_list = vast.vast().show_ssh_keys() # pylint: disable=assignment-from-no-return
|
496
|
+
# Only add an ssh key if it hasn't already been added
|
497
|
+
if not any(x['public_key'] == public_key for x in current_key_list):
|
498
|
+
vast.vast().create_ssh_key(ssh_key=public_key)
|
499
|
+
|
500
|
+
config['auth']['ssh_public_key'] = public_key_path
|
501
|
+
return configure_ssh_info(config)
|
502
|
+
|
503
|
+
|
484
504
|
def setup_fluidstack_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
485
505
|
|
486
|
-
get_or_generate_keys()
|
506
|
+
_, public_key_path = get_or_generate_keys()
|
487
507
|
|
488
508
|
client = fluidstack_utils.FluidstackClient()
|
489
|
-
public_key_path = os.path.expanduser(PUBLIC_SSH_KEY_PATH)
|
490
509
|
public_key = None
|
491
510
|
with open(public_key_path, 'r', encoding='utf-8') as f:
|
492
511
|
public_key = f.read()
|
493
512
|
client.get_or_add_ssh_key(public_key)
|
494
|
-
config['auth']['ssh_public_key'] =
|
513
|
+
config['auth']['ssh_public_key'] = public_key_path
|
495
514
|
return configure_ssh_info(config)
|
sky/backends/backend.py
CHANGED
@@ -2,9 +2,11 @@
|
|
2
2
|
import typing
|
3
3
|
from typing import Dict, Generic, Optional
|
4
4
|
|
5
|
-
import sky
|
6
5
|
from sky.usage import usage_lib
|
6
|
+
from sky.utils import cluster_utils
|
7
|
+
from sky.utils import rich_utils
|
7
8
|
from sky.utils import timeline
|
9
|
+
from sky.utils import ux_utils
|
8
10
|
|
9
11
|
if typing.TYPE_CHECKING:
|
10
12
|
from sky import resources
|
@@ -43,19 +45,45 @@ class Backend(Generic[_ResourceHandleType]):
|
|
43
45
|
@timeline.event
|
44
46
|
@usage_lib.messages.usage.update_runtime('provision')
|
45
47
|
def provision(
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
48
|
+
self,
|
49
|
+
task: 'task_lib.Task',
|
50
|
+
to_provision: Optional['resources.Resources'],
|
51
|
+
dryrun: bool,
|
52
|
+
stream_logs: bool,
|
53
|
+
cluster_name: Optional[str] = None,
|
54
|
+
retry_until_up: bool = False,
|
55
|
+
skip_unnecessary_provisioning: bool = False,
|
56
|
+
) -> Optional[_ResourceHandleType]:
|
57
|
+
"""Provisions resources for the given task.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
task: The task to provision resources for.
|
61
|
+
to_provision: Resource config to provision. Should only be None if
|
62
|
+
cluster_name refers to an existing cluster, whose resources will
|
63
|
+
be used.
|
64
|
+
dryrun: If True, don't actually provision anything.
|
65
|
+
stream_logs: If True, stream additional logs to console.
|
66
|
+
cluster_name: Name of the cluster to provision. If None, a name will
|
67
|
+
be auto-generated. If the name refers to an existing cluster,
|
68
|
+
the existing cluster will be reused and re-provisioned.
|
69
|
+
retry_until_up: If True, retry provisioning until resources are
|
70
|
+
successfully launched.
|
71
|
+
skip_if_no_cluster_updates: If True, compare the cluster config to
|
72
|
+
the existing cluster_name's config. Skip provisioning if no
|
73
|
+
updates are needed for the existing cluster.
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
A ResourceHandle object for the provisioned resources, or None if
|
77
|
+
dryrun is True.
|
78
|
+
"""
|
53
79
|
if cluster_name is None:
|
54
|
-
cluster_name =
|
80
|
+
cluster_name = cluster_utils.generate_cluster_name()
|
55
81
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
56
82
|
usage_lib.messages.usage.update_actual_task(task)
|
57
|
-
|
58
|
-
|
83
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Launching')):
|
84
|
+
return self._provision(task, to_provision, dryrun, stream_logs,
|
85
|
+
cluster_name, retry_until_up,
|
86
|
+
skip_unnecessary_provisioning)
|
59
87
|
|
60
88
|
@timeline.event
|
61
89
|
@usage_lib.messages.usage.update_runtime('sync_workdir')
|
@@ -76,7 +104,8 @@ class Backend(Generic[_ResourceHandleType]):
|
|
76
104
|
@usage_lib.messages.usage.update_runtime('setup')
|
77
105
|
def setup(self, handle: _ResourceHandleType, task: 'task_lib.Task',
|
78
106
|
detach_setup: bool) -> None:
|
79
|
-
|
107
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Running setup')):
|
108
|
+
return self._setup(handle, task, detach_setup)
|
80
109
|
|
81
110
|
def add_storage_objects(self, task: 'task_lib.Task') -> None:
|
82
111
|
raise NotImplementedError
|
@@ -96,7 +125,8 @@ class Backend(Generic[_ResourceHandleType]):
|
|
96
125
|
usage_lib.record_cluster_name_for_current_operation(
|
97
126
|
handle.get_cluster_name())
|
98
127
|
usage_lib.messages.usage.update_actual_task(task)
|
99
|
-
|
128
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Submitting job')):
|
129
|
+
return self._execute(handle, task, detach_run, dryrun)
|
100
130
|
|
101
131
|
@timeline.event
|
102
132
|
def post_execute(self, handle: _ResourceHandleType, down: bool) -> None:
|
@@ -121,13 +151,15 @@ class Backend(Generic[_ResourceHandleType]):
|
|
121
151
|
|
122
152
|
# --- Implementations of the APIs ---
|
123
153
|
def _provision(
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
154
|
+
self,
|
155
|
+
task: 'task_lib.Task',
|
156
|
+
to_provision: Optional['resources.Resources'],
|
157
|
+
dryrun: bool,
|
158
|
+
stream_logs: bool,
|
159
|
+
cluster_name: str,
|
160
|
+
retry_until_up: bool = False,
|
161
|
+
skip_unnecessary_provisioning: bool = False,
|
162
|
+
) -> Optional[_ResourceHandleType]:
|
131
163
|
raise NotImplementedError
|
132
164
|
|
133
165
|
def _sync_workdir(self, handle: _ResourceHandleType, workdir: Path) -> None:
|