skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,22 @@
|
|
1
1
|
"""Utilities for sky status."""
|
2
|
+
import typing
|
2
3
|
from typing import Any, Callable, Dict, List, Optional
|
3
4
|
|
4
5
|
import click
|
5
6
|
import colorama
|
6
7
|
|
7
8
|
from sky import backends
|
8
|
-
from sky import status_lib
|
9
9
|
from sky.skylet import constants
|
10
|
+
from sky.utils import common_utils
|
10
11
|
from sky.utils import log_utils
|
11
12
|
from sky.utils import resources_utils
|
13
|
+
from sky.utils import status_lib
|
14
|
+
|
15
|
+
if typing.TYPE_CHECKING:
|
16
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
17
|
+
|
18
|
+
if typing.TYPE_CHECKING:
|
19
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
12
20
|
|
13
21
|
COMMAND_TRUNC_LENGTH = 25
|
14
22
|
NUM_COST_REPORT_LINES = 5
|
@@ -19,25 +27,6 @@ _ClusterRecord = Dict[str, Any]
|
|
19
27
|
_ClusterCostReportRecord = Dict[str, Any]
|
20
28
|
|
21
29
|
|
22
|
-
def truncate_long_string(s: str, max_length: int = 35) -> str:
|
23
|
-
if len(s) <= max_length:
|
24
|
-
return s
|
25
|
-
splits = s.split(' ')
|
26
|
-
if len(splits[0]) > max_length:
|
27
|
-
return splits[0][:max_length] + '...' # Use '…'?
|
28
|
-
# Truncate on word boundary.
|
29
|
-
i = 0
|
30
|
-
total = 0
|
31
|
-
for i, part in enumerate(splits):
|
32
|
-
total += len(part)
|
33
|
-
if total >= max_length:
|
34
|
-
break
|
35
|
-
prefix = ' '.join(splits[:i])
|
36
|
-
if len(prefix) < max_length:
|
37
|
-
prefix += s[len(prefix):max_length]
|
38
|
-
return prefix + '...'
|
39
|
-
|
40
|
-
|
41
30
|
class StatusColumn:
|
42
31
|
"""One column of the displayed cluster table"""
|
43
32
|
|
@@ -54,12 +43,14 @@ class StatusColumn:
|
|
54
43
|
def calc(self, record):
|
55
44
|
val = self.calc_func(record)
|
56
45
|
if self.trunc_length != 0:
|
57
|
-
val = truncate_long_string(str(val), self.trunc_length)
|
46
|
+
val = common_utils.truncate_long_string(str(val), self.trunc_length)
|
58
47
|
return val
|
59
48
|
|
60
49
|
|
61
50
|
def show_status_table(cluster_records: List[_ClusterRecord],
|
62
|
-
show_all: bool
|
51
|
+
show_all: bool,
|
52
|
+
show_user: bool,
|
53
|
+
query_clusters: Optional[List[str]] = None) -> int:
|
63
54
|
"""Compute cluster table values and display.
|
64
55
|
|
65
56
|
Returns:
|
@@ -70,6 +61,13 @@ def show_status_table(cluster_records: List[_ClusterRecord],
|
|
70
61
|
|
71
62
|
status_columns = [
|
72
63
|
StatusColumn('NAME', _get_name),
|
64
|
+
]
|
65
|
+
if show_user:
|
66
|
+
status_columns.append(StatusColumn('USER', _get_user_name))
|
67
|
+
status_columns.append(
|
68
|
+
StatusColumn('USER_ID', _get_user_hash, show_by_default=False))
|
69
|
+
|
70
|
+
status_columns += [
|
73
71
|
StatusColumn('LAUNCHED', _get_launched),
|
74
72
|
StatusColumn('RESOURCES',
|
75
73
|
_get_resources,
|
@@ -101,7 +99,21 @@ def show_status_table(cluster_records: List[_ClusterRecord],
|
|
101
99
|
|
102
100
|
if cluster_records:
|
103
101
|
click.echo(cluster_table)
|
104
|
-
|
102
|
+
|
103
|
+
if query_clusters:
|
104
|
+
cluster_names = {record['name'] for record in cluster_records}
|
105
|
+
not_found_clusters = [
|
106
|
+
repr(cluster)
|
107
|
+
for cluster in query_clusters
|
108
|
+
if cluster not in cluster_names
|
109
|
+
]
|
110
|
+
cluster_str = 'Cluster'
|
111
|
+
if len(not_found_clusters) > 1:
|
112
|
+
cluster_str += 's'
|
113
|
+
cluster_str += ' '
|
114
|
+
cluster_str += ', '.join(not_found_clusters)
|
115
|
+
click.echo(f'{cluster_str} not found.')
|
116
|
+
elif not cluster_records:
|
105
117
|
click.echo('No existing clusters.')
|
106
118
|
return num_pending_autostop
|
107
119
|
|
@@ -202,6 +214,8 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord],
|
|
202
214
|
# _ClusterCostReportRecord, which is okay as we guarantee the queried fields
|
203
215
|
# exist in those cases.
|
204
216
|
_get_name = (lambda cluster_record: cluster_record['name'])
|
217
|
+
_get_user_hash = (lambda cluster_record: cluster_record['user_hash'])
|
218
|
+
_get_user_name = (lambda cluster_record: cluster_record.get('user_name', '-'))
|
205
219
|
_get_launched = (lambda cluster_record: log_utils.readable_time_duration(
|
206
220
|
cluster_record['launched_at']))
|
207
221
|
_get_region = (
|
@@ -220,6 +234,8 @@ def _get_status_colored(cluster_record: _ClusterRecord) -> str:
|
|
220
234
|
|
221
235
|
|
222
236
|
def _get_resources(cluster_record: _ClusterRecord) -> str:
|
237
|
+
if 'resources_str' in cluster_record:
|
238
|
+
return cluster_record['resources_str']
|
223
239
|
handle = cluster_record['handle']
|
224
240
|
if isinstance(handle, backends.LocalDockerResourceHandle):
|
225
241
|
resources_str = 'docker'
|
@@ -316,3 +332,45 @@ def _get_estimated_cost_for_cost_report(
|
|
316
332
|
return '-'
|
317
333
|
|
318
334
|
return f'$ {cost:.2f}'
|
335
|
+
|
336
|
+
|
337
|
+
def show_kubernetes_cluster_status_table(
|
338
|
+
clusters: List['kubernetes_utils.KubernetesSkyPilotClusterInfo'],
|
339
|
+
show_all: bool) -> None:
|
340
|
+
"""Compute cluster table values and display for Kubernetes clusters."""
|
341
|
+
status_columns = [
|
342
|
+
StatusColumn('USER', lambda c: c.user),
|
343
|
+
StatusColumn('NAME', lambda c: c.cluster_name),
|
344
|
+
StatusColumn('LAUNCHED',
|
345
|
+
lambda c: log_utils.readable_time_duration(c.launched_at)),
|
346
|
+
StatusColumn('RESOURCES',
|
347
|
+
lambda c: c.resources_str,
|
348
|
+
trunc_length=70 if not show_all else 0),
|
349
|
+
StatusColumn('STATUS', lambda c: c.status.colored_str()),
|
350
|
+
# TODO(romilb): We should consider adding POD_NAME field here when --all
|
351
|
+
# is passed to help users fetch pod name programmatically.
|
352
|
+
]
|
353
|
+
|
354
|
+
columns = [
|
355
|
+
col.name for col in status_columns if col.show_by_default or show_all
|
356
|
+
]
|
357
|
+
cluster_table = log_utils.create_table(columns)
|
358
|
+
|
359
|
+
# Sort table by user, then by cluster name
|
360
|
+
sorted_clusters = sorted(clusters, key=lambda c: (c.user, c.cluster_name))
|
361
|
+
|
362
|
+
for cluster in sorted_clusters:
|
363
|
+
row = []
|
364
|
+
for status_column in status_columns:
|
365
|
+
if status_column.show_by_default or show_all:
|
366
|
+
row.append(status_column.calc(cluster))
|
367
|
+
cluster_table.add_row(row)
|
368
|
+
|
369
|
+
if clusters:
|
370
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
371
|
+
f'SkyPilot clusters'
|
372
|
+
f'{colorama.Style.RESET_ALL}')
|
373
|
+
click.echo(cluster_table)
|
374
|
+
else:
|
375
|
+
click.echo('No SkyPilot resources found in the '
|
376
|
+
'active Kubernetes context.')
|
@@ -0,0 +1,356 @@
|
|
1
|
+
"""Utility functions for cluster yaml file."""
|
2
|
+
|
3
|
+
import functools
|
4
|
+
import glob
|
5
|
+
import os
|
6
|
+
import re
|
7
|
+
import textwrap
|
8
|
+
from typing import Dict, List, Optional
|
9
|
+
import uuid
|
10
|
+
|
11
|
+
from sky.skylet import constants
|
12
|
+
from sky.utils import command_runner
|
13
|
+
from sky.utils import common_utils
|
14
|
+
from sky.utils import timeline
|
15
|
+
|
16
|
+
# The cluster yaml used to create the current cluster where the module is
|
17
|
+
# called.
|
18
|
+
SKY_CLUSTER_YAML_REMOTE_PATH = '~/.sky/sky_ray.yml'
|
19
|
+
|
20
|
+
|
21
|
+
def get_provider_name(config: dict) -> str:
|
22
|
+
"""Return the name of the provider."""
|
23
|
+
|
24
|
+
provider_module = config['provider']['module']
|
25
|
+
# Examples:
|
26
|
+
# 'sky.skylet.providers.aws.AWSNodeProviderV2' -> 'aws'
|
27
|
+
# 'sky.provision.aws' -> 'aws'
|
28
|
+
provider_search = re.search(r'(?:providers|provision)\.(\w+)\.?',
|
29
|
+
provider_module)
|
30
|
+
assert provider_search is not None, config
|
31
|
+
provider_name = provider_search.group(1).lower()
|
32
|
+
# Special handling for lambda_cloud as Lambda cloud is registered as lambda.
|
33
|
+
if provider_name == 'lambda_cloud':
|
34
|
+
provider_name = 'lambda'
|
35
|
+
return provider_name
|
36
|
+
|
37
|
+
|
38
|
+
class SSHConfigHelper(object):
|
39
|
+
"""Helper for handling local SSH configuration."""
|
40
|
+
|
41
|
+
ssh_conf_path = '~/.ssh/config'
|
42
|
+
ssh_conf_lock_path = os.path.expanduser('~/.sky/locks/.ssh_config.lock')
|
43
|
+
ssh_conf_per_cluster_lock_path = os.path.expanduser(
|
44
|
+
'~/.sky/locks/.ssh_config_{}.lock')
|
45
|
+
ssh_cluster_path = constants.SKY_USER_FILE_PATH + '/ssh/{}'
|
46
|
+
ssh_cluster_key_path = constants.SKY_USER_FILE_PATH + '/ssh-keys/{}.key'
|
47
|
+
|
48
|
+
@classmethod
|
49
|
+
def _get_generated_config(cls, autogen_comment: str, host_name: str,
|
50
|
+
ip: str, username: str, ssh_key_path: str,
|
51
|
+
proxy_command: Optional[str], port: int,
|
52
|
+
docker_proxy_command: Optional[str]):
|
53
|
+
if proxy_command is not None:
|
54
|
+
# Already checked in resources
|
55
|
+
assert docker_proxy_command is None, (
|
56
|
+
'Cannot specify both proxy_command and docker_proxy_command.')
|
57
|
+
proxy = f'ProxyCommand {proxy_command}'
|
58
|
+
elif docker_proxy_command is not None:
|
59
|
+
proxy = f'ProxyCommand {docker_proxy_command}'
|
60
|
+
else:
|
61
|
+
proxy = ''
|
62
|
+
# StrictHostKeyChecking=no skips the host key check for the first
|
63
|
+
# time. UserKnownHostsFile=/dev/null and GlobalKnownHostsFile/dev/null
|
64
|
+
# prevent the host key from being added to the known_hosts file and
|
65
|
+
# always return an empty file for known hosts, making the ssh think
|
66
|
+
# this is a first-time connection, and thus skipping the host key
|
67
|
+
# check.
|
68
|
+
# Not adding SSH agent forwarding by default here to avoid implicitly
|
69
|
+
# using users' SSH keys in their local agent. Plus on sky launch side we
|
70
|
+
# are not default adding SSH agent forwarding either.
|
71
|
+
codegen = textwrap.dedent(f"""\
|
72
|
+
{autogen_comment}
|
73
|
+
Host {host_name}
|
74
|
+
HostName {ip}
|
75
|
+
User {username}
|
76
|
+
IdentityFile {ssh_key_path}
|
77
|
+
IdentitiesOnly yes
|
78
|
+
StrictHostKeyChecking no
|
79
|
+
UserKnownHostsFile=/dev/null
|
80
|
+
GlobalKnownHostsFile=/dev/null
|
81
|
+
Port {port}
|
82
|
+
{proxy}
|
83
|
+
""".rstrip())
|
84
|
+
codegen = codegen + '\n'
|
85
|
+
return codegen
|
86
|
+
|
87
|
+
@classmethod
|
88
|
+
def generate_local_key_file(cls, cluster_name: str,
|
89
|
+
auth_config: Dict[str, str]) -> str:
|
90
|
+
key_content = auth_config.pop('ssh_private_key_content', None)
|
91
|
+
if key_content is not None:
|
92
|
+
cluster_private_key_path = cls.ssh_cluster_key_path.format(
|
93
|
+
cluster_name)
|
94
|
+
expanded_cluster_private_key_path = os.path.expanduser(
|
95
|
+
cluster_private_key_path)
|
96
|
+
expanded_cluster_private_key_dir = os.path.dirname(
|
97
|
+
expanded_cluster_private_key_path)
|
98
|
+
os.makedirs(expanded_cluster_private_key_dir,
|
99
|
+
exist_ok=True,
|
100
|
+
mode=0o700)
|
101
|
+
with open(expanded_cluster_private_key_path,
|
102
|
+
'w',
|
103
|
+
encoding='utf-8',
|
104
|
+
opener=functools.partial(os.open, mode=0o600)) as f:
|
105
|
+
f.write(key_content)
|
106
|
+
auth_config['ssh_private_key'] = cluster_private_key_path
|
107
|
+
return auth_config['ssh_private_key']
|
108
|
+
|
109
|
+
@classmethod
|
110
|
+
@timeline.FileLockEvent(ssh_conf_lock_path)
|
111
|
+
def add_cluster(
|
112
|
+
cls,
|
113
|
+
cluster_name: str,
|
114
|
+
ips: List[str],
|
115
|
+
auth_config: Dict[str, str],
|
116
|
+
ports: List[int],
|
117
|
+
docker_user: Optional[str] = None,
|
118
|
+
ssh_user: Optional[str] = None,
|
119
|
+
):
|
120
|
+
"""Add authentication information for cluster to local SSH config file.
|
121
|
+
|
122
|
+
If a host with `cluster_name` already exists and the configuration was
|
123
|
+
not added by sky, then `ip` is used to identify the host instead in the
|
124
|
+
file.
|
125
|
+
|
126
|
+
If a host with `cluster_name` already exists and the configuration was
|
127
|
+
added by sky (e.g. a spot instance), then the configuration is
|
128
|
+
overwritten.
|
129
|
+
|
130
|
+
Args:
|
131
|
+
cluster_name: Cluster name (see `sky status`)
|
132
|
+
ips: List of public IP addresses in the cluster. First IP is head
|
133
|
+
node.
|
134
|
+
auth_config: `auth` in cluster yaml.
|
135
|
+
ports: List of port numbers for SSH corresponding to ips
|
136
|
+
docker_user: If not None, use this user to ssh into the docker
|
137
|
+
ssh_user: Override the ssh_user in auth_config
|
138
|
+
"""
|
139
|
+
if ssh_user is None:
|
140
|
+
username = auth_config['ssh_user']
|
141
|
+
else:
|
142
|
+
username = ssh_user
|
143
|
+
if docker_user is not None:
|
144
|
+
username = docker_user
|
145
|
+
|
146
|
+
key_path = cls.generate_local_key_file(cluster_name, auth_config)
|
147
|
+
key_path = os.path.expanduser(key_path)
|
148
|
+
sky_autogen_comment = ('# Added by sky (use `sky stop/down '
|
149
|
+
f'{cluster_name}` to remove)')
|
150
|
+
ip = ips[0]
|
151
|
+
if docker_user is not None:
|
152
|
+
ip = 'localhost'
|
153
|
+
|
154
|
+
config_path = os.path.expanduser(cls.ssh_conf_path)
|
155
|
+
os.makedirs(os.path.dirname(config_path), exist_ok=True, mode=0o700)
|
156
|
+
|
157
|
+
if not os.path.exists(config_path):
|
158
|
+
config = ['\n']
|
159
|
+
with open(config_path,
|
160
|
+
'w',
|
161
|
+
encoding='utf-8',
|
162
|
+
opener=functools.partial(os.open, mode=0o644)) as f:
|
163
|
+
f.writelines(config)
|
164
|
+
|
165
|
+
with open(config_path, 'r', encoding='utf-8') as f:
|
166
|
+
config = f.readlines()
|
167
|
+
|
168
|
+
ssh_dir = cls.ssh_cluster_path.format('')
|
169
|
+
os.makedirs(os.path.expanduser(ssh_dir), exist_ok=True, mode=0o700)
|
170
|
+
|
171
|
+
# Handle Include on top of Config file
|
172
|
+
include_str = f'Include {cls.ssh_cluster_path.format("*")}'
|
173
|
+
found = False
|
174
|
+
for i, line in enumerate(config):
|
175
|
+
config_str = line.strip()
|
176
|
+
if config_str == include_str:
|
177
|
+
found = True
|
178
|
+
break
|
179
|
+
if 'Host' in config_str:
|
180
|
+
break
|
181
|
+
if not found:
|
182
|
+
# Did not find Include string. Insert `Include` lines.
|
183
|
+
with open(config_path, 'w', encoding='utf-8') as f:
|
184
|
+
config.insert(
|
185
|
+
0, '# Added by SkyPilot for ssh config of all clusters\n'
|
186
|
+
f'{include_str}\n')
|
187
|
+
f.write(''.join(config).strip())
|
188
|
+
f.write('\n' * 2)
|
189
|
+
|
190
|
+
proxy_command = auth_config.get('ssh_proxy_command', None)
|
191
|
+
|
192
|
+
docker_proxy_command_generator = None
|
193
|
+
if docker_user is not None:
|
194
|
+
docker_proxy_command_generator = lambda ip, port: ' '.join(
|
195
|
+
['ssh'] + command_runner.ssh_options_list(
|
196
|
+
key_path, ssh_control_name=None, port=port) +
|
197
|
+
['-W', '%h:%p', f'{auth_config["ssh_user"]}@{ip}'])
|
198
|
+
|
199
|
+
codegen = ''
|
200
|
+
# Add the nodes to the codegen
|
201
|
+
for i, ip in enumerate(ips):
|
202
|
+
docker_proxy_command = None
|
203
|
+
port = ports[i]
|
204
|
+
if docker_proxy_command_generator is not None:
|
205
|
+
docker_proxy_command = docker_proxy_command_generator(ip, port)
|
206
|
+
ip = 'localhost'
|
207
|
+
port = constants.DEFAULT_DOCKER_PORT
|
208
|
+
node_name = cluster_name if i == 0 else cluster_name + f'-worker{i}'
|
209
|
+
# TODO(romilb): Update port number when k8s supports multinode
|
210
|
+
codegen += cls._get_generated_config(
|
211
|
+
sky_autogen_comment, node_name, ip, username, key_path,
|
212
|
+
proxy_command, port, docker_proxy_command) + '\n'
|
213
|
+
|
214
|
+
cluster_config_path = os.path.expanduser(
|
215
|
+
cls.ssh_cluster_path.format(cluster_name))
|
216
|
+
|
217
|
+
with open(cluster_config_path,
|
218
|
+
'w',
|
219
|
+
encoding='utf-8',
|
220
|
+
opener=functools.partial(os.open, mode=0o644)) as f:
|
221
|
+
f.write(codegen)
|
222
|
+
|
223
|
+
@classmethod
|
224
|
+
def _remove_stale_cluster_config_for_backward_compatibility(
|
225
|
+
cls,
|
226
|
+
cluster_name: str,
|
227
|
+
ip: str,
|
228
|
+
auth_config: Dict[str, str],
|
229
|
+
docker_user: Optional[str] = None,
|
230
|
+
):
|
231
|
+
"""Remove authentication information for cluster from local SSH config.
|
232
|
+
|
233
|
+
If no existing host matching the provided specification is found, then
|
234
|
+
nothing is removed.
|
235
|
+
|
236
|
+
Args:
|
237
|
+
ip: Head node's IP address.
|
238
|
+
auth_config: `auth` in cluster yaml.
|
239
|
+
docker_user: If not None, use this user to ssh into the docker
|
240
|
+
"""
|
241
|
+
username = auth_config['ssh_user']
|
242
|
+
config_path = os.path.expanduser(cls.ssh_conf_path)
|
243
|
+
cluster_config_path = os.path.expanduser(
|
244
|
+
cls.ssh_cluster_path.format(cluster_name))
|
245
|
+
if not os.path.exists(config_path):
|
246
|
+
return
|
247
|
+
|
248
|
+
with open(config_path, 'r', encoding='utf-8') as f:
|
249
|
+
config = f.readlines()
|
250
|
+
|
251
|
+
start_line_idx = None
|
252
|
+
|
253
|
+
# Scan the config for the cluster name.
|
254
|
+
for i, line in enumerate(config):
|
255
|
+
next_line = config[i + 1] if i + 1 < len(config) else ''
|
256
|
+
if docker_user is None:
|
257
|
+
found = (line.strip() == f'HostName {ip}' and
|
258
|
+
next_line.strip() == f'User {username}')
|
259
|
+
else:
|
260
|
+
found = (line.strip() == 'HostName localhost' and
|
261
|
+
next_line.strip() == f'User {docker_user}')
|
262
|
+
if found:
|
263
|
+
# Find the line starting with ProxyCommand and contains ip
|
264
|
+
found = False
|
265
|
+
for idx in range(i, len(config)):
|
266
|
+
# Stop if we reach an empty line, which means a new host
|
267
|
+
if not config[idx].strip():
|
268
|
+
break
|
269
|
+
if config[idx].strip().startswith('ProxyCommand'):
|
270
|
+
proxy_command_line = config[idx].strip()
|
271
|
+
if proxy_command_line.endswith(f'@{ip}'):
|
272
|
+
found = True
|
273
|
+
break
|
274
|
+
if found:
|
275
|
+
start_line_idx = i - 1
|
276
|
+
break
|
277
|
+
|
278
|
+
if start_line_idx is not None:
|
279
|
+
# Scan for end of previous config.
|
280
|
+
cursor = start_line_idx
|
281
|
+
while cursor > 0 and len(config[cursor].strip()) > 0:
|
282
|
+
cursor -= 1
|
283
|
+
prev_end_line_idx = cursor
|
284
|
+
|
285
|
+
# Scan for end of the cluster config.
|
286
|
+
end_line_idx = None
|
287
|
+
cursor = start_line_idx + 1
|
288
|
+
start_line_idx -= 1 # remove auto-generated comment
|
289
|
+
while cursor < len(config):
|
290
|
+
if config[cursor].strip().startswith(
|
291
|
+
'# ') or config[cursor].strip().startswith('Host '):
|
292
|
+
end_line_idx = cursor
|
293
|
+
break
|
294
|
+
cursor += 1
|
295
|
+
|
296
|
+
# Remove sky-generated config and update the file.
|
297
|
+
config[prev_end_line_idx:end_line_idx] = [
|
298
|
+
'\n'
|
299
|
+
] if end_line_idx is not None else []
|
300
|
+
with open(config_path, 'w', encoding='utf-8') as f:
|
301
|
+
f.write(''.join(config).strip())
|
302
|
+
f.write('\n' * 2)
|
303
|
+
|
304
|
+
# Delete include statement if it exists in the config.
|
305
|
+
sky_autogen_comment = ('# Added by sky (use `sky stop/down '
|
306
|
+
f'{cluster_name}` to remove)')
|
307
|
+
with open(config_path, 'r', encoding='utf-8') as f:
|
308
|
+
config = f.readlines()
|
309
|
+
|
310
|
+
for i, line in enumerate(config):
|
311
|
+
config_str = line.strip()
|
312
|
+
if f'Include {cluster_config_path}' in config_str:
|
313
|
+
with open(config_path, 'w', encoding='utf-8') as f:
|
314
|
+
if i < len(config) - 1 and config[i + 1] == '\n':
|
315
|
+
del config[i + 1]
|
316
|
+
# Delete Include string
|
317
|
+
del config[i]
|
318
|
+
# Delete Sky Autogen Comment
|
319
|
+
if i > 0 and sky_autogen_comment in config[i - 1].strip():
|
320
|
+
del config[i - 1]
|
321
|
+
f.write(''.join(config))
|
322
|
+
break
|
323
|
+
if 'Host' in config_str:
|
324
|
+
break
|
325
|
+
|
326
|
+
@classmethod
|
327
|
+
def remove_cluster(cls, cluster_name: str):
|
328
|
+
"""Remove auth information for cluster from ~/.sky/ssh/<cluster_name>.
|
329
|
+
|
330
|
+
If no existing host matching the provided specification is found, then
|
331
|
+
nothing is removed.
|
332
|
+
|
333
|
+
Args:
|
334
|
+
cluster_name: Cluster name.
|
335
|
+
"""
|
336
|
+
|
337
|
+
with timeline.FileLockEvent(
|
338
|
+
cls.ssh_conf_per_cluster_lock_path.format(cluster_name)):
|
339
|
+
cluster_config_path = os.path.expanduser(
|
340
|
+
cls.ssh_cluster_path.format(cluster_name))
|
341
|
+
common_utils.remove_file_if_exists(cluster_config_path)
|
342
|
+
|
343
|
+
@classmethod
|
344
|
+
def list_cluster_names(cls) -> List[str]:
|
345
|
+
"""List all names of clusters with SSH config set up."""
|
346
|
+
cluster_config_dir = os.path.expanduser(cls.ssh_cluster_path.format(''))
|
347
|
+
return [
|
348
|
+
os.path.basename(path)
|
349
|
+
for path in glob.glob(os.path.join(cluster_config_dir, '*'))
|
350
|
+
]
|
351
|
+
|
352
|
+
|
353
|
+
def generate_cluster_name():
|
354
|
+
# TODO: change this ID formatting to something more pleasant.
|
355
|
+
# User name is helpful in non-isolated accounts, e.g., GCP, Azure.
|
356
|
+
return f'sky-{uuid.uuid4().hex[:4]}-{common_utils.get_cleaned_username()}'
|