skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,73 @@
|
|
1
|
+
"""Helper script to strip path information from exec auth in a kubeconfig file.
|
2
|
+
|
3
|
+
This script processes a kubeconfig file and removes any path information from
|
4
|
+
the 'command' field in the exec configuration, leaving only the executable name.
|
5
|
+
This is useful when moving between different environments where auth plugin
|
6
|
+
executables might be installed in different locations.
|
7
|
+
|
8
|
+
It assumes the target environment has the auth executable available in PATH.
|
9
|
+
If not, you'll need to update your environment container to include the auth
|
10
|
+
executable in PATH.
|
11
|
+
|
12
|
+
Usage:
|
13
|
+
python -m sky.utils.kubernetes.exec_kubeconfig_converter
|
14
|
+
"""
|
15
|
+
import argparse
|
16
|
+
import os
|
17
|
+
|
18
|
+
import yaml
|
19
|
+
|
20
|
+
|
21
|
+
def strip_auth_plugin_paths(kubeconfig_path: str, output_path: str):
|
22
|
+
"""Strip path information from exec plugin commands in a kubeconfig file.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
kubeconfig_path (str): Path to the input kubeconfig file
|
26
|
+
output_path (str): Path where the modified kubeconfig will be saved
|
27
|
+
"""
|
28
|
+
with open(kubeconfig_path, 'r', encoding='utf-8') as file:
|
29
|
+
config = yaml.safe_load(file)
|
30
|
+
|
31
|
+
updated = False
|
32
|
+
for user in config.get('users', []):
|
33
|
+
exec_info = user.get('user', {}).get('exec', {})
|
34
|
+
current_command = exec_info.get('command', '')
|
35
|
+
|
36
|
+
if current_command:
|
37
|
+
# Strip the path and keep only the executable name
|
38
|
+
executable = os.path.basename(current_command)
|
39
|
+
if executable != current_command:
|
40
|
+
exec_info['command'] = executable
|
41
|
+
updated = True
|
42
|
+
|
43
|
+
if updated:
|
44
|
+
with open(output_path, 'w', encoding='utf-8') as file:
|
45
|
+
yaml.safe_dump(config, file)
|
46
|
+
print('Kubeconfig updated with path-less exec auth. '
|
47
|
+
f'Saved to {output_path}')
|
48
|
+
else:
|
49
|
+
print('No updates made. No exec-based auth commands paths found.')
|
50
|
+
|
51
|
+
|
52
|
+
def main():
|
53
|
+
parser = argparse.ArgumentParser(
|
54
|
+
description='Strip path information from exec plugin commands in a '
|
55
|
+
'kubeconfig file. Used to prepare kubeconfigs for deployment '
|
56
|
+
'with SkyPilot.')
|
57
|
+
parser.add_argument(
|
58
|
+
'--input',
|
59
|
+
'-i',
|
60
|
+
default=os.path.expanduser('~/.kube/config'),
|
61
|
+
help='Input kubeconfig file path (default: %(default)s)')
|
62
|
+
parser.add_argument(
|
63
|
+
'--output',
|
64
|
+
'-o',
|
65
|
+
default=os.path.expanduser('~/.kube/config.converted'),
|
66
|
+
help='Output kubeconfig file path (default: %(default)s)')
|
67
|
+
|
68
|
+
args = parser.parse_args()
|
69
|
+
strip_auth_plugin_paths(args.input, args.output)
|
70
|
+
|
71
|
+
|
72
|
+
if __name__ == '__main__':
|
73
|
+
main()
|
@@ -0,0 +1,336 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# This script creates a new k8s Service Account and generates a kubeconfig with
|
3
|
+
# its credentials. This Service Account has the minimal permissions necessary for
|
4
|
+
# SkyPilot. The kubeconfig is written in the current directory.
|
5
|
+
#
|
6
|
+
# Before running this script, you must configure your local kubectl to point to
|
7
|
+
# the right k8s cluster and have admin-level access.
|
8
|
+
#
|
9
|
+
# By default, this script will create a service account "sky-sa" in "default"
|
10
|
+
# namespace. If you want to use a different namespace or service account name:
|
11
|
+
#
|
12
|
+
# * Specify SKYPILOT_NAMESPACE env var to override the default namespace where the service account is created.
|
13
|
+
# * Specify SKYPILOT_SA_NAME env var to override the default service account name.
|
14
|
+
# * Specify SKIP_SA_CREATION=1 to skip creating the service account and use an existing one
|
15
|
+
# * Specify SUPER_USER=1 to create a service account with cluster-admin permissions
|
16
|
+
#
|
17
|
+
# Usage:
|
18
|
+
# # Create "sky-sa" service account with minimal permissions in "default" namespace and generate kubeconfig
|
19
|
+
# $ ./generate_kubeconfig.sh
|
20
|
+
#
|
21
|
+
# # Create "my-sa" service account with minimal permissions in "my-namespace" namespace and generate kubeconfig
|
22
|
+
# $ SKYPILOT_SA_NAME=my-sa SKYPILOT_NAMESPACE=my-namespace ./generate_kubeconfig.sh
|
23
|
+
#
|
24
|
+
# # Use an existing service account "my-sa" in "my-namespace" namespace and generate kubeconfig
|
25
|
+
# $ SKIP_SA_CREATION=1 SKYPILOT_SA_NAME=my-sa SKYPILOT_NAMESPACE=my-namespace ./generate_kubeconfig.sh
|
26
|
+
#
|
27
|
+
# # Create "sky-sa" service account with cluster-admin permissions in "default" namespace
|
28
|
+
# $ SUPER_USER=1 ./generate_kubeconfig.sh
|
29
|
+
|
30
|
+
set -eu -o pipefail
|
31
|
+
|
32
|
+
# Allow passing in common name and username in environment. If not provided,
|
33
|
+
# use default.
|
34
|
+
SKYPILOT_SA=${SKYPILOT_SA_NAME:-sky-sa}
|
35
|
+
NAMESPACE=${SKYPILOT_NAMESPACE:-default}
|
36
|
+
SUPER_USER=${SUPER_USER:-0}
|
37
|
+
|
38
|
+
echo "Service account: ${SKYPILOT_SA}"
|
39
|
+
echo "Namespace: ${NAMESPACE}"
|
40
|
+
echo "Super user permissions: ${SUPER_USER}"
|
41
|
+
|
42
|
+
# Set OS specific values.
|
43
|
+
if [[ "$OSTYPE" == "linux-gnu" ]]; then
|
44
|
+
BASE64_DECODE_FLAG="-d"
|
45
|
+
elif [[ "$OSTYPE" == "darwin"* ]]; then
|
46
|
+
BASE64_DECODE_FLAG="-D"
|
47
|
+
elif [[ "$OSTYPE" == "linux-musl" ]]; then
|
48
|
+
BASE64_DECODE_FLAG="-d"
|
49
|
+
else
|
50
|
+
echo "Unknown OS ${OSTYPE}"
|
51
|
+
exit 1
|
52
|
+
fi
|
53
|
+
|
54
|
+
# If the user has set SKIP_SA_CREATION=1, skip creating the service account.
|
55
|
+
if [ -z ${SKIP_SA_CREATION+x} ]; then
|
56
|
+
echo "Creating the Kubernetes Service Account with ${SUPER_USER:+super user}${SUPER_USER:-minimal} RBAC permissions."
|
57
|
+
if [ "${SUPER_USER}" = "1" ]; then
|
58
|
+
# Create service account with cluster-admin permissions
|
59
|
+
kubectl apply -f - <<EOF
|
60
|
+
apiVersion: v1
|
61
|
+
kind: Namespace
|
62
|
+
metadata:
|
63
|
+
name: ${NAMESPACE}
|
64
|
+
labels:
|
65
|
+
parent: skypilot
|
66
|
+
---
|
67
|
+
kind: ServiceAccount
|
68
|
+
apiVersion: v1
|
69
|
+
metadata:
|
70
|
+
name: ${SKYPILOT_SA}
|
71
|
+
namespace: ${NAMESPACE}
|
72
|
+
labels:
|
73
|
+
parent: skypilot
|
74
|
+
---
|
75
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
76
|
+
kind: ClusterRoleBinding
|
77
|
+
metadata:
|
78
|
+
name: ${SKYPILOT_SA}-cluster-admin
|
79
|
+
labels:
|
80
|
+
parent: skypilot
|
81
|
+
subjects:
|
82
|
+
- kind: ServiceAccount
|
83
|
+
name: ${SKYPILOT_SA}
|
84
|
+
namespace: ${NAMESPACE}
|
85
|
+
roleRef:
|
86
|
+
kind: ClusterRole
|
87
|
+
name: cluster-admin
|
88
|
+
apiGroup: rbac.authorization.k8s.io
|
89
|
+
EOF
|
90
|
+
else
|
91
|
+
# Original RBAC rules for minimal permissions
|
92
|
+
kubectl apply -f - <<EOF
|
93
|
+
# Create/update namespace specified by the user
|
94
|
+
apiVersion: v1
|
95
|
+
kind: Namespace
|
96
|
+
metadata:
|
97
|
+
name: ${NAMESPACE}
|
98
|
+
labels:
|
99
|
+
parent: skypilot
|
100
|
+
---
|
101
|
+
kind: ServiceAccount
|
102
|
+
apiVersion: v1
|
103
|
+
metadata:
|
104
|
+
name: ${SKYPILOT_SA}
|
105
|
+
namespace: ${NAMESPACE}
|
106
|
+
labels:
|
107
|
+
parent: skypilot
|
108
|
+
---
|
109
|
+
# Role for the service account
|
110
|
+
kind: Role
|
111
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
112
|
+
metadata:
|
113
|
+
name: ${SKYPILOT_SA}-role
|
114
|
+
namespace: ${NAMESPACE}
|
115
|
+
labels:
|
116
|
+
parent: skypilot
|
117
|
+
rules:
|
118
|
+
- apiGroups: ["*"] # Required for creating pods, services, secrets and other necessary resources in the namespace.
|
119
|
+
resources: ["*"]
|
120
|
+
verbs: ["*"]
|
121
|
+
---
|
122
|
+
# RoleBinding for the service account
|
123
|
+
kind: RoleBinding
|
124
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
125
|
+
metadata:
|
126
|
+
name: ${SKYPILOT_SA}-rb
|
127
|
+
namespace: ${NAMESPACE}
|
128
|
+
labels:
|
129
|
+
parent: skypilot
|
130
|
+
subjects:
|
131
|
+
- kind: ServiceAccount
|
132
|
+
name: ${SKYPILOT_SA}
|
133
|
+
roleRef:
|
134
|
+
kind: Role
|
135
|
+
name: ${SKYPILOT_SA}-role
|
136
|
+
apiGroup: rbac.authorization.k8s.io
|
137
|
+
---
|
138
|
+
# ClusterRole for the service account
|
139
|
+
kind: ClusterRole
|
140
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
141
|
+
metadata:
|
142
|
+
name: ${SKYPILOT_SA}-cluster-role
|
143
|
+
namespace: ${NAMESPACE}
|
144
|
+
labels:
|
145
|
+
parent: skypilot
|
146
|
+
rules:
|
147
|
+
- apiGroups: [""]
|
148
|
+
resources: ["nodes"] # Required for getting node resources.
|
149
|
+
verbs: ["get", "list", "watch"]
|
150
|
+
- apiGroups: ["node.k8s.io"]
|
151
|
+
resources: ["runtimeclasses"] # Required for autodetecting the runtime class of the nodes.
|
152
|
+
verbs: ["get", "list", "watch"]
|
153
|
+
- apiGroups: ["networking.k8s.io"] # Required for exposing services through ingresses
|
154
|
+
resources: ["ingressclasses"]
|
155
|
+
verbs: ["get", "list", "watch"]
|
156
|
+
- apiGroups: [""] # Required for sky show-gpus command
|
157
|
+
resources: ["pods"]
|
158
|
+
verbs: ["get", "list"]
|
159
|
+
---
|
160
|
+
# ClusterRoleBinding for the service account
|
161
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
162
|
+
kind: ClusterRoleBinding
|
163
|
+
metadata:
|
164
|
+
name: ${SKYPILOT_SA}-cluster-role-binding
|
165
|
+
namespace: ${NAMESPACE}
|
166
|
+
labels:
|
167
|
+
parent: skypilot
|
168
|
+
subjects:
|
169
|
+
- kind: ServiceAccount
|
170
|
+
name: ${SKYPILOT_SA}
|
171
|
+
namespace: ${NAMESPACE}
|
172
|
+
roleRef:
|
173
|
+
kind: ClusterRole
|
174
|
+
name: ${SKYPILOT_SA}-cluster-role
|
175
|
+
apiGroup: rbac.authorization.k8s.io
|
176
|
+
---
|
177
|
+
# Optional: If using object store mounting, create the skypilot-system namespace
|
178
|
+
apiVersion: v1
|
179
|
+
kind: Namespace
|
180
|
+
metadata:
|
181
|
+
name: skypilot-system
|
182
|
+
labels:
|
183
|
+
parent: skypilot
|
184
|
+
---
|
185
|
+
# Optional: If using object store mounting, create role in the skypilot-system
|
186
|
+
# namespace to create FUSE device manager.
|
187
|
+
kind: Role
|
188
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
189
|
+
metadata:
|
190
|
+
name: skypilot-system-service-account-role
|
191
|
+
namespace: skypilot-system
|
192
|
+
labels:
|
193
|
+
parent: skypilot
|
194
|
+
rules:
|
195
|
+
- apiGroups: ["*"]
|
196
|
+
resources: ["*"]
|
197
|
+
verbs: ["*"]
|
198
|
+
---
|
199
|
+
# Optional: If using object store mounting, create rolebinding in the skypilot-system
|
200
|
+
# namespace to create FUSE device manager.
|
201
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
202
|
+
kind: RoleBinding
|
203
|
+
metadata:
|
204
|
+
name: ${SKYPILOT_SA}-skypilot-system-role-binding-${NAMESPACE}
|
205
|
+
namespace: skypilot-system # Do not change this namespace
|
206
|
+
labels:
|
207
|
+
parent: skypilot
|
208
|
+
subjects:
|
209
|
+
- kind: ServiceAccount
|
210
|
+
name: ${SKYPILOT_SA}
|
211
|
+
namespace: ${NAMESPACE}
|
212
|
+
roleRef:
|
213
|
+
kind: Role
|
214
|
+
name: skypilot-system-service-account-role
|
215
|
+
apiGroup: rbac.authorization.k8s.io
|
216
|
+
EOF
|
217
|
+
fi
|
218
|
+
# Apply optional ingress-related roles, but don't make the script fail if it fails
|
219
|
+
kubectl apply -f - <<EOF || echo "Failed to apply optional ingress-related roles. Nginx ingress is likely not installed. This is not critical and the script will continue."
|
220
|
+
# Optional: Role for accessing ingress resources
|
221
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
222
|
+
kind: Role
|
223
|
+
metadata:
|
224
|
+
name: ${SKYPILOT_SA}-role-ingress-nginx
|
225
|
+
namespace: ingress-nginx # Do not change this namespace
|
226
|
+
labels:
|
227
|
+
parent: skypilot
|
228
|
+
rules:
|
229
|
+
- apiGroups: [""]
|
230
|
+
resources: ["services"]
|
231
|
+
verbs: ["list", "get", "watch"]
|
232
|
+
- apiGroups: ["rbac.authorization.k8s.io"]
|
233
|
+
resources: ["roles", "rolebindings"]
|
234
|
+
verbs: ["list", "get", "watch"]
|
235
|
+
---
|
236
|
+
# Optional: RoleBinding for accessing ingress resources
|
237
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
238
|
+
kind: RoleBinding
|
239
|
+
metadata:
|
240
|
+
name: ${SKYPILOT_SA}-rolebinding-ingress-nginx
|
241
|
+
namespace: ingress-nginx # Do not change this namespace
|
242
|
+
labels:
|
243
|
+
parent: skypilot
|
244
|
+
subjects:
|
245
|
+
- kind: ServiceAccount
|
246
|
+
name: ${SKYPILOT_SA}
|
247
|
+
namespace: ${NAMESPACE}
|
248
|
+
roleRef:
|
249
|
+
kind: Role
|
250
|
+
name: ${SKYPILOT_SA}-role-ingress-nginx # Use the same name as the role at line 119
|
251
|
+
apiGroup: rbac.authorization.k8s.io
|
252
|
+
EOF
|
253
|
+
fi
|
254
|
+
|
255
|
+
# Checks if secret entry was defined for Service account. If defined it means that Kubernetes server has a
|
256
|
+
# version bellow 1.24, otherwise one must manually create the secret and bind it to the Service account to have a non expiring token.
|
257
|
+
# After Kubernetes v1.24 Service accounts no longer generate automatic tokens/secrets.
|
258
|
+
# We can use kubectl create token but the token has a expiration time.
|
259
|
+
# https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG/CHANGELOG-1.24.md#urgent-upgrade-notes
|
260
|
+
SA_SECRET_NAME=$(kubectl get -n ${NAMESPACE} sa/${SKYPILOT_SA} -o "jsonpath={.secrets[0]..name}")
|
261
|
+
if [ -z $SA_SECRET_NAME ]
|
262
|
+
then
|
263
|
+
# Create the secret and bind it to the desired SA
|
264
|
+
kubectl apply -f - <<EOF
|
265
|
+
apiVersion: v1
|
266
|
+
kind: Secret
|
267
|
+
type: kubernetes.io/service-account-token
|
268
|
+
metadata:
|
269
|
+
name: ${SKYPILOT_SA}
|
270
|
+
namespace: ${NAMESPACE}
|
271
|
+
annotations:
|
272
|
+
kubernetes.io/service-account.name: "${SKYPILOT_SA}"
|
273
|
+
labels:
|
274
|
+
parent: skypilot
|
275
|
+
EOF
|
276
|
+
|
277
|
+
SA_SECRET_NAME=${SKYPILOT_SA}
|
278
|
+
fi
|
279
|
+
|
280
|
+
# Sleep for 2 seconds to allow the secret to be created before fetching it.
|
281
|
+
sleep 2
|
282
|
+
|
283
|
+
# Note: service account token is stored base64-encoded in the secret but must
|
284
|
+
# be plaintext in kubeconfig.
|
285
|
+
SA_TOKEN=$(kubectl get -n ${NAMESPACE} secrets/${SA_SECRET_NAME} -o "jsonpath={.data['token']}" | base64 ${BASE64_DECODE_FLAG})
|
286
|
+
CA_CERT=$(kubectl get -n ${NAMESPACE} secrets/${SA_SECRET_NAME} -o "jsonpath={.data['ca\.crt']}")
|
287
|
+
|
288
|
+
# Extract cluster IP from the current context
|
289
|
+
CURRENT_CONTEXT=$(kubectl config current-context)
|
290
|
+
CURRENT_CLUSTER=$(kubectl config view -o jsonpath="{.contexts[?(@.name == \"${CURRENT_CONTEXT}\"})].context.cluster}")
|
291
|
+
CURRENT_CLUSTER_ADDR=$(kubectl config view -o jsonpath="{.clusters[?(@.name == \"${CURRENT_CLUSTER}\"})].cluster.server}")
|
292
|
+
|
293
|
+
echo "Writing kubeconfig."
|
294
|
+
cat > kubeconfig <<EOF
|
295
|
+
apiVersion: v1
|
296
|
+
clusters:
|
297
|
+
- cluster:
|
298
|
+
certificate-authority-data: ${CA_CERT}
|
299
|
+
server: ${CURRENT_CLUSTER_ADDR}
|
300
|
+
name: ${CURRENT_CLUSTER}
|
301
|
+
contexts:
|
302
|
+
- context:
|
303
|
+
cluster: ${CURRENT_CLUSTER}
|
304
|
+
user: ${CURRENT_CLUSTER}-${SKYPILOT_SA}
|
305
|
+
namespace: ${NAMESPACE}
|
306
|
+
name: ${CURRENT_CONTEXT}
|
307
|
+
current-context: ${CURRENT_CONTEXT}
|
308
|
+
kind: Config
|
309
|
+
preferences: {}
|
310
|
+
users:
|
311
|
+
- name: ${CURRENT_CLUSTER}-${SKYPILOT_SA}
|
312
|
+
user:
|
313
|
+
token: ${SA_TOKEN}
|
314
|
+
EOF
|
315
|
+
|
316
|
+
echo "---
|
317
|
+
Done!
|
318
|
+
|
319
|
+
Kubeconfig using service acccount '${SKYPILOT_SA}' in namespace '${NAMESPACE}' written at $(pwd)/kubeconfig
|
320
|
+
|
321
|
+
Copy the generated kubeconfig file to your ~/.kube/ directory to use it with
|
322
|
+
kubectl and skypilot:
|
323
|
+
|
324
|
+
# Backup your existing kubeconfig file
|
325
|
+
mv ~/.kube/config ~/.kube/config.bak
|
326
|
+
cp kubeconfig ~/.kube/config
|
327
|
+
|
328
|
+
# Verify that you can access the cluster
|
329
|
+
kubectl get pods
|
330
|
+
|
331
|
+
Also add this to your ~/.sky/config.yaml to use the new service account:
|
332
|
+
|
333
|
+
# ~/.sky/config.yaml
|
334
|
+
kubernetes:
|
335
|
+
remote_identity: ${SKYPILOT_SA}
|
336
|
+
"
|
@@ -101,7 +101,7 @@ def label():
|
|
101
101
|
# Get the list of nodes with GPUs
|
102
102
|
gpu_nodes = []
|
103
103
|
for node in nodes:
|
104
|
-
if
|
104
|
+
if kubernetes_utils.get_gpu_resource_key() in node.status.capacity:
|
105
105
|
gpu_nodes.append(node)
|
106
106
|
|
107
107
|
print(f'Found {len(gpu_nodes)} GPU nodes in the cluster')
|
@@ -115,7 +115,7 @@ def label():
|
|
115
115
|
print('Continuing without using nvidia RuntimeClass. '
|
116
116
|
'This may fail on K3s clusters. '
|
117
117
|
'For more details, refer to K3s deployment notes at: '
|
118
|
-
'https://skypilot.
|
118
|
+
'https://docs.skypilot.co/en/latest/reference/kubernetes/kubernetes-setup.html') # pylint: disable=line-too-long
|
119
119
|
nvidia_exists = False
|
120
120
|
|
121
121
|
if nvidia_exists:
|
@@ -139,10 +139,10 @@ def label():
|
|
139
139
|
# Create the job for this node`
|
140
140
|
batch_v1.create_namespaced_job(namespace, job_manifest)
|
141
141
|
print(f'Created GPU labeler job for node {node_name}')
|
142
|
-
if
|
142
|
+
if not gpu_nodes:
|
143
143
|
print('No GPU nodes found in the cluster. If you have GPU nodes, '
|
144
144
|
'please ensure that they have the label '
|
145
|
-
'`
|
145
|
+
f'`{kubernetes_utils.get_gpu_resource_key()}: <number of GPUs>`')
|
146
146
|
else:
|
147
147
|
print('GPU labeling started - this may take 10 min or more to complete.'
|
148
148
|
'\nTo check the status of GPU labeling jobs, run '
|
@@ -14,9 +14,10 @@ spec:
|
|
14
14
|
containers:
|
15
15
|
- name: gpu-labeler
|
16
16
|
image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:latest # Using this image also serves as a way to "pre-pull" the image onto nodes
|
17
|
-
command:
|
18
|
-
|
19
|
-
-
|
17
|
+
command: ["/bin/bash", "-i", "-c"]
|
18
|
+
args:
|
19
|
+
- |
|
20
|
+
python /label_gpus.py
|
20
21
|
env:
|
21
22
|
- name: MY_NODE_NAME
|
22
23
|
valueFrom:
|
@@ -0,0 +1,228 @@
|
|
1
|
+
"""Utility functions for deploying Kubernetes clusters."""
|
2
|
+
import os
|
3
|
+
import shlex
|
4
|
+
import subprocess
|
5
|
+
import tempfile
|
6
|
+
from typing import List
|
7
|
+
|
8
|
+
from sky import check as sky_check
|
9
|
+
from sky import sky_logging
|
10
|
+
from sky.backends import backend_utils
|
11
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
12
|
+
from sky.skylet import constants
|
13
|
+
from sky.skylet import log_lib
|
14
|
+
from sky.utils import log_utils
|
15
|
+
from sky.utils import rich_utils
|
16
|
+
from sky.utils import subprocess_utils
|
17
|
+
from sky.utils import ux_utils
|
18
|
+
|
19
|
+
logger = sky_logging.init_logger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
def deploy_remote_cluster(ip_list: List[str], ssh_user: str, ssh_key: str,
|
23
|
+
cleanup: bool):
|
24
|
+
success = False
|
25
|
+
path_to_package = os.path.dirname(__file__)
|
26
|
+
up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.sh')
|
27
|
+
# Get directory of script and run it from there
|
28
|
+
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
29
|
+
|
30
|
+
# Create temporary files for the IPs and SSH key
|
31
|
+
with tempfile.NamedTemporaryFile(mode='w') as ip_file, \
|
32
|
+
tempfile.NamedTemporaryFile(mode='w') as key_file:
|
33
|
+
|
34
|
+
# Write IPs and SSH key to temporary files
|
35
|
+
ip_file.write('\n'.join(ip_list))
|
36
|
+
ip_file.flush()
|
37
|
+
|
38
|
+
key_file.write(ssh_key)
|
39
|
+
key_file.flush()
|
40
|
+
os.chmod(key_file.name, 0o600)
|
41
|
+
|
42
|
+
deploy_command = (f'{up_script_path} {ip_file.name} '
|
43
|
+
f'{ssh_user} {key_file.name}')
|
44
|
+
if cleanup:
|
45
|
+
deploy_command += ' --cleanup'
|
46
|
+
|
47
|
+
# Convert the command to a format suitable for subprocess
|
48
|
+
deploy_command = shlex.split(deploy_command)
|
49
|
+
|
50
|
+
# Setup logging paths
|
51
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
52
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
53
|
+
'local_up.log')
|
54
|
+
|
55
|
+
# Check if ~/.kube/config exists:
|
56
|
+
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
57
|
+
logger.info('Found existing kube config. '
|
58
|
+
'It will be backed up to ~/.kube/config.bak.')
|
59
|
+
if cleanup:
|
60
|
+
msg_str = 'Cleaning up remote cluster...'
|
61
|
+
else:
|
62
|
+
msg_str = 'Deploying remote cluster...'
|
63
|
+
with rich_utils.safe_status(
|
64
|
+
ux_utils.spinner_message(msg_str,
|
65
|
+
log_path=log_path,
|
66
|
+
is_local=True)):
|
67
|
+
returncode, _, stderr = log_lib.run_with_log(
|
68
|
+
cmd=deploy_command,
|
69
|
+
log_path=log_path,
|
70
|
+
require_outputs=True,
|
71
|
+
stream_logs=False,
|
72
|
+
line_processor=log_utils.SkyRemoteUpLineProcessor(
|
73
|
+
log_path=log_path, is_local=True),
|
74
|
+
cwd=cwd)
|
75
|
+
if returncode == 0:
|
76
|
+
success = True
|
77
|
+
else:
|
78
|
+
with ux_utils.print_exception_no_traceback():
|
79
|
+
log_hint = ux_utils.log_path_hint(log_path, is_local=True)
|
80
|
+
raise RuntimeError('Failed to deploy remote cluster. '
|
81
|
+
f'Full log: {log_hint}'
|
82
|
+
f'\nError: {stderr}')
|
83
|
+
|
84
|
+
if success:
|
85
|
+
if cleanup:
|
86
|
+
logger.info(
|
87
|
+
ux_utils.finishing_message(
|
88
|
+
'🎉 Remote cluster cleaned up successfully.',
|
89
|
+
log_path=log_path,
|
90
|
+
is_local=True))
|
91
|
+
else:
|
92
|
+
logger.info(
|
93
|
+
ux_utils.finishing_message(
|
94
|
+
'🎉 Remote cluster deployed successfully.',
|
95
|
+
log_path=log_path,
|
96
|
+
is_local=True))
|
97
|
+
|
98
|
+
|
99
|
+
def deploy_local_cluster(gpus: bool):
|
100
|
+
cluster_created = False
|
101
|
+
|
102
|
+
# Check if GPUs are available on the host
|
103
|
+
local_gpus_available = backend_utils.check_local_gpus()
|
104
|
+
gpus = gpus and local_gpus_available
|
105
|
+
|
106
|
+
# Check if ~/.kube/config exists:
|
107
|
+
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
108
|
+
curr_context = kubernetes_utils.get_current_kube_config_context_name()
|
109
|
+
skypilot_context = 'kind-skypilot'
|
110
|
+
if curr_context is not None and curr_context != skypilot_context:
|
111
|
+
logger.info(
|
112
|
+
f'Current context in kube config: {curr_context}'
|
113
|
+
'\nWill automatically switch to kind-skypilot after the local '
|
114
|
+
'cluster is created.')
|
115
|
+
message_str = 'Creating local cluster{}...'
|
116
|
+
message_str = message_str.format((' with GPU support (this may take up '
|
117
|
+
'to 15 minutes)') if gpus else '')
|
118
|
+
path_to_package = os.path.dirname(__file__)
|
119
|
+
up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
|
120
|
+
|
121
|
+
# Get directory of script and run it from there
|
122
|
+
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
123
|
+
run_command = up_script_path + ' --gpus' if gpus else up_script_path
|
124
|
+
run_command = shlex.split(run_command)
|
125
|
+
|
126
|
+
# Setup logging paths
|
127
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
128
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
129
|
+
'local_up.log')
|
130
|
+
logger.info(message_str)
|
131
|
+
|
132
|
+
with rich_utils.safe_status(
|
133
|
+
ux_utils.spinner_message(message_str,
|
134
|
+
log_path=log_path,
|
135
|
+
is_local=True)):
|
136
|
+
returncode, _, stderr = log_lib.run_with_log(
|
137
|
+
cmd=run_command,
|
138
|
+
log_path=log_path,
|
139
|
+
require_outputs=True,
|
140
|
+
stream_logs=False,
|
141
|
+
line_processor=log_utils.SkyLocalUpLineProcessor(log_path=log_path,
|
142
|
+
is_local=True),
|
143
|
+
cwd=cwd)
|
144
|
+
|
145
|
+
# Kind always writes to stderr even if it succeeds.
|
146
|
+
# If the failure happens after the cluster is created, we need
|
147
|
+
# to strip all stderr of "No kind clusters found.", which is
|
148
|
+
# printed when querying with kind get clusters.
|
149
|
+
stderr = stderr.replace('No kind clusters found.\n', '')
|
150
|
+
|
151
|
+
if returncode == 0:
|
152
|
+
cluster_created = True
|
153
|
+
elif returncode == 100:
|
154
|
+
logger.info(
|
155
|
+
ux_utils.finishing_message(
|
156
|
+
'Local cluster already exists.\n',
|
157
|
+
log_path=log_path,
|
158
|
+
is_local=True,
|
159
|
+
follow_up_message=
|
160
|
+
'If you want to delete it instead, run: sky local down'))
|
161
|
+
else:
|
162
|
+
with ux_utils.print_exception_no_traceback():
|
163
|
+
log_hint = ux_utils.log_path_hint(log_path, is_local=True)
|
164
|
+
raise RuntimeError('Failed to create local cluster. '
|
165
|
+
f'Full log: {log_hint}'
|
166
|
+
f'\nError: {stderr}')
|
167
|
+
# Run sky check
|
168
|
+
with rich_utils.safe_status('[bold cyan]Running sky check...'):
|
169
|
+
sky_check.check(clouds=['kubernetes'], quiet=True)
|
170
|
+
if cluster_created:
|
171
|
+
# Prepare completion message which shows CPU and GPU count
|
172
|
+
# Get number of CPUs
|
173
|
+
p = subprocess_utils.run(
|
174
|
+
'kubectl get nodes -o jsonpath=\'{.items[0].status.capacity.cpu}\'',
|
175
|
+
capture_output=True)
|
176
|
+
num_cpus = int(p.stdout.decode('utf-8'))
|
177
|
+
|
178
|
+
# GPU count/type parsing
|
179
|
+
gpu_message = ''
|
180
|
+
gpu_hint = ''
|
181
|
+
if gpus:
|
182
|
+
# Get GPU model by querying the node labels
|
183
|
+
label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
|
184
|
+
gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
|
185
|
+
try:
|
186
|
+
# Run the command and capture the output
|
187
|
+
gpu_count_output = subprocess.check_output(gpu_type_cmd,
|
188
|
+
shell=True,
|
189
|
+
text=True)
|
190
|
+
gpu_type_str = gpu_count_output.strip() + ' '
|
191
|
+
except subprocess.CalledProcessError as e:
|
192
|
+
output = str(e.output.decode('utf-8'))
|
193
|
+
logger.warning(f'Failed to get GPU type: {output}')
|
194
|
+
gpu_type_str = ''
|
195
|
+
|
196
|
+
# Get number of GPUs (sum of nvidia.com/gpu resources)
|
197
|
+
gpu_count_command = 'kubectl get nodes -o=jsonpath=\'{range .items[*]}{.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}\' | awk \'{sum += $1} END {print sum}\'' # pylint: disable=line-too-long
|
198
|
+
try:
|
199
|
+
# Run the command and capture the output
|
200
|
+
gpu_count_output = subprocess.check_output(gpu_count_command,
|
201
|
+
shell=True,
|
202
|
+
text=True)
|
203
|
+
gpu_count = gpu_count_output.strip(
|
204
|
+
) # Remove any extra whitespace
|
205
|
+
gpu_message = f' and {gpu_count} {gpu_type_str}GPUs'
|
206
|
+
except subprocess.CalledProcessError as e:
|
207
|
+
output = str(e.output.decode('utf-8'))
|
208
|
+
logger.warning(f'Failed to get GPU count: {output}')
|
209
|
+
gpu_message = f' with {gpu_type_str}GPU support'
|
210
|
+
|
211
|
+
gpu_hint = (
|
212
|
+
'\nHint: To see the list of GPUs in the cluster, '
|
213
|
+
'run \'sky show-gpus --cloud kubernetes\'') if gpus else ''
|
214
|
+
|
215
|
+
if num_cpus < 2:
|
216
|
+
logger.info('Warning: Local cluster has less than 2 CPUs. '
|
217
|
+
'This may cause issues with running tasks.')
|
218
|
+
logger.info(
|
219
|
+
ux_utils.finishing_message(
|
220
|
+
message=(f'Local Kubernetes cluster created successfully with '
|
221
|
+
f'{num_cpus} CPUs{gpu_message}.'),
|
222
|
+
log_path=log_path,
|
223
|
+
is_local=True,
|
224
|
+
follow_up_message=(
|
225
|
+
'\n`sky launch` can now run tasks locally.\n'
|
226
|
+
'Hint: To change the number of CPUs, change your docker '
|
227
|
+
'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
|
228
|
+
f'{gpu_hint}')))
|