skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,78 +1,43 @@
|
|
1
1
|
"""Kubernetes instance provisioning."""
|
2
2
|
import copy
|
3
|
+
import json
|
3
4
|
import time
|
4
|
-
from typing import Any, Dict, List, Optional
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
5
6
|
import uuid
|
6
7
|
|
7
8
|
from sky import exceptions
|
8
9
|
from sky import sky_logging
|
9
10
|
from sky import skypilot_config
|
10
|
-
from sky import status_lib
|
11
11
|
from sky.adaptors import kubernetes
|
12
12
|
from sky.provision import common
|
13
|
+
from sky.provision import constants
|
13
14
|
from sky.provision import docker_utils
|
14
15
|
from sky.provision.kubernetes import config as config_lib
|
16
|
+
from sky.provision.kubernetes import network_utils
|
15
17
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
18
|
+
from sky.utils import command_runner
|
16
19
|
from sky.utils import common_utils
|
17
20
|
from sky.utils import kubernetes_enums
|
21
|
+
from sky.utils import status_lib
|
22
|
+
from sky.utils import subprocess_utils
|
23
|
+
from sky.utils import timeline
|
18
24
|
from sky.utils import ux_utils
|
19
25
|
|
20
26
|
POLL_INTERVAL = 2
|
21
27
|
_TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
|
28
|
+
_MAX_RETRIES = 3
|
29
|
+
_NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
|
22
30
|
|
23
31
|
logger = sky_logging.init_logger(__name__)
|
24
32
|
TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
|
25
33
|
TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
|
26
|
-
TAG_RAY_NODE_KIND = 'ray-node-type' # legacy tag for backward compatibility
|
27
34
|
TAG_POD_INITIALIZED = 'skypilot-initialized'
|
28
35
|
|
29
|
-
POD_STATUSES = {
|
30
|
-
'Pending', 'Running', 'Succeeded', 'Failed', 'Unknown', 'Terminating'
|
31
|
-
}
|
32
|
-
|
33
|
-
|
34
|
-
def to_label_selector(tags):
|
35
|
-
label_selector = ''
|
36
|
-
for k, v in tags.items():
|
37
|
-
if label_selector != '':
|
38
|
-
label_selector += ','
|
39
|
-
label_selector += '{}={}'.format(k, v)
|
40
|
-
return label_selector
|
41
|
-
|
42
|
-
|
43
|
-
def _get_namespace(provider_config: Dict[str, Any]) -> str:
|
44
|
-
return provider_config.get(
|
45
|
-
'namespace',
|
46
|
-
kubernetes_utils.get_current_kube_config_context_namespace())
|
47
|
-
|
48
|
-
|
49
|
-
def _filter_pods(namespace: str, tag_filters: Dict[str, str],
|
50
|
-
status_filters: Optional[List[str]]) -> Dict[str, Any]:
|
51
|
-
"""Filters pods by tags and status."""
|
52
|
-
non_included_pod_statuses = POD_STATUSES.copy()
|
53
|
-
|
54
|
-
field_selector = ''
|
55
|
-
if status_filters is not None:
|
56
|
-
non_included_pod_statuses -= set(status_filters)
|
57
|
-
field_selector = ','.join(
|
58
|
-
[f'status.phase!={status}' for status in non_included_pod_statuses])
|
59
|
-
|
60
|
-
label_selector = to_label_selector(tag_filters)
|
61
|
-
pod_list = kubernetes.core_api().list_namespaced_pod(
|
62
|
-
namespace, field_selector=field_selector, label_selector=label_selector)
|
63
|
-
|
64
|
-
# Don't return pods marked for deletion,
|
65
|
-
# i.e. pods with non-null metadata.DeletionTimestamp.
|
66
|
-
pods = [
|
67
|
-
pod for pod in pod_list.items if pod.metadata.deletion_timestamp is None
|
68
|
-
]
|
69
|
-
return {pod.metadata.name: pod for pod in pods}
|
70
|
-
|
71
36
|
|
72
37
|
def _get_head_pod_name(pods: Dict[str, Any]) -> Optional[str]:
|
73
38
|
head_pod_name = None
|
74
39
|
for pod_name, pod in pods.items():
|
75
|
-
if pod.metadata.labels[TAG_RAY_NODE_KIND] == 'head':
|
40
|
+
if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head':
|
76
41
|
head_pod_name = pod_name
|
77
42
|
break
|
78
43
|
return head_pod_name
|
@@ -83,16 +48,85 @@ def head_service_selector(cluster_name: str) -> Dict[str, str]:
|
|
83
48
|
return {'component': f'{cluster_name}-head'}
|
84
49
|
|
85
50
|
|
86
|
-
def
|
51
|
+
def _formatted_resource_requirements(pod_or_spec: Union[Any, dict]) -> str:
|
52
|
+
# Returns a formatted string of resource requirements for a pod.
|
53
|
+
resource_requirements = {}
|
54
|
+
|
55
|
+
if isinstance(pod_or_spec, dict):
|
56
|
+
containers = pod_or_spec.get('spec', {}).get('containers', [])
|
57
|
+
else:
|
58
|
+
containers = pod_or_spec.spec.containers
|
59
|
+
|
60
|
+
for container in containers:
|
61
|
+
if isinstance(container, dict):
|
62
|
+
resources = container.get('resources', {})
|
63
|
+
requests = resources.get('requests', {})
|
64
|
+
else:
|
65
|
+
resources = container.resources
|
66
|
+
requests = resources.requests or {}
|
67
|
+
|
68
|
+
for resource, value in requests.items():
|
69
|
+
if resource not in resource_requirements:
|
70
|
+
resource_requirements[resource] = 0
|
71
|
+
if resource == 'memory':
|
72
|
+
int_value = kubernetes_utils.parse_memory_resource(value)
|
73
|
+
else:
|
74
|
+
int_value = kubernetes_utils.parse_cpu_or_gpu_resource(value)
|
75
|
+
resource_requirements[resource] += int(int_value)
|
76
|
+
return ', '.join(f'{resource}={value}'
|
77
|
+
for resource, value in resource_requirements.items())
|
78
|
+
|
79
|
+
|
80
|
+
def _formatted_node_selector(pod_or_spec: Union[Any, dict]) -> Optional[str]:
|
81
|
+
# Returns a formatted string of node selectors for a pod.
|
82
|
+
node_selectors = []
|
83
|
+
|
84
|
+
if isinstance(pod_or_spec, dict):
|
85
|
+
selectors = pod_or_spec.get('spec', {}).get('nodeSelector', {})
|
86
|
+
else:
|
87
|
+
selectors = pod_or_spec.spec.node_selector
|
88
|
+
|
89
|
+
if not selectors:
|
90
|
+
return None
|
91
|
+
|
92
|
+
for label_key, label_value in selectors.items():
|
93
|
+
node_selectors.append(f'{label_key}={label_value}')
|
94
|
+
return ', '.join(node_selectors)
|
95
|
+
|
96
|
+
|
97
|
+
def _lack_resource_msg(resource: str,
|
98
|
+
pod_or_spec: Union[Any, dict],
|
99
|
+
extra_msg: Optional[str] = None,
|
100
|
+
details: Optional[str] = None) -> str:
|
101
|
+
resource_requirements = _formatted_resource_requirements(pod_or_spec)
|
102
|
+
node_selectors = _formatted_node_selector(pod_or_spec)
|
103
|
+
node_selector_str = f' and labels ({node_selectors})' if (
|
104
|
+
node_selectors) else ''
|
105
|
+
msg = (f'Insufficient {resource} capacity on the cluster. '
|
106
|
+
f'Required resources ({resource_requirements}){node_selector_str} '
|
107
|
+
'were not found in a single node. Other SkyPilot tasks or pods may '
|
108
|
+
'be using resources. Check resource usage by running '
|
109
|
+
'`kubectl describe nodes`.')
|
110
|
+
if extra_msg:
|
111
|
+
msg += f' {extra_msg}'
|
112
|
+
if details:
|
113
|
+
msg += f'\nFull error: {details}'
|
114
|
+
return msg
|
115
|
+
|
116
|
+
|
117
|
+
def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
87
118
|
"""Raise pod scheduling failure reason.
|
88
119
|
|
89
120
|
When a pod fails to schedule in Kubernetes, the reasons for the failure
|
90
121
|
are recorded as events. This function retrieves those events and raises
|
91
122
|
descriptive errors for better debugging and user feedback.
|
92
123
|
"""
|
124
|
+
timeout_err_msg = ('Timed out while waiting for nodes to start. '
|
125
|
+
'Cluster may be out of resources or '
|
126
|
+
'may be too slow to autoscale.')
|
93
127
|
for new_node in new_nodes:
|
94
|
-
pod = kubernetes.core_api().read_namespaced_pod(
|
95
|
-
|
128
|
+
pod = kubernetes.core_api(context).read_namespaced_pod(
|
129
|
+
new_node.metadata.name, namespace)
|
96
130
|
pod_status = pod.status.phase
|
97
131
|
# When there are multiple pods involved while launching instance,
|
98
132
|
# there may be a single pod causing issue while others are
|
@@ -101,7 +135,7 @@ def _raise_pod_scheduling_errors(namespace, new_nodes):
|
|
101
135
|
if pod_status != 'Pending':
|
102
136
|
continue
|
103
137
|
pod_name = pod._metadata._name # pylint: disable=protected-access
|
104
|
-
events = kubernetes.core_api().list_namespaced_event(
|
138
|
+
events = kubernetes.core_api(context).list_namespaced_event(
|
105
139
|
namespace,
|
106
140
|
field_selector=(f'involvedObject.name={pod_name},'
|
107
141
|
'involvedObject.kind=Pod'))
|
@@ -118,24 +152,25 @@ def _raise_pod_scheduling_errors(namespace, new_nodes):
|
|
118
152
|
if event.reason == 'FailedScheduling':
|
119
153
|
event_message = event.message
|
120
154
|
break
|
121
|
-
timeout_err_msg = ('Timed out while waiting for nodes to start. '
|
122
|
-
'Cluster may be out of resources or '
|
123
|
-
'may be too slow to autoscale.')
|
124
|
-
lack_resource_msg = (
|
125
|
-
'Insufficient {resource} capacity on the cluster. '
|
126
|
-
'Other SkyPilot tasks or pods may be using resources. '
|
127
|
-
'Check resource usage by running `kubectl describe nodes`.')
|
128
155
|
if event_message is not None:
|
129
156
|
if pod_status == 'Pending':
|
157
|
+
logger.info(event_message)
|
130
158
|
if 'Insufficient cpu' in event_message:
|
131
159
|
raise config_lib.KubernetesError(
|
132
|
-
|
160
|
+
_lack_resource_msg('CPU', pod, details=event_message))
|
133
161
|
if 'Insufficient memory' in event_message:
|
134
162
|
raise config_lib.KubernetesError(
|
135
|
-
|
163
|
+
_lack_resource_msg('memory', pod,
|
164
|
+
details=event_message))
|
165
|
+
if 'Insufficient smarter-devices/fuse' in event_message:
|
166
|
+
raise config_lib.KubernetesError(
|
167
|
+
'Something went wrong with FUSE device daemonset.'
|
168
|
+
' Try restarting your FUSE pods by running '
|
169
|
+
'`kubectl delete pods -n skypilot-system -l name=smarter-device-manager`.' # pylint: disable=line-too-long
|
170
|
+
f' Full error: {event_message}')
|
136
171
|
gpu_lf_keys = [
|
137
|
-
lf.
|
138
|
-
for
|
172
|
+
key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
|
173
|
+
for key in lf.get_label_keys()
|
139
174
|
]
|
140
175
|
if pod.spec.node_selector:
|
141
176
|
for label_key in pod.spec.node_selector.keys():
|
@@ -143,22 +178,52 @@ def _raise_pod_scheduling_errors(namespace, new_nodes):
|
|
143
178
|
# TODO(romilb): We may have additional node
|
144
179
|
# affinity selectors in the future - in that
|
145
180
|
# case we will need to update this logic.
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
f'{msg} Verify if '
|
181
|
+
# TODO(Doyoung): Update the error message raised
|
182
|
+
# with the multi-host TPU support.
|
183
|
+
gpu_resource_key = kubernetes_utils.get_gpu_resource_key() # pylint: disable=line-too-long
|
184
|
+
if 'Insufficient google.com/tpu' in event_message:
|
185
|
+
extra_msg = (
|
186
|
+
f'Verify if '
|
153
187
|
f'{pod.spec.node_selector[label_key]}'
|
154
|
-
' is available in the cluster.'
|
188
|
+
' is available in the cluster. Note '
|
189
|
+
'that multi-host TPU podslices are '
|
190
|
+
'currently not unsupported.')
|
191
|
+
raise config_lib.KubernetesError(
|
192
|
+
_lack_resource_msg('TPU',
|
193
|
+
pod,
|
194
|
+
extra_msg,
|
195
|
+
details=event_message))
|
196
|
+
elif ((f'Insufficient {gpu_resource_key}'
|
197
|
+
in event_message) or
|
198
|
+
('didn\'t match Pod\'s node affinity/selector'
|
199
|
+
in event_message)):
|
200
|
+
extra_msg = (
|
201
|
+
f'Verify if any node matching label '
|
202
|
+
f'{pod.spec.node_selector[label_key]} and '
|
203
|
+
f'sufficient resource {gpu_resource_key} '
|
204
|
+
f'is available in the cluster.')
|
205
|
+
raise config_lib.KubernetesError(
|
206
|
+
_lack_resource_msg('GPU',
|
207
|
+
pod,
|
208
|
+
extra_msg,
|
209
|
+
details=event_message))
|
155
210
|
raise config_lib.KubernetesError(f'{timeout_err_msg} '
|
156
211
|
f'Pod status: {pod_status}'
|
157
212
|
f'Details: \'{event_message}\' ')
|
158
213
|
raise config_lib.KubernetesError(f'{timeout_err_msg}')
|
159
214
|
|
160
215
|
|
161
|
-
def
|
216
|
+
def _raise_command_running_error(message: str, command: str, pod_name: str,
|
217
|
+
rc: int, stdout: str) -> None:
|
218
|
+
if rc == 0:
|
219
|
+
return
|
220
|
+
raise config_lib.KubernetesError(
|
221
|
+
f'Failed to {message} for pod {pod_name} with return '
|
222
|
+
f'code {rc}: {command!r}\nOutput: {stdout}.')
|
223
|
+
|
224
|
+
|
225
|
+
@timeline.event
|
226
|
+
def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
162
227
|
"""Wait for all pods to be scheduled.
|
163
228
|
|
164
229
|
Wait for all pods including jump pod to be scheduled, and if it
|
@@ -168,6 +233,10 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
|
|
168
233
|
|
169
234
|
If timeout is set to a negative value, this method will wait indefinitely.
|
170
235
|
"""
|
236
|
+
# Create a set of pod names we're waiting for
|
237
|
+
if not new_nodes:
|
238
|
+
return
|
239
|
+
expected_pod_names = {node.metadata.name for node in new_nodes}
|
171
240
|
start_time = time.time()
|
172
241
|
|
173
242
|
def _evaluate_timeout() -> bool:
|
@@ -177,25 +246,40 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
|
|
177
246
|
return time.time() - start_time < timeout
|
178
247
|
|
179
248
|
while _evaluate_timeout():
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
249
|
+
# Get all pods in a single API call using the cluster name label
|
250
|
+
# which all pods in new_nodes should share
|
251
|
+
cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
|
252
|
+
pods = kubernetes.core_api(context).list_namespaced_pod(
|
253
|
+
namespace,
|
254
|
+
label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
|
255
|
+
|
256
|
+
# Get the set of found pod names and check if we have all expected pods
|
257
|
+
found_pod_names = {pod.metadata.name for pod in pods}
|
258
|
+
missing_pods = expected_pod_names - found_pod_names
|
259
|
+
if missing_pods:
|
260
|
+
logger.info('Retrying waiting for pods: '
|
261
|
+
f'Missing pods: {missing_pods}')
|
262
|
+
time.sleep(0.5)
|
263
|
+
continue
|
264
|
+
|
265
|
+
# Check if all pods are scheduled
|
266
|
+
all_scheduled = True
|
267
|
+
for pod in pods:
|
268
|
+
if (pod.metadata.name in expected_pod_names and
|
269
|
+
pod.status.phase == 'Pending'):
|
186
270
|
# If container_statuses is None, then the pod hasn't
|
187
271
|
# been scheduled yet.
|
188
272
|
if pod.status.container_statuses is None:
|
189
|
-
|
273
|
+
all_scheduled = False
|
190
274
|
break
|
191
275
|
|
192
|
-
if
|
276
|
+
if all_scheduled:
|
193
277
|
return
|
194
278
|
time.sleep(1)
|
195
279
|
|
196
280
|
# Handle pod scheduling errors
|
197
281
|
try:
|
198
|
-
_raise_pod_scheduling_errors(namespace, new_nodes)
|
282
|
+
_raise_pod_scheduling_errors(namespace, context, new_nodes)
|
199
283
|
except config_lib.KubernetesError:
|
200
284
|
raise
|
201
285
|
except Exception as e:
|
@@ -205,19 +289,64 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
|
|
205
289
|
f'Error: {common_utils.format_exception(e)}') from None
|
206
290
|
|
207
291
|
|
208
|
-
|
292
|
+
@timeline.event
|
293
|
+
def _wait_for_pods_to_run(namespace, context, new_nodes):
|
209
294
|
"""Wait for pods and their containers to be ready.
|
210
295
|
|
211
296
|
Pods may be pulling images or may be in the process of container
|
212
297
|
creation.
|
213
298
|
"""
|
299
|
+
if not new_nodes:
|
300
|
+
return
|
301
|
+
|
302
|
+
# Create a set of pod names we're waiting for
|
303
|
+
expected_pod_names = {node.metadata.name for node in new_nodes}
|
304
|
+
|
305
|
+
def _check_init_containers(pod):
|
306
|
+
# Check if any of the init containers failed
|
307
|
+
# to start. Could be because the init container
|
308
|
+
# command failed or failed to pull image etc.
|
309
|
+
for init_status in pod.status.init_container_statuses:
|
310
|
+
init_terminated = init_status.state.terminated
|
311
|
+
if init_terminated:
|
312
|
+
if init_terminated.exit_code != 0:
|
313
|
+
msg = init_terminated.message if (
|
314
|
+
init_terminated.message) else str(init_terminated)
|
315
|
+
raise config_lib.KubernetesError(
|
316
|
+
'Failed to run init container for pod '
|
317
|
+
f'{pod.metadata.name}. Error details: {msg}.')
|
318
|
+
continue
|
319
|
+
init_waiting = init_status.state.waiting
|
320
|
+
if (init_waiting is not None and init_waiting.reason
|
321
|
+
not in ['ContainerCreating', 'PodInitializing']):
|
322
|
+
# TODO(romilb): There may be more states to check for. Add
|
323
|
+
# them as needed.
|
324
|
+
msg = init_waiting.message if (
|
325
|
+
init_waiting.message) else str(init_waiting)
|
326
|
+
raise config_lib.KubernetesError(
|
327
|
+
'Failed to create init container for pod '
|
328
|
+
f'{pod.metadata.name}. Error details: {msg}.')
|
329
|
+
|
214
330
|
while True:
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
331
|
+
# Get all pods in a single API call
|
332
|
+
cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
|
333
|
+
all_pods = kubernetes.core_api(context).list_namespaced_pod(
|
334
|
+
namespace,
|
335
|
+
label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
|
336
|
+
|
337
|
+
# Get the set of found pod names and check if we have all expected pods
|
338
|
+
found_pod_names = {pod.metadata.name for pod in all_pods}
|
339
|
+
missing_pods = expected_pod_names - found_pod_names
|
340
|
+
if missing_pods:
|
341
|
+
logger.info('Retrying running pods check: '
|
342
|
+
f'Missing pods: {missing_pods}')
|
343
|
+
time.sleep(0.5)
|
344
|
+
continue
|
220
345
|
|
346
|
+
all_pods_running = True
|
347
|
+
for pod in all_pods:
|
348
|
+
if pod.metadata.name not in expected_pod_names:
|
349
|
+
continue
|
221
350
|
# Continue if pod and all the containers within the
|
222
351
|
# pod are successfully created and running.
|
223
352
|
if pod.status.phase == 'Running' and all(
|
@@ -235,12 +364,15 @@ def _wait_for_pods_to_run(namespace, new_nodes):
|
|
235
364
|
# See list of possible reasons for waiting here:
|
236
365
|
# https://stackoverflow.com/a/57886025
|
237
366
|
waiting = container_status.state.waiting
|
238
|
-
if
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
367
|
+
if waiting is not None:
|
368
|
+
if waiting.reason == 'PodInitializing':
|
369
|
+
_check_init_containers(pod)
|
370
|
+
elif waiting.reason != 'ContainerCreating':
|
371
|
+
msg = waiting.message if waiting.message else str(
|
372
|
+
waiting)
|
373
|
+
raise config_lib.KubernetesError(
|
374
|
+
'Failed to create container while launching '
|
375
|
+
f'the node. Error details: {msg}.')
|
244
376
|
# Reaching this point means that one of the pods had an issue,
|
245
377
|
# so break out of the loop, and wait until next second.
|
246
378
|
break
|
@@ -250,145 +382,188 @@ def _wait_for_pods_to_run(namespace, new_nodes):
|
|
250
382
|
time.sleep(1)
|
251
383
|
|
252
384
|
|
253
|
-
def
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
"""
|
385
|
+
def _run_function_with_retries(func: Callable,
|
386
|
+
operation_name: str,
|
387
|
+
max_retries: int = _MAX_RETRIES,
|
388
|
+
retry_delay: int = 5) -> Any:
|
389
|
+
"""Runs a function with retries on Kubernetes errors.
|
390
|
+
|
391
|
+
Args:
|
392
|
+
func: Function to retry
|
393
|
+
operation_name: Name of the operation for logging
|
394
|
+
max_retries: Maximum number of retry attempts
|
395
|
+
retry_delay: Delay between retries in seconds
|
258
396
|
|
259
|
-
|
260
|
-
|
261
|
-
When called from the provisioner, this logger.info is written to the
|
262
|
-
provision.log file (see setup_provision_logging()).
|
397
|
+
Raises:
|
398
|
+
The last exception encountered if all retries fail.
|
263
399
|
"""
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
400
|
+
for attempt in range(max_retries + 1):
|
401
|
+
try:
|
402
|
+
return func()
|
403
|
+
except config_lib.KubernetesError:
|
404
|
+
if attempt < max_retries:
|
405
|
+
logger.warning(f'Failed to {operation_name} - '
|
406
|
+
f'retrying in {retry_delay} seconds.')
|
407
|
+
time.sleep(retry_delay)
|
408
|
+
else:
|
409
|
+
raise
|
410
|
+
|
411
|
+
|
412
|
+
@timeline.event
|
413
|
+
def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
|
414
|
+
"""Pre-initialization step for SkyPilot pods.
|
415
|
+
|
416
|
+
This step is run in the pod right after it is created and before the
|
417
|
+
SkyPilot runtime is setup.
|
418
|
+
|
419
|
+
This step includes three key steps:
|
420
|
+
|
421
|
+
1. Privilege check: Checks if the default user has sufficient privilege
|
422
|
+
to set up the kubernetes instance pod.
|
423
|
+
2. SSH setup: Sets up SSH for the pod instance.
|
424
|
+
3. Environment variable setup to populate k8s env vars in the pod.
|
425
|
+
|
426
|
+
Make sure commands used in these methods are generic and work
|
427
|
+
on most base images. E.g., do not use Python, since that may not
|
428
|
+
be installed by default.
|
429
|
+
|
430
|
+
If you run any apt commands, be sure to check if the lock is available.
|
431
|
+
It is possible the `apt update` run in the pod container args may still
|
432
|
+
be running.
|
433
|
+
|
434
|
+
Args:
|
435
|
+
namespace (str): Kubernetes namespace.
|
436
|
+
context (Optional[str]): Kubernetes context.
|
437
|
+
new_nodes (List): List of new pod instances.
|
438
|
+
|
439
|
+
Raises:
|
440
|
+
config_lib.KubernetesError: If user privileges are insufficient or
|
441
|
+
setup fails.
|
301
442
|
"""
|
302
|
-
set_k8s_env_var_cmd = [
|
303
|
-
'/bin/sh',
|
304
|
-
'-c',
|
305
|
-
docker_utils.SETUP_ENV_VARS_CMD,
|
306
|
-
]
|
307
|
-
|
308
|
-
for new_pod in new_pods:
|
309
|
-
_run_command_on_pods(new_pod.metadata.name, namespace,
|
310
|
-
set_k8s_env_var_cmd)
|
311
|
-
|
312
|
-
|
313
|
-
def _check_user_privilege(namespace: str, new_nodes: List) -> None:
|
314
|
-
# Checks if the default user has sufficient privilege to set up
|
315
|
-
# the kubernetes instance pod.
|
316
|
-
check_k8s_user_sudo_cmd = [
|
317
|
-
'/bin/sh',
|
318
|
-
'-c',
|
319
|
-
(
|
320
|
-
'if [ $(id -u) -eq 0 ]; then'
|
321
|
-
# If user is root, create an alias for sudo used in skypilot setup
|
322
|
-
' echo \'alias sudo=""\' >> ~/.bashrc; '
|
323
|
-
'else '
|
324
|
-
' if command -v sudo >/dev/null 2>&1; then '
|
325
|
-
' timeout 2 sudo -l >/dev/null 2>&1 || '
|
326
|
-
f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
|
327
|
-
' else '
|
328
|
-
f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
|
329
|
-
' fi; '
|
330
|
-
'fi')
|
331
|
-
]
|
332
443
|
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
444
|
+
check_k8s_user_sudo_cmd = (
|
445
|
+
'if [ $(id -u) -eq 0 ]; then'
|
446
|
+
# If user is root, create an alias for sudo used in skypilot setup
|
447
|
+
' echo \'alias sudo=""\' >> ~/.bashrc; echo succeed;'
|
448
|
+
'else '
|
449
|
+
' if command -v sudo >/dev/null 2>&1; then '
|
450
|
+
' timeout 2 sudo -l >/dev/null 2>&1 && echo succeed || '
|
451
|
+
f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
|
452
|
+
f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
|
453
|
+
' else '
|
454
|
+
f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
|
455
|
+
f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
|
456
|
+
' fi; '
|
457
|
+
'fi;')
|
458
|
+
|
459
|
+
# Kubernetes automatically populates containers with critical
|
460
|
+
# environment variables, such as those for discovering services running
|
461
|
+
# in the cluster and CUDA/nvidia environment variables. We need to
|
462
|
+
# make sure these env vars are available in every task and ssh session.
|
463
|
+
# This is needed for GPU support and service discovery.
|
464
|
+
# See https://github.com/skypilot-org/skypilot/issues/2287 for more details.
|
465
|
+
# To do so, we capture env vars from the pod's runtime and write them to
|
466
|
+
# /etc/profile.d/, making them available for all users in future
|
467
|
+
# shell sessions.
|
468
|
+
set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
|
469
|
+
|
470
|
+
check_apt_update_complete_cmd = (
|
471
|
+
'echo "Checking if apt update from container init is complete..."; '
|
472
|
+
'timeout_secs=600; '
|
473
|
+
'start_time=$(date +%s); '
|
474
|
+
'while ! grep -q "Fetched" /tmp/apt-update.log 2>/dev/null; do '
|
475
|
+
' echo "apt update still running. Logs:"; '
|
476
|
+
' cat /tmp/apt-update.log || true; '
|
477
|
+
' current_time=$(date +%s); '
|
478
|
+
' elapsed=$((current_time - start_time)); '
|
479
|
+
' if [ $elapsed -ge $timeout_secs ]; then '
|
480
|
+
' echo "Timed out waiting for apt update"; '
|
481
|
+
' exit 1; '
|
482
|
+
' fi; '
|
483
|
+
' sleep 5; '
|
484
|
+
'done; '
|
485
|
+
'echo "apt update complete."; ')
|
486
|
+
|
487
|
+
install_ssh_k8s_cmd = (
|
488
|
+
'prefix_cmd() '
|
489
|
+
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
|
490
|
+
'export DEBIAN_FRONTEND=noninteractive;'
|
491
|
+
'echo "Installing missing packages..."; '
|
492
|
+
'for i in {1..5}; do '
|
493
|
+
' output=$($(prefix_cmd) apt install openssh-server rsync -y 2>&1); '
|
494
|
+
' rc=$?; '
|
495
|
+
' if [ $rc -eq 0 ]; then '
|
496
|
+
' break; '
|
497
|
+
' fi; '
|
498
|
+
' echo "$output" | grep -qi "could not get lock" || '
|
499
|
+
' grep -qi "Unable to acquire the dpkg frontend lock"; '
|
500
|
+
' if [ $? -eq 0 ]; then '
|
501
|
+
' echo "apt install failed due to lock, retrying. (Attempt $i/5)"; '
|
502
|
+
' sleep 5; '
|
503
|
+
' else '
|
504
|
+
' echo "apt install failed for a non-lock reason: $output"; '
|
505
|
+
' exit $rc; '
|
506
|
+
' fi; '
|
507
|
+
'done; '
|
508
|
+
'if [ $rc -ne 0 ]; then '
|
509
|
+
' echo "apt install failed after 5 attempts due to lock errors."; '
|
510
|
+
' exit $rc; '
|
511
|
+
'fi; '
|
512
|
+
'$(prefix_cmd) mkdir -p /var/run/sshd; '
|
513
|
+
'$(prefix_cmd) '
|
514
|
+
'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
|
515
|
+
'/etc/ssh/sshd_config; '
|
516
|
+
'$(prefix_cmd) sed '
|
517
|
+
'"s@session\\s*required\\s*pam_loginuid.so@session optional '
|
518
|
+
'pam_loginuid.so@g" -i /etc/pam.d/sshd; '
|
519
|
+
'cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A; '
|
520
|
+
'$(prefix_cmd) mkdir -p ~/.ssh; '
|
521
|
+
'$(prefix_cmd) chown -R $(whoami) ~/.ssh;'
|
522
|
+
'$(prefix_cmd) chmod 700 ~/.ssh; '
|
523
|
+
'$(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > '
|
524
|
+
'~/.ssh/authorized_keys; '
|
525
|
+
'$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys; '
|
526
|
+
'$(prefix_cmd) service ssh restart; '
|
527
|
+
# Eliminate the error
|
528
|
+
# `mesg: ttyname failed: inappropriate ioctl for device`.
|
529
|
+
# See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
|
530
|
+
'$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')
|
531
|
+
|
532
|
+
pre_init_cmd = ('set -ex; ' + check_k8s_user_sudo_cmd +
|
533
|
+
set_k8s_env_var_cmd + check_apt_update_complete_cmd +
|
534
|
+
install_ssh_k8s_cmd)
|
535
|
+
|
536
|
+
def _pre_init_thread(new_node):
|
537
|
+
pod_name = new_node.metadata.name
|
538
|
+
logger.info(f'{"-"*20}Start: Pre-init in pod {pod_name!r} {"-"*20}')
|
539
|
+
runner = command_runner.KubernetesCommandRunner(
|
540
|
+
((namespace, context), pod_name))
|
541
|
+
|
542
|
+
# Run the combined pre-init command
|
543
|
+
rc, stdout, _ = runner.run(pre_init_cmd,
|
544
|
+
require_outputs=True,
|
545
|
+
stream_logs=False)
|
546
|
+
if rc == exceptions.INSUFFICIENT_PRIVILEGES_CODE:
|
338
547
|
raise config_lib.KubernetesError(
|
339
548
|
'Insufficient system privileges detected. '
|
340
549
|
'Ensure the default user has root access or '
|
341
550
|
'"sudo" is installed and the user is added to the sudoers '
|
342
551
|
'from the image.')
|
343
552
|
|
553
|
+
op_name = 'pre-init'
|
554
|
+
_raise_command_running_error(op_name, pre_init_cmd, pod_name, rc,
|
555
|
+
stdout)
|
344
556
|
|
345
|
-
|
346
|
-
|
347
|
-
#
|
348
|
-
|
349
|
-
'/bin/sh',
|
350
|
-
'-c',
|
351
|
-
(
|
352
|
-
'set -x; '
|
353
|
-
'prefix_cmd() '
|
354
|
-
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
|
355
|
-
'export DEBIAN_FRONTEND=noninteractive;'
|
356
|
-
'$(prefix_cmd) apt-get update;'
|
357
|
-
'$(prefix_cmd) apt install openssh-server rsync -y; '
|
358
|
-
'$(prefix_cmd) mkdir -p /var/run/sshd; '
|
359
|
-
'$(prefix_cmd) '
|
360
|
-
'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
|
361
|
-
'/etc/ssh/sshd_config; '
|
362
|
-
'$(prefix_cmd) sed '
|
363
|
-
'"s@session\\s*required\\s*pam_loginuid.so@session optional '
|
364
|
-
'pam_loginuid.so@g" -i /etc/pam.d/sshd; '
|
365
|
-
'cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A; '
|
366
|
-
'$(prefix_cmd) mkdir -p ~/.ssh; '
|
367
|
-
'$(prefix_cmd) chown -R $(whoami) ~/.ssh;'
|
368
|
-
'$(prefix_cmd) chmod 700 ~/.ssh; '
|
369
|
-
'$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys; '
|
370
|
-
'$(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > '
|
371
|
-
'~/.ssh/authorized_keys; '
|
372
|
-
'$(prefix_cmd) service ssh restart; '
|
373
|
-
# Eliminate the error
|
374
|
-
# `mesg: ttyname failed: inappropriate ioctl for device`.
|
375
|
-
# See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
|
376
|
-
'$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')
|
377
|
-
]
|
378
|
-
# TODO(romilb): Parallelize the setup of SSH in pods for multi-node clusters
|
379
|
-
for new_node in new_nodes:
|
380
|
-
pod_name = new_node.metadata.name
|
381
|
-
logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}')
|
382
|
-
_run_command_on_pods(new_node.metadata.name,
|
383
|
-
namespace,
|
384
|
-
set_k8s_ssh_cmd,
|
385
|
-
stream_logs=True)
|
386
|
-
logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
|
557
|
+
logger.info(f'{"-"*20}End: Pre-init in pod {pod_name!r} {"-"*20}')
|
558
|
+
|
559
|
+
# Run pre_init in parallel across all new_nodes
|
560
|
+
subprocess_utils.run_in_parallel(_pre_init_thread, new_nodes, _NUM_THREADS)
|
387
561
|
|
388
562
|
|
389
|
-
def _label_pod(namespace: str,
|
563
|
+
def _label_pod(namespace: str, context: Optional[str], pod_name: str,
|
564
|
+
label: Dict[str, str]) -> None:
|
390
565
|
"""Label a pod."""
|
391
|
-
kubernetes.core_api().patch_namespaced_pod(
|
566
|
+
kubernetes.core_api(context).patch_namespaced_pod(
|
392
567
|
pod_name,
|
393
568
|
namespace, {'metadata': {
|
394
569
|
'labels': label
|
@@ -396,11 +571,92 @@ def _label_pod(namespace: str, pod_name: str, label: Dict[str, str]) -> None:
|
|
396
571
|
_request_timeout=kubernetes.API_TIMEOUT)
|
397
572
|
|
398
573
|
|
574
|
+
@timeline.event
|
575
|
+
def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
|
576
|
+
context: Optional[str]) -> Any:
|
577
|
+
"""Attempts to create a Kubernetes Pod and handle any errors.
|
578
|
+
|
579
|
+
Currently, we handle errors due to the AppArmor annotation and retry if
|
580
|
+
it fails due to the `FieldValueForbidden` error.
|
581
|
+
See https://github.com/skypilot-org/skypilot/issues/4174 for details.
|
582
|
+
|
583
|
+
Returns: The created Pod object.
|
584
|
+
"""
|
585
|
+
try:
|
586
|
+
# Attempt to create the Pod with the AppArmor annotation
|
587
|
+
pod = kubernetes.core_api(context).create_namespaced_pod(
|
588
|
+
namespace, pod_spec)
|
589
|
+
return pod
|
590
|
+
except kubernetes.api_exception() as e:
|
591
|
+
try:
|
592
|
+
error_body = json.loads(e.body)
|
593
|
+
error_message = error_body.get('message', '')
|
594
|
+
except json.JSONDecodeError:
|
595
|
+
error_message = str(e.body)
|
596
|
+
# Check if the error is due to the AppArmor annotation and retry.
|
597
|
+
# We add an AppArmor annotation to set it as unconfined in our
|
598
|
+
# base template in kubernetes-ray.yml.j2. This is required for
|
599
|
+
# FUSE to work in the pod on most Kubernetes distributions.
|
600
|
+
# However, some distributions do not support the AppArmor annotation
|
601
|
+
# and will fail to create the pod. In this case, we retry without
|
602
|
+
# the annotation.
|
603
|
+
if (e.status == 422 and 'FieldValueForbidden' in error_message and
|
604
|
+
'AppArmorProfile: nil' in error_message):
|
605
|
+
logger.warning('AppArmor annotation caused pod creation to fail. '
|
606
|
+
'Retrying without the annotation. '
|
607
|
+
'Note: this may cause bucket mounting to fail.')
|
608
|
+
|
609
|
+
# Remove the AppArmor annotation
|
610
|
+
annotations = pod_spec.get('metadata', {}).get('annotations', {})
|
611
|
+
if ('container.apparmor.security.beta.kubernetes.io/ray-node'
|
612
|
+
in annotations):
|
613
|
+
del annotations[
|
614
|
+
'container.apparmor.security.beta.kubernetes.io/ray-node']
|
615
|
+
pod_spec['metadata']['annotations'] = annotations
|
616
|
+
logger.info('AppArmor annotation removed from Pod spec.')
|
617
|
+
else:
|
618
|
+
logger.warning('AppArmor annotation not found in pod spec, '
|
619
|
+
'retrying will not help. '
|
620
|
+
f'Current annotations: {annotations}')
|
621
|
+
raise e
|
622
|
+
|
623
|
+
# Retry Pod creation without the AppArmor annotation
|
624
|
+
try:
|
625
|
+
pod = kubernetes.core_api(context).create_namespaced_pod(
|
626
|
+
namespace, pod_spec)
|
627
|
+
logger.info(f'Pod {pod.metadata.name} created successfully '
|
628
|
+
'without AppArmor annotation.')
|
629
|
+
return pod
|
630
|
+
except kubernetes.api_exception() as retry_exception:
|
631
|
+
logger.info('Failed to create Pod without AppArmor annotation: '
|
632
|
+
f'{retry_exception}')
|
633
|
+
raise retry_exception
|
634
|
+
# Unlike other error from resource lackage on CPU/GPU/Memory, TPU
|
635
|
+
# lackage error is raised when pod is attemtped to be created.
|
636
|
+
# TODO(Doyoung): Update the error message raised with the multi-host
|
637
|
+
# TPU support.
|
638
|
+
elif 'Invalid resource requests for google.com/tpu.' in error_message:
|
639
|
+
extra_message = ('Verify if the cluster has a TPU slice node with '
|
640
|
+
'a topology matching the number of TPU(s) '
|
641
|
+
'requested. Note that multi-host TPU podslices '
|
642
|
+
'are currently not unsupported.')
|
643
|
+
raise config_lib.KubernetesError(
|
644
|
+
_lack_resource_msg('TPU',
|
645
|
+
pod_spec,
|
646
|
+
details=error_message,
|
647
|
+
extra_msg=extra_message))
|
648
|
+
else:
|
649
|
+
# Re-raise the exception if it's a different error
|
650
|
+
raise e
|
651
|
+
|
652
|
+
|
653
|
+
@timeline.event
|
399
654
|
def _create_pods(region: str, cluster_name_on_cloud: str,
|
400
655
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
401
656
|
"""Create pods based on the config."""
|
402
657
|
provider_config = config.provider_config
|
403
|
-
namespace =
|
658
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
659
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
404
660
|
pod_spec = copy.deepcopy(config.node_config)
|
405
661
|
tags = {
|
406
662
|
TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
|
@@ -413,17 +669,19 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
413
669
|
pod_spec['metadata']['labels'].update(
|
414
670
|
{TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
|
415
671
|
|
416
|
-
terminating_pods =
|
672
|
+
terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
673
|
+
['Terminating'])
|
417
674
|
start_time = time.time()
|
418
|
-
while (
|
675
|
+
while (terminating_pods and
|
419
676
|
time.time() - start_time < _TIMEOUT_FOR_POD_TERMINATION):
|
420
677
|
logger.debug(f'run_instances: Found {len(terminating_pods)} '
|
421
678
|
'terminating pods. Waiting them to finish: '
|
422
679
|
f'{list(terminating_pods.keys())}')
|
423
680
|
time.sleep(POLL_INTERVAL)
|
424
|
-
terminating_pods =
|
681
|
+
terminating_pods = kubernetes_utils.filter_pods(namespace, context,
|
682
|
+
tags, ['Terminating'])
|
425
683
|
|
426
|
-
if
|
684
|
+
if terminating_pods:
|
427
685
|
# If there are still terminating pods, we force delete them.
|
428
686
|
logger.debug(f'run_instances: Found {len(terminating_pods)} '
|
429
687
|
'terminating pods still in terminating state after '
|
@@ -432,13 +690,14 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
432
690
|
for pod_name in terminating_pods.keys():
|
433
691
|
# grace_period_seconds=0 means force delete the pod.
|
434
692
|
# https://github.com/kubernetes-client/python/issues/508#issuecomment-1695759777
|
435
|
-
kubernetes.core_api().delete_namespaced_pod(
|
693
|
+
kubernetes.core_api(context).delete_namespaced_pod(
|
436
694
|
pod_name,
|
437
695
|
namespace,
|
438
696
|
_request_timeout=config_lib.DELETION_TIMEOUT,
|
439
697
|
grace_period_seconds=0)
|
440
698
|
|
441
|
-
running_pods =
|
699
|
+
running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
700
|
+
['Pending', 'Running'])
|
442
701
|
head_pod_name = _get_head_pod_name(running_pods)
|
443
702
|
logger.debug(f'Found {len(running_pods)} existing pods: '
|
444
703
|
f'{list(running_pods.keys())}')
|
@@ -456,7 +715,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
456
715
|
# Add nvidia runtime class if it exists
|
457
716
|
nvidia_runtime_exists = False
|
458
717
|
try:
|
459
|
-
nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
|
718
|
+
nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
|
719
|
+
context)
|
460
720
|
except kubernetes.kubernetes.client.ApiException as e:
|
461
721
|
logger.warning('run_instances: Error occurred while checking for '
|
462
722
|
f'nvidia RuntimeClass - '
|
@@ -464,32 +724,45 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
464
724
|
'Continuing without using nvidia RuntimeClass.\n'
|
465
725
|
'If you are on a K3s cluster, manually '
|
466
726
|
'override runtimeClassName in ~/.sky/config.yaml. '
|
467
|
-
'For more details, refer to https://skypilot.
|
727
|
+
'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
|
728
|
+
|
729
|
+
needs_gpus = False
|
730
|
+
limits = pod_spec['spec']['containers'][0].get('resources',
|
731
|
+
{}).get('limits')
|
732
|
+
if limits is not None:
|
733
|
+
needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(), 0) > 0
|
468
734
|
|
469
|
-
|
735
|
+
# TPU pods provisioned on GKE use the default containerd runtime.
|
736
|
+
# Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long
|
737
|
+
if nvidia_runtime_exists and needs_gpus:
|
470
738
|
pod_spec['spec']['runtimeClassName'] = 'nvidia'
|
471
739
|
|
472
740
|
created_pods = {}
|
473
741
|
logger.debug(f'run_instances: calling create_namespaced_pod '
|
474
742
|
f'(count={to_start_count}).')
|
475
|
-
|
476
|
-
|
477
|
-
|
743
|
+
|
744
|
+
def _create_pod_thread(i: int):
|
745
|
+
pod_spec_copy = copy.deepcopy(pod_spec)
|
746
|
+
if head_pod_name is None and i == 0:
|
747
|
+
# First pod should be head if no head exists
|
748
|
+
pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
|
478
749
|
head_selector = head_service_selector(cluster_name_on_cloud)
|
479
|
-
|
480
|
-
|
750
|
+
pod_spec_copy['metadata']['labels'].update(head_selector)
|
751
|
+
pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
|
481
752
|
else:
|
482
|
-
|
483
|
-
|
753
|
+
# Worker pods
|
754
|
+
pod_spec_copy['metadata']['labels'].update(
|
755
|
+
constants.WORKER_NODE_TAGS)
|
756
|
+
pod_uuid = str(uuid.uuid4())[:6]
|
484
757
|
pod_name = f'{cluster_name_on_cloud}-{pod_uuid}'
|
485
|
-
|
758
|
+
pod_spec_copy['metadata']['name'] = f'{pod_name}-worker'
|
486
759
|
# For multi-node support, we put a soft-constraint to schedule
|
487
760
|
# worker pods on different nodes than the head pod.
|
488
761
|
# This is not set as a hard constraint because if different nodes
|
489
762
|
# are not available, we still want to be able to schedule worker
|
490
763
|
# pods on larger nodes which may be able to fit multiple SkyPilot
|
491
764
|
# "nodes".
|
492
|
-
|
765
|
+
pod_spec_copy['spec']['affinity'] = {
|
493
766
|
'podAntiAffinity': {
|
494
767
|
# Set as a soft constraint
|
495
768
|
'preferredDuringSchedulingIgnoredDuringExecution': [{
|
@@ -510,67 +783,67 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
510
783
|
}
|
511
784
|
}
|
512
785
|
|
513
|
-
|
786
|
+
# TPU slice nodes are given a taint, google.com/tpu=present:NoSchedule.
|
787
|
+
# This is to prevent from non-TPU workloads from being scheduled on TPU
|
788
|
+
# slice nodes. We need this toleration to allow the pod to be scheduled
|
789
|
+
# on TPU nodes.
|
790
|
+
# Reference: https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work # pylint: disable=line-too-long
|
791
|
+
tpu_label = kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY
|
792
|
+
if tpu_label in config.node_config.get('spec',
|
793
|
+
{}).get('nodeSelector', {}):
|
794
|
+
tpu_toleration = {
|
795
|
+
'key': kubernetes_utils.TPU_RESOURCE_KEY,
|
796
|
+
'operator': 'Equal',
|
797
|
+
'value': 'present',
|
798
|
+
'effect': 'NoSchedule'
|
799
|
+
}
|
800
|
+
# Preserve existing tolerations if any
|
801
|
+
existing_tolerations = pod_spec_copy['spec'].get('tolerations', [])
|
802
|
+
pod_spec_copy['spec']['tolerations'] = existing_tolerations + [
|
803
|
+
tpu_toleration
|
804
|
+
]
|
805
|
+
|
806
|
+
return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
|
807
|
+
context)
|
808
|
+
|
809
|
+
# Create pods in parallel
|
810
|
+
pods = subprocess_utils.run_in_parallel(_create_pod_thread,
|
811
|
+
list(range(to_start_count)),
|
812
|
+
_NUM_THREADS)
|
813
|
+
|
814
|
+
# Process created pods
|
815
|
+
for pod in pods:
|
514
816
|
created_pods[pod.metadata.name] = pod
|
515
|
-
if head_pod_name is None
|
817
|
+
if head_pod_name is None and pod.metadata.labels.get(
|
818
|
+
constants.TAG_RAY_NODE_KIND) == 'head':
|
516
819
|
head_pod_name = pod.metadata.name
|
517
820
|
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
821
|
+
networking_mode = network_utils.get_networking_mode(
|
822
|
+
config.provider_config.get('networking_mode'))
|
823
|
+
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
824
|
+
# Adding the jump pod to the new_nodes list as well so it can be
|
825
|
+
# checked if it's scheduled and running along with other pods.
|
826
|
+
ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
|
827
|
+
jump_pod = kubernetes.core_api(context).read_namespaced_pod(
|
828
|
+
ssh_jump_pod_name, namespace)
|
829
|
+
pods.append(jump_pod)
|
526
830
|
provision_timeout = provider_config['timeout']
|
527
831
|
|
528
832
|
wait_str = ('indefinitely'
|
529
833
|
if provision_timeout < 0 else f'for {provision_timeout}s')
|
530
834
|
logger.debug(f'run_instances: waiting {wait_str} for pods to schedule and '
|
531
|
-
f'run: {
|
835
|
+
f'run: {[pod.metadata.name for pod in pods]}')
|
532
836
|
|
533
837
|
# Wait until the pods are scheduled and surface cause for error
|
534
838
|
# if there is one
|
535
|
-
_wait_for_pods_to_schedule(namespace,
|
839
|
+
_wait_for_pods_to_schedule(namespace, context, pods, provision_timeout)
|
536
840
|
# Wait until the pods and their containers are up and running, and
|
537
841
|
# fail early if there is an error
|
538
842
|
logger.debug(f'run_instances: waiting for pods to be running (pulling '
|
539
|
-
f'images): {
|
540
|
-
_wait_for_pods_to_run(namespace,
|
843
|
+
f'images): {[pod.metadata.name for pod in pods]}')
|
844
|
+
_wait_for_pods_to_run(namespace, context, pods)
|
541
845
|
logger.debug(f'run_instances: all pods are scheduled and running: '
|
542
|
-
f'{
|
543
|
-
|
544
|
-
running_pods = _filter_pods(namespace, tags, ['Running'])
|
545
|
-
initialized_pods = _filter_pods(namespace, {
|
546
|
-
TAG_POD_INITIALIZED: 'true',
|
547
|
-
**tags
|
548
|
-
}, ['Running'])
|
549
|
-
uninitialized_pods = {
|
550
|
-
pod_name: pod
|
551
|
-
for pod_name, pod in running_pods.items()
|
552
|
-
if pod_name not in initialized_pods
|
553
|
-
}
|
554
|
-
if len(uninitialized_pods) > 0:
|
555
|
-
logger.debug(f'run_instances: Initializing {len(uninitialized_pods)} '
|
556
|
-
f'pods: {list(uninitialized_pods.keys())}')
|
557
|
-
uninitialized_pods_list = list(uninitialized_pods.values())
|
558
|
-
|
559
|
-
# Setup SSH and environment variables in pods.
|
560
|
-
# Make sure commands used in these methods are generic and work
|
561
|
-
# on most base images. E.g., do not use Python, since that may not
|
562
|
-
# be installed by default.
|
563
|
-
_check_user_privilege(namespace, uninitialized_pods_list)
|
564
|
-
_setup_ssh_in_pods(namespace, uninitialized_pods_list)
|
565
|
-
_set_env_vars_in_pods(namespace, uninitialized_pods_list)
|
566
|
-
|
567
|
-
for pod in uninitialized_pods.values():
|
568
|
-
_label_pod(namespace,
|
569
|
-
pod.metadata.name,
|
570
|
-
label={
|
571
|
-
TAG_POD_INITIALIZED: 'true',
|
572
|
-
**pod.metadata.labels
|
573
|
-
})
|
846
|
+
f'{[pod.metadata.name for pod in pods]}')
|
574
847
|
|
575
848
|
assert head_pod_name is not None, 'head_instance_id should not be None'
|
576
849
|
return common.ProvisionRecord(
|
@@ -590,7 +863,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
590
863
|
try:
|
591
864
|
return _create_pods(region, cluster_name_on_cloud, config)
|
592
865
|
except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
|
593
|
-
|
866
|
+
e_msg = common_utils.format_exception(e).replace('\n', ' ')
|
867
|
+
logger.warning('run_instances: Error occurred when creating pods: '
|
868
|
+
f'{e_msg}')
|
594
869
|
raise
|
595
870
|
|
596
871
|
|
@@ -607,35 +882,66 @@ def stop_instances(
|
|
607
882
|
raise NotImplementedError()
|
608
883
|
|
609
884
|
|
610
|
-
def _terminate_node(namespace: str,
|
885
|
+
def _terminate_node(namespace: str, context: Optional[str],
|
886
|
+
pod_name: str) -> None:
|
611
887
|
"""Terminate a pod."""
|
612
888
|
logger.debug('terminate_instances: calling delete_namespaced_pod')
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
889
|
+
|
890
|
+
def _delete_k8s_resource_with_retry(delete_func: Callable,
|
891
|
+
resource_type: str,
|
892
|
+
resource_name: str) -> None:
|
893
|
+
"""Helper to delete Kubernetes resources with 404 handling and retries.
|
894
|
+
|
895
|
+
Args:
|
896
|
+
delete_func: Function to call to delete the resource
|
897
|
+
resource_type: Type of resource being deleted (e.g. 'service'),
|
898
|
+
used in logging
|
899
|
+
resource_name: Name of the resource being deleted, used in logging
|
900
|
+
"""
|
901
|
+
max_retries = 3
|
902
|
+
retry_delay = 5 # seconds
|
903
|
+
|
904
|
+
for attempt in range(max_retries):
|
905
|
+
try:
|
906
|
+
delete_func()
|
907
|
+
return
|
908
|
+
except kubernetes.api_exception() as e:
|
909
|
+
if e.status == 404:
|
910
|
+
logger.warning(
|
911
|
+
f'terminate_instances: Tried to delete {resource_type} '
|
912
|
+
f'{resource_name}, but the {resource_type} was not '
|
913
|
+
'found (404).')
|
914
|
+
return
|
915
|
+
elif attempt < max_retries - 1:
|
916
|
+
logger.warning(f'terminate_instances: Failed to delete '
|
917
|
+
f'{resource_type} {resource_name} (attempt '
|
918
|
+
f'{attempt + 1}/{max_retries}). Error: {e}. '
|
919
|
+
f'Retrying in {retry_delay} seconds...')
|
920
|
+
time.sleep(retry_delay)
|
921
|
+
else:
|
922
|
+
raise
|
923
|
+
|
924
|
+
# Delete services for the pod
|
925
|
+
for service_name in [pod_name, f'{pod_name}-ssh']:
|
926
|
+
_delete_k8s_resource_with_retry(
|
927
|
+
delete_func=lambda name=service_name: kubernetes.core_api(
|
928
|
+
context).delete_namespaced_service(name=name,
|
929
|
+
namespace=namespace,
|
930
|
+
_request_timeout=config_lib.
|
931
|
+
DELETION_TIMEOUT),
|
932
|
+
resource_type='service',
|
933
|
+
resource_name=service_name)
|
934
|
+
|
627
935
|
# Note - delete pod after all other resources are deleted.
|
628
936
|
# This is to ensure there are no leftover resources if this down is run
|
629
937
|
# from within the pod, e.g., for autodown.
|
630
|
-
|
631
|
-
kubernetes.core_api().delete_namespaced_pod(
|
632
|
-
pod_name,
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
else:
|
638
|
-
raise
|
938
|
+
_delete_k8s_resource_with_retry(
|
939
|
+
delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
|
940
|
+
name=pod_name,
|
941
|
+
namespace=namespace,
|
942
|
+
_request_timeout=config_lib.DELETION_TIMEOUT),
|
943
|
+
resource_type='pod',
|
944
|
+
resource_name=pod_name)
|
639
945
|
|
640
946
|
|
641
947
|
def terminate_instances(
|
@@ -644,20 +950,38 @@ def terminate_instances(
|
|
644
950
|
worker_only: bool = False,
|
645
951
|
) -> None:
|
646
952
|
"""See sky/provision/__init__.py"""
|
647
|
-
namespace =
|
953
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
954
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
648
955
|
tag_filters = {
|
649
956
|
TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
|
650
957
|
}
|
651
|
-
pods =
|
958
|
+
pods = kubernetes_utils.filter_pods(namespace, context, tag_filters, None)
|
959
|
+
|
960
|
+
# Clean up the SSH jump pod if in use
|
961
|
+
networking_mode = network_utils.get_networking_mode(
|
962
|
+
provider_config.get('networking_mode'))
|
963
|
+
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
964
|
+
pod_name = list(pods.keys())[0]
|
965
|
+
try:
|
966
|
+
kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
|
967
|
+
pod_name)
|
968
|
+
except Exception as e: # pylint: disable=broad-except
|
969
|
+
logger.warning('terminate_instances: Error occurred when analyzing '
|
970
|
+
f'SSH Jump pod: {e}')
|
652
971
|
|
653
972
|
def _is_head(pod) -> bool:
|
654
|
-
return pod.metadata.labels[TAG_RAY_NODE_KIND] == 'head'
|
973
|
+
return pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'
|
655
974
|
|
656
|
-
|
657
|
-
|
975
|
+
def _terminate_pod_thread(pod_info):
|
976
|
+
pod_name, pod = pod_info
|
658
977
|
if _is_head(pod) and worker_only:
|
659
|
-
|
660
|
-
|
978
|
+
return
|
979
|
+
logger.debug(f'Terminating instance {pod_name}: {pod}')
|
980
|
+
_terminate_node(namespace, context, pod_name)
|
981
|
+
|
982
|
+
# Run pod termination in parallel
|
983
|
+
subprocess_utils.run_in_parallel(_terminate_pod_thread, list(pods.items()),
|
984
|
+
_NUM_THREADS)
|
661
985
|
|
662
986
|
|
663
987
|
def get_cluster_info(
|
@@ -666,12 +990,15 @@ def get_cluster_info(
|
|
666
990
|
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
667
991
|
del region # unused
|
668
992
|
assert provider_config is not None
|
669
|
-
namespace =
|
993
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
994
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
670
995
|
tag_filters = {
|
671
996
|
TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
|
672
997
|
}
|
673
998
|
|
674
|
-
running_pods =
|
999
|
+
running_pods = kubernetes_utils.filter_pods(namespace, context, tag_filters,
|
1000
|
+
['Running'])
|
1001
|
+
|
675
1002
|
pods: Dict[str, List[common.InstanceInfo]] = {}
|
676
1003
|
head_pod_name = None
|
677
1004
|
|
@@ -680,11 +1007,11 @@ def get_cluster_info(
|
|
680
1007
|
port_forward_mode.value)
|
681
1008
|
network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
|
682
1009
|
network_mode_str)
|
683
|
-
external_ip = kubernetes_utils.get_external_ip(network_mode)
|
1010
|
+
external_ip = kubernetes_utils.get_external_ip(network_mode, context)
|
684
1011
|
port = 22
|
685
1012
|
if not provider_config.get('use_internal_ips', False):
|
686
1013
|
port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
|
687
|
-
namespace)
|
1014
|
+
namespace, context)
|
688
1015
|
|
689
1016
|
head_pod_name = None
|
690
1017
|
cpu_request = None
|
@@ -700,7 +1027,7 @@ def get_cluster_info(
|
|
700
1027
|
tags=pod.metadata.labels,
|
701
1028
|
)
|
702
1029
|
]
|
703
|
-
if pod.metadata.labels[TAG_RAY_NODE_KIND] == 'head':
|
1030
|
+
if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head':
|
704
1031
|
head_pod_name = pod_name
|
705
1032
|
head_spec = pod.spec
|
706
1033
|
assert head_spec is not None, pod
|
@@ -709,11 +1036,17 @@ def get_cluster_info(
|
|
709
1036
|
assert cpu_request is not None, 'cpu_request should not be None'
|
710
1037
|
|
711
1038
|
ssh_user = 'sky'
|
712
|
-
get_k8s_ssh_user_cmd =
|
1039
|
+
get_k8s_ssh_user_cmd = 'echo $(whoami)'
|
713
1040
|
assert head_pod_name is not None
|
714
|
-
|
715
|
-
|
716
|
-
|
1041
|
+
runner = command_runner.KubernetesCommandRunner(
|
1042
|
+
((namespace, context), head_pod_name))
|
1043
|
+
rc, stdout, stderr = runner.run(get_k8s_ssh_user_cmd,
|
1044
|
+
require_outputs=True,
|
1045
|
+
separate_stderr=True,
|
1046
|
+
stream_logs=False)
|
1047
|
+
_raise_command_running_error('get ssh user', get_k8s_ssh_user_cmd,
|
1048
|
+
head_pod_name, rc, stdout + stderr)
|
1049
|
+
ssh_user = stdout.strip()
|
717
1050
|
logger.debug(
|
718
1051
|
f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
|
719
1052
|
|
@@ -737,7 +1070,6 @@ def query_instances(
|
|
737
1070
|
provider_config: Optional[Dict[str, Any]] = None,
|
738
1071
|
non_terminated_only: bool = True
|
739
1072
|
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
740
|
-
del provider_config # unused
|
741
1073
|
status_map = {
|
742
1074
|
'Pending': status_lib.ClusterStatus.INIT,
|
743
1075
|
'Running': status_lib.ClusterStatus.UP,
|
@@ -747,11 +1079,13 @@ def query_instances(
|
|
747
1079
|
'Terminating': None,
|
748
1080
|
}
|
749
1081
|
|
750
|
-
|
1082
|
+
assert provider_config is not None
|
1083
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
1084
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
751
1085
|
|
752
1086
|
# Get all the pods with the label skypilot-cluster: <cluster_name>
|
753
1087
|
try:
|
754
|
-
pods = kubernetes.core_api().list_namespaced_pod(
|
1088
|
+
pods = kubernetes.core_api(context).list_namespaced_pod(
|
755
1089
|
namespace,
|
756
1090
|
label_selector=f'skypilot-cluster={cluster_name_on_cloud}',
|
757
1091
|
_request_timeout=kubernetes.API_TIMEOUT).items
|
@@ -776,3 +1110,24 @@ def query_instances(
|
|
776
1110
|
continue
|
777
1111
|
cluster_status[pod.metadata.name] = pod_status
|
778
1112
|
return cluster_status
|
1113
|
+
|
1114
|
+
|
1115
|
+
def get_command_runners(
|
1116
|
+
cluster_info: common.ClusterInfo,
|
1117
|
+
**credentials: Dict[str, Any],
|
1118
|
+
) -> List[command_runner.CommandRunner]:
|
1119
|
+
"""Get a command runner for the given cluster."""
|
1120
|
+
assert cluster_info.provider_config is not None, cluster_info
|
1121
|
+
instances = cluster_info.instances
|
1122
|
+
namespace = kubernetes_utils.get_namespace_from_config(
|
1123
|
+
cluster_info.provider_config)
|
1124
|
+
context = kubernetes_utils.get_context_from_config(
|
1125
|
+
cluster_info.provider_config)
|
1126
|
+
node_list = []
|
1127
|
+
if cluster_info.head_instance_id is not None:
|
1128
|
+
node_list = [((namespace, context), cluster_info.head_instance_id)]
|
1129
|
+
node_list.extend(((namespace, context), pod_name)
|
1130
|
+
for pod_name in instances.keys()
|
1131
|
+
if pod_name != cluster_info.head_instance_id)
|
1132
|
+
return command_runner.KubernetesCommandRunner.make_runner_list(
|
1133
|
+
node_list=node_list, **credentials)
|