skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,78 +1,43 @@
1
1
  """Kubernetes instance provisioning."""
2
2
  import copy
3
+ import json
3
4
  import time
4
- from typing import Any, Dict, List, Optional
5
+ from typing import Any, Callable, Dict, List, Optional, Union
5
6
  import uuid
6
7
 
7
8
  from sky import exceptions
8
9
  from sky import sky_logging
9
10
  from sky import skypilot_config
10
- from sky import status_lib
11
11
  from sky.adaptors import kubernetes
12
12
  from sky.provision import common
13
+ from sky.provision import constants
13
14
  from sky.provision import docker_utils
14
15
  from sky.provision.kubernetes import config as config_lib
16
+ from sky.provision.kubernetes import network_utils
15
17
  from sky.provision.kubernetes import utils as kubernetes_utils
18
+ from sky.utils import command_runner
16
19
  from sky.utils import common_utils
17
20
  from sky.utils import kubernetes_enums
21
+ from sky.utils import status_lib
22
+ from sky.utils import subprocess_utils
23
+ from sky.utils import timeline
18
24
  from sky.utils import ux_utils
19
25
 
20
26
  POLL_INTERVAL = 2
21
27
  _TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
28
+ _MAX_RETRIES = 3
29
+ _NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
22
30
 
23
31
  logger = sky_logging.init_logger(__name__)
24
32
  TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
25
33
  TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
26
- TAG_RAY_NODE_KIND = 'ray-node-type' # legacy tag for backward compatibility
27
34
  TAG_POD_INITIALIZED = 'skypilot-initialized'
28
35
 
29
- POD_STATUSES = {
30
- 'Pending', 'Running', 'Succeeded', 'Failed', 'Unknown', 'Terminating'
31
- }
32
-
33
-
34
- def to_label_selector(tags):
35
- label_selector = ''
36
- for k, v in tags.items():
37
- if label_selector != '':
38
- label_selector += ','
39
- label_selector += '{}={}'.format(k, v)
40
- return label_selector
41
-
42
-
43
- def _get_namespace(provider_config: Dict[str, Any]) -> str:
44
- return provider_config.get(
45
- 'namespace',
46
- kubernetes_utils.get_current_kube_config_context_namespace())
47
-
48
-
49
- def _filter_pods(namespace: str, tag_filters: Dict[str, str],
50
- status_filters: Optional[List[str]]) -> Dict[str, Any]:
51
- """Filters pods by tags and status."""
52
- non_included_pod_statuses = POD_STATUSES.copy()
53
-
54
- field_selector = ''
55
- if status_filters is not None:
56
- non_included_pod_statuses -= set(status_filters)
57
- field_selector = ','.join(
58
- [f'status.phase!={status}' for status in non_included_pod_statuses])
59
-
60
- label_selector = to_label_selector(tag_filters)
61
- pod_list = kubernetes.core_api().list_namespaced_pod(
62
- namespace, field_selector=field_selector, label_selector=label_selector)
63
-
64
- # Don't return pods marked for deletion,
65
- # i.e. pods with non-null metadata.DeletionTimestamp.
66
- pods = [
67
- pod for pod in pod_list.items if pod.metadata.deletion_timestamp is None
68
- ]
69
- return {pod.metadata.name: pod for pod in pods}
70
-
71
36
 
72
37
  def _get_head_pod_name(pods: Dict[str, Any]) -> Optional[str]:
73
38
  head_pod_name = None
74
39
  for pod_name, pod in pods.items():
75
- if pod.metadata.labels[TAG_RAY_NODE_KIND] == 'head':
40
+ if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head':
76
41
  head_pod_name = pod_name
77
42
  break
78
43
  return head_pod_name
@@ -83,16 +48,85 @@ def head_service_selector(cluster_name: str) -> Dict[str, str]:
83
48
  return {'component': f'{cluster_name}-head'}
84
49
 
85
50
 
86
- def _raise_pod_scheduling_errors(namespace, new_nodes):
51
+ def _formatted_resource_requirements(pod_or_spec: Union[Any, dict]) -> str:
52
+ # Returns a formatted string of resource requirements for a pod.
53
+ resource_requirements = {}
54
+
55
+ if isinstance(pod_or_spec, dict):
56
+ containers = pod_or_spec.get('spec', {}).get('containers', [])
57
+ else:
58
+ containers = pod_or_spec.spec.containers
59
+
60
+ for container in containers:
61
+ if isinstance(container, dict):
62
+ resources = container.get('resources', {})
63
+ requests = resources.get('requests', {})
64
+ else:
65
+ resources = container.resources
66
+ requests = resources.requests or {}
67
+
68
+ for resource, value in requests.items():
69
+ if resource not in resource_requirements:
70
+ resource_requirements[resource] = 0
71
+ if resource == 'memory':
72
+ int_value = kubernetes_utils.parse_memory_resource(value)
73
+ else:
74
+ int_value = kubernetes_utils.parse_cpu_or_gpu_resource(value)
75
+ resource_requirements[resource] += int(int_value)
76
+ return ', '.join(f'{resource}={value}'
77
+ for resource, value in resource_requirements.items())
78
+
79
+
80
+ def _formatted_node_selector(pod_or_spec: Union[Any, dict]) -> Optional[str]:
81
+ # Returns a formatted string of node selectors for a pod.
82
+ node_selectors = []
83
+
84
+ if isinstance(pod_or_spec, dict):
85
+ selectors = pod_or_spec.get('spec', {}).get('nodeSelector', {})
86
+ else:
87
+ selectors = pod_or_spec.spec.node_selector
88
+
89
+ if not selectors:
90
+ return None
91
+
92
+ for label_key, label_value in selectors.items():
93
+ node_selectors.append(f'{label_key}={label_value}')
94
+ return ', '.join(node_selectors)
95
+
96
+
97
+ def _lack_resource_msg(resource: str,
98
+ pod_or_spec: Union[Any, dict],
99
+ extra_msg: Optional[str] = None,
100
+ details: Optional[str] = None) -> str:
101
+ resource_requirements = _formatted_resource_requirements(pod_or_spec)
102
+ node_selectors = _formatted_node_selector(pod_or_spec)
103
+ node_selector_str = f' and labels ({node_selectors})' if (
104
+ node_selectors) else ''
105
+ msg = (f'Insufficient {resource} capacity on the cluster. '
106
+ f'Required resources ({resource_requirements}){node_selector_str} '
107
+ 'were not found in a single node. Other SkyPilot tasks or pods may '
108
+ 'be using resources. Check resource usage by running '
109
+ '`kubectl describe nodes`.')
110
+ if extra_msg:
111
+ msg += f' {extra_msg}'
112
+ if details:
113
+ msg += f'\nFull error: {details}'
114
+ return msg
115
+
116
+
117
+ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
87
118
  """Raise pod scheduling failure reason.
88
119
 
89
120
  When a pod fails to schedule in Kubernetes, the reasons for the failure
90
121
  are recorded as events. This function retrieves those events and raises
91
122
  descriptive errors for better debugging and user feedback.
92
123
  """
124
+ timeout_err_msg = ('Timed out while waiting for nodes to start. '
125
+ 'Cluster may be out of resources or '
126
+ 'may be too slow to autoscale.')
93
127
  for new_node in new_nodes:
94
- pod = kubernetes.core_api().read_namespaced_pod(new_node.metadata.name,
95
- namespace)
128
+ pod = kubernetes.core_api(context).read_namespaced_pod(
129
+ new_node.metadata.name, namespace)
96
130
  pod_status = pod.status.phase
97
131
  # When there are multiple pods involved while launching instance,
98
132
  # there may be a single pod causing issue while others are
@@ -101,7 +135,7 @@ def _raise_pod_scheduling_errors(namespace, new_nodes):
101
135
  if pod_status != 'Pending':
102
136
  continue
103
137
  pod_name = pod._metadata._name # pylint: disable=protected-access
104
- events = kubernetes.core_api().list_namespaced_event(
138
+ events = kubernetes.core_api(context).list_namespaced_event(
105
139
  namespace,
106
140
  field_selector=(f'involvedObject.name={pod_name},'
107
141
  'involvedObject.kind=Pod'))
@@ -118,24 +152,25 @@ def _raise_pod_scheduling_errors(namespace, new_nodes):
118
152
  if event.reason == 'FailedScheduling':
119
153
  event_message = event.message
120
154
  break
121
- timeout_err_msg = ('Timed out while waiting for nodes to start. '
122
- 'Cluster may be out of resources or '
123
- 'may be too slow to autoscale.')
124
- lack_resource_msg = (
125
- 'Insufficient {resource} capacity on the cluster. '
126
- 'Other SkyPilot tasks or pods may be using resources. '
127
- 'Check resource usage by running `kubectl describe nodes`.')
128
155
  if event_message is not None:
129
156
  if pod_status == 'Pending':
157
+ logger.info(event_message)
130
158
  if 'Insufficient cpu' in event_message:
131
159
  raise config_lib.KubernetesError(
132
- lack_resource_msg.format(resource='CPU'))
160
+ _lack_resource_msg('CPU', pod, details=event_message))
133
161
  if 'Insufficient memory' in event_message:
134
162
  raise config_lib.KubernetesError(
135
- lack_resource_msg.format(resource='memory'))
163
+ _lack_resource_msg('memory', pod,
164
+ details=event_message))
165
+ if 'Insufficient smarter-devices/fuse' in event_message:
166
+ raise config_lib.KubernetesError(
167
+ 'Something went wrong with FUSE device daemonset.'
168
+ ' Try restarting your FUSE pods by running '
169
+ '`kubectl delete pods -n skypilot-system -l name=smarter-device-manager`.' # pylint: disable=line-too-long
170
+ f' Full error: {event_message}')
136
171
  gpu_lf_keys = [
137
- lf.get_label_key()
138
- for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
172
+ key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
173
+ for key in lf.get_label_keys()
139
174
  ]
140
175
  if pod.spec.node_selector:
141
176
  for label_key in pod.spec.node_selector.keys():
@@ -143,22 +178,52 @@ def _raise_pod_scheduling_errors(namespace, new_nodes):
143
178
  # TODO(romilb): We may have additional node
144
179
  # affinity selectors in the future - in that
145
180
  # case we will need to update this logic.
146
- if (('Insufficient nvidia.com/gpu'
147
- in event_message) or
148
- ('didn\'t match Pod\'s node affinity/selector'
149
- in event_message)):
150
- msg = lack_resource_msg.format(resource='GPU')
151
- raise config_lib.KubernetesError(
152
- f'{msg} Verify if '
181
+ # TODO(Doyoung): Update the error message raised
182
+ # with the multi-host TPU support.
183
+ gpu_resource_key = kubernetes_utils.get_gpu_resource_key() # pylint: disable=line-too-long
184
+ if 'Insufficient google.com/tpu' in event_message:
185
+ extra_msg = (
186
+ f'Verify if '
153
187
  f'{pod.spec.node_selector[label_key]}'
154
- ' is available in the cluster.')
188
+ ' is available in the cluster. Note '
189
+ 'that multi-host TPU podslices are '
190
+ 'currently not unsupported.')
191
+ raise config_lib.KubernetesError(
192
+ _lack_resource_msg('TPU',
193
+ pod,
194
+ extra_msg,
195
+ details=event_message))
196
+ elif ((f'Insufficient {gpu_resource_key}'
197
+ in event_message) or
198
+ ('didn\'t match Pod\'s node affinity/selector'
199
+ in event_message)):
200
+ extra_msg = (
201
+ f'Verify if any node matching label '
202
+ f'{pod.spec.node_selector[label_key]} and '
203
+ f'sufficient resource {gpu_resource_key} '
204
+ f'is available in the cluster.')
205
+ raise config_lib.KubernetesError(
206
+ _lack_resource_msg('GPU',
207
+ pod,
208
+ extra_msg,
209
+ details=event_message))
155
210
  raise config_lib.KubernetesError(f'{timeout_err_msg} '
156
211
  f'Pod status: {pod_status}'
157
212
  f'Details: \'{event_message}\' ')
158
213
  raise config_lib.KubernetesError(f'{timeout_err_msg}')
159
214
 
160
215
 
161
- def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
216
+ def _raise_command_running_error(message: str, command: str, pod_name: str,
217
+ rc: int, stdout: str) -> None:
218
+ if rc == 0:
219
+ return
220
+ raise config_lib.KubernetesError(
221
+ f'Failed to {message} for pod {pod_name} with return '
222
+ f'code {rc}: {command!r}\nOutput: {stdout}.')
223
+
224
+
225
+ @timeline.event
226
+ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
162
227
  """Wait for all pods to be scheduled.
163
228
 
164
229
  Wait for all pods including jump pod to be scheduled, and if it
@@ -168,6 +233,10 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
168
233
 
169
234
  If timeout is set to a negative value, this method will wait indefinitely.
170
235
  """
236
+ # Create a set of pod names we're waiting for
237
+ if not new_nodes:
238
+ return
239
+ expected_pod_names = {node.metadata.name for node in new_nodes}
171
240
  start_time = time.time()
172
241
 
173
242
  def _evaluate_timeout() -> bool:
@@ -177,25 +246,40 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
177
246
  return time.time() - start_time < timeout
178
247
 
179
248
  while _evaluate_timeout():
180
- all_pods_scheduled = True
181
- for node in new_nodes:
182
- # Iterate over each pod to check their status
183
- pod = kubernetes.core_api().read_namespaced_pod(
184
- node.metadata.name, namespace)
185
- if pod.status.phase == 'Pending':
249
+ # Get all pods in a single API call using the cluster name label
250
+ # which all pods in new_nodes should share
251
+ cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
252
+ pods = kubernetes.core_api(context).list_namespaced_pod(
253
+ namespace,
254
+ label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
255
+
256
+ # Get the set of found pod names and check if we have all expected pods
257
+ found_pod_names = {pod.metadata.name for pod in pods}
258
+ missing_pods = expected_pod_names - found_pod_names
259
+ if missing_pods:
260
+ logger.info('Retrying waiting for pods: '
261
+ f'Missing pods: {missing_pods}')
262
+ time.sleep(0.5)
263
+ continue
264
+
265
+ # Check if all pods are scheduled
266
+ all_scheduled = True
267
+ for pod in pods:
268
+ if (pod.metadata.name in expected_pod_names and
269
+ pod.status.phase == 'Pending'):
186
270
  # If container_statuses is None, then the pod hasn't
187
271
  # been scheduled yet.
188
272
  if pod.status.container_statuses is None:
189
- all_pods_scheduled = False
273
+ all_scheduled = False
190
274
  break
191
275
 
192
- if all_pods_scheduled:
276
+ if all_scheduled:
193
277
  return
194
278
  time.sleep(1)
195
279
 
196
280
  # Handle pod scheduling errors
197
281
  try:
198
- _raise_pod_scheduling_errors(namespace, new_nodes)
282
+ _raise_pod_scheduling_errors(namespace, context, new_nodes)
199
283
  except config_lib.KubernetesError:
200
284
  raise
201
285
  except Exception as e:
@@ -205,19 +289,64 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
205
289
  f'Error: {common_utils.format_exception(e)}') from None
206
290
 
207
291
 
208
- def _wait_for_pods_to_run(namespace, new_nodes):
292
+ @timeline.event
293
+ def _wait_for_pods_to_run(namespace, context, new_nodes):
209
294
  """Wait for pods and their containers to be ready.
210
295
 
211
296
  Pods may be pulling images or may be in the process of container
212
297
  creation.
213
298
  """
299
+ if not new_nodes:
300
+ return
301
+
302
+ # Create a set of pod names we're waiting for
303
+ expected_pod_names = {node.metadata.name for node in new_nodes}
304
+
305
+ def _check_init_containers(pod):
306
+ # Check if any of the init containers failed
307
+ # to start. Could be because the init container
308
+ # command failed or failed to pull image etc.
309
+ for init_status in pod.status.init_container_statuses:
310
+ init_terminated = init_status.state.terminated
311
+ if init_terminated:
312
+ if init_terminated.exit_code != 0:
313
+ msg = init_terminated.message if (
314
+ init_terminated.message) else str(init_terminated)
315
+ raise config_lib.KubernetesError(
316
+ 'Failed to run init container for pod '
317
+ f'{pod.metadata.name}. Error details: {msg}.')
318
+ continue
319
+ init_waiting = init_status.state.waiting
320
+ if (init_waiting is not None and init_waiting.reason
321
+ not in ['ContainerCreating', 'PodInitializing']):
322
+ # TODO(romilb): There may be more states to check for. Add
323
+ # them as needed.
324
+ msg = init_waiting.message if (
325
+ init_waiting.message) else str(init_waiting)
326
+ raise config_lib.KubernetesError(
327
+ 'Failed to create init container for pod '
328
+ f'{pod.metadata.name}. Error details: {msg}.')
329
+
214
330
  while True:
215
- all_pods_running = True
216
- # Iterate over each pod to check their status
217
- for node in new_nodes:
218
- pod = kubernetes.core_api().read_namespaced_pod(
219
- node.metadata.name, namespace)
331
+ # Get all pods in a single API call
332
+ cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
333
+ all_pods = kubernetes.core_api(context).list_namespaced_pod(
334
+ namespace,
335
+ label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
336
+
337
+ # Get the set of found pod names and check if we have all expected pods
338
+ found_pod_names = {pod.metadata.name for pod in all_pods}
339
+ missing_pods = expected_pod_names - found_pod_names
340
+ if missing_pods:
341
+ logger.info('Retrying running pods check: '
342
+ f'Missing pods: {missing_pods}')
343
+ time.sleep(0.5)
344
+ continue
220
345
 
346
+ all_pods_running = True
347
+ for pod in all_pods:
348
+ if pod.metadata.name not in expected_pod_names:
349
+ continue
221
350
  # Continue if pod and all the containers within the
222
351
  # pod are successfully created and running.
223
352
  if pod.status.phase == 'Running' and all(
@@ -235,12 +364,15 @@ def _wait_for_pods_to_run(namespace, new_nodes):
235
364
  # See list of possible reasons for waiting here:
236
365
  # https://stackoverflow.com/a/57886025
237
366
  waiting = container_status.state.waiting
238
- if (waiting is not None and
239
- waiting.reason != 'ContainerCreating'):
240
- raise config_lib.KubernetesError(
241
- 'Failed to create container while launching '
242
- 'the node. Error details: '
243
- f'{container_status.state.waiting.message}.')
367
+ if waiting is not None:
368
+ if waiting.reason == 'PodInitializing':
369
+ _check_init_containers(pod)
370
+ elif waiting.reason != 'ContainerCreating':
371
+ msg = waiting.message if waiting.message else str(
372
+ waiting)
373
+ raise config_lib.KubernetesError(
374
+ 'Failed to create container while launching '
375
+ f'the node. Error details: {msg}.')
244
376
  # Reaching this point means that one of the pods had an issue,
245
377
  # so break out of the loop, and wait until next second.
246
378
  break
@@ -250,145 +382,188 @@ def _wait_for_pods_to_run(namespace, new_nodes):
250
382
  time.sleep(1)
251
383
 
252
384
 
253
- def _run_command_on_pods(node_name: str,
254
- node_namespace: str,
255
- command: List[str],
256
- stream_logs: bool = False):
257
- """Run command on Kubernetes pods.
385
+ def _run_function_with_retries(func: Callable,
386
+ operation_name: str,
387
+ max_retries: int = _MAX_RETRIES,
388
+ retry_delay: int = 5) -> Any:
389
+ """Runs a function with retries on Kubernetes errors.
390
+
391
+ Args:
392
+ func: Function to retry
393
+ operation_name: Name of the operation for logging
394
+ max_retries: Maximum number of retry attempts
395
+ retry_delay: Delay between retries in seconds
258
396
 
259
- If `stream_logs` is True, we poll for output and error messages while the
260
- command is executing, and the stdout and stderr is written to logger.info.
261
- When called from the provisioner, this logger.info is written to the
262
- provision.log file (see setup_provision_logging()).
397
+ Raises:
398
+ The last exception encountered if all retries fail.
263
399
  """
264
- cmd_output = kubernetes.stream()(
265
- kubernetes.core_api().connect_get_namespaced_pod_exec,
266
- node_name,
267
- node_namespace,
268
- command=command,
269
- stderr=True,
270
- stdin=False,
271
- stdout=True,
272
- tty=False,
273
- _preload_content=(not stream_logs),
274
- _request_timeout=kubernetes.API_TIMEOUT)
275
- if stream_logs:
276
- while cmd_output.is_open():
277
- cmd_output.update(timeout=1)
278
- if cmd_output.peek_stdout():
279
- logger.info(f'{cmd_output.read_stdout().strip()}')
280
- if cmd_output.peek_stderr():
281
- logger.info(f'{cmd_output.read_stderr().strip()}')
282
- cmd_output.close()
283
- return cmd_output
284
-
285
-
286
- def _set_env_vars_in_pods(namespace: str, new_pods: List):
287
- """Setting environment variables in pods.
288
-
289
- Once all containers are ready, we can exec into them and set env vars.
290
- Kubernetes automatically populates containers with critical
291
- environment variables, such as those for discovering services running
292
- in the cluster and CUDA/nvidia environment variables. We need to
293
- make sure these env vars are available in every task and ssh session.
294
- This is needed for GPU support and service discovery.
295
- See https://github.com/skypilot-org/skypilot/issues/2287 for
296
- more details.
297
-
298
- To do so, we capture env vars from the pod's runtime and write them to
299
- /etc/profile.d/, making them available for all users in future
300
- shell sessions.
400
+ for attempt in range(max_retries + 1):
401
+ try:
402
+ return func()
403
+ except config_lib.KubernetesError:
404
+ if attempt < max_retries:
405
+ logger.warning(f'Failed to {operation_name} - '
406
+ f'retrying in {retry_delay} seconds.')
407
+ time.sleep(retry_delay)
408
+ else:
409
+ raise
410
+
411
+
412
+ @timeline.event
413
+ def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
414
+ """Pre-initialization step for SkyPilot pods.
415
+
416
+ This step is run in the pod right after it is created and before the
417
+ SkyPilot runtime is setup.
418
+
419
+ This step includes three key steps:
420
+
421
+ 1. Privilege check: Checks if the default user has sufficient privilege
422
+ to set up the kubernetes instance pod.
423
+ 2. SSH setup: Sets up SSH for the pod instance.
424
+ 3. Environment variable setup to populate k8s env vars in the pod.
425
+
426
+ Make sure commands used in these methods are generic and work
427
+ on most base images. E.g., do not use Python, since that may not
428
+ be installed by default.
429
+
430
+ If you run any apt commands, be sure to check if the lock is available.
431
+ It is possible the `apt update` run in the pod container args may still
432
+ be running.
433
+
434
+ Args:
435
+ namespace (str): Kubernetes namespace.
436
+ context (Optional[str]): Kubernetes context.
437
+ new_nodes (List): List of new pod instances.
438
+
439
+ Raises:
440
+ config_lib.KubernetesError: If user privileges are insufficient or
441
+ setup fails.
301
442
  """
302
- set_k8s_env_var_cmd = [
303
- '/bin/sh',
304
- '-c',
305
- docker_utils.SETUP_ENV_VARS_CMD,
306
- ]
307
-
308
- for new_pod in new_pods:
309
- _run_command_on_pods(new_pod.metadata.name, namespace,
310
- set_k8s_env_var_cmd)
311
-
312
-
313
- def _check_user_privilege(namespace: str, new_nodes: List) -> None:
314
- # Checks if the default user has sufficient privilege to set up
315
- # the kubernetes instance pod.
316
- check_k8s_user_sudo_cmd = [
317
- '/bin/sh',
318
- '-c',
319
- (
320
- 'if [ $(id -u) -eq 0 ]; then'
321
- # If user is root, create an alias for sudo used in skypilot setup
322
- ' echo \'alias sudo=""\' >> ~/.bashrc; '
323
- 'else '
324
- ' if command -v sudo >/dev/null 2>&1; then '
325
- ' timeout 2 sudo -l >/dev/null 2>&1 || '
326
- f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
327
- ' else '
328
- f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
329
- ' fi; '
330
- 'fi')
331
- ]
332
443
 
333
- for new_node in new_nodes:
334
- privilege_check = _run_command_on_pods(new_node.metadata.name,
335
- namespace,
336
- check_k8s_user_sudo_cmd)
337
- if privilege_check == str(exceptions.INSUFFICIENT_PRIVILEGES_CODE):
444
+ check_k8s_user_sudo_cmd = (
445
+ 'if [ $(id -u) -eq 0 ]; then'
446
+ # If user is root, create an alias for sudo used in skypilot setup
447
+ ' echo \'alias sudo=""\' >> ~/.bashrc; echo succeed;'
448
+ 'else '
449
+ ' if command -v sudo >/dev/null 2>&1; then '
450
+ ' timeout 2 sudo -l >/dev/null 2>&1 && echo succeed || '
451
+ f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
452
+ f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
453
+ ' else '
454
+ f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
455
+ f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
456
+ ' fi; '
457
+ 'fi;')
458
+
459
+ # Kubernetes automatically populates containers with critical
460
+ # environment variables, such as those for discovering services running
461
+ # in the cluster and CUDA/nvidia environment variables. We need to
462
+ # make sure these env vars are available in every task and ssh session.
463
+ # This is needed for GPU support and service discovery.
464
+ # See https://github.com/skypilot-org/skypilot/issues/2287 for more details.
465
+ # To do so, we capture env vars from the pod's runtime and write them to
466
+ # /etc/profile.d/, making them available for all users in future
467
+ # shell sessions.
468
+ set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
469
+
470
+ check_apt_update_complete_cmd = (
471
+ 'echo "Checking if apt update from container init is complete..."; '
472
+ 'timeout_secs=600; '
473
+ 'start_time=$(date +%s); '
474
+ 'while ! grep -q "Fetched" /tmp/apt-update.log 2>/dev/null; do '
475
+ ' echo "apt update still running. Logs:"; '
476
+ ' cat /tmp/apt-update.log || true; '
477
+ ' current_time=$(date +%s); '
478
+ ' elapsed=$((current_time - start_time)); '
479
+ ' if [ $elapsed -ge $timeout_secs ]; then '
480
+ ' echo "Timed out waiting for apt update"; '
481
+ ' exit 1; '
482
+ ' fi; '
483
+ ' sleep 5; '
484
+ 'done; '
485
+ 'echo "apt update complete."; ')
486
+
487
+ install_ssh_k8s_cmd = (
488
+ 'prefix_cmd() '
489
+ '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
490
+ 'export DEBIAN_FRONTEND=noninteractive;'
491
+ 'echo "Installing missing packages..."; '
492
+ 'for i in {1..5}; do '
493
+ ' output=$($(prefix_cmd) apt install openssh-server rsync -y 2>&1); '
494
+ ' rc=$?; '
495
+ ' if [ $rc -eq 0 ]; then '
496
+ ' break; '
497
+ ' fi; '
498
+ ' echo "$output" | grep -qi "could not get lock" || '
499
+ ' grep -qi "Unable to acquire the dpkg frontend lock"; '
500
+ ' if [ $? -eq 0 ]; then '
501
+ ' echo "apt install failed due to lock, retrying. (Attempt $i/5)"; '
502
+ ' sleep 5; '
503
+ ' else '
504
+ ' echo "apt install failed for a non-lock reason: $output"; '
505
+ ' exit $rc; '
506
+ ' fi; '
507
+ 'done; '
508
+ 'if [ $rc -ne 0 ]; then '
509
+ ' echo "apt install failed after 5 attempts due to lock errors."; '
510
+ ' exit $rc; '
511
+ 'fi; '
512
+ '$(prefix_cmd) mkdir -p /var/run/sshd; '
513
+ '$(prefix_cmd) '
514
+ 'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
515
+ '/etc/ssh/sshd_config; '
516
+ '$(prefix_cmd) sed '
517
+ '"s@session\\s*required\\s*pam_loginuid.so@session optional '
518
+ 'pam_loginuid.so@g" -i /etc/pam.d/sshd; '
519
+ 'cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A; '
520
+ '$(prefix_cmd) mkdir -p ~/.ssh; '
521
+ '$(prefix_cmd) chown -R $(whoami) ~/.ssh;'
522
+ '$(prefix_cmd) chmod 700 ~/.ssh; '
523
+ '$(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > '
524
+ '~/.ssh/authorized_keys; '
525
+ '$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys; '
526
+ '$(prefix_cmd) service ssh restart; '
527
+ # Eliminate the error
528
+ # `mesg: ttyname failed: inappropriate ioctl for device`.
529
+ # See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
530
+ '$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')
531
+
532
+ pre_init_cmd = ('set -ex; ' + check_k8s_user_sudo_cmd +
533
+ set_k8s_env_var_cmd + check_apt_update_complete_cmd +
534
+ install_ssh_k8s_cmd)
535
+
536
+ def _pre_init_thread(new_node):
537
+ pod_name = new_node.metadata.name
538
+ logger.info(f'{"-"*20}Start: Pre-init in pod {pod_name!r} {"-"*20}')
539
+ runner = command_runner.KubernetesCommandRunner(
540
+ ((namespace, context), pod_name))
541
+
542
+ # Run the combined pre-init command
543
+ rc, stdout, _ = runner.run(pre_init_cmd,
544
+ require_outputs=True,
545
+ stream_logs=False)
546
+ if rc == exceptions.INSUFFICIENT_PRIVILEGES_CODE:
338
547
  raise config_lib.KubernetesError(
339
548
  'Insufficient system privileges detected. '
340
549
  'Ensure the default user has root access or '
341
550
  '"sudo" is installed and the user is added to the sudoers '
342
551
  'from the image.')
343
552
 
553
+ op_name = 'pre-init'
554
+ _raise_command_running_error(op_name, pre_init_cmd, pod_name, rc,
555
+ stdout)
344
556
 
345
- def _setup_ssh_in_pods(namespace: str, new_nodes: List) -> None:
346
- # Setting up ssh for the pod instance. This is already setup for
347
- # the jump pod so it does not need to be run for it.
348
- set_k8s_ssh_cmd = [
349
- '/bin/sh',
350
- '-c',
351
- (
352
- 'set -x; '
353
- 'prefix_cmd() '
354
- '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
355
- 'export DEBIAN_FRONTEND=noninteractive;'
356
- '$(prefix_cmd) apt-get update;'
357
- '$(prefix_cmd) apt install openssh-server rsync -y; '
358
- '$(prefix_cmd) mkdir -p /var/run/sshd; '
359
- '$(prefix_cmd) '
360
- 'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
361
- '/etc/ssh/sshd_config; '
362
- '$(prefix_cmd) sed '
363
- '"s@session\\s*required\\s*pam_loginuid.so@session optional '
364
- 'pam_loginuid.so@g" -i /etc/pam.d/sshd; '
365
- 'cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A; '
366
- '$(prefix_cmd) mkdir -p ~/.ssh; '
367
- '$(prefix_cmd) chown -R $(whoami) ~/.ssh;'
368
- '$(prefix_cmd) chmod 700 ~/.ssh; '
369
- '$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys; '
370
- '$(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > '
371
- '~/.ssh/authorized_keys; '
372
- '$(prefix_cmd) service ssh restart; '
373
- # Eliminate the error
374
- # `mesg: ttyname failed: inappropriate ioctl for device`.
375
- # See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
376
- '$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')
377
- ]
378
- # TODO(romilb): Parallelize the setup of SSH in pods for multi-node clusters
379
- for new_node in new_nodes:
380
- pod_name = new_node.metadata.name
381
- logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}')
382
- _run_command_on_pods(new_node.metadata.name,
383
- namespace,
384
- set_k8s_ssh_cmd,
385
- stream_logs=True)
386
- logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
557
+ logger.info(f'{"-"*20}End: Pre-init in pod {pod_name!r} {"-"*20}')
558
+
559
+ # Run pre_init in parallel across all new_nodes
560
+ subprocess_utils.run_in_parallel(_pre_init_thread, new_nodes, _NUM_THREADS)
387
561
 
388
562
 
389
- def _label_pod(namespace: str, pod_name: str, label: Dict[str, str]) -> None:
563
+ def _label_pod(namespace: str, context: Optional[str], pod_name: str,
564
+ label: Dict[str, str]) -> None:
390
565
  """Label a pod."""
391
- kubernetes.core_api().patch_namespaced_pod(
566
+ kubernetes.core_api(context).patch_namespaced_pod(
392
567
  pod_name,
393
568
  namespace, {'metadata': {
394
569
  'labels': label
@@ -396,11 +571,92 @@ def _label_pod(namespace: str, pod_name: str, label: Dict[str, str]) -> None:
396
571
  _request_timeout=kubernetes.API_TIMEOUT)
397
572
 
398
573
 
574
+ @timeline.event
575
+ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
576
+ context: Optional[str]) -> Any:
577
+ """Attempts to create a Kubernetes Pod and handle any errors.
578
+
579
+ Currently, we handle errors due to the AppArmor annotation and retry if
580
+ it fails due to the `FieldValueForbidden` error.
581
+ See https://github.com/skypilot-org/skypilot/issues/4174 for details.
582
+
583
+ Returns: The created Pod object.
584
+ """
585
+ try:
586
+ # Attempt to create the Pod with the AppArmor annotation
587
+ pod = kubernetes.core_api(context).create_namespaced_pod(
588
+ namespace, pod_spec)
589
+ return pod
590
+ except kubernetes.api_exception() as e:
591
+ try:
592
+ error_body = json.loads(e.body)
593
+ error_message = error_body.get('message', '')
594
+ except json.JSONDecodeError:
595
+ error_message = str(e.body)
596
+ # Check if the error is due to the AppArmor annotation and retry.
597
+ # We add an AppArmor annotation to set it as unconfined in our
598
+ # base template in kubernetes-ray.yml.j2. This is required for
599
+ # FUSE to work in the pod on most Kubernetes distributions.
600
+ # However, some distributions do not support the AppArmor annotation
601
+ # and will fail to create the pod. In this case, we retry without
602
+ # the annotation.
603
+ if (e.status == 422 and 'FieldValueForbidden' in error_message and
604
+ 'AppArmorProfile: nil' in error_message):
605
+ logger.warning('AppArmor annotation caused pod creation to fail. '
606
+ 'Retrying without the annotation. '
607
+ 'Note: this may cause bucket mounting to fail.')
608
+
609
+ # Remove the AppArmor annotation
610
+ annotations = pod_spec.get('metadata', {}).get('annotations', {})
611
+ if ('container.apparmor.security.beta.kubernetes.io/ray-node'
612
+ in annotations):
613
+ del annotations[
614
+ 'container.apparmor.security.beta.kubernetes.io/ray-node']
615
+ pod_spec['metadata']['annotations'] = annotations
616
+ logger.info('AppArmor annotation removed from Pod spec.')
617
+ else:
618
+ logger.warning('AppArmor annotation not found in pod spec, '
619
+ 'retrying will not help. '
620
+ f'Current annotations: {annotations}')
621
+ raise e
622
+
623
+ # Retry Pod creation without the AppArmor annotation
624
+ try:
625
+ pod = kubernetes.core_api(context).create_namespaced_pod(
626
+ namespace, pod_spec)
627
+ logger.info(f'Pod {pod.metadata.name} created successfully '
628
+ 'without AppArmor annotation.')
629
+ return pod
630
+ except kubernetes.api_exception() as retry_exception:
631
+ logger.info('Failed to create Pod without AppArmor annotation: '
632
+ f'{retry_exception}')
633
+ raise retry_exception
634
+ # Unlike other error from resource lackage on CPU/GPU/Memory, TPU
635
+ # lackage error is raised when pod is attemtped to be created.
636
+ # TODO(Doyoung): Update the error message raised with the multi-host
637
+ # TPU support.
638
+ elif 'Invalid resource requests for google.com/tpu.' in error_message:
639
+ extra_message = ('Verify if the cluster has a TPU slice node with '
640
+ 'a topology matching the number of TPU(s) '
641
+ 'requested. Note that multi-host TPU podslices '
642
+ 'are currently not unsupported.')
643
+ raise config_lib.KubernetesError(
644
+ _lack_resource_msg('TPU',
645
+ pod_spec,
646
+ details=error_message,
647
+ extra_msg=extra_message))
648
+ else:
649
+ # Re-raise the exception if it's a different error
650
+ raise e
651
+
652
+
653
+ @timeline.event
399
654
  def _create_pods(region: str, cluster_name_on_cloud: str,
400
655
  config: common.ProvisionConfig) -> common.ProvisionRecord:
401
656
  """Create pods based on the config."""
402
657
  provider_config = config.provider_config
403
- namespace = _get_namespace(provider_config)
658
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
659
+ context = kubernetes_utils.get_context_from_config(provider_config)
404
660
  pod_spec = copy.deepcopy(config.node_config)
405
661
  tags = {
406
662
  TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
@@ -413,17 +669,19 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
413
669
  pod_spec['metadata']['labels'].update(
414
670
  {TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
415
671
 
416
- terminating_pods = _filter_pods(namespace, tags, ['Terminating'])
672
+ terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
673
+ ['Terminating'])
417
674
  start_time = time.time()
418
- while (len(terminating_pods) > 0 and
675
+ while (terminating_pods and
419
676
  time.time() - start_time < _TIMEOUT_FOR_POD_TERMINATION):
420
677
  logger.debug(f'run_instances: Found {len(terminating_pods)} '
421
678
  'terminating pods. Waiting them to finish: '
422
679
  f'{list(terminating_pods.keys())}')
423
680
  time.sleep(POLL_INTERVAL)
424
- terminating_pods = _filter_pods(namespace, tags, ['Terminating'])
681
+ terminating_pods = kubernetes_utils.filter_pods(namespace, context,
682
+ tags, ['Terminating'])
425
683
 
426
- if len(terminating_pods) > 0:
684
+ if terminating_pods:
427
685
  # If there are still terminating pods, we force delete them.
428
686
  logger.debug(f'run_instances: Found {len(terminating_pods)} '
429
687
  'terminating pods still in terminating state after '
@@ -432,13 +690,14 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
432
690
  for pod_name in terminating_pods.keys():
433
691
  # grace_period_seconds=0 means force delete the pod.
434
692
  # https://github.com/kubernetes-client/python/issues/508#issuecomment-1695759777
435
- kubernetes.core_api().delete_namespaced_pod(
693
+ kubernetes.core_api(context).delete_namespaced_pod(
436
694
  pod_name,
437
695
  namespace,
438
696
  _request_timeout=config_lib.DELETION_TIMEOUT,
439
697
  grace_period_seconds=0)
440
698
 
441
- running_pods = _filter_pods(namespace, tags, ['Pending', 'Running'])
699
+ running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
700
+ ['Pending', 'Running'])
442
701
  head_pod_name = _get_head_pod_name(running_pods)
443
702
  logger.debug(f'Found {len(running_pods)} existing pods: '
444
703
  f'{list(running_pods.keys())}')
@@ -456,7 +715,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
456
715
  # Add nvidia runtime class if it exists
457
716
  nvidia_runtime_exists = False
458
717
  try:
459
- nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class()
718
+ nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
719
+ context)
460
720
  except kubernetes.kubernetes.client.ApiException as e:
461
721
  logger.warning('run_instances: Error occurred while checking for '
462
722
  f'nvidia RuntimeClass - '
@@ -464,32 +724,45 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
464
724
  'Continuing without using nvidia RuntimeClass.\n'
465
725
  'If you are on a K3s cluster, manually '
466
726
  'override runtimeClassName in ~/.sky/config.yaml. '
467
- 'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html') # pylint: disable=line-too-long
727
+ 'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
728
+
729
+ needs_gpus = False
730
+ limits = pod_spec['spec']['containers'][0].get('resources',
731
+ {}).get('limits')
732
+ if limits is not None:
733
+ needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(), 0) > 0
468
734
 
469
- if nvidia_runtime_exists:
735
+ # TPU pods provisioned on GKE use the default containerd runtime.
736
+ # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long
737
+ if nvidia_runtime_exists and needs_gpus:
470
738
  pod_spec['spec']['runtimeClassName'] = 'nvidia'
471
739
 
472
740
  created_pods = {}
473
741
  logger.debug(f'run_instances: calling create_namespaced_pod '
474
742
  f'(count={to_start_count}).')
475
- for _ in range(to_start_count):
476
- if head_pod_name is None:
477
- pod_spec['metadata']['labels'][TAG_RAY_NODE_KIND] = 'head'
743
+
744
+ def _create_pod_thread(i: int):
745
+ pod_spec_copy = copy.deepcopy(pod_spec)
746
+ if head_pod_name is None and i == 0:
747
+ # First pod should be head if no head exists
748
+ pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
478
749
  head_selector = head_service_selector(cluster_name_on_cloud)
479
- pod_spec['metadata']['labels'].update(head_selector)
480
- pod_spec['metadata']['name'] = f'{cluster_name_on_cloud}-head'
750
+ pod_spec_copy['metadata']['labels'].update(head_selector)
751
+ pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
481
752
  else:
482
- pod_spec['metadata']['labels'][TAG_RAY_NODE_KIND] = 'worker'
483
- pod_uuid = str(uuid.uuid4())[:4]
753
+ # Worker pods
754
+ pod_spec_copy['metadata']['labels'].update(
755
+ constants.WORKER_NODE_TAGS)
756
+ pod_uuid = str(uuid.uuid4())[:6]
484
757
  pod_name = f'{cluster_name_on_cloud}-{pod_uuid}'
485
- pod_spec['metadata']['name'] = f'{pod_name}-worker'
758
+ pod_spec_copy['metadata']['name'] = f'{pod_name}-worker'
486
759
  # For multi-node support, we put a soft-constraint to schedule
487
760
  # worker pods on different nodes than the head pod.
488
761
  # This is not set as a hard constraint because if different nodes
489
762
  # are not available, we still want to be able to schedule worker
490
763
  # pods on larger nodes which may be able to fit multiple SkyPilot
491
764
  # "nodes".
492
- pod_spec['spec']['affinity'] = {
765
+ pod_spec_copy['spec']['affinity'] = {
493
766
  'podAntiAffinity': {
494
767
  # Set as a soft constraint
495
768
  'preferredDuringSchedulingIgnoredDuringExecution': [{
@@ -510,67 +783,67 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
510
783
  }
511
784
  }
512
785
 
513
- pod = kubernetes.core_api().create_namespaced_pod(namespace, pod_spec)
786
+ # TPU slice nodes are given a taint, google.com/tpu=present:NoSchedule.
787
+ # This is to prevent from non-TPU workloads from being scheduled on TPU
788
+ # slice nodes. We need this toleration to allow the pod to be scheduled
789
+ # on TPU nodes.
790
+ # Reference: https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work # pylint: disable=line-too-long
791
+ tpu_label = kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY
792
+ if tpu_label in config.node_config.get('spec',
793
+ {}).get('nodeSelector', {}):
794
+ tpu_toleration = {
795
+ 'key': kubernetes_utils.TPU_RESOURCE_KEY,
796
+ 'operator': 'Equal',
797
+ 'value': 'present',
798
+ 'effect': 'NoSchedule'
799
+ }
800
+ # Preserve existing tolerations if any
801
+ existing_tolerations = pod_spec_copy['spec'].get('tolerations', [])
802
+ pod_spec_copy['spec']['tolerations'] = existing_tolerations + [
803
+ tpu_toleration
804
+ ]
805
+
806
+ return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
807
+ context)
808
+
809
+ # Create pods in parallel
810
+ pods = subprocess_utils.run_in_parallel(_create_pod_thread,
811
+ list(range(to_start_count)),
812
+ _NUM_THREADS)
813
+
814
+ # Process created pods
815
+ for pod in pods:
514
816
  created_pods[pod.metadata.name] = pod
515
- if head_pod_name is None:
817
+ if head_pod_name is None and pod.metadata.labels.get(
818
+ constants.TAG_RAY_NODE_KIND) == 'head':
516
819
  head_pod_name = pod.metadata.name
517
820
 
518
- # Adding the jump pod to the new_nodes list as well so it can be
519
- # checked if it's scheduled and running along with other pods.
520
- ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
521
- jump_pod = kubernetes.core_api().read_namespaced_pod(
522
- ssh_jump_pod_name, namespace)
523
- wait_pods_dict = _filter_pods(namespace, tags, ['Pending'])
524
- wait_pods = list(wait_pods_dict.values())
525
- wait_pods.append(jump_pod)
821
+ networking_mode = network_utils.get_networking_mode(
822
+ config.provider_config.get('networking_mode'))
823
+ if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
824
+ # Adding the jump pod to the new_nodes list as well so it can be
825
+ # checked if it's scheduled and running along with other pods.
826
+ ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
827
+ jump_pod = kubernetes.core_api(context).read_namespaced_pod(
828
+ ssh_jump_pod_name, namespace)
829
+ pods.append(jump_pod)
526
830
  provision_timeout = provider_config['timeout']
527
831
 
528
832
  wait_str = ('indefinitely'
529
833
  if provision_timeout < 0 else f'for {provision_timeout}s')
530
834
  logger.debug(f'run_instances: waiting {wait_str} for pods to schedule and '
531
- f'run: {list(wait_pods_dict.keys())}')
835
+ f'run: {[pod.metadata.name for pod in pods]}')
532
836
 
533
837
  # Wait until the pods are scheduled and surface cause for error
534
838
  # if there is one
535
- _wait_for_pods_to_schedule(namespace, wait_pods, provision_timeout)
839
+ _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout)
536
840
  # Wait until the pods and their containers are up and running, and
537
841
  # fail early if there is an error
538
842
  logger.debug(f'run_instances: waiting for pods to be running (pulling '
539
- f'images): {list(wait_pods_dict.keys())}')
540
- _wait_for_pods_to_run(namespace, wait_pods)
843
+ f'images): {[pod.metadata.name for pod in pods]}')
844
+ _wait_for_pods_to_run(namespace, context, pods)
541
845
  logger.debug(f'run_instances: all pods are scheduled and running: '
542
- f'{list(wait_pods_dict.keys())}')
543
-
544
- running_pods = _filter_pods(namespace, tags, ['Running'])
545
- initialized_pods = _filter_pods(namespace, {
546
- TAG_POD_INITIALIZED: 'true',
547
- **tags
548
- }, ['Running'])
549
- uninitialized_pods = {
550
- pod_name: pod
551
- for pod_name, pod in running_pods.items()
552
- if pod_name not in initialized_pods
553
- }
554
- if len(uninitialized_pods) > 0:
555
- logger.debug(f'run_instances: Initializing {len(uninitialized_pods)} '
556
- f'pods: {list(uninitialized_pods.keys())}')
557
- uninitialized_pods_list = list(uninitialized_pods.values())
558
-
559
- # Setup SSH and environment variables in pods.
560
- # Make sure commands used in these methods are generic and work
561
- # on most base images. E.g., do not use Python, since that may not
562
- # be installed by default.
563
- _check_user_privilege(namespace, uninitialized_pods_list)
564
- _setup_ssh_in_pods(namespace, uninitialized_pods_list)
565
- _set_env_vars_in_pods(namespace, uninitialized_pods_list)
566
-
567
- for pod in uninitialized_pods.values():
568
- _label_pod(namespace,
569
- pod.metadata.name,
570
- label={
571
- TAG_POD_INITIALIZED: 'true',
572
- **pod.metadata.labels
573
- })
846
+ f'{[pod.metadata.name for pod in pods]}')
574
847
 
575
848
  assert head_pod_name is not None, 'head_instance_id should not be None'
576
849
  return common.ProvisionRecord(
@@ -590,7 +863,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
590
863
  try:
591
864
  return _create_pods(region, cluster_name_on_cloud, config)
592
865
  except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
593
- logger.warning(f'run_instances: Error occurred when creating pods: {e}')
866
+ e_msg = common_utils.format_exception(e).replace('\n', ' ')
867
+ logger.warning('run_instances: Error occurred when creating pods: '
868
+ f'{e_msg}')
594
869
  raise
595
870
 
596
871
 
@@ -607,35 +882,66 @@ def stop_instances(
607
882
  raise NotImplementedError()
608
883
 
609
884
 
610
- def _terminate_node(namespace: str, pod_name: str) -> None:
885
+ def _terminate_node(namespace: str, context: Optional[str],
886
+ pod_name: str) -> None:
611
887
  """Terminate a pod."""
612
888
  logger.debug('terminate_instances: calling delete_namespaced_pod')
613
- try:
614
- kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, pod_name)
615
- except Exception as e: # pylint: disable=broad-except
616
- logger.warning('terminate_instances: Error occurred when analyzing '
617
- f'SSH Jump pod: {e}')
618
- try:
619
- kubernetes.core_api().delete_namespaced_service(
620
- pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT)
621
- kubernetes.core_api().delete_namespaced_service(
622
- f'{pod_name}-ssh',
623
- namespace,
624
- _request_timeout=config_lib.DELETION_TIMEOUT)
625
- except kubernetes.api_exception():
626
- pass
889
+
890
+ def _delete_k8s_resource_with_retry(delete_func: Callable,
891
+ resource_type: str,
892
+ resource_name: str) -> None:
893
+ """Helper to delete Kubernetes resources with 404 handling and retries.
894
+
895
+ Args:
896
+ delete_func: Function to call to delete the resource
897
+ resource_type: Type of resource being deleted (e.g. 'service'),
898
+ used in logging
899
+ resource_name: Name of the resource being deleted, used in logging
900
+ """
901
+ max_retries = 3
902
+ retry_delay = 5 # seconds
903
+
904
+ for attempt in range(max_retries):
905
+ try:
906
+ delete_func()
907
+ return
908
+ except kubernetes.api_exception() as e:
909
+ if e.status == 404:
910
+ logger.warning(
911
+ f'terminate_instances: Tried to delete {resource_type} '
912
+ f'{resource_name}, but the {resource_type} was not '
913
+ 'found (404).')
914
+ return
915
+ elif attempt < max_retries - 1:
916
+ logger.warning(f'terminate_instances: Failed to delete '
917
+ f'{resource_type} {resource_name} (attempt '
918
+ f'{attempt + 1}/{max_retries}). Error: {e}. '
919
+ f'Retrying in {retry_delay} seconds...')
920
+ time.sleep(retry_delay)
921
+ else:
922
+ raise
923
+
924
+ # Delete services for the pod
925
+ for service_name in [pod_name, f'{pod_name}-ssh']:
926
+ _delete_k8s_resource_with_retry(
927
+ delete_func=lambda name=service_name: kubernetes.core_api(
928
+ context).delete_namespaced_service(name=name,
929
+ namespace=namespace,
930
+ _request_timeout=config_lib.
931
+ DELETION_TIMEOUT),
932
+ resource_type='service',
933
+ resource_name=service_name)
934
+
627
935
  # Note - delete pod after all other resources are deleted.
628
936
  # This is to ensure there are no leftover resources if this down is run
629
937
  # from within the pod, e.g., for autodown.
630
- try:
631
- kubernetes.core_api().delete_namespaced_pod(
632
- pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT)
633
- except kubernetes.api_exception() as e:
634
- if e.status == 404:
635
- logger.warning('terminate_instances: Tried to delete pod '
636
- f'{pod_name}, but the pod was not found (404).')
637
- else:
638
- raise
938
+ _delete_k8s_resource_with_retry(
939
+ delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
940
+ name=pod_name,
941
+ namespace=namespace,
942
+ _request_timeout=config_lib.DELETION_TIMEOUT),
943
+ resource_type='pod',
944
+ resource_name=pod_name)
639
945
 
640
946
 
641
947
  def terminate_instances(
@@ -644,20 +950,38 @@ def terminate_instances(
644
950
  worker_only: bool = False,
645
951
  ) -> None:
646
952
  """See sky/provision/__init__.py"""
647
- namespace = _get_namespace(provider_config)
953
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
954
+ context = kubernetes_utils.get_context_from_config(provider_config)
648
955
  tag_filters = {
649
956
  TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
650
957
  }
651
- pods = _filter_pods(namespace, tag_filters, None)
958
+ pods = kubernetes_utils.filter_pods(namespace, context, tag_filters, None)
959
+
960
+ # Clean up the SSH jump pod if in use
961
+ networking_mode = network_utils.get_networking_mode(
962
+ provider_config.get('networking_mode'))
963
+ if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
964
+ pod_name = list(pods.keys())[0]
965
+ try:
966
+ kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
967
+ pod_name)
968
+ except Exception as e: # pylint: disable=broad-except
969
+ logger.warning('terminate_instances: Error occurred when analyzing '
970
+ f'SSH Jump pod: {e}')
652
971
 
653
972
  def _is_head(pod) -> bool:
654
- return pod.metadata.labels[TAG_RAY_NODE_KIND] == 'head'
973
+ return pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'
655
974
 
656
- for pod_name, pod in pods.items():
657
- logger.debug(f'Terminating instance {pod_name}: {pod}')
975
+ def _terminate_pod_thread(pod_info):
976
+ pod_name, pod = pod_info
658
977
  if _is_head(pod) and worker_only:
659
- continue
660
- _terminate_node(namespace, pod_name)
978
+ return
979
+ logger.debug(f'Terminating instance {pod_name}: {pod}')
980
+ _terminate_node(namespace, context, pod_name)
981
+
982
+ # Run pod termination in parallel
983
+ subprocess_utils.run_in_parallel(_terminate_pod_thread, list(pods.items()),
984
+ _NUM_THREADS)
661
985
 
662
986
 
663
987
  def get_cluster_info(
@@ -666,12 +990,15 @@ def get_cluster_info(
666
990
  provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
667
991
  del region # unused
668
992
  assert provider_config is not None
669
- namespace = _get_namespace(provider_config)
993
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
994
+ context = kubernetes_utils.get_context_from_config(provider_config)
670
995
  tag_filters = {
671
996
  TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
672
997
  }
673
998
 
674
- running_pods = _filter_pods(namespace, tag_filters, ['Running'])
999
+ running_pods = kubernetes_utils.filter_pods(namespace, context, tag_filters,
1000
+ ['Running'])
1001
+
675
1002
  pods: Dict[str, List[common.InstanceInfo]] = {}
676
1003
  head_pod_name = None
677
1004
 
@@ -680,11 +1007,11 @@ def get_cluster_info(
680
1007
  port_forward_mode.value)
681
1008
  network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
682
1009
  network_mode_str)
683
- external_ip = kubernetes_utils.get_external_ip(network_mode)
1010
+ external_ip = kubernetes_utils.get_external_ip(network_mode, context)
684
1011
  port = 22
685
1012
  if not provider_config.get('use_internal_ips', False):
686
1013
  port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
687
- namespace)
1014
+ namespace, context)
688
1015
 
689
1016
  head_pod_name = None
690
1017
  cpu_request = None
@@ -700,7 +1027,7 @@ def get_cluster_info(
700
1027
  tags=pod.metadata.labels,
701
1028
  )
702
1029
  ]
703
- if pod.metadata.labels[TAG_RAY_NODE_KIND] == 'head':
1030
+ if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head':
704
1031
  head_pod_name = pod_name
705
1032
  head_spec = pod.spec
706
1033
  assert head_spec is not None, pod
@@ -709,11 +1036,17 @@ def get_cluster_info(
709
1036
  assert cpu_request is not None, 'cpu_request should not be None'
710
1037
 
711
1038
  ssh_user = 'sky'
712
- get_k8s_ssh_user_cmd = ['/bin/sh', '-c', ('echo $(whoami)')]
1039
+ get_k8s_ssh_user_cmd = 'echo $(whoami)'
713
1040
  assert head_pod_name is not None
714
- ssh_user = _run_command_on_pods(head_pod_name, namespace,
715
- get_k8s_ssh_user_cmd)
716
- ssh_user = ssh_user.strip()
1041
+ runner = command_runner.KubernetesCommandRunner(
1042
+ ((namespace, context), head_pod_name))
1043
+ rc, stdout, stderr = runner.run(get_k8s_ssh_user_cmd,
1044
+ require_outputs=True,
1045
+ separate_stderr=True,
1046
+ stream_logs=False)
1047
+ _raise_command_running_error('get ssh user', get_k8s_ssh_user_cmd,
1048
+ head_pod_name, rc, stdout + stderr)
1049
+ ssh_user = stdout.strip()
717
1050
  logger.debug(
718
1051
  f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
719
1052
 
@@ -737,7 +1070,6 @@ def query_instances(
737
1070
  provider_config: Optional[Dict[str, Any]] = None,
738
1071
  non_terminated_only: bool = True
739
1072
  ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
740
- del provider_config # unused
741
1073
  status_map = {
742
1074
  'Pending': status_lib.ClusterStatus.INIT,
743
1075
  'Running': status_lib.ClusterStatus.UP,
@@ -747,11 +1079,13 @@ def query_instances(
747
1079
  'Terminating': None,
748
1080
  }
749
1081
 
750
- namespace = kubernetes_utils.get_current_kube_config_context_namespace()
1082
+ assert provider_config is not None
1083
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
1084
+ context = kubernetes_utils.get_context_from_config(provider_config)
751
1085
 
752
1086
  # Get all the pods with the label skypilot-cluster: <cluster_name>
753
1087
  try:
754
- pods = kubernetes.core_api().list_namespaced_pod(
1088
+ pods = kubernetes.core_api(context).list_namespaced_pod(
755
1089
  namespace,
756
1090
  label_selector=f'skypilot-cluster={cluster_name_on_cloud}',
757
1091
  _request_timeout=kubernetes.API_TIMEOUT).items
@@ -776,3 +1110,24 @@ def query_instances(
776
1110
  continue
777
1111
  cluster_status[pod.metadata.name] = pod_status
778
1112
  return cluster_status
1113
+
1114
+
1115
+ def get_command_runners(
1116
+ cluster_info: common.ClusterInfo,
1117
+ **credentials: Dict[str, Any],
1118
+ ) -> List[command_runner.CommandRunner]:
1119
+ """Get a command runner for the given cluster."""
1120
+ assert cluster_info.provider_config is not None, cluster_info
1121
+ instances = cluster_info.instances
1122
+ namespace = kubernetes_utils.get_namespace_from_config(
1123
+ cluster_info.provider_config)
1124
+ context = kubernetes_utils.get_context_from_config(
1125
+ cluster_info.provider_config)
1126
+ node_list = []
1127
+ if cluster_info.head_instance_id is not None:
1128
+ node_list = [((namespace, context), cluster_info.head_instance_id)]
1129
+ node_list.extend(((namespace, context), pod_name)
1130
+ for pod_name in instances.keys()
1131
+ if pod_name != cluster_info.head_instance_id)
1132
+ return command_runner.KubernetesCommandRunner.make_runner_list(
1133
+ node_list=node_list, **credentials)