skypilot-nightly 1.0.0.dev20240925__py3-none-any.whl → 1.0.0.dev20240927__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +10 -8
- sky/authentication.py +10 -6
- sky/backends/backend_utils.py +1 -0
- sky/backends/cloud_vm_ray_backend.py +22 -1
- sky/cli.py +10 -10
- sky/clouds/kubernetes.py +161 -45
- sky/clouds/oci.py +11 -8
- sky/clouds/service_catalog/kubernetes_catalog.py +15 -7
- sky/provision/kubernetes/instance.py +15 -46
- sky/provision/kubernetes/network.py +34 -14
- sky/provision/kubernetes/network_utils.py +7 -5
- sky/provision/kubernetes/utils.py +258 -49
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/provisioner.py +2 -0
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/utils/command_runner.py +4 -0
- sky/utils/schemas.py +6 -0
- {skypilot_nightly-1.0.0.dev20240925.dist-info → skypilot_nightly-1.0.0.dev20240927.dist-info}/METADATA +17 -15
- {skypilot_nightly-1.0.0.dev20240925.dist-info → skypilot_nightly-1.0.0.dev20240927.dist-info}/RECORD +24 -24
- {skypilot_nightly-1.0.0.dev20240925.dist-info → skypilot_nightly-1.0.0.dev20240927.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20240925.dist-info → skypilot_nightly-1.0.0.dev20240927.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20240925.dist-info → skypilot_nightly-1.0.0.dev20240927.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20240925.dist-info → skypilot_nightly-1.0.0.dev20240927.dist-info}/top_level.txt +0 -0
@@ -28,42 +28,6 @@ TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
|
|
28
28
|
TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
|
29
29
|
TAG_POD_INITIALIZED = 'skypilot-initialized'
|
30
30
|
|
31
|
-
POD_STATUSES = {
|
32
|
-
'Pending', 'Running', 'Succeeded', 'Failed', 'Unknown', 'Terminating'
|
33
|
-
}
|
34
|
-
|
35
|
-
|
36
|
-
def to_label_selector(tags):
|
37
|
-
label_selector = ''
|
38
|
-
for k, v in tags.items():
|
39
|
-
if label_selector != '':
|
40
|
-
label_selector += ','
|
41
|
-
label_selector += '{}={}'.format(k, v)
|
42
|
-
return label_selector
|
43
|
-
|
44
|
-
|
45
|
-
def _filter_pods(namespace: str, context: str, tag_filters: Dict[str, str],
|
46
|
-
status_filters: Optional[List[str]]) -> Dict[str, Any]:
|
47
|
-
"""Filters pods by tags and status."""
|
48
|
-
non_included_pod_statuses = POD_STATUSES.copy()
|
49
|
-
|
50
|
-
field_selector = ''
|
51
|
-
if status_filters is not None:
|
52
|
-
non_included_pod_statuses -= set(status_filters)
|
53
|
-
field_selector = ','.join(
|
54
|
-
[f'status.phase!={status}' for status in non_included_pod_statuses])
|
55
|
-
|
56
|
-
label_selector = to_label_selector(tag_filters)
|
57
|
-
pod_list = kubernetes.core_api(context).list_namespaced_pod(
|
58
|
-
namespace, field_selector=field_selector, label_selector=label_selector)
|
59
|
-
|
60
|
-
# Don't return pods marked for deletion,
|
61
|
-
# i.e. pods with non-null metadata.DeletionTimestamp.
|
62
|
-
pods = [
|
63
|
-
pod for pod in pod_list.items if pod.metadata.deletion_timestamp is None
|
64
|
-
]
|
65
|
-
return {pod.metadata.name: pod for pod in pods}
|
66
|
-
|
67
31
|
|
68
32
|
def _get_head_pod_name(pods: Dict[str, Any]) -> Optional[str]:
|
69
33
|
head_pod_name = None
|
@@ -475,7 +439,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
475
439
|
pod_spec['metadata']['labels'].update(
|
476
440
|
{TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
|
477
441
|
|
478
|
-
terminating_pods =
|
442
|
+
terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
443
|
+
['Terminating'])
|
479
444
|
start_time = time.time()
|
480
445
|
while (len(terminating_pods) > 0 and
|
481
446
|
time.time() - start_time < _TIMEOUT_FOR_POD_TERMINATION):
|
@@ -483,8 +448,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
483
448
|
'terminating pods. Waiting them to finish: '
|
484
449
|
f'{list(terminating_pods.keys())}')
|
485
450
|
time.sleep(POLL_INTERVAL)
|
486
|
-
terminating_pods =
|
487
|
-
|
451
|
+
terminating_pods = kubernetes_utils.filter_pods(namespace, context,
|
452
|
+
tags, ['Terminating'])
|
488
453
|
|
489
454
|
if len(terminating_pods) > 0:
|
490
455
|
# If there are still terminating pods, we force delete them.
|
@@ -501,8 +466,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
501
466
|
_request_timeout=config_lib.DELETION_TIMEOUT,
|
502
467
|
grace_period_seconds=0)
|
503
468
|
|
504
|
-
running_pods =
|
505
|
-
|
469
|
+
running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
470
|
+
['Pending', 'Running'])
|
506
471
|
head_pod_name = _get_head_pod_name(running_pods)
|
507
472
|
logger.debug(f'Found {len(running_pods)} existing pods: '
|
508
473
|
f'{list(running_pods.keys())}')
|
@@ -583,7 +548,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
583
548
|
if head_pod_name is None:
|
584
549
|
head_pod_name = pod.metadata.name
|
585
550
|
|
586
|
-
wait_pods_dict =
|
551
|
+
wait_pods_dict = kubernetes_utils.filter_pods(namespace, context, tags,
|
552
|
+
['Pending'])
|
587
553
|
wait_pods = list(wait_pods_dict.values())
|
588
554
|
|
589
555
|
networking_mode = network_utils.get_networking_mode(
|
@@ -613,8 +579,9 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
613
579
|
logger.debug(f'run_instances: all pods are scheduled and running: '
|
614
580
|
f'{list(wait_pods_dict.keys())}')
|
615
581
|
|
616
|
-
running_pods =
|
617
|
-
|
582
|
+
running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
583
|
+
['Running'])
|
584
|
+
initialized_pods = kubernetes_utils.filter_pods(namespace, context, {
|
618
585
|
TAG_POD_INITIALIZED: 'true',
|
619
586
|
**tags
|
620
587
|
}, ['Running'])
|
@@ -722,7 +689,7 @@ def terminate_instances(
|
|
722
689
|
tag_filters = {
|
723
690
|
TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
|
724
691
|
}
|
725
|
-
pods =
|
692
|
+
pods = kubernetes_utils.filter_pods(namespace, context, tag_filters, None)
|
726
693
|
|
727
694
|
def _is_head(pod) -> bool:
|
728
695
|
return pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'
|
@@ -746,7 +713,9 @@ def get_cluster_info(
|
|
746
713
|
TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
|
747
714
|
}
|
748
715
|
|
749
|
-
running_pods =
|
716
|
+
running_pods = kubernetes_utils.filter_pods(namespace, context, tag_filters,
|
717
|
+
['Running'])
|
718
|
+
|
750
719
|
pods: Dict[str, List[common.InstanceInfo]] = {}
|
751
720
|
head_pod_name = None
|
752
721
|
|
@@ -79,13 +79,14 @@ def _open_ports_using_ingress(
|
|
79
79
|
)
|
80
80
|
|
81
81
|
# Prepare service names, ports, for template rendering
|
82
|
-
service_details = [
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
82
|
+
service_details = [
|
83
|
+
(f'{cluster_name_on_cloud}--skypilot-svc--{port}', port,
|
84
|
+
_PATH_PREFIX.format(
|
85
|
+
cluster_name_on_cloud=cluster_name_on_cloud,
|
86
|
+
port=port,
|
87
|
+
namespace=kubernetes_utils.get_kube_config_context_namespace(
|
88
|
+
context)).rstrip('/').lstrip('/')) for port in ports
|
89
|
+
]
|
89
90
|
|
90
91
|
# Generate ingress and services specs
|
91
92
|
# We batch ingress rule creation because each rule triggers a hot reload of
|
@@ -171,7 +172,8 @@ def _cleanup_ports_for_ingress(
|
|
171
172
|
for port in ports:
|
172
173
|
service_name = f'{cluster_name_on_cloud}--skypilot-svc--{port}'
|
173
174
|
network_utils.delete_namespaced_service(
|
174
|
-
namespace=provider_config.get('namespace',
|
175
|
+
namespace=provider_config.get('namespace',
|
176
|
+
kubernetes_utils.DEFAULT_NAMESPACE),
|
175
177
|
service_name=service_name,
|
176
178
|
)
|
177
179
|
|
@@ -208,11 +210,13 @@ def query_ports(
|
|
208
210
|
return _query_ports_for_ingress(
|
209
211
|
cluster_name_on_cloud=cluster_name_on_cloud,
|
210
212
|
ports=ports,
|
213
|
+
provider_config=provider_config,
|
211
214
|
)
|
212
215
|
elif port_mode == kubernetes_enums.KubernetesPortMode.PODIP:
|
213
216
|
return _query_ports_for_podip(
|
214
217
|
cluster_name_on_cloud=cluster_name_on_cloud,
|
215
218
|
ports=ports,
|
219
|
+
provider_config=provider_config,
|
216
220
|
)
|
217
221
|
else:
|
218
222
|
return {}
|
@@ -231,8 +235,14 @@ def _query_ports_for_loadbalancer(
|
|
231
235
|
result: Dict[int, List[common.Endpoint]] = {}
|
232
236
|
service_name = _LOADBALANCER_SERVICE_NAME.format(
|
233
237
|
cluster_name_on_cloud=cluster_name_on_cloud)
|
238
|
+
context = provider_config.get(
|
239
|
+
'context', kubernetes_utils.get_current_kube_config_context_name())
|
240
|
+
namespace = provider_config.get(
|
241
|
+
'namespace',
|
242
|
+
kubernetes_utils.get_kube_config_context_namespace(context))
|
234
243
|
external_ip = network_utils.get_loadbalancer_ip(
|
235
|
-
|
244
|
+
context=context,
|
245
|
+
namespace=namespace,
|
236
246
|
service_name=service_name,
|
237
247
|
# Timeout is set so that we can retry the query when the
|
238
248
|
# cluster is firstly created and the load balancer is not ready yet.
|
@@ -251,19 +261,24 @@ def _query_ports_for_loadbalancer(
|
|
251
261
|
def _query_ports_for_ingress(
|
252
262
|
cluster_name_on_cloud: str,
|
253
263
|
ports: List[int],
|
264
|
+
provider_config: Dict[str, Any],
|
254
265
|
) -> Dict[int, List[common.Endpoint]]:
|
255
|
-
|
266
|
+
context = provider_config.get(
|
267
|
+
'context', kubernetes_utils.get_current_kube_config_context_name())
|
268
|
+
ingress_details = network_utils.get_ingress_external_ip_and_ports(context)
|
256
269
|
external_ip, external_ports = ingress_details
|
257
270
|
if external_ip is None:
|
258
271
|
return {}
|
259
272
|
|
273
|
+
namespace = provider_config.get(
|
274
|
+
'namespace',
|
275
|
+
kubernetes_utils.get_kube_config_context_namespace(context))
|
260
276
|
result: Dict[int, List[common.Endpoint]] = {}
|
261
277
|
for port in ports:
|
262
278
|
path_prefix = _PATH_PREFIX.format(
|
263
279
|
cluster_name_on_cloud=cluster_name_on_cloud,
|
264
280
|
port=port,
|
265
|
-
namespace=
|
266
|
-
get_current_kube_config_context_namespace())
|
281
|
+
namespace=namespace)
|
267
282
|
|
268
283
|
http_port, https_port = external_ports \
|
269
284
|
if external_ports is not None else (None, None)
|
@@ -282,10 +297,15 @@ def _query_ports_for_ingress(
|
|
282
297
|
def _query_ports_for_podip(
|
283
298
|
cluster_name_on_cloud: str,
|
284
299
|
ports: List[int],
|
300
|
+
provider_config: Dict[str, Any],
|
285
301
|
) -> Dict[int, List[common.Endpoint]]:
|
286
|
-
|
302
|
+
context = provider_config.get(
|
303
|
+
'context', kubernetes_utils.get_current_kube_config_context_name())
|
304
|
+
namespace = provider_config.get(
|
305
|
+
'namespace',
|
306
|
+
kubernetes_utils.get_kube_config_context_namespace(context))
|
287
307
|
pod_name = kubernetes_utils.get_head_pod_name(cluster_name_on_cloud)
|
288
|
-
pod_ip = network_utils.get_pod_ip(namespace, pod_name)
|
308
|
+
pod_ip = network_utils.get_pod_ip(context, namespace, pod_name)
|
289
309
|
|
290
310
|
result: Dict[int, List[common.Endpoint]] = {}
|
291
311
|
if pod_ip is None:
|
@@ -220,10 +220,11 @@ def ingress_controller_exists(context: str,
|
|
220
220
|
|
221
221
|
|
222
222
|
def get_ingress_external_ip_and_ports(
|
223
|
+
context: str,
|
223
224
|
namespace: str = 'ingress-nginx'
|
224
225
|
) -> Tuple[Optional[str], Optional[Tuple[int, int]]]:
|
225
226
|
"""Returns external ip and ports for the ingress controller."""
|
226
|
-
core_api = kubernetes.core_api()
|
227
|
+
core_api = kubernetes.core_api(context)
|
227
228
|
ingress_services = [
|
228
229
|
item for item in core_api.list_namespaced_service(
|
229
230
|
namespace, _request_timeout=kubernetes.API_TIMEOUT).items
|
@@ -257,11 +258,12 @@ def get_ingress_external_ip_and_ports(
|
|
257
258
|
return external_ip, None
|
258
259
|
|
259
260
|
|
260
|
-
def get_loadbalancer_ip(
|
261
|
+
def get_loadbalancer_ip(context: str,
|
262
|
+
namespace: str,
|
261
263
|
service_name: str,
|
262
264
|
timeout: int = 0) -> Optional[str]:
|
263
265
|
"""Returns the IP address of the load balancer."""
|
264
|
-
core_api = kubernetes.core_api()
|
266
|
+
core_api = kubernetes.core_api(context)
|
265
267
|
|
266
268
|
ip = None
|
267
269
|
|
@@ -282,9 +284,9 @@ def get_loadbalancer_ip(namespace: str,
|
|
282
284
|
return ip
|
283
285
|
|
284
286
|
|
285
|
-
def get_pod_ip(namespace: str, pod_name: str) -> Optional[str]:
|
287
|
+
def get_pod_ip(context: str, namespace: str, pod_name: str) -> Optional[str]:
|
286
288
|
"""Returns the IP address of the pod."""
|
287
|
-
core_api = kubernetes.core_api()
|
289
|
+
core_api = kubernetes.core_api(context)
|
288
290
|
pod = core_api.read_namespaced_pod(pod_name,
|
289
291
|
namespace,
|
290
292
|
_request_timeout=kubernetes.API_TIMEOUT)
|