skypilot-nightly 1.0.0.dev20240909__py3-none-any.whl → 1.0.0.dev20240911__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +33 -67
- sky/authentication.py +12 -7
- sky/backends/backend_utils.py +40 -33
- sky/backends/cloud_vm_ray_backend.py +1 -1
- sky/check.py +1 -1
- sky/clouds/aws.py +8 -6
- sky/clouds/azure.py +7 -5
- sky/clouds/cloud.py +43 -14
- sky/clouds/cudo.py +1 -1
- sky/clouds/fluidstack.py +2 -2
- sky/clouds/gcp.py +12 -7
- sky/clouds/kubernetes.py +28 -15
- sky/clouds/lambda_cloud.py +2 -2
- sky/clouds/oci.py +1 -1
- sky/clouds/paperspace.py +1 -1
- sky/clouds/runpod.py +1 -1
- sky/clouds/scp.py +2 -2
- sky/clouds/service_catalog/aws_catalog.py +1 -1
- sky/clouds/vsphere.py +1 -1
- sky/provision/kubernetes/config.py +52 -34
- sky/provision/kubernetes/instance.py +73 -61
- sky/provision/kubernetes/network.py +11 -5
- sky/provision/kubernetes/network_utils.py +10 -8
- sky/provision/kubernetes/utils.py +72 -45
- sky/skylet/log_lib.py +4 -1
- sky/skylet/subprocess_daemon.py +47 -15
- sky/templates/kubernetes-port-forward-proxy-command.sh +29 -4
- sky/templates/kubernetes-ray.yml.j2 +5 -0
- sky/templates/lambda-ray.yml.j2 +2 -2
- sky/utils/command_runner.py +12 -6
- sky/utils/command_runner.pyi +1 -1
- sky/utils/kubernetes/rsync_helper.sh +12 -3
- {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/RECORD +39 -39
- {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/top_level.txt +0 -0
@@ -42,13 +42,7 @@ def to_label_selector(tags):
|
|
42
42
|
return label_selector
|
43
43
|
|
44
44
|
|
45
|
-
def
|
46
|
-
return provider_config.get(
|
47
|
-
'namespace',
|
48
|
-
kubernetes_utils.get_current_kube_config_context_namespace())
|
49
|
-
|
50
|
-
|
51
|
-
def _filter_pods(namespace: str, tag_filters: Dict[str, str],
|
45
|
+
def _filter_pods(namespace: str, context: str, tag_filters: Dict[str, str],
|
52
46
|
status_filters: Optional[List[str]]) -> Dict[str, Any]:
|
53
47
|
"""Filters pods by tags and status."""
|
54
48
|
non_included_pod_statuses = POD_STATUSES.copy()
|
@@ -60,7 +54,7 @@ def _filter_pods(namespace: str, tag_filters: Dict[str, str],
|
|
60
54
|
[f'status.phase!={status}' for status in non_included_pod_statuses])
|
61
55
|
|
62
56
|
label_selector = to_label_selector(tag_filters)
|
63
|
-
pod_list = kubernetes.core_api().list_namespaced_pod(
|
57
|
+
pod_list = kubernetes.core_api(context).list_namespaced_pod(
|
64
58
|
namespace, field_selector=field_selector, label_selector=label_selector)
|
65
59
|
|
66
60
|
# Don't return pods marked for deletion,
|
@@ -85,7 +79,7 @@ def head_service_selector(cluster_name: str) -> Dict[str, str]:
|
|
85
79
|
return {'component': f'{cluster_name}-head'}
|
86
80
|
|
87
81
|
|
88
|
-
def _raise_pod_scheduling_errors(namespace, new_nodes):
|
82
|
+
def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
89
83
|
"""Raise pod scheduling failure reason.
|
90
84
|
|
91
85
|
When a pod fails to schedule in Kubernetes, the reasons for the failure
|
@@ -139,8 +133,8 @@ def _raise_pod_scheduling_errors(namespace, new_nodes):
|
|
139
133
|
return msg
|
140
134
|
|
141
135
|
for new_node in new_nodes:
|
142
|
-
pod = kubernetes.core_api().read_namespaced_pod(
|
143
|
-
|
136
|
+
pod = kubernetes.core_api(context).read_namespaced_pod(
|
137
|
+
new_node.metadata.name, namespace)
|
144
138
|
pod_status = pod.status.phase
|
145
139
|
# When there are multiple pods involved while launching instance,
|
146
140
|
# there may be a single pod causing issue while others are
|
@@ -149,7 +143,7 @@ def _raise_pod_scheduling_errors(namespace, new_nodes):
|
|
149
143
|
if pod_status != 'Pending':
|
150
144
|
continue
|
151
145
|
pod_name = pod._metadata._name # pylint: disable=protected-access
|
152
|
-
events = kubernetes.core_api().list_namespaced_event(
|
146
|
+
events = kubernetes.core_api(context).list_namespaced_event(
|
153
147
|
namespace,
|
154
148
|
field_selector=(f'involvedObject.name={pod_name},'
|
155
149
|
'involvedObject.kind=Pod'))
|
@@ -223,7 +217,7 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
|
|
223
217
|
f'code {rc}: {command!r}\nOutput: {stdout}.')
|
224
218
|
|
225
219
|
|
226
|
-
def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
|
220
|
+
def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
227
221
|
"""Wait for all pods to be scheduled.
|
228
222
|
|
229
223
|
Wait for all pods including jump pod to be scheduled, and if it
|
@@ -245,7 +239,7 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
|
|
245
239
|
all_pods_scheduled = True
|
246
240
|
for node in new_nodes:
|
247
241
|
# Iterate over each pod to check their status
|
248
|
-
pod = kubernetes.core_api().read_namespaced_pod(
|
242
|
+
pod = kubernetes.core_api(context).read_namespaced_pod(
|
249
243
|
node.metadata.name, namespace)
|
250
244
|
if pod.status.phase == 'Pending':
|
251
245
|
# If container_statuses is None, then the pod hasn't
|
@@ -260,7 +254,7 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
|
|
260
254
|
|
261
255
|
# Handle pod scheduling errors
|
262
256
|
try:
|
263
|
-
_raise_pod_scheduling_errors(namespace, new_nodes)
|
257
|
+
_raise_pod_scheduling_errors(namespace, context, new_nodes)
|
264
258
|
except config_lib.KubernetesError:
|
265
259
|
raise
|
266
260
|
except Exception as e:
|
@@ -270,7 +264,7 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
|
|
270
264
|
f'Error: {common_utils.format_exception(e)}') from None
|
271
265
|
|
272
266
|
|
273
|
-
def _wait_for_pods_to_run(namespace, new_nodes):
|
267
|
+
def _wait_for_pods_to_run(namespace, context, new_nodes):
|
274
268
|
"""Wait for pods and their containers to be ready.
|
275
269
|
|
276
270
|
Pods may be pulling images or may be in the process of container
|
@@ -306,7 +300,7 @@ def _wait_for_pods_to_run(namespace, new_nodes):
|
|
306
300
|
all_pods_running = True
|
307
301
|
# Iterate over each pod to check their status
|
308
302
|
for node in new_nodes:
|
309
|
-
pod = kubernetes.core_api().read_namespaced_pod(
|
303
|
+
pod = kubernetes.core_api(context).read_namespaced_pod(
|
310
304
|
node.metadata.name, namespace)
|
311
305
|
|
312
306
|
# Continue if pod and all the containers within the
|
@@ -344,7 +338,7 @@ def _wait_for_pods_to_run(namespace, new_nodes):
|
|
344
338
|
time.sleep(1)
|
345
339
|
|
346
340
|
|
347
|
-
def _set_env_vars_in_pods(namespace: str, new_pods: List):
|
341
|
+
def _set_env_vars_in_pods(namespace: str, context: str, new_pods: List):
|
348
342
|
"""Setting environment variables in pods.
|
349
343
|
|
350
344
|
Once all containers are ready, we can exec into them and set env vars.
|
@@ -364,7 +358,7 @@ def _set_env_vars_in_pods(namespace: str, new_pods: List):
|
|
364
358
|
|
365
359
|
for new_pod in new_pods:
|
366
360
|
runner = command_runner.KubernetesCommandRunner(
|
367
|
-
(namespace, new_pod.metadata.name))
|
361
|
+
((namespace, context), new_pod.metadata.name))
|
368
362
|
rc, stdout, _ = runner.run(set_k8s_env_var_cmd,
|
369
363
|
require_outputs=True,
|
370
364
|
stream_logs=False)
|
@@ -372,7 +366,8 @@ def _set_env_vars_in_pods(namespace: str, new_pods: List):
|
|
372
366
|
new_pod.metadata.name, rc, stdout)
|
373
367
|
|
374
368
|
|
375
|
-
def _check_user_privilege(namespace: str,
|
369
|
+
def _check_user_privilege(namespace: str, context: str,
|
370
|
+
new_nodes: List) -> None:
|
376
371
|
# Checks if the default user has sufficient privilege to set up
|
377
372
|
# the kubernetes instance pod.
|
378
373
|
check_k8s_user_sudo_cmd = (
|
@@ -390,7 +385,7 @@ def _check_user_privilege(namespace: str, new_nodes: List) -> None:
|
|
390
385
|
|
391
386
|
for new_node in new_nodes:
|
392
387
|
runner = command_runner.KubernetesCommandRunner(
|
393
|
-
(namespace, new_node.metadata.name))
|
388
|
+
((namespace, context), new_node.metadata.name))
|
394
389
|
rc, stdout, stderr = runner.run(check_k8s_user_sudo_cmd,
|
395
390
|
require_outputs=True,
|
396
391
|
separate_stderr=True,
|
@@ -407,7 +402,7 @@ def _check_user_privilege(namespace: str, new_nodes: List) -> None:
|
|
407
402
|
'from the image.')
|
408
403
|
|
409
404
|
|
410
|
-
def _setup_ssh_in_pods(namespace: str, new_nodes: List) -> None:
|
405
|
+
def _setup_ssh_in_pods(namespace: str, context: str, new_nodes: List) -> None:
|
411
406
|
# Setting up ssh for the pod instance. This is already setup for
|
412
407
|
# the jump pod so it does not need to be run for it.
|
413
408
|
set_k8s_ssh_cmd = (
|
@@ -440,7 +435,8 @@ def _setup_ssh_in_pods(namespace: str, new_nodes: List) -> None:
|
|
440
435
|
# TODO(romilb): Parallelize the setup of SSH in pods for multi-node clusters
|
441
436
|
for new_node in new_nodes:
|
442
437
|
pod_name = new_node.metadata.name
|
443
|
-
runner = command_runner.KubernetesCommandRunner(
|
438
|
+
runner = command_runner.KubernetesCommandRunner(
|
439
|
+
((namespace, context), pod_name))
|
444
440
|
logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}')
|
445
441
|
rc, stdout, _ = runner.run(set_k8s_ssh_cmd,
|
446
442
|
require_outputs=True,
|
@@ -450,9 +446,10 @@ def _setup_ssh_in_pods(namespace: str, new_nodes: List) -> None:
|
|
450
446
|
logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
|
451
447
|
|
452
448
|
|
453
|
-
def _label_pod(namespace: str,
|
449
|
+
def _label_pod(namespace: str, context: str, pod_name: str,
|
450
|
+
label: Dict[str, str]) -> None:
|
454
451
|
"""Label a pod."""
|
455
|
-
kubernetes.core_api().patch_namespaced_pod(
|
452
|
+
kubernetes.core_api(context).patch_namespaced_pod(
|
456
453
|
pod_name,
|
457
454
|
namespace, {'metadata': {
|
458
455
|
'labels': label
|
@@ -464,7 +461,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
464
461
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
465
462
|
"""Create pods based on the config."""
|
466
463
|
provider_config = config.provider_config
|
467
|
-
namespace =
|
464
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
465
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
468
466
|
pod_spec = copy.deepcopy(config.node_config)
|
469
467
|
tags = {
|
470
468
|
TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
|
@@ -477,7 +475,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
477
475
|
pod_spec['metadata']['labels'].update(
|
478
476
|
{TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
|
479
477
|
|
480
|
-
terminating_pods = _filter_pods(namespace, tags, ['Terminating'])
|
478
|
+
terminating_pods = _filter_pods(namespace, context, tags, ['Terminating'])
|
481
479
|
start_time = time.time()
|
482
480
|
while (len(terminating_pods) > 0 and
|
483
481
|
time.time() - start_time < _TIMEOUT_FOR_POD_TERMINATION):
|
@@ -485,7 +483,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
485
483
|
'terminating pods. Waiting them to finish: '
|
486
484
|
f'{list(terminating_pods.keys())}')
|
487
485
|
time.sleep(POLL_INTERVAL)
|
488
|
-
terminating_pods = _filter_pods(namespace, tags,
|
486
|
+
terminating_pods = _filter_pods(namespace, context, tags,
|
487
|
+
['Terminating'])
|
489
488
|
|
490
489
|
if len(terminating_pods) > 0:
|
491
490
|
# If there are still terminating pods, we force delete them.
|
@@ -496,13 +495,14 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
496
495
|
for pod_name in terminating_pods.keys():
|
497
496
|
# grace_period_seconds=0 means force delete the pod.
|
498
497
|
# https://github.com/kubernetes-client/python/issues/508#issuecomment-1695759777
|
499
|
-
kubernetes.core_api().delete_namespaced_pod(
|
498
|
+
kubernetes.core_api(context).delete_namespaced_pod(
|
500
499
|
pod_name,
|
501
500
|
namespace,
|
502
501
|
_request_timeout=config_lib.DELETION_TIMEOUT,
|
503
502
|
grace_period_seconds=0)
|
504
503
|
|
505
|
-
running_pods = _filter_pods(namespace,
|
504
|
+
running_pods = _filter_pods(namespace, context, tags,
|
505
|
+
['Pending', 'Running'])
|
506
506
|
head_pod_name = _get_head_pod_name(running_pods)
|
507
507
|
logger.debug(f'Found {len(running_pods)} existing pods: '
|
508
508
|
f'{list(running_pods.keys())}')
|
@@ -520,7 +520,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
520
520
|
# Add nvidia runtime class if it exists
|
521
521
|
nvidia_runtime_exists = False
|
522
522
|
try:
|
523
|
-
nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
|
523
|
+
nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
|
524
|
+
context)
|
524
525
|
except kubernetes.kubernetes.client.ApiException as e:
|
525
526
|
logger.warning('run_instances: Error occurred while checking for '
|
526
527
|
f'nvidia RuntimeClass - '
|
@@ -530,7 +531,9 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
530
531
|
'override runtimeClassName in ~/.sky/config.yaml. '
|
531
532
|
'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html') # pylint: disable=line-too-long
|
532
533
|
|
533
|
-
|
534
|
+
needs_gpus = (pod_spec['spec']['containers'][0].get('resources', {}).get(
|
535
|
+
'limits', {}).get('nvidia.com/gpu', 0) > 0)
|
536
|
+
if nvidia_runtime_exists and needs_gpus:
|
534
537
|
pod_spec['spec']['runtimeClassName'] = 'nvidia'
|
535
538
|
|
536
539
|
created_pods = {}
|
@@ -574,12 +577,13 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
574
577
|
}
|
575
578
|
}
|
576
579
|
|
577
|
-
pod = kubernetes.core_api().create_namespaced_pod(
|
580
|
+
pod = kubernetes.core_api(context).create_namespaced_pod(
|
581
|
+
namespace, pod_spec)
|
578
582
|
created_pods[pod.metadata.name] = pod
|
579
583
|
if head_pod_name is None:
|
580
584
|
head_pod_name = pod.metadata.name
|
581
585
|
|
582
|
-
wait_pods_dict = _filter_pods(namespace, tags, ['Pending'])
|
586
|
+
wait_pods_dict = _filter_pods(namespace, context, tags, ['Pending'])
|
583
587
|
wait_pods = list(wait_pods_dict.values())
|
584
588
|
|
585
589
|
networking_mode = network_utils.get_networking_mode(
|
@@ -588,7 +592,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
588
592
|
# Adding the jump pod to the new_nodes list as well so it can be
|
589
593
|
# checked if it's scheduled and running along with other pods.
|
590
594
|
ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
|
591
|
-
jump_pod = kubernetes.core_api().read_namespaced_pod(
|
595
|
+
jump_pod = kubernetes.core_api(context).read_namespaced_pod(
|
592
596
|
ssh_jump_pod_name, namespace)
|
593
597
|
wait_pods.append(jump_pod)
|
594
598
|
provision_timeout = provider_config['timeout']
|
@@ -600,17 +604,17 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
600
604
|
|
601
605
|
# Wait until the pods are scheduled and surface cause for error
|
602
606
|
# if there is one
|
603
|
-
_wait_for_pods_to_schedule(namespace, wait_pods, provision_timeout)
|
607
|
+
_wait_for_pods_to_schedule(namespace, context, wait_pods, provision_timeout)
|
604
608
|
# Wait until the pods and their containers are up and running, and
|
605
609
|
# fail early if there is an error
|
606
610
|
logger.debug(f'run_instances: waiting for pods to be running (pulling '
|
607
611
|
f'images): {list(wait_pods_dict.keys())}')
|
608
|
-
_wait_for_pods_to_run(namespace, wait_pods)
|
612
|
+
_wait_for_pods_to_run(namespace, context, wait_pods)
|
609
613
|
logger.debug(f'run_instances: all pods are scheduled and running: '
|
610
614
|
f'{list(wait_pods_dict.keys())}')
|
611
615
|
|
612
|
-
running_pods = _filter_pods(namespace, tags, ['Running'])
|
613
|
-
initialized_pods = _filter_pods(namespace, {
|
616
|
+
running_pods = _filter_pods(namespace, context, tags, ['Running'])
|
617
|
+
initialized_pods = _filter_pods(namespace, context, {
|
614
618
|
TAG_POD_INITIALIZED: 'true',
|
615
619
|
**tags
|
616
620
|
}, ['Running'])
|
@@ -628,12 +632,13 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
628
632
|
# Make sure commands used in these methods are generic and work
|
629
633
|
# on most base images. E.g., do not use Python, since that may not
|
630
634
|
# be installed by default.
|
631
|
-
_check_user_privilege(namespace, uninitialized_pods_list)
|
632
|
-
_setup_ssh_in_pods(namespace, uninitialized_pods_list)
|
633
|
-
_set_env_vars_in_pods(namespace, uninitialized_pods_list)
|
635
|
+
_check_user_privilege(namespace, context, uninitialized_pods_list)
|
636
|
+
_setup_ssh_in_pods(namespace, context, uninitialized_pods_list)
|
637
|
+
_set_env_vars_in_pods(namespace, context, uninitialized_pods_list)
|
634
638
|
|
635
639
|
for pod in uninitialized_pods.values():
|
636
640
|
_label_pod(namespace,
|
641
|
+
context,
|
637
642
|
pod.metadata.name,
|
638
643
|
label={
|
639
644
|
TAG_POD_INITIALIZED: 'true',
|
@@ -675,18 +680,18 @@ def stop_instances(
|
|
675
680
|
raise NotImplementedError()
|
676
681
|
|
677
682
|
|
678
|
-
def _terminate_node(namespace: str, pod_name: str) -> None:
|
683
|
+
def _terminate_node(namespace: str, context: str, pod_name: str) -> None:
|
679
684
|
"""Terminate a pod."""
|
680
685
|
logger.debug('terminate_instances: calling delete_namespaced_pod')
|
681
686
|
try:
|
682
|
-
kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, pod_name)
|
687
|
+
kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context, pod_name)
|
683
688
|
except Exception as e: # pylint: disable=broad-except
|
684
689
|
logger.warning('terminate_instances: Error occurred when analyzing '
|
685
690
|
f'SSH Jump pod: {e}')
|
686
691
|
try:
|
687
|
-
kubernetes.core_api().delete_namespaced_service(
|
692
|
+
kubernetes.core_api(context).delete_namespaced_service(
|
688
693
|
pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT)
|
689
|
-
kubernetes.core_api().delete_namespaced_service(
|
694
|
+
kubernetes.core_api(context).delete_namespaced_service(
|
690
695
|
f'{pod_name}-ssh',
|
691
696
|
namespace,
|
692
697
|
_request_timeout=config_lib.DELETION_TIMEOUT)
|
@@ -696,7 +701,7 @@ def _terminate_node(namespace: str, pod_name: str) -> None:
|
|
696
701
|
# This is to ensure there are no leftover resources if this down is run
|
697
702
|
# from within the pod, e.g., for autodown.
|
698
703
|
try:
|
699
|
-
kubernetes.core_api().delete_namespaced_pod(
|
704
|
+
kubernetes.core_api(context).delete_namespaced_pod(
|
700
705
|
pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT)
|
701
706
|
except kubernetes.api_exception() as e:
|
702
707
|
if e.status == 404:
|
@@ -712,11 +717,12 @@ def terminate_instances(
|
|
712
717
|
worker_only: bool = False,
|
713
718
|
) -> None:
|
714
719
|
"""See sky/provision/__init__.py"""
|
715
|
-
namespace =
|
720
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
721
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
716
722
|
tag_filters = {
|
717
723
|
TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
|
718
724
|
}
|
719
|
-
pods = _filter_pods(namespace, tag_filters, None)
|
725
|
+
pods = _filter_pods(namespace, context, tag_filters, None)
|
720
726
|
|
721
727
|
def _is_head(pod) -> bool:
|
722
728
|
return pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'
|
@@ -725,7 +731,7 @@ def terminate_instances(
|
|
725
731
|
logger.debug(f'Terminating instance {pod_name}: {pod}')
|
726
732
|
if _is_head(pod) and worker_only:
|
727
733
|
continue
|
728
|
-
_terminate_node(namespace, pod_name)
|
734
|
+
_terminate_node(namespace, context, pod_name)
|
729
735
|
|
730
736
|
|
731
737
|
def get_cluster_info(
|
@@ -734,12 +740,13 @@ def get_cluster_info(
|
|
734
740
|
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
735
741
|
del region # unused
|
736
742
|
assert provider_config is not None
|
737
|
-
namespace =
|
743
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
744
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
738
745
|
tag_filters = {
|
739
746
|
TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
|
740
747
|
}
|
741
748
|
|
742
|
-
running_pods = _filter_pods(namespace, tag_filters, ['Running'])
|
749
|
+
running_pods = _filter_pods(namespace, context, tag_filters, ['Running'])
|
743
750
|
pods: Dict[str, List[common.InstanceInfo]] = {}
|
744
751
|
head_pod_name = None
|
745
752
|
|
@@ -748,11 +755,11 @@ def get_cluster_info(
|
|
748
755
|
port_forward_mode.value)
|
749
756
|
network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
|
750
757
|
network_mode_str)
|
751
|
-
external_ip = kubernetes_utils.get_external_ip(network_mode)
|
758
|
+
external_ip = kubernetes_utils.get_external_ip(network_mode, context)
|
752
759
|
port = 22
|
753
760
|
if not provider_config.get('use_internal_ips', False):
|
754
761
|
port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
|
755
|
-
namespace)
|
762
|
+
namespace, context)
|
756
763
|
|
757
764
|
head_pod_name = None
|
758
765
|
cpu_request = None
|
@@ -779,7 +786,8 @@ def get_cluster_info(
|
|
779
786
|
ssh_user = 'sky'
|
780
787
|
get_k8s_ssh_user_cmd = 'echo $(whoami)'
|
781
788
|
assert head_pod_name is not None
|
782
|
-
runner = command_runner.KubernetesCommandRunner(
|
789
|
+
runner = command_runner.KubernetesCommandRunner(
|
790
|
+
((namespace, context), head_pod_name))
|
783
791
|
rc, stdout, stderr = runner.run(get_k8s_ssh_user_cmd,
|
784
792
|
require_outputs=True,
|
785
793
|
separate_stderr=True,
|
@@ -810,7 +818,6 @@ def query_instances(
|
|
810
818
|
provider_config: Optional[Dict[str, Any]] = None,
|
811
819
|
non_terminated_only: bool = True
|
812
820
|
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
813
|
-
del provider_config # unused
|
814
821
|
status_map = {
|
815
822
|
'Pending': status_lib.ClusterStatus.INIT,
|
816
823
|
'Running': status_lib.ClusterStatus.UP,
|
@@ -820,11 +827,13 @@ def query_instances(
|
|
820
827
|
'Terminating': None,
|
821
828
|
}
|
822
829
|
|
823
|
-
|
830
|
+
assert provider_config is not None
|
831
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
832
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
824
833
|
|
825
834
|
# Get all the pods with the label skypilot-cluster: <cluster_name>
|
826
835
|
try:
|
827
|
-
pods = kubernetes.core_api().list_namespaced_pod(
|
836
|
+
pods = kubernetes.core_api(context).list_namespaced_pod(
|
828
837
|
namespace,
|
829
838
|
label_selector=f'skypilot-cluster={cluster_name_on_cloud}',
|
830
839
|
_request_timeout=kubernetes.API_TIMEOUT).items
|
@@ -858,11 +867,14 @@ def get_command_runners(
|
|
858
867
|
"""Get a command runner for the given cluster."""
|
859
868
|
assert cluster_info.provider_config is not None, cluster_info
|
860
869
|
instances = cluster_info.instances
|
861
|
-
namespace =
|
870
|
+
namespace = kubernetes_utils.get_namespace_from_config(
|
871
|
+
cluster_info.provider_config)
|
872
|
+
context = kubernetes_utils.get_context_from_config(
|
873
|
+
cluster_info.provider_config)
|
862
874
|
node_list = []
|
863
875
|
if cluster_info.head_instance_id is not None:
|
864
|
-
node_list = [(namespace, cluster_info.head_instance_id)]
|
865
|
-
node_list.extend((namespace, pod_name)
|
876
|
+
node_list = [((namespace, context), cluster_info.head_instance_id)]
|
877
|
+
node_list.extend(((namespace, context), pod_name)
|
866
878
|
for pod_name in instances.keys()
|
867
879
|
if pod_name != cluster_info.head_instance_id)
|
868
880
|
return command_runner.KubernetesCommandRunner.make_runner_list(
|
@@ -58,7 +58,8 @@ def _open_ports_using_loadbalancer(
|
|
58
58
|
kubernetes_utils.merge_custom_metadata(content['service_spec']['metadata'])
|
59
59
|
|
60
60
|
network_utils.create_or_replace_namespaced_service(
|
61
|
-
namespace=
|
61
|
+
namespace=kubernetes_utils.get_namespace_from_config(provider_config),
|
62
|
+
context=kubernetes_utils.get_context_from_config(provider_config),
|
62
63
|
service_name=service_name,
|
63
64
|
service_spec=content['service_spec'])
|
64
65
|
|
@@ -68,8 +69,9 @@ def _open_ports_using_ingress(
|
|
68
69
|
ports: List[int],
|
69
70
|
provider_config: Dict[str, Any],
|
70
71
|
) -> None:
|
72
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
71
73
|
# Check if an ingress controller exists
|
72
|
-
if not network_utils.ingress_controller_exists():
|
74
|
+
if not network_utils.ingress_controller_exists(context):
|
73
75
|
raise Exception(
|
74
76
|
'Ingress controller not found. '
|
75
77
|
'Install Nginx ingress controller first: '
|
@@ -108,7 +110,9 @@ def _open_ports_using_ingress(
|
|
108
110
|
# Update metadata from config
|
109
111
|
kubernetes_utils.merge_custom_metadata(service_spec['metadata'])
|
110
112
|
network_utils.create_or_replace_namespaced_service(
|
111
|
-
namespace=
|
113
|
+
namespace=kubernetes_utils.get_namespace_from_config(
|
114
|
+
provider_config),
|
115
|
+
context=kubernetes_utils.get_context_from_config(provider_config),
|
112
116
|
service_name=service_name,
|
113
117
|
service_spec=service_spec,
|
114
118
|
)
|
@@ -116,7 +120,8 @@ def _open_ports_using_ingress(
|
|
116
120
|
kubernetes_utils.merge_custom_metadata(content['ingress_spec']['metadata'])
|
117
121
|
# Create or update the single ingress for all services
|
118
122
|
network_utils.create_or_replace_namespaced_ingress(
|
119
|
-
namespace=
|
123
|
+
namespace=kubernetes_utils.get_namespace_from_config(provider_config),
|
124
|
+
context=kubernetes_utils.get_context_from_config(provider_config),
|
120
125
|
ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress',
|
121
126
|
ingress_spec=content['ingress_spec'],
|
122
127
|
)
|
@@ -173,7 +178,8 @@ def _cleanup_ports_for_ingress(
|
|
173
178
|
# Delete the single ingress used for all ports
|
174
179
|
ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress'
|
175
180
|
network_utils.delete_namespaced_ingress(
|
176
|
-
namespace=
|
181
|
+
namespace=kubernetes_utils.get_namespace_from_config(provider_config),
|
182
|
+
context=kubernetes_utils.get_context_from_config(provider_config),
|
177
183
|
ingress_name=ingress_name,
|
178
184
|
)
|
179
185
|
|
@@ -132,10 +132,10 @@ def fill_ingress_template(namespace: str, service_details: List[Tuple[str, int,
|
|
132
132
|
|
133
133
|
|
134
134
|
def create_or_replace_namespaced_ingress(
|
135
|
-
namespace: str, ingress_name: str,
|
135
|
+
namespace: str, context: str, ingress_name: str,
|
136
136
|
ingress_spec: Dict[str, Union[str, int]]) -> None:
|
137
137
|
"""Creates an ingress resource for the specified service."""
|
138
|
-
networking_api = kubernetes.networking_api()
|
138
|
+
networking_api = kubernetes.networking_api(context)
|
139
139
|
|
140
140
|
try:
|
141
141
|
networking_api.read_namespaced_ingress(
|
@@ -156,9 +156,10 @@ def create_or_replace_namespaced_ingress(
|
|
156
156
|
_request_timeout=kubernetes.API_TIMEOUT)
|
157
157
|
|
158
158
|
|
159
|
-
def delete_namespaced_ingress(namespace: str,
|
159
|
+
def delete_namespaced_ingress(namespace: str, context: str,
|
160
|
+
ingress_name: str) -> None:
|
160
161
|
"""Deletes an ingress resource."""
|
161
|
-
networking_api = kubernetes.networking_api()
|
162
|
+
networking_api = kubernetes.networking_api(context)
|
162
163
|
try:
|
163
164
|
networking_api.delete_namespaced_ingress(
|
164
165
|
ingress_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
|
@@ -170,10 +171,10 @@ def delete_namespaced_ingress(namespace: str, ingress_name: str) -> None:
|
|
170
171
|
|
171
172
|
|
172
173
|
def create_or_replace_namespaced_service(
|
173
|
-
namespace: str, service_name: str,
|
174
|
+
namespace: str, context: str, service_name: str,
|
174
175
|
service_spec: Dict[str, Union[str, int]]) -> None:
|
175
176
|
"""Creates a service resource for the specified service."""
|
176
|
-
core_api = kubernetes.core_api()
|
177
|
+
core_api = kubernetes.core_api(context)
|
177
178
|
|
178
179
|
try:
|
179
180
|
core_api.read_namespaced_service(
|
@@ -207,9 +208,10 @@ def delete_namespaced_service(namespace: str, service_name: str) -> None:
|
|
207
208
|
raise e
|
208
209
|
|
209
210
|
|
210
|
-
def ingress_controller_exists(
|
211
|
+
def ingress_controller_exists(context: str,
|
212
|
+
ingress_class_name: str = 'nginx') -> bool:
|
211
213
|
"""Checks if an ingress controller exists in the cluster."""
|
212
|
-
networking_api = kubernetes.networking_api()
|
214
|
+
networking_api = kubernetes.networking_api(context)
|
213
215
|
ingress_classes = networking_api.list_ingress_class(
|
214
216
|
_request_timeout=kubernetes.API_TIMEOUT).items
|
215
217
|
return any(
|