skypilot-nightly 1.0.0.dev20240909__py3-none-any.whl → 1.0.0.dev20240911__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +33 -67
  3. sky/authentication.py +12 -7
  4. sky/backends/backend_utils.py +40 -33
  5. sky/backends/cloud_vm_ray_backend.py +1 -1
  6. sky/check.py +1 -1
  7. sky/clouds/aws.py +8 -6
  8. sky/clouds/azure.py +7 -5
  9. sky/clouds/cloud.py +43 -14
  10. sky/clouds/cudo.py +1 -1
  11. sky/clouds/fluidstack.py +2 -2
  12. sky/clouds/gcp.py +12 -7
  13. sky/clouds/kubernetes.py +28 -15
  14. sky/clouds/lambda_cloud.py +2 -2
  15. sky/clouds/oci.py +1 -1
  16. sky/clouds/paperspace.py +1 -1
  17. sky/clouds/runpod.py +1 -1
  18. sky/clouds/scp.py +2 -2
  19. sky/clouds/service_catalog/aws_catalog.py +1 -1
  20. sky/clouds/vsphere.py +1 -1
  21. sky/provision/kubernetes/config.py +52 -34
  22. sky/provision/kubernetes/instance.py +73 -61
  23. sky/provision/kubernetes/network.py +11 -5
  24. sky/provision/kubernetes/network_utils.py +10 -8
  25. sky/provision/kubernetes/utils.py +72 -45
  26. sky/skylet/log_lib.py +4 -1
  27. sky/skylet/subprocess_daemon.py +47 -15
  28. sky/templates/kubernetes-port-forward-proxy-command.sh +29 -4
  29. sky/templates/kubernetes-ray.yml.j2 +5 -0
  30. sky/templates/lambda-ray.yml.j2 +2 -2
  31. sky/utils/command_runner.py +12 -6
  32. sky/utils/command_runner.pyi +1 -1
  33. sky/utils/kubernetes/rsync_helper.sh +12 -3
  34. {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/METADATA +1 -1
  35. {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/RECORD +39 -39
  36. {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/LICENSE +0 -0
  37. {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/WHEEL +0 -0
  38. {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/entry_points.txt +0 -0
  39. {skypilot_nightly-1.0.0.dev20240909.dist-info → skypilot_nightly-1.0.0.dev20240911.dist-info}/top_level.txt +0 -0
@@ -42,13 +42,7 @@ def to_label_selector(tags):
42
42
  return label_selector
43
43
 
44
44
 
45
- def _get_namespace(provider_config: Dict[str, Any]) -> str:
46
- return provider_config.get(
47
- 'namespace',
48
- kubernetes_utils.get_current_kube_config_context_namespace())
49
-
50
-
51
- def _filter_pods(namespace: str, tag_filters: Dict[str, str],
45
+ def _filter_pods(namespace: str, context: str, tag_filters: Dict[str, str],
52
46
  status_filters: Optional[List[str]]) -> Dict[str, Any]:
53
47
  """Filters pods by tags and status."""
54
48
  non_included_pod_statuses = POD_STATUSES.copy()
@@ -60,7 +54,7 @@ def _filter_pods(namespace: str, tag_filters: Dict[str, str],
60
54
  [f'status.phase!={status}' for status in non_included_pod_statuses])
61
55
 
62
56
  label_selector = to_label_selector(tag_filters)
63
- pod_list = kubernetes.core_api().list_namespaced_pod(
57
+ pod_list = kubernetes.core_api(context).list_namespaced_pod(
64
58
  namespace, field_selector=field_selector, label_selector=label_selector)
65
59
 
66
60
  # Don't return pods marked for deletion,
@@ -85,7 +79,7 @@ def head_service_selector(cluster_name: str) -> Dict[str, str]:
85
79
  return {'component': f'{cluster_name}-head'}
86
80
 
87
81
 
88
- def _raise_pod_scheduling_errors(namespace, new_nodes):
82
+ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
89
83
  """Raise pod scheduling failure reason.
90
84
 
91
85
  When a pod fails to schedule in Kubernetes, the reasons for the failure
@@ -139,8 +133,8 @@ def _raise_pod_scheduling_errors(namespace, new_nodes):
139
133
  return msg
140
134
 
141
135
  for new_node in new_nodes:
142
- pod = kubernetes.core_api().read_namespaced_pod(new_node.metadata.name,
143
- namespace)
136
+ pod = kubernetes.core_api(context).read_namespaced_pod(
137
+ new_node.metadata.name, namespace)
144
138
  pod_status = pod.status.phase
145
139
  # When there are multiple pods involved while launching instance,
146
140
  # there may be a single pod causing issue while others are
@@ -149,7 +143,7 @@ def _raise_pod_scheduling_errors(namespace, new_nodes):
149
143
  if pod_status != 'Pending':
150
144
  continue
151
145
  pod_name = pod._metadata._name # pylint: disable=protected-access
152
- events = kubernetes.core_api().list_namespaced_event(
146
+ events = kubernetes.core_api(context).list_namespaced_event(
153
147
  namespace,
154
148
  field_selector=(f'involvedObject.name={pod_name},'
155
149
  'involvedObject.kind=Pod'))
@@ -223,7 +217,7 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
223
217
  f'code {rc}: {command!r}\nOutput: {stdout}.')
224
218
 
225
219
 
226
- def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
220
+ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
227
221
  """Wait for all pods to be scheduled.
228
222
 
229
223
  Wait for all pods including jump pod to be scheduled, and if it
@@ -245,7 +239,7 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
245
239
  all_pods_scheduled = True
246
240
  for node in new_nodes:
247
241
  # Iterate over each pod to check their status
248
- pod = kubernetes.core_api().read_namespaced_pod(
242
+ pod = kubernetes.core_api(context).read_namespaced_pod(
249
243
  node.metadata.name, namespace)
250
244
  if pod.status.phase == 'Pending':
251
245
  # If container_statuses is None, then the pod hasn't
@@ -260,7 +254,7 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
260
254
 
261
255
  # Handle pod scheduling errors
262
256
  try:
263
- _raise_pod_scheduling_errors(namespace, new_nodes)
257
+ _raise_pod_scheduling_errors(namespace, context, new_nodes)
264
258
  except config_lib.KubernetesError:
265
259
  raise
266
260
  except Exception as e:
@@ -270,7 +264,7 @@ def _wait_for_pods_to_schedule(namespace, new_nodes, timeout: int):
270
264
  f'Error: {common_utils.format_exception(e)}') from None
271
265
 
272
266
 
273
- def _wait_for_pods_to_run(namespace, new_nodes):
267
+ def _wait_for_pods_to_run(namespace, context, new_nodes):
274
268
  """Wait for pods and their containers to be ready.
275
269
 
276
270
  Pods may be pulling images or may be in the process of container
@@ -306,7 +300,7 @@ def _wait_for_pods_to_run(namespace, new_nodes):
306
300
  all_pods_running = True
307
301
  # Iterate over each pod to check their status
308
302
  for node in new_nodes:
309
- pod = kubernetes.core_api().read_namespaced_pod(
303
+ pod = kubernetes.core_api(context).read_namespaced_pod(
310
304
  node.metadata.name, namespace)
311
305
 
312
306
  # Continue if pod and all the containers within the
@@ -344,7 +338,7 @@ def _wait_for_pods_to_run(namespace, new_nodes):
344
338
  time.sleep(1)
345
339
 
346
340
 
347
- def _set_env_vars_in_pods(namespace: str, new_pods: List):
341
+ def _set_env_vars_in_pods(namespace: str, context: str, new_pods: List):
348
342
  """Setting environment variables in pods.
349
343
 
350
344
  Once all containers are ready, we can exec into them and set env vars.
@@ -364,7 +358,7 @@ def _set_env_vars_in_pods(namespace: str, new_pods: List):
364
358
 
365
359
  for new_pod in new_pods:
366
360
  runner = command_runner.KubernetesCommandRunner(
367
- (namespace, new_pod.metadata.name))
361
+ ((namespace, context), new_pod.metadata.name))
368
362
  rc, stdout, _ = runner.run(set_k8s_env_var_cmd,
369
363
  require_outputs=True,
370
364
  stream_logs=False)
@@ -372,7 +366,8 @@ def _set_env_vars_in_pods(namespace: str, new_pods: List):
372
366
  new_pod.metadata.name, rc, stdout)
373
367
 
374
368
 
375
- def _check_user_privilege(namespace: str, new_nodes: List) -> None:
369
+ def _check_user_privilege(namespace: str, context: str,
370
+ new_nodes: List) -> None:
376
371
  # Checks if the default user has sufficient privilege to set up
377
372
  # the kubernetes instance pod.
378
373
  check_k8s_user_sudo_cmd = (
@@ -390,7 +385,7 @@ def _check_user_privilege(namespace: str, new_nodes: List) -> None:
390
385
 
391
386
  for new_node in new_nodes:
392
387
  runner = command_runner.KubernetesCommandRunner(
393
- (namespace, new_node.metadata.name))
388
+ ((namespace, context), new_node.metadata.name))
394
389
  rc, stdout, stderr = runner.run(check_k8s_user_sudo_cmd,
395
390
  require_outputs=True,
396
391
  separate_stderr=True,
@@ -407,7 +402,7 @@ def _check_user_privilege(namespace: str, new_nodes: List) -> None:
407
402
  'from the image.')
408
403
 
409
404
 
410
- def _setup_ssh_in_pods(namespace: str, new_nodes: List) -> None:
405
+ def _setup_ssh_in_pods(namespace: str, context: str, new_nodes: List) -> None:
411
406
  # Setting up ssh for the pod instance. This is already setup for
412
407
  # the jump pod so it does not need to be run for it.
413
408
  set_k8s_ssh_cmd = (
@@ -440,7 +435,8 @@ def _setup_ssh_in_pods(namespace: str, new_nodes: List) -> None:
440
435
  # TODO(romilb): Parallelize the setup of SSH in pods for multi-node clusters
441
436
  for new_node in new_nodes:
442
437
  pod_name = new_node.metadata.name
443
- runner = command_runner.KubernetesCommandRunner((namespace, pod_name))
438
+ runner = command_runner.KubernetesCommandRunner(
439
+ ((namespace, context), pod_name))
444
440
  logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}')
445
441
  rc, stdout, _ = runner.run(set_k8s_ssh_cmd,
446
442
  require_outputs=True,
@@ -450,9 +446,10 @@ def _setup_ssh_in_pods(namespace: str, new_nodes: List) -> None:
450
446
  logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
451
447
 
452
448
 
453
- def _label_pod(namespace: str, pod_name: str, label: Dict[str, str]) -> None:
449
+ def _label_pod(namespace: str, context: str, pod_name: str,
450
+ label: Dict[str, str]) -> None:
454
451
  """Label a pod."""
455
- kubernetes.core_api().patch_namespaced_pod(
452
+ kubernetes.core_api(context).patch_namespaced_pod(
456
453
  pod_name,
457
454
  namespace, {'metadata': {
458
455
  'labels': label
@@ -464,7 +461,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
464
461
  config: common.ProvisionConfig) -> common.ProvisionRecord:
465
462
  """Create pods based on the config."""
466
463
  provider_config = config.provider_config
467
- namespace = _get_namespace(provider_config)
464
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
465
+ context = kubernetes_utils.get_context_from_config(provider_config)
468
466
  pod_spec = copy.deepcopy(config.node_config)
469
467
  tags = {
470
468
  TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
@@ -477,7 +475,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
477
475
  pod_spec['metadata']['labels'].update(
478
476
  {TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
479
477
 
480
- terminating_pods = _filter_pods(namespace, tags, ['Terminating'])
478
+ terminating_pods = _filter_pods(namespace, context, tags, ['Terminating'])
481
479
  start_time = time.time()
482
480
  while (len(terminating_pods) > 0 and
483
481
  time.time() - start_time < _TIMEOUT_FOR_POD_TERMINATION):
@@ -485,7 +483,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
485
483
  'terminating pods. Waiting them to finish: '
486
484
  f'{list(terminating_pods.keys())}')
487
485
  time.sleep(POLL_INTERVAL)
488
- terminating_pods = _filter_pods(namespace, tags, ['Terminating'])
486
+ terminating_pods = _filter_pods(namespace, context, tags,
487
+ ['Terminating'])
489
488
 
490
489
  if len(terminating_pods) > 0:
491
490
  # If there are still terminating pods, we force delete them.
@@ -496,13 +495,14 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
496
495
  for pod_name in terminating_pods.keys():
497
496
  # grace_period_seconds=0 means force delete the pod.
498
497
  # https://github.com/kubernetes-client/python/issues/508#issuecomment-1695759777
499
- kubernetes.core_api().delete_namespaced_pod(
498
+ kubernetes.core_api(context).delete_namespaced_pod(
500
499
  pod_name,
501
500
  namespace,
502
501
  _request_timeout=config_lib.DELETION_TIMEOUT,
503
502
  grace_period_seconds=0)
504
503
 
505
- running_pods = _filter_pods(namespace, tags, ['Pending', 'Running'])
504
+ running_pods = _filter_pods(namespace, context, tags,
505
+ ['Pending', 'Running'])
506
506
  head_pod_name = _get_head_pod_name(running_pods)
507
507
  logger.debug(f'Found {len(running_pods)} existing pods: '
508
508
  f'{list(running_pods.keys())}')
@@ -520,7 +520,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
520
520
  # Add nvidia runtime class if it exists
521
521
  nvidia_runtime_exists = False
522
522
  try:
523
- nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class()
523
+ nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
524
+ context)
524
525
  except kubernetes.kubernetes.client.ApiException as e:
525
526
  logger.warning('run_instances: Error occurred while checking for '
526
527
  f'nvidia RuntimeClass - '
@@ -530,7 +531,9 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
530
531
  'override runtimeClassName in ~/.sky/config.yaml. '
531
532
  'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html') # pylint: disable=line-too-long
532
533
 
533
- if nvidia_runtime_exists:
534
+ needs_gpus = (pod_spec['spec']['containers'][0].get('resources', {}).get(
535
+ 'limits', {}).get('nvidia.com/gpu', 0) > 0)
536
+ if nvidia_runtime_exists and needs_gpus:
534
537
  pod_spec['spec']['runtimeClassName'] = 'nvidia'
535
538
 
536
539
  created_pods = {}
@@ -574,12 +577,13 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
574
577
  }
575
578
  }
576
579
 
577
- pod = kubernetes.core_api().create_namespaced_pod(namespace, pod_spec)
580
+ pod = kubernetes.core_api(context).create_namespaced_pod(
581
+ namespace, pod_spec)
578
582
  created_pods[pod.metadata.name] = pod
579
583
  if head_pod_name is None:
580
584
  head_pod_name = pod.metadata.name
581
585
 
582
- wait_pods_dict = _filter_pods(namespace, tags, ['Pending'])
586
+ wait_pods_dict = _filter_pods(namespace, context, tags, ['Pending'])
583
587
  wait_pods = list(wait_pods_dict.values())
584
588
 
585
589
  networking_mode = network_utils.get_networking_mode(
@@ -588,7 +592,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
588
592
  # Adding the jump pod to the new_nodes list as well so it can be
589
593
  # checked if it's scheduled and running along with other pods.
590
594
  ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
591
- jump_pod = kubernetes.core_api().read_namespaced_pod(
595
+ jump_pod = kubernetes.core_api(context).read_namespaced_pod(
592
596
  ssh_jump_pod_name, namespace)
593
597
  wait_pods.append(jump_pod)
594
598
  provision_timeout = provider_config['timeout']
@@ -600,17 +604,17 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
600
604
 
601
605
  # Wait until the pods are scheduled and surface cause for error
602
606
  # if there is one
603
- _wait_for_pods_to_schedule(namespace, wait_pods, provision_timeout)
607
+ _wait_for_pods_to_schedule(namespace, context, wait_pods, provision_timeout)
604
608
  # Wait until the pods and their containers are up and running, and
605
609
  # fail early if there is an error
606
610
  logger.debug(f'run_instances: waiting for pods to be running (pulling '
607
611
  f'images): {list(wait_pods_dict.keys())}')
608
- _wait_for_pods_to_run(namespace, wait_pods)
612
+ _wait_for_pods_to_run(namespace, context, wait_pods)
609
613
  logger.debug(f'run_instances: all pods are scheduled and running: '
610
614
  f'{list(wait_pods_dict.keys())}')
611
615
 
612
- running_pods = _filter_pods(namespace, tags, ['Running'])
613
- initialized_pods = _filter_pods(namespace, {
616
+ running_pods = _filter_pods(namespace, context, tags, ['Running'])
617
+ initialized_pods = _filter_pods(namespace, context, {
614
618
  TAG_POD_INITIALIZED: 'true',
615
619
  **tags
616
620
  }, ['Running'])
@@ -628,12 +632,13 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
628
632
  # Make sure commands used in these methods are generic and work
629
633
  # on most base images. E.g., do not use Python, since that may not
630
634
  # be installed by default.
631
- _check_user_privilege(namespace, uninitialized_pods_list)
632
- _setup_ssh_in_pods(namespace, uninitialized_pods_list)
633
- _set_env_vars_in_pods(namespace, uninitialized_pods_list)
635
+ _check_user_privilege(namespace, context, uninitialized_pods_list)
636
+ _setup_ssh_in_pods(namespace, context, uninitialized_pods_list)
637
+ _set_env_vars_in_pods(namespace, context, uninitialized_pods_list)
634
638
 
635
639
  for pod in uninitialized_pods.values():
636
640
  _label_pod(namespace,
641
+ context,
637
642
  pod.metadata.name,
638
643
  label={
639
644
  TAG_POD_INITIALIZED: 'true',
@@ -675,18 +680,18 @@ def stop_instances(
675
680
  raise NotImplementedError()
676
681
 
677
682
 
678
- def _terminate_node(namespace: str, pod_name: str) -> None:
683
+ def _terminate_node(namespace: str, context: str, pod_name: str) -> None:
679
684
  """Terminate a pod."""
680
685
  logger.debug('terminate_instances: calling delete_namespaced_pod')
681
686
  try:
682
- kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, pod_name)
687
+ kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context, pod_name)
683
688
  except Exception as e: # pylint: disable=broad-except
684
689
  logger.warning('terminate_instances: Error occurred when analyzing '
685
690
  f'SSH Jump pod: {e}')
686
691
  try:
687
- kubernetes.core_api().delete_namespaced_service(
692
+ kubernetes.core_api(context).delete_namespaced_service(
688
693
  pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT)
689
- kubernetes.core_api().delete_namespaced_service(
694
+ kubernetes.core_api(context).delete_namespaced_service(
690
695
  f'{pod_name}-ssh',
691
696
  namespace,
692
697
  _request_timeout=config_lib.DELETION_TIMEOUT)
@@ -696,7 +701,7 @@ def _terminate_node(namespace: str, pod_name: str) -> None:
696
701
  # This is to ensure there are no leftover resources if this down is run
697
702
  # from within the pod, e.g., for autodown.
698
703
  try:
699
- kubernetes.core_api().delete_namespaced_pod(
704
+ kubernetes.core_api(context).delete_namespaced_pod(
700
705
  pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT)
701
706
  except kubernetes.api_exception() as e:
702
707
  if e.status == 404:
@@ -712,11 +717,12 @@ def terminate_instances(
712
717
  worker_only: bool = False,
713
718
  ) -> None:
714
719
  """See sky/provision/__init__.py"""
715
- namespace = _get_namespace(provider_config)
720
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
721
+ context = kubernetes_utils.get_context_from_config(provider_config)
716
722
  tag_filters = {
717
723
  TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
718
724
  }
719
- pods = _filter_pods(namespace, tag_filters, None)
725
+ pods = _filter_pods(namespace, context, tag_filters, None)
720
726
 
721
727
  def _is_head(pod) -> bool:
722
728
  return pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'
@@ -725,7 +731,7 @@ def terminate_instances(
725
731
  logger.debug(f'Terminating instance {pod_name}: {pod}')
726
732
  if _is_head(pod) and worker_only:
727
733
  continue
728
- _terminate_node(namespace, pod_name)
734
+ _terminate_node(namespace, context, pod_name)
729
735
 
730
736
 
731
737
  def get_cluster_info(
@@ -734,12 +740,13 @@ def get_cluster_info(
734
740
  provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
735
741
  del region # unused
736
742
  assert provider_config is not None
737
- namespace = _get_namespace(provider_config)
743
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
744
+ context = kubernetes_utils.get_context_from_config(provider_config)
738
745
  tag_filters = {
739
746
  TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
740
747
  }
741
748
 
742
- running_pods = _filter_pods(namespace, tag_filters, ['Running'])
749
+ running_pods = _filter_pods(namespace, context, tag_filters, ['Running'])
743
750
  pods: Dict[str, List[common.InstanceInfo]] = {}
744
751
  head_pod_name = None
745
752
 
@@ -748,11 +755,11 @@ def get_cluster_info(
748
755
  port_forward_mode.value)
749
756
  network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
750
757
  network_mode_str)
751
- external_ip = kubernetes_utils.get_external_ip(network_mode)
758
+ external_ip = kubernetes_utils.get_external_ip(network_mode, context)
752
759
  port = 22
753
760
  if not provider_config.get('use_internal_ips', False):
754
761
  port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
755
- namespace)
762
+ namespace, context)
756
763
 
757
764
  head_pod_name = None
758
765
  cpu_request = None
@@ -779,7 +786,8 @@ def get_cluster_info(
779
786
  ssh_user = 'sky'
780
787
  get_k8s_ssh_user_cmd = 'echo $(whoami)'
781
788
  assert head_pod_name is not None
782
- runner = command_runner.KubernetesCommandRunner((namespace, head_pod_name))
789
+ runner = command_runner.KubernetesCommandRunner(
790
+ ((namespace, context), head_pod_name))
783
791
  rc, stdout, stderr = runner.run(get_k8s_ssh_user_cmd,
784
792
  require_outputs=True,
785
793
  separate_stderr=True,
@@ -810,7 +818,6 @@ def query_instances(
810
818
  provider_config: Optional[Dict[str, Any]] = None,
811
819
  non_terminated_only: bool = True
812
820
  ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
813
- del provider_config # unused
814
821
  status_map = {
815
822
  'Pending': status_lib.ClusterStatus.INIT,
816
823
  'Running': status_lib.ClusterStatus.UP,
@@ -820,11 +827,13 @@ def query_instances(
820
827
  'Terminating': None,
821
828
  }
822
829
 
823
- namespace = kubernetes_utils.get_current_kube_config_context_namespace()
830
+ assert provider_config is not None
831
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
832
+ context = kubernetes_utils.get_context_from_config(provider_config)
824
833
 
825
834
  # Get all the pods with the label skypilot-cluster: <cluster_name>
826
835
  try:
827
- pods = kubernetes.core_api().list_namespaced_pod(
836
+ pods = kubernetes.core_api(context).list_namespaced_pod(
828
837
  namespace,
829
838
  label_selector=f'skypilot-cluster={cluster_name_on_cloud}',
830
839
  _request_timeout=kubernetes.API_TIMEOUT).items
@@ -858,11 +867,14 @@ def get_command_runners(
858
867
  """Get a command runner for the given cluster."""
859
868
  assert cluster_info.provider_config is not None, cluster_info
860
869
  instances = cluster_info.instances
861
- namespace = _get_namespace(cluster_info.provider_config)
870
+ namespace = kubernetes_utils.get_namespace_from_config(
871
+ cluster_info.provider_config)
872
+ context = kubernetes_utils.get_context_from_config(
873
+ cluster_info.provider_config)
862
874
  node_list = []
863
875
  if cluster_info.head_instance_id is not None:
864
- node_list = [(namespace, cluster_info.head_instance_id)]
865
- node_list.extend((namespace, pod_name)
876
+ node_list = [((namespace, context), cluster_info.head_instance_id)]
877
+ node_list.extend(((namespace, context), pod_name)
866
878
  for pod_name in instances.keys()
867
879
  if pod_name != cluster_info.head_instance_id)
868
880
  return command_runner.KubernetesCommandRunner.make_runner_list(
@@ -58,7 +58,8 @@ def _open_ports_using_loadbalancer(
58
58
  kubernetes_utils.merge_custom_metadata(content['service_spec']['metadata'])
59
59
 
60
60
  network_utils.create_or_replace_namespaced_service(
61
- namespace=provider_config.get('namespace', 'default'),
61
+ namespace=kubernetes_utils.get_namespace_from_config(provider_config),
62
+ context=kubernetes_utils.get_context_from_config(provider_config),
62
63
  service_name=service_name,
63
64
  service_spec=content['service_spec'])
64
65
 
@@ -68,8 +69,9 @@ def _open_ports_using_ingress(
68
69
  ports: List[int],
69
70
  provider_config: Dict[str, Any],
70
71
  ) -> None:
72
+ context = kubernetes_utils.get_context_from_config(provider_config)
71
73
  # Check if an ingress controller exists
72
- if not network_utils.ingress_controller_exists():
74
+ if not network_utils.ingress_controller_exists(context):
73
75
  raise Exception(
74
76
  'Ingress controller not found. '
75
77
  'Install Nginx ingress controller first: '
@@ -108,7 +110,9 @@ def _open_ports_using_ingress(
108
110
  # Update metadata from config
109
111
  kubernetes_utils.merge_custom_metadata(service_spec['metadata'])
110
112
  network_utils.create_or_replace_namespaced_service(
111
- namespace=provider_config.get('namespace', 'default'),
113
+ namespace=kubernetes_utils.get_namespace_from_config(
114
+ provider_config),
115
+ context=kubernetes_utils.get_context_from_config(provider_config),
112
116
  service_name=service_name,
113
117
  service_spec=service_spec,
114
118
  )
@@ -116,7 +120,8 @@ def _open_ports_using_ingress(
116
120
  kubernetes_utils.merge_custom_metadata(content['ingress_spec']['metadata'])
117
121
  # Create or update the single ingress for all services
118
122
  network_utils.create_or_replace_namespaced_ingress(
119
- namespace=provider_config.get('namespace', 'default'),
123
+ namespace=kubernetes_utils.get_namespace_from_config(provider_config),
124
+ context=kubernetes_utils.get_context_from_config(provider_config),
120
125
  ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress',
121
126
  ingress_spec=content['ingress_spec'],
122
127
  )
@@ -173,7 +178,8 @@ def _cleanup_ports_for_ingress(
173
178
  # Delete the single ingress used for all ports
174
179
  ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress'
175
180
  network_utils.delete_namespaced_ingress(
176
- namespace=provider_config.get('namespace', 'default'),
181
+ namespace=kubernetes_utils.get_namespace_from_config(provider_config),
182
+ context=kubernetes_utils.get_context_from_config(provider_config),
177
183
  ingress_name=ingress_name,
178
184
  )
179
185
 
@@ -132,10 +132,10 @@ def fill_ingress_template(namespace: str, service_details: List[Tuple[str, int,
132
132
 
133
133
 
134
134
  def create_or_replace_namespaced_ingress(
135
- namespace: str, ingress_name: str,
135
+ namespace: str, context: str, ingress_name: str,
136
136
  ingress_spec: Dict[str, Union[str, int]]) -> None:
137
137
  """Creates an ingress resource for the specified service."""
138
- networking_api = kubernetes.networking_api()
138
+ networking_api = kubernetes.networking_api(context)
139
139
 
140
140
  try:
141
141
  networking_api.read_namespaced_ingress(
@@ -156,9 +156,10 @@ def create_or_replace_namespaced_ingress(
156
156
  _request_timeout=kubernetes.API_TIMEOUT)
157
157
 
158
158
 
159
- def delete_namespaced_ingress(namespace: str, ingress_name: str) -> None:
159
+ def delete_namespaced_ingress(namespace: str, context: str,
160
+ ingress_name: str) -> None:
160
161
  """Deletes an ingress resource."""
161
- networking_api = kubernetes.networking_api()
162
+ networking_api = kubernetes.networking_api(context)
162
163
  try:
163
164
  networking_api.delete_namespaced_ingress(
164
165
  ingress_name, namespace, _request_timeout=kubernetes.API_TIMEOUT)
@@ -170,10 +171,10 @@ def delete_namespaced_ingress(namespace: str, ingress_name: str) -> None:
170
171
 
171
172
 
172
173
  def create_or_replace_namespaced_service(
173
- namespace: str, service_name: str,
174
+ namespace: str, context: str, service_name: str,
174
175
  service_spec: Dict[str, Union[str, int]]) -> None:
175
176
  """Creates a service resource for the specified service."""
176
- core_api = kubernetes.core_api()
177
+ core_api = kubernetes.core_api(context)
177
178
 
178
179
  try:
179
180
  core_api.read_namespaced_service(
@@ -207,9 +208,10 @@ def delete_namespaced_service(namespace: str, service_name: str) -> None:
207
208
  raise e
208
209
 
209
210
 
210
- def ingress_controller_exists(ingress_class_name: str = 'nginx') -> bool:
211
+ def ingress_controller_exists(context: str,
212
+ ingress_class_name: str = 'nginx') -> bool:
211
213
  """Checks if an ingress controller exists in the cluster."""
212
- networking_api = kubernetes.networking_api()
214
+ networking_api = kubernetes.networking_api(context)
213
215
  ingress_classes = networking_api.list_ingress_class(
214
216
  _request_timeout=kubernetes.API_TIMEOUT).items
215
217
  return any(