skypilot-nightly 1.0.0.dev20240926__py3-none-any.whl → 1.0.0.dev20240927__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'e95332b9eb8de4cdcac464ff704bf64f3285e776'
8
+ _SKYPILOT_COMMIT_SHA = 'e6b8d2c086544ab5cfdb877ad414eafddaa49cb4'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20240926'
38
+ __version__ = '1.0.0.dev20240927'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -75,15 +75,17 @@ def _load_config(context: Optional[str] = None):
75
75
  suffix += f' Error: {str(e)}'
76
76
  # Check if exception was due to no current-context
77
77
  if 'Expected key current-context' in str(e):
78
- err_str = ('Failed to load Kubernetes configuration. '
79
- 'Kubeconfig does not contain any valid context(s).'
80
- f'{suffix}\n'
81
- ' If you were running a local Kubernetes '
82
- 'cluster, run `sky local up` to start the cluster.')
78
+ err_str = (
79
+ f'Failed to load Kubernetes configuration for {context!r}. '
80
+ 'Kubeconfig does not contain any valid context(s).'
81
+ f'{suffix}\n'
82
+ ' If you were running a local Kubernetes '
83
+ 'cluster, run `sky local up` to start the cluster.')
83
84
  else:
84
- err_str = ('Failed to load Kubernetes configuration. '
85
- 'Please check if your kubeconfig file exists at '
86
- f'~/.kube/config and is valid.{suffix}')
85
+ err_str = (
86
+ f'Failed to load Kubernetes configuration for {context!r}. '
87
+ 'Please check if your kubeconfig file exists at '
88
+ f'~/.kube/config and is valid.{suffix}')
87
89
  err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
88
90
  with ux_utils.print_exception_no_traceback():
89
91
  raise ValueError(err_str) from None
sky/authentication.py CHANGED
@@ -378,11 +378,11 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
378
378
  public_key_path = os.path.expanduser(PUBLIC_SSH_KEY_PATH)
379
379
  secret_name = clouds.Kubernetes.SKY_SSH_KEY_SECRET_NAME
380
380
  secret_field_name = clouds.Kubernetes().ssh_key_secret_field_name
381
- namespace = config['provider'].get(
382
- 'namespace',
383
- kubernetes_utils.get_current_kube_config_context_namespace())
384
381
  context = config['provider'].get(
385
382
  'context', kubernetes_utils.get_current_kube_config_context_name())
383
+ namespace = config['provider'].get(
384
+ 'namespace',
385
+ kubernetes_utils.get_kube_config_context_namespace(context))
386
386
  k8s = kubernetes.kubernetes
387
387
  with open(public_key_path, 'r', encoding='utf-8') as f:
388
388
  public_key = f.read()
@@ -425,8 +425,8 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
425
425
  ssh_jump_name,
426
426
  nodeport_mode,
427
427
  private_key_path=private_key_path,
428
- namespace=namespace,
429
- context=context)
428
+ context=context,
429
+ namespace=namespace)
430
430
  elif network_mode == port_forward_mode:
431
431
  # Using `kubectl port-forward` creates a direct tunnel to the pod and
432
432
  # does not require a ssh jump pod.
@@ -441,7 +441,11 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
441
441
  # on GKE.
442
442
  ssh_target = config['cluster_name'] + '-head'
443
443
  ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
444
- ssh_target, port_forward_mode, private_key_path=private_key_path)
444
+ ssh_target,
445
+ port_forward_mode,
446
+ private_key_path=private_key_path,
447
+ context=context,
448
+ namespace=namespace)
445
449
  else:
446
450
  # This should never happen because we check for this in from_str above.
447
451
  raise ValueError(f'Unsupported networking mode: {network_mode_str}')
@@ -428,6 +428,7 @@ class SSHConfigHelper(object):
428
428
  HostName {ip}
429
429
  User {username}
430
430
  IdentityFile {ssh_key_path}
431
+ AddKeysToAgent yes
431
432
  IdentitiesOnly yes
432
433
  ForwardAgent yes
433
434
  StrictHostKeyChecking no
@@ -2082,7 +2082,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2082
2082
  """
2083
2083
  # Bump if any fields get added/removed/changed, and add backward
2084
2084
  # compaitibility logic in __setstate__.
2085
- _VERSION = 8
2085
+ _VERSION = 9
2086
2086
 
2087
2087
  def __init__(
2088
2088
  self,
@@ -2516,6 +2516,19 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2516
2516
  if version < 8:
2517
2517
  self.cached_cluster_info = None
2518
2518
 
2519
+ if version < 9:
2520
+ # For backward compatibility, we should update the region of a
2521
+ # SkyPilot cluster on Kubernetes to the actual context it is using.
2522
+ # pylint: disable=import-outside-toplevel
2523
+ launched_resources = state['launched_resources']
2524
+ if isinstance(launched_resources.cloud, clouds.Kubernetes):
2525
+ yaml_config = common_utils.read_yaml(
2526
+ os.path.expanduser(state['_cluster_yaml']))
2527
+ context = kubernetes_utils.get_context_from_config(
2528
+ yaml_config['provider'])
2529
+ state['launched_resources'] = launched_resources.copy(
2530
+ region=context)
2531
+
2519
2532
  self.__dict__.update(state)
2520
2533
 
2521
2534
  # Because the update_cluster_ips and update_ssh_ports
sky/cli.py CHANGED
@@ -3026,14 +3026,11 @@ def show_gpus(
3026
3026
  kubernetes_is_enabled = sky_clouds.cloud_in_iterable(
3027
3027
  sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds())
3028
3028
 
3029
- if cloud_is_kubernetes and region is not None:
3030
- raise click.UsageError(
3031
- 'The --region flag cannot be set with --cloud kubernetes.')
3032
-
3033
3029
  def _list_to_str(lst):
3034
3030
  return ', '.join([str(e) for e in lst])
3035
3031
 
3036
3032
  def _get_kubernetes_realtime_gpu_table(
3033
+ context: Optional[str] = None,
3037
3034
  name_filter: Optional[str] = None,
3038
3035
  quantity_filter: Optional[int] = None):
3039
3036
  if quantity_filter:
@@ -3048,7 +3045,7 @@ def show_gpus(
3048
3045
  gpus_only=True,
3049
3046
  clouds='kubernetes',
3050
3047
  name_filter=name_filter,
3051
- region_filter=region,
3048
+ region_filter=context,
3052
3049
  quantity_filter=quantity_filter,
3053
3050
  case_sensitive=False)
3054
3051
  assert (set(counts.keys()) == set(capacity.keys()) == set(
@@ -3078,11 +3075,11 @@ def show_gpus(
3078
3075
  ])
3079
3076
  return realtime_gpu_table
3080
3077
 
3081
- def _get_kubernetes_node_info_table():
3078
+ def _get_kubernetes_node_info_table(context: Optional[str]):
3082
3079
  node_table = log_utils.create_table(
3083
3080
  ['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS'])
3084
3081
 
3085
- node_info_dict = kubernetes_utils.get_kubernetes_node_info()
3082
+ node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
3086
3083
  for node_name, node_info in node_info_dict.items():
3087
3084
  node_table.add_row([
3088
3085
  node_name, node_info.gpu_type,
@@ -3116,11 +3113,13 @@ def show_gpus(
3116
3113
  print_section_titles = False
3117
3114
  # If cloud is kubernetes, we want to show real-time capacity
3118
3115
  if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes):
3116
+ context = region
3119
3117
  try:
3120
3118
  # If --cloud kubernetes is not specified, we want to catch
3121
3119
  # the case where no GPUs are available on the cluster and
3122
3120
  # print the warning at the end.
3123
- k8s_realtime_table = _get_kubernetes_realtime_gpu_table()
3121
+ k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3122
+ context)
3124
3123
  except ValueError as e:
3125
3124
  if not cloud_is_kubernetes:
3126
3125
  # Make it a note if cloud is not kubernetes
@@ -3129,9 +3128,10 @@ def show_gpus(
3129
3128
  else:
3130
3129
  print_section_titles = True
3131
3130
  yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3132
- f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
3131
+ f'Kubernetes GPUs (Context: {context})'
3132
+ f'{colorama.Style.RESET_ALL}\n')
3133
3133
  yield from k8s_realtime_table.get_string()
3134
- k8s_node_table = _get_kubernetes_node_info_table()
3134
+ k8s_node_table = _get_kubernetes_node_info_table(context)
3135
3135
  yield '\n\n'
3136
3136
  yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3137
3137
  f'Kubernetes per node GPU availability'
sky/clouds/kubernetes.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Kubernetes."""
2
+ import functools
2
3
  import json
3
4
  import os
4
5
  import re
@@ -52,8 +53,7 @@ class Kubernetes(clouds.Cloud):
52
53
  _DEFAULT_MEMORY_CPU_RATIO = 1
53
54
  _DEFAULT_MEMORY_CPU_RATIO_WITH_GPU = 4 # Allocate more memory for GPU tasks
54
55
  _REPR = 'Kubernetes'
55
- _SINGLETON_REGION = 'kubernetes'
56
- _regions: List[clouds.Region] = [clouds.Region(_SINGLETON_REGION)]
56
+ _LEGACY_SINGLETON_REGION = 'kubernetes'
57
57
  _CLOUD_UNSUPPORTED_FEATURES = {
58
58
  # TODO(romilb): Stopping might be possible to implement with
59
59
  # container checkpointing introduced in Kubernetes v1.25. See:
@@ -88,8 +88,12 @@ class Kubernetes(clouds.Cloud):
88
88
  cls, resources: 'resources_lib.Resources'
89
89
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
90
90
  unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES.copy()
91
+ context = resources.region
92
+ if context is None:
93
+ context = kubernetes_utils.get_current_kube_config_context_name()
91
94
  # Features to be disabled for exec auth
92
- is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth()
95
+ is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth(
96
+ context)
93
97
  if is_exec_auth:
94
98
  assert isinstance(message, str), message
95
99
  # Controllers cannot spin up new pods with exec auth.
@@ -99,7 +103,7 @@ class Kubernetes(clouds.Cloud):
99
103
  unsupported_features[
100
104
  clouds.CloudImplementationFeatures.AUTO_TERMINATE] = message
101
105
  # Allow spot instances if supported by the cluster
102
- spot_label_key, _ = kubernetes_utils.get_spot_label()
106
+ spot_label_key, _ = kubernetes_utils.get_spot_label(context)
103
107
  if spot_label_key is not None:
104
108
  unsupported_features.pop(
105
109
  clouds.CloudImplementationFeatures.SPOT_INSTANCE, None)
@@ -110,16 +114,87 @@ class Kubernetes(clouds.Cloud):
110
114
  return cls._MAX_CLUSTER_NAME_LEN_LIMIT
111
115
 
112
116
  @classmethod
113
- def regions(cls) -> List[clouds.Region]:
114
- return cls._regions
117
+ @functools.lru_cache(maxsize=1)
118
+ def _log_skipped_contexts_once(cls, skipped_contexts: Tuple[str,
119
+ ...]) -> None:
120
+ """Log skipped contexts for only once.
121
+
122
+ We don't directly cache the result of _filter_existing_allowed_contexts
123
+ as the admin policy may update the allowed contexts.
124
+ """
125
+ if skipped_contexts:
126
+ logger.warning(
127
+ f'Kubernetes contexts {set(skipped_contexts)!r} specified in '
128
+ '"allowed_contexts" not found in kubeconfig. '
129
+ 'Ignoring these contexts.')
130
+
131
+ @classmethod
132
+ def _existing_allowed_contexts(cls) -> List[str]:
133
+ """Get existing allowed contexts."""
134
+ all_contexts = kubernetes_utils.get_all_kube_config_context_names()
135
+ if all_contexts is None:
136
+ return []
137
+ all_contexts = set(all_contexts)
138
+
139
+ allowed_contexts = skypilot_config.get_nested(
140
+ ('kubernetes', 'allowed_contexts'), None)
141
+
142
+ if allowed_contexts is None:
143
+ current_context = (
144
+ kubernetes_utils.get_current_kube_config_context_name())
145
+ allowed_contexts = []
146
+ if current_context is not None:
147
+ allowed_contexts = [current_context]
148
+
149
+ existing_contexts = []
150
+ skipped_contexts = []
151
+ for context in allowed_contexts:
152
+ if context in all_contexts:
153
+ existing_contexts.append(context)
154
+ else:
155
+ skipped_contexts.append(context)
156
+ cls._log_skipped_contexts_once(tuple(skipped_contexts))
157
+ return existing_contexts
115
158
 
116
159
  @classmethod
117
160
  def regions_with_offering(cls, instance_type: Optional[str],
118
161
  accelerators: Optional[Dict[str, int]],
119
162
  use_spot: bool, region: Optional[str],
120
163
  zone: Optional[str]) -> List[clouds.Region]:
121
- # No notion of regions in Kubernetes - return a single region.
122
- return cls.regions()
164
+ del accelerators, zone, use_spot # unused
165
+ existing_contexts = cls._existing_allowed_contexts()
166
+
167
+ regions = [clouds.Region(context) for context in existing_contexts]
168
+
169
+ if region is not None:
170
+ regions = [r for r in regions if r.name == region]
171
+
172
+ # Check if requested instance type will fit in the cluster.
173
+ # TODO(zhwu,romilb): autoscaler type needs to be regional (per
174
+ # kubernetes cluster/context).
175
+ regions_to_return = []
176
+ autoscaler_type = kubernetes_utils.get_autoscaler_type()
177
+ if autoscaler_type is None and instance_type is not None:
178
+ # If autoscaler is not set, check if the instance type fits in the
179
+ # cluster. Else, rely on the autoscaler to provision the right
180
+ # instance type without running checks. Worst case, if autoscaling
181
+ # fails, the pod will be stuck in pending state until
182
+ # provision_timeout, after which failover will be triggered.
183
+ for r in regions:
184
+ context = r.name
185
+ fits, reason = kubernetes_utils.check_instance_fits(
186
+ context, instance_type)
187
+ if fits:
188
+ regions_to_return.append(r)
189
+ else:
190
+ logger.debug(
191
+ f'Instance type {instance_type} does '
192
+ 'not fit in the Kubernetes cluster with context: '
193
+ f'{context}. Reason: {reason}')
194
+ else:
195
+ regions_to_return = regions
196
+
197
+ return regions_to_return
123
198
 
124
199
  def instance_type_to_hourly_cost(self,
125
200
  instance_type: str,
@@ -201,9 +276,9 @@ class Kubernetes(clouds.Cloud):
201
276
  accelerators: Optional[Dict[str, int]] = None,
202
277
  use_spot: bool = False,
203
278
  ) -> Iterator[Optional[List[clouds.Zone]]]:
204
- del num_nodes, region, instance_type, accelerators, use_spot # Unused.
205
- for r in cls.regions():
206
- yield r.zones
279
+ # Always yield None for zones, since Kubernetes does not have zones, and
280
+ # we should allow any region get to this point.
281
+ yield None
207
282
 
208
283
  @classmethod
209
284
  def get_zone_shell_cmd(cls) -> Optional[str]:
@@ -225,7 +300,10 @@ class Kubernetes(clouds.Cloud):
225
300
  dryrun: bool = False) -> Dict[str, Optional[str]]:
226
301
  del cluster_name, zones, dryrun # Unused.
227
302
  if region is None:
228
- region = self._regions[0]
303
+ context = kubernetes_utils.get_current_kube_config_context_name()
304
+ else:
305
+ context = region.name
306
+ assert context is not None, 'No context found in kubeconfig'
229
307
 
230
308
  r = resources
231
309
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
@@ -244,9 +322,14 @@ class Kubernetes(clouds.Cloud):
244
322
  acc_count = k.accelerator_count if k.accelerator_count else 0
245
323
  acc_type = k.accelerator_type if k.accelerator_type else None
246
324
 
247
- if resources.image_id is not None:
325
+ image_id_dict = resources.image_id
326
+ if image_id_dict is not None:
248
327
  # Use custom image specified in resources
249
- image_id = resources.image_id['kubernetes']
328
+ if None in image_id_dict:
329
+ image_id = image_id_dict[None]
330
+ else:
331
+ assert resources.region in image_id_dict, image_id_dict
332
+ image_id = image_id_dict[resources.region]
250
333
  if image_id.startswith('docker:'):
251
334
  image_id = image_id[len('docker:'):]
252
335
  else:
@@ -265,7 +348,7 @@ class Kubernetes(clouds.Cloud):
265
348
  # If GPUs are requested, set node label to match the GPU type.
266
349
  if acc_count > 0 and acc_type is not None:
267
350
  k8s_acc_label_key, k8s_acc_label_value = \
268
- kubernetes_utils.get_gpu_label_key_value(acc_type)
351
+ kubernetes_utils.get_gpu_label_key_value(context, acc_type)
269
352
 
270
353
  port_mode = network_utils.get_port_mode(None)
271
354
 
@@ -309,13 +392,10 @@ class Kubernetes(clouds.Cloud):
309
392
  deploy_vars = {
310
393
  'instance_type': resources.instance_type,
311
394
  'custom_resources': custom_resources,
312
- 'region': region.name,
313
395
  'cpus': str(cpus),
314
396
  'memory': str(mem),
315
397
  'accelerator_count': str(acc_count),
316
398
  'timeout': str(timeout),
317
- 'k8s_namespace':
318
- kubernetes_utils.get_current_kube_config_context_namespace(),
319
399
  'k8s_port_mode': port_mode.value,
320
400
  'k8s_networking_mode': network_utils.get_networking_mode().value,
321
401
  'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
@@ -335,18 +415,30 @@ class Kubernetes(clouds.Cloud):
335
415
 
336
416
  # Add kubecontext if it is set. It may be None if SkyPilot is running
337
417
  # inside a pod with in-cluster auth.
338
- curr_context = kubernetes_utils.get_current_kube_config_context_name()
339
- if curr_context is not None:
340
- deploy_vars['k8s_context'] = curr_context
418
+ if context is not None:
419
+ deploy_vars['k8s_context'] = context
420
+
421
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
422
+ deploy_vars['k8s_namespace'] = namespace
341
423
 
342
424
  return deploy_vars
343
425
 
344
426
  def _get_feasible_launchable_resources(
345
427
  self, resources: 'resources_lib.Resources'
346
428
  ) -> 'resources_utils.FeasibleResources':
429
+ # TODO(zhwu): This needs to be updated to return the correct region
430
+ # (context) that has enough resources.
347
431
  fuzzy_candidate_list: List[str] = []
348
432
  if resources.instance_type is not None:
349
433
  assert resources.is_launchable(), resources
434
+ regions = self.regions_with_offering(
435
+ resources.instance_type,
436
+ accelerators=resources.accelerators,
437
+ use_spot=resources.use_spot,
438
+ region=resources.region,
439
+ zone=resources.zone)
440
+ if not regions:
441
+ return resources_utils.FeasibleResources([], [], None)
350
442
  resources = resources.copy(accelerators=None)
351
443
  return resources_utils.FeasibleResources([resources],
352
444
  fuzzy_candidate_list, None)
@@ -391,34 +483,48 @@ class Kubernetes(clouds.Cloud):
391
483
  kubernetes_utils.KubernetesInstanceType.from_resources(
392
484
  gpu_task_cpus, gpu_task_memory, acc_count, acc_type).name)
393
485
 
394
- # Check if requested instance type will fit in the cluster.
395
- autoscaler_type = kubernetes_utils.get_autoscaler_type()
396
- if autoscaler_type is None:
397
- # If autoscaler is not set, check if the instance type fits in the
398
- # cluster. Else, rely on the autoscaler to provision the right
399
- # instance type without running checks. Worst case, if autoscaling
400
- # fails, the pod will be stuck in pending state until
401
- # provision_timeout, after which failover will be triggered.
402
- fits, reason = kubernetes_utils.check_instance_fits(
403
- chosen_instance_type)
404
- if not fits:
405
- logger.debug(f'Instance type {chosen_instance_type} does '
406
- 'not fit in the Kubernetes cluster. '
407
- f'Reason: {reason}')
408
- return resources_utils.FeasibleResources([], [], reason)
409
-
486
+ # Check the availability of the specified instance type in all contexts.
487
+ available_regions = self.regions_with_offering(
488
+ chosen_instance_type,
489
+ accelerators=None,
490
+ use_spot=resources.use_spot,
491
+ region=resources.region,
492
+ zone=resources.zone)
493
+ if not available_regions:
494
+ return resources_utils.FeasibleResources([], [], None)
410
495
  # No fuzzy lists for Kubernetes
496
+ # We don't set the resources returned with regions, because the
497
+ # optimizer will further find the valid region (context) for the
498
+ # resources.
411
499
  return resources_utils.FeasibleResources(_make([chosen_instance_type]),
412
500
  [], None)
413
501
 
414
502
  @classmethod
415
503
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
416
504
  # Test using python API
417
- try:
418
- return kubernetes_utils.check_credentials()
419
- except Exception as e: # pylint: disable=broad-except
420
- return (False, 'Credential check failed: '
421
- f'{common_utils.format_exception(e)}')
505
+ existing_allowed_contexts = cls._existing_allowed_contexts()
506
+ if not existing_allowed_contexts:
507
+ if skypilot_config.loaded_config_path() is None:
508
+ check_skypilot_config_msg = ''
509
+ else:
510
+ check_skypilot_config_msg = (
511
+ ' and check "allowed_contexts" in your '
512
+ f'{skypilot_config.loaded_config_path()} file.')
513
+ return (False, 'No available context found in kubeconfig. '
514
+ 'Check if you have a valid kubeconfig file' +
515
+ check_skypilot_config_msg)
516
+ reasons = []
517
+ for context in existing_allowed_contexts:
518
+ try:
519
+ check_result = kubernetes_utils.check_credentials(context)
520
+ if check_result[0]:
521
+ return check_result
522
+ reasons.append(f'{context}: {check_result[1]}')
523
+ except Exception as e: # pylint: disable=broad-except
524
+ return (False, f'Credential check failed for {context}: '
525
+ f'{common_utils.format_exception(e)}')
526
+ return (False, 'Failed to find available context with working '
527
+ 'credentials. Details:\n' + '\n'.join(reasons))
422
528
 
423
529
  def get_credential_file_mounts(self) -> Dict[str, str]:
424
530
  if os.path.exists(os.path.expanduser(CREDENTIAL_PATH)):
@@ -433,10 +539,20 @@ class Kubernetes(clouds.Cloud):
433
539
  instance_type)
434
540
 
435
541
  def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
436
- if region != self._SINGLETON_REGION:
542
+ if region == self._LEGACY_SINGLETON_REGION:
543
+ # For backward compatibility, we allow the region to be set to the
544
+ # legacy singletonton region.
545
+ # TODO: Remove this after 0.9.0.
546
+ return region, zone
547
+
548
+ all_contexts = kubernetes_utils.get_all_kube_config_context_names()
549
+ if all_contexts is None:
550
+ all_contexts = []
551
+ if region not in all_contexts:
437
552
  raise ValueError(
438
- 'Kubernetes support does not support setting region.'
439
- ' Cluster used is determined by the kubeconfig.')
553
+ f'Context {region} not found in kubeconfig. Kubernetes only '
554
+ 'supports context names as regions. Available '
555
+ f'contexts: {all_contexts}')
440
556
  if zone is not None:
441
557
  raise ValueError('Kubernetes support does not support setting zone.'
442
558
  ' Cluster used is determined by the kubeconfig.')
sky/clouds/oci.py CHANGED
@@ -431,14 +431,17 @@ class OCI(clouds.Cloud):
431
431
 
432
432
  def get_credential_file_mounts(self) -> Dict[str, str]:
433
433
  """Returns a dict of credential file paths to mount paths."""
434
- oci_cfg_file = oci_adaptor.get_config_file()
435
- # Pass-in a profile parameter so that multiple profile in oci
436
- # config file is supported (2023/06/09).
437
- oci_cfg = oci_adaptor.get_oci_config(
438
- profile=oci_utils.oci_config.get_profile())
439
- api_key_file = oci_cfg[
440
- 'key_file'] if 'key_file' in oci_cfg else 'BadConf'
441
- sky_cfg_file = oci_utils.oci_config.get_sky_user_config_file()
434
+ try:
435
+ oci_cfg_file = oci_adaptor.get_config_file()
436
+ # Pass-in a profile parameter so that multiple profile in oci
437
+ # config file is supported (2023/06/09).
438
+ oci_cfg = oci_adaptor.get_oci_config(
439
+ profile=oci_utils.oci_config.get_profile())
440
+ api_key_file = oci_cfg[
441
+ 'key_file'] if 'key_file' in oci_cfg else 'BadConf'
442
+ sky_cfg_file = oci_utils.oci_config.get_sky_user_config_file()
443
+ except ImportError:
444
+ return {}
442
445
 
443
446
  # OCI config and API key file are mandatory
444
447
  credential_files = [oci_cfg_file, api_key_file]
@@ -68,26 +68,35 @@ def list_accelerators_realtime(
68
68
  # TODO(romilb): This should be refactored to use get_kubernetes_node_info()
69
69
  # function from kubernetes_utils.
70
70
  del all_regions, require_price # Unused.
71
+ # TODO(zhwu): this should return all accelerators in multiple kubernetes
72
+ # clusters defined by allowed_contexts.
73
+ if region_filter is None:
74
+ context = kubernetes_utils.get_current_kube_config_context_name()
75
+ else:
76
+ context = region_filter
77
+ if context is None:
78
+ return {}, {}, {}
79
+
71
80
  k8s_cloud = Kubernetes()
72
81
  if not any(
73
82
  map(k8s_cloud.is_same_cloud,
74
83
  sky_check.get_cached_enabled_clouds_or_refresh())
75
- ) or not kubernetes_utils.check_credentials()[0]:
84
+ ) or not kubernetes_utils.check_credentials(context)[0]:
76
85
  return {}, {}, {}
77
86
 
78
- has_gpu = kubernetes_utils.detect_gpu_resource()
87
+ has_gpu = kubernetes_utils.detect_gpu_resource(context)
79
88
  if not has_gpu:
80
89
  return {}, {}, {}
81
90
 
82
- label_formatter, _ = kubernetes_utils.detect_gpu_label_formatter()
91
+ label_formatter, _ = kubernetes_utils.detect_gpu_label_formatter(context)
83
92
  if not label_formatter:
84
93
  return {}, {}, {}
85
94
 
86
95
  accelerators_qtys: Set[Tuple[str, int]] = set()
87
96
  key = label_formatter.get_label_key()
88
- nodes = kubernetes_utils.get_kubernetes_nodes()
97
+ nodes = kubernetes_utils.get_kubernetes_nodes(context)
89
98
  # Get the pods to get the real-time GPU usage
90
- pods = kubernetes_utils.get_kubernetes_pods()
99
+ pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
91
100
  # Total number of GPUs in the cluster
92
101
  total_accelerators_capacity: Dict[str, int] = {}
93
102
  # Total number of GPUs currently available in the cluster
@@ -160,7 +169,7 @@ def list_accelerators_realtime(
160
169
  memory=None,
161
170
  price=0.0,
162
171
  spot_price=0.0,
163
- region='kubernetes'))
172
+ region=context))
164
173
 
165
174
  df = pd.DataFrame(result,
166
175
  columns=[
@@ -175,7 +184,6 @@ def list_accelerators_realtime(
175
184
  qtys_map = common.list_accelerators_impl('Kubernetes', df, gpus_only,
176
185
  name_filter, region_filter,
177
186
  quantity_filter, case_sensitive)
178
-
179
187
  return qtys_map, total_accelerators_capacity, total_accelerators_available
180
188
 
181
189