skypilot-nightly 1.0.0.dev20240925__py3-none-any.whl → 1.0.0.dev20240927__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +10 -8
- sky/authentication.py +10 -6
- sky/backends/backend_utils.py +1 -0
- sky/backends/cloud_vm_ray_backend.py +22 -1
- sky/cli.py +10 -10
- sky/clouds/kubernetes.py +161 -45
- sky/clouds/oci.py +11 -8
- sky/clouds/service_catalog/kubernetes_catalog.py +15 -7
- sky/provision/kubernetes/instance.py +15 -46
- sky/provision/kubernetes/network.py +34 -14
- sky/provision/kubernetes/network_utils.py +7 -5
- sky/provision/kubernetes/utils.py +258 -49
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/provisioner.py +2 -0
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/utils/command_runner.py +4 -0
- sky/utils/schemas.py +6 -0
- {skypilot_nightly-1.0.0.dev20240925.dist-info → skypilot_nightly-1.0.0.dev20240927.dist-info}/METADATA +17 -15
- {skypilot_nightly-1.0.0.dev20240925.dist-info → skypilot_nightly-1.0.0.dev20240927.dist-info}/RECORD +24 -24
- {skypilot_nightly-1.0.0.dev20240925.dist-info → skypilot_nightly-1.0.0.dev20240927.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20240925.dist-info → skypilot_nightly-1.0.0.dev20240927.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20240925.dist-info → skypilot_nightly-1.0.0.dev20240927.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20240925.dist-info → skypilot_nightly-1.0.0.dev20240927.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'e6b8d2c086544ab5cfdb877ad414eafddaa49cb4'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20240927'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/adaptors/kubernetes.py
CHANGED
@@ -75,15 +75,17 @@ def _load_config(context: Optional[str] = None):
|
|
75
75
|
suffix += f' Error: {str(e)}'
|
76
76
|
# Check if exception was due to no current-context
|
77
77
|
if 'Expected key current-context' in str(e):
|
78
|
-
err_str = (
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
78
|
+
err_str = (
|
79
|
+
f'Failed to load Kubernetes configuration for {context!r}. '
|
80
|
+
'Kubeconfig does not contain any valid context(s).'
|
81
|
+
f'{suffix}\n'
|
82
|
+
' If you were running a local Kubernetes '
|
83
|
+
'cluster, run `sky local up` to start the cluster.')
|
83
84
|
else:
|
84
|
-
err_str = (
|
85
|
-
|
86
|
-
|
85
|
+
err_str = (
|
86
|
+
f'Failed to load Kubernetes configuration for {context!r}. '
|
87
|
+
'Please check if your kubeconfig file exists at '
|
88
|
+
f'~/.kube/config and is valid.{suffix}')
|
87
89
|
err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
|
88
90
|
with ux_utils.print_exception_no_traceback():
|
89
91
|
raise ValueError(err_str) from None
|
sky/authentication.py
CHANGED
@@ -378,11 +378,11 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
378
378
|
public_key_path = os.path.expanduser(PUBLIC_SSH_KEY_PATH)
|
379
379
|
secret_name = clouds.Kubernetes.SKY_SSH_KEY_SECRET_NAME
|
380
380
|
secret_field_name = clouds.Kubernetes().ssh_key_secret_field_name
|
381
|
-
namespace = config['provider'].get(
|
382
|
-
'namespace',
|
383
|
-
kubernetes_utils.get_current_kube_config_context_namespace())
|
384
381
|
context = config['provider'].get(
|
385
382
|
'context', kubernetes_utils.get_current_kube_config_context_name())
|
383
|
+
namespace = config['provider'].get(
|
384
|
+
'namespace',
|
385
|
+
kubernetes_utils.get_kube_config_context_namespace(context))
|
386
386
|
k8s = kubernetes.kubernetes
|
387
387
|
with open(public_key_path, 'r', encoding='utf-8') as f:
|
388
388
|
public_key = f.read()
|
@@ -425,8 +425,8 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
425
425
|
ssh_jump_name,
|
426
426
|
nodeport_mode,
|
427
427
|
private_key_path=private_key_path,
|
428
|
-
|
429
|
-
|
428
|
+
context=context,
|
429
|
+
namespace=namespace)
|
430
430
|
elif network_mode == port_forward_mode:
|
431
431
|
# Using `kubectl port-forward` creates a direct tunnel to the pod and
|
432
432
|
# does not require a ssh jump pod.
|
@@ -441,7 +441,11 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
441
441
|
# on GKE.
|
442
442
|
ssh_target = config['cluster_name'] + '-head'
|
443
443
|
ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
|
444
|
-
ssh_target,
|
444
|
+
ssh_target,
|
445
|
+
port_forward_mode,
|
446
|
+
private_key_path=private_key_path,
|
447
|
+
context=context,
|
448
|
+
namespace=namespace)
|
445
449
|
else:
|
446
450
|
# This should never happen because we check for this in from_str above.
|
447
451
|
raise ValueError(f'Unsupported networking mode: {network_mode_str}')
|
sky/backends/backend_utils.py
CHANGED
@@ -48,6 +48,7 @@ from sky.provision import common as provision_common
|
|
48
48
|
from sky.provision import instance_setup
|
49
49
|
from sky.provision import metadata_utils
|
50
50
|
from sky.provision import provisioner
|
51
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
51
52
|
from sky.skylet import autostop_lib
|
52
53
|
from sky.skylet import constants
|
53
54
|
from sky.skylet import job_lib
|
@@ -2081,7 +2082,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2081
2082
|
"""
|
2082
2083
|
# Bump if any fields get added/removed/changed, and add backward
|
2083
2084
|
# compaitibility logic in __setstate__.
|
2084
|
-
_VERSION =
|
2085
|
+
_VERSION = 9
|
2085
2086
|
|
2086
2087
|
def __init__(
|
2087
2088
|
self,
|
@@ -2515,6 +2516,19 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2515
2516
|
if version < 8:
|
2516
2517
|
self.cached_cluster_info = None
|
2517
2518
|
|
2519
|
+
if version < 9:
|
2520
|
+
# For backward compatibility, we should update the region of a
|
2521
|
+
# SkyPilot cluster on Kubernetes to the actual context it is using.
|
2522
|
+
# pylint: disable=import-outside-toplevel
|
2523
|
+
launched_resources = state['launched_resources']
|
2524
|
+
if isinstance(launched_resources.cloud, clouds.Kubernetes):
|
2525
|
+
yaml_config = common_utils.read_yaml(
|
2526
|
+
os.path.expanduser(state['_cluster_yaml']))
|
2527
|
+
context = kubernetes_utils.get_context_from_config(
|
2528
|
+
yaml_config['provider'])
|
2529
|
+
state['launched_resources'] = launched_resources.copy(
|
2530
|
+
region=context)
|
2531
|
+
|
2518
2532
|
self.__dict__.update(state)
|
2519
2533
|
|
2520
2534
|
# Because the update_cluster_ips and update_ssh_ports
|
@@ -4180,6 +4194,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4180
4194
|
global_user_state.set_cluster_autostop_value(
|
4181
4195
|
handle.cluster_name, idle_minutes_to_autostop, down)
|
4182
4196
|
|
4197
|
+
# Add/Remove autodown annotations to/from Kubernetes pods.
|
4198
|
+
if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
|
4199
|
+
kubernetes_utils.set_autodown_annotations(
|
4200
|
+
handle=handle,
|
4201
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
4202
|
+
down=down)
|
4203
|
+
|
4183
4204
|
def is_definitely_autostopping(self,
|
4184
4205
|
handle: CloudVmRayResourceHandle,
|
4185
4206
|
stream_logs: bool = True) -> bool:
|
sky/cli.py
CHANGED
@@ -3026,14 +3026,11 @@ def show_gpus(
|
|
3026
3026
|
kubernetes_is_enabled = sky_clouds.cloud_in_iterable(
|
3027
3027
|
sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds())
|
3028
3028
|
|
3029
|
-
if cloud_is_kubernetes and region is not None:
|
3030
|
-
raise click.UsageError(
|
3031
|
-
'The --region flag cannot be set with --cloud kubernetes.')
|
3032
|
-
|
3033
3029
|
def _list_to_str(lst):
|
3034
3030
|
return ', '.join([str(e) for e in lst])
|
3035
3031
|
|
3036
3032
|
def _get_kubernetes_realtime_gpu_table(
|
3033
|
+
context: Optional[str] = None,
|
3037
3034
|
name_filter: Optional[str] = None,
|
3038
3035
|
quantity_filter: Optional[int] = None):
|
3039
3036
|
if quantity_filter:
|
@@ -3048,7 +3045,7 @@ def show_gpus(
|
|
3048
3045
|
gpus_only=True,
|
3049
3046
|
clouds='kubernetes',
|
3050
3047
|
name_filter=name_filter,
|
3051
|
-
region_filter=
|
3048
|
+
region_filter=context,
|
3052
3049
|
quantity_filter=quantity_filter,
|
3053
3050
|
case_sensitive=False)
|
3054
3051
|
assert (set(counts.keys()) == set(capacity.keys()) == set(
|
@@ -3078,11 +3075,11 @@ def show_gpus(
|
|
3078
3075
|
])
|
3079
3076
|
return realtime_gpu_table
|
3080
3077
|
|
3081
|
-
def _get_kubernetes_node_info_table():
|
3078
|
+
def _get_kubernetes_node_info_table(context: Optional[str]):
|
3082
3079
|
node_table = log_utils.create_table(
|
3083
3080
|
['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS'])
|
3084
3081
|
|
3085
|
-
node_info_dict = kubernetes_utils.get_kubernetes_node_info()
|
3082
|
+
node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
|
3086
3083
|
for node_name, node_info in node_info_dict.items():
|
3087
3084
|
node_table.add_row([
|
3088
3085
|
node_name, node_info.gpu_type,
|
@@ -3116,11 +3113,13 @@ def show_gpus(
|
|
3116
3113
|
print_section_titles = False
|
3117
3114
|
# If cloud is kubernetes, we want to show real-time capacity
|
3118
3115
|
if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes):
|
3116
|
+
context = region
|
3119
3117
|
try:
|
3120
3118
|
# If --cloud kubernetes is not specified, we want to catch
|
3121
3119
|
# the case where no GPUs are available on the cluster and
|
3122
3120
|
# print the warning at the end.
|
3123
|
-
k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
|
3121
|
+
k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
|
3122
|
+
context)
|
3124
3123
|
except ValueError as e:
|
3125
3124
|
if not cloud_is_kubernetes:
|
3126
3125
|
# Make it a note if cloud is not kubernetes
|
@@ -3129,9 +3128,10 @@ def show_gpus(
|
|
3129
3128
|
else:
|
3130
3129
|
print_section_titles = True
|
3131
3130
|
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3132
|
-
f'Kubernetes GPUs{
|
3131
|
+
f'Kubernetes GPUs (Context: {context})'
|
3132
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3133
3133
|
yield from k8s_realtime_table.get_string()
|
3134
|
-
k8s_node_table = _get_kubernetes_node_info_table()
|
3134
|
+
k8s_node_table = _get_kubernetes_node_info_table(context)
|
3135
3135
|
yield '\n\n'
|
3136
3136
|
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3137
3137
|
f'Kubernetes per node GPU availability'
|
sky/clouds/kubernetes.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
"""Kubernetes."""
|
2
|
+
import functools
|
2
3
|
import json
|
3
4
|
import os
|
4
5
|
import re
|
@@ -52,8 +53,7 @@ class Kubernetes(clouds.Cloud):
|
|
52
53
|
_DEFAULT_MEMORY_CPU_RATIO = 1
|
53
54
|
_DEFAULT_MEMORY_CPU_RATIO_WITH_GPU = 4 # Allocate more memory for GPU tasks
|
54
55
|
_REPR = 'Kubernetes'
|
55
|
-
|
56
|
-
_regions: List[clouds.Region] = [clouds.Region(_SINGLETON_REGION)]
|
56
|
+
_LEGACY_SINGLETON_REGION = 'kubernetes'
|
57
57
|
_CLOUD_UNSUPPORTED_FEATURES = {
|
58
58
|
# TODO(romilb): Stopping might be possible to implement with
|
59
59
|
# container checkpointing introduced in Kubernetes v1.25. See:
|
@@ -88,8 +88,12 @@ class Kubernetes(clouds.Cloud):
|
|
88
88
|
cls, resources: 'resources_lib.Resources'
|
89
89
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
90
90
|
unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES.copy()
|
91
|
+
context = resources.region
|
92
|
+
if context is None:
|
93
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
91
94
|
# Features to be disabled for exec auth
|
92
|
-
is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth(
|
95
|
+
is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth(
|
96
|
+
context)
|
93
97
|
if is_exec_auth:
|
94
98
|
assert isinstance(message, str), message
|
95
99
|
# Controllers cannot spin up new pods with exec auth.
|
@@ -99,7 +103,7 @@ class Kubernetes(clouds.Cloud):
|
|
99
103
|
unsupported_features[
|
100
104
|
clouds.CloudImplementationFeatures.AUTO_TERMINATE] = message
|
101
105
|
# Allow spot instances if supported by the cluster
|
102
|
-
spot_label_key, _ = kubernetes_utils.get_spot_label()
|
106
|
+
spot_label_key, _ = kubernetes_utils.get_spot_label(context)
|
103
107
|
if spot_label_key is not None:
|
104
108
|
unsupported_features.pop(
|
105
109
|
clouds.CloudImplementationFeatures.SPOT_INSTANCE, None)
|
@@ -110,16 +114,87 @@ class Kubernetes(clouds.Cloud):
|
|
110
114
|
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
111
115
|
|
112
116
|
@classmethod
|
113
|
-
|
114
|
-
|
117
|
+
@functools.lru_cache(maxsize=1)
|
118
|
+
def _log_skipped_contexts_once(cls, skipped_contexts: Tuple[str,
|
119
|
+
...]) -> None:
|
120
|
+
"""Log skipped contexts for only once.
|
121
|
+
|
122
|
+
We don't directly cache the result of _filter_existing_allowed_contexts
|
123
|
+
as the admin policy may update the allowed contexts.
|
124
|
+
"""
|
125
|
+
if skipped_contexts:
|
126
|
+
logger.warning(
|
127
|
+
f'Kubernetes contexts {set(skipped_contexts)!r} specified in '
|
128
|
+
'"allowed_contexts" not found in kubeconfig. '
|
129
|
+
'Ignoring these contexts.')
|
130
|
+
|
131
|
+
@classmethod
|
132
|
+
def _existing_allowed_contexts(cls) -> List[str]:
|
133
|
+
"""Get existing allowed contexts."""
|
134
|
+
all_contexts = kubernetes_utils.get_all_kube_config_context_names()
|
135
|
+
if all_contexts is None:
|
136
|
+
return []
|
137
|
+
all_contexts = set(all_contexts)
|
138
|
+
|
139
|
+
allowed_contexts = skypilot_config.get_nested(
|
140
|
+
('kubernetes', 'allowed_contexts'), None)
|
141
|
+
|
142
|
+
if allowed_contexts is None:
|
143
|
+
current_context = (
|
144
|
+
kubernetes_utils.get_current_kube_config_context_name())
|
145
|
+
allowed_contexts = []
|
146
|
+
if current_context is not None:
|
147
|
+
allowed_contexts = [current_context]
|
148
|
+
|
149
|
+
existing_contexts = []
|
150
|
+
skipped_contexts = []
|
151
|
+
for context in allowed_contexts:
|
152
|
+
if context in all_contexts:
|
153
|
+
existing_contexts.append(context)
|
154
|
+
else:
|
155
|
+
skipped_contexts.append(context)
|
156
|
+
cls._log_skipped_contexts_once(tuple(skipped_contexts))
|
157
|
+
return existing_contexts
|
115
158
|
|
116
159
|
@classmethod
|
117
160
|
def regions_with_offering(cls, instance_type: Optional[str],
|
118
161
|
accelerators: Optional[Dict[str, int]],
|
119
162
|
use_spot: bool, region: Optional[str],
|
120
163
|
zone: Optional[str]) -> List[clouds.Region]:
|
121
|
-
|
122
|
-
|
164
|
+
del accelerators, zone, use_spot # unused
|
165
|
+
existing_contexts = cls._existing_allowed_contexts()
|
166
|
+
|
167
|
+
regions = [clouds.Region(context) for context in existing_contexts]
|
168
|
+
|
169
|
+
if region is not None:
|
170
|
+
regions = [r for r in regions if r.name == region]
|
171
|
+
|
172
|
+
# Check if requested instance type will fit in the cluster.
|
173
|
+
# TODO(zhwu,romilb): autoscaler type needs to be regional (per
|
174
|
+
# kubernetes cluster/context).
|
175
|
+
regions_to_return = []
|
176
|
+
autoscaler_type = kubernetes_utils.get_autoscaler_type()
|
177
|
+
if autoscaler_type is None and instance_type is not None:
|
178
|
+
# If autoscaler is not set, check if the instance type fits in the
|
179
|
+
# cluster. Else, rely on the autoscaler to provision the right
|
180
|
+
# instance type without running checks. Worst case, if autoscaling
|
181
|
+
# fails, the pod will be stuck in pending state until
|
182
|
+
# provision_timeout, after which failover will be triggered.
|
183
|
+
for r in regions:
|
184
|
+
context = r.name
|
185
|
+
fits, reason = kubernetes_utils.check_instance_fits(
|
186
|
+
context, instance_type)
|
187
|
+
if fits:
|
188
|
+
regions_to_return.append(r)
|
189
|
+
else:
|
190
|
+
logger.debug(
|
191
|
+
f'Instance type {instance_type} does '
|
192
|
+
'not fit in the Kubernetes cluster with context: '
|
193
|
+
f'{context}. Reason: {reason}')
|
194
|
+
else:
|
195
|
+
regions_to_return = regions
|
196
|
+
|
197
|
+
return regions_to_return
|
123
198
|
|
124
199
|
def instance_type_to_hourly_cost(self,
|
125
200
|
instance_type: str,
|
@@ -201,9 +276,9 @@ class Kubernetes(clouds.Cloud):
|
|
201
276
|
accelerators: Optional[Dict[str, int]] = None,
|
202
277
|
use_spot: bool = False,
|
203
278
|
) -> Iterator[Optional[List[clouds.Zone]]]:
|
204
|
-
|
205
|
-
|
206
|
-
|
279
|
+
# Always yield None for zones, since Kubernetes does not have zones, and
|
280
|
+
# we should allow any region get to this point.
|
281
|
+
yield None
|
207
282
|
|
208
283
|
@classmethod
|
209
284
|
def get_zone_shell_cmd(cls) -> Optional[str]:
|
@@ -225,7 +300,10 @@ class Kubernetes(clouds.Cloud):
|
|
225
300
|
dryrun: bool = False) -> Dict[str, Optional[str]]:
|
226
301
|
del cluster_name, zones, dryrun # Unused.
|
227
302
|
if region is None:
|
228
|
-
|
303
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
304
|
+
else:
|
305
|
+
context = region.name
|
306
|
+
assert context is not None, 'No context found in kubeconfig'
|
229
307
|
|
230
308
|
r = resources
|
231
309
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
@@ -244,9 +322,14 @@ class Kubernetes(clouds.Cloud):
|
|
244
322
|
acc_count = k.accelerator_count if k.accelerator_count else 0
|
245
323
|
acc_type = k.accelerator_type if k.accelerator_type else None
|
246
324
|
|
247
|
-
|
325
|
+
image_id_dict = resources.image_id
|
326
|
+
if image_id_dict is not None:
|
248
327
|
# Use custom image specified in resources
|
249
|
-
|
328
|
+
if None in image_id_dict:
|
329
|
+
image_id = image_id_dict[None]
|
330
|
+
else:
|
331
|
+
assert resources.region in image_id_dict, image_id_dict
|
332
|
+
image_id = image_id_dict[resources.region]
|
250
333
|
if image_id.startswith('docker:'):
|
251
334
|
image_id = image_id[len('docker:'):]
|
252
335
|
else:
|
@@ -265,7 +348,7 @@ class Kubernetes(clouds.Cloud):
|
|
265
348
|
# If GPUs are requested, set node label to match the GPU type.
|
266
349
|
if acc_count > 0 and acc_type is not None:
|
267
350
|
k8s_acc_label_key, k8s_acc_label_value = \
|
268
|
-
kubernetes_utils.get_gpu_label_key_value(acc_type)
|
351
|
+
kubernetes_utils.get_gpu_label_key_value(context, acc_type)
|
269
352
|
|
270
353
|
port_mode = network_utils.get_port_mode(None)
|
271
354
|
|
@@ -309,13 +392,10 @@ class Kubernetes(clouds.Cloud):
|
|
309
392
|
deploy_vars = {
|
310
393
|
'instance_type': resources.instance_type,
|
311
394
|
'custom_resources': custom_resources,
|
312
|
-
'region': region.name,
|
313
395
|
'cpus': str(cpus),
|
314
396
|
'memory': str(mem),
|
315
397
|
'accelerator_count': str(acc_count),
|
316
398
|
'timeout': str(timeout),
|
317
|
-
'k8s_namespace':
|
318
|
-
kubernetes_utils.get_current_kube_config_context_namespace(),
|
319
399
|
'k8s_port_mode': port_mode.value,
|
320
400
|
'k8s_networking_mode': network_utils.get_networking_mode().value,
|
321
401
|
'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
|
@@ -335,18 +415,30 @@ class Kubernetes(clouds.Cloud):
|
|
335
415
|
|
336
416
|
# Add kubecontext if it is set. It may be None if SkyPilot is running
|
337
417
|
# inside a pod with in-cluster auth.
|
338
|
-
|
339
|
-
|
340
|
-
|
418
|
+
if context is not None:
|
419
|
+
deploy_vars['k8s_context'] = context
|
420
|
+
|
421
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
422
|
+
deploy_vars['k8s_namespace'] = namespace
|
341
423
|
|
342
424
|
return deploy_vars
|
343
425
|
|
344
426
|
def _get_feasible_launchable_resources(
|
345
427
|
self, resources: 'resources_lib.Resources'
|
346
428
|
) -> 'resources_utils.FeasibleResources':
|
429
|
+
# TODO(zhwu): This needs to be updated to return the correct region
|
430
|
+
# (context) that has enough resources.
|
347
431
|
fuzzy_candidate_list: List[str] = []
|
348
432
|
if resources.instance_type is not None:
|
349
433
|
assert resources.is_launchable(), resources
|
434
|
+
regions = self.regions_with_offering(
|
435
|
+
resources.instance_type,
|
436
|
+
accelerators=resources.accelerators,
|
437
|
+
use_spot=resources.use_spot,
|
438
|
+
region=resources.region,
|
439
|
+
zone=resources.zone)
|
440
|
+
if not regions:
|
441
|
+
return resources_utils.FeasibleResources([], [], None)
|
350
442
|
resources = resources.copy(accelerators=None)
|
351
443
|
return resources_utils.FeasibleResources([resources],
|
352
444
|
fuzzy_candidate_list, None)
|
@@ -391,34 +483,48 @@ class Kubernetes(clouds.Cloud):
|
|
391
483
|
kubernetes_utils.KubernetesInstanceType.from_resources(
|
392
484
|
gpu_task_cpus, gpu_task_memory, acc_count, acc_type).name)
|
393
485
|
|
394
|
-
# Check
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
chosen_instance_type)
|
404
|
-
if not fits:
|
405
|
-
logger.debug(f'Instance type {chosen_instance_type} does '
|
406
|
-
'not fit in the Kubernetes cluster. '
|
407
|
-
f'Reason: {reason}')
|
408
|
-
return resources_utils.FeasibleResources([], [], reason)
|
409
|
-
|
486
|
+
# Check the availability of the specified instance type in all contexts.
|
487
|
+
available_regions = self.regions_with_offering(
|
488
|
+
chosen_instance_type,
|
489
|
+
accelerators=None,
|
490
|
+
use_spot=resources.use_spot,
|
491
|
+
region=resources.region,
|
492
|
+
zone=resources.zone)
|
493
|
+
if not available_regions:
|
494
|
+
return resources_utils.FeasibleResources([], [], None)
|
410
495
|
# No fuzzy lists for Kubernetes
|
496
|
+
# We don't set the resources returned with regions, because the
|
497
|
+
# optimizer will further find the valid region (context) for the
|
498
|
+
# resources.
|
411
499
|
return resources_utils.FeasibleResources(_make([chosen_instance_type]),
|
412
500
|
[], None)
|
413
501
|
|
414
502
|
@classmethod
|
415
503
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
416
504
|
# Test using python API
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
505
|
+
existing_allowed_contexts = cls._existing_allowed_contexts()
|
506
|
+
if not existing_allowed_contexts:
|
507
|
+
if skypilot_config.loaded_config_path() is None:
|
508
|
+
check_skypilot_config_msg = ''
|
509
|
+
else:
|
510
|
+
check_skypilot_config_msg = (
|
511
|
+
' and check "allowed_contexts" in your '
|
512
|
+
f'{skypilot_config.loaded_config_path()} file.')
|
513
|
+
return (False, 'No available context found in kubeconfig. '
|
514
|
+
'Check if you have a valid kubeconfig file' +
|
515
|
+
check_skypilot_config_msg)
|
516
|
+
reasons = []
|
517
|
+
for context in existing_allowed_contexts:
|
518
|
+
try:
|
519
|
+
check_result = kubernetes_utils.check_credentials(context)
|
520
|
+
if check_result[0]:
|
521
|
+
return check_result
|
522
|
+
reasons.append(f'{context}: {check_result[1]}')
|
523
|
+
except Exception as e: # pylint: disable=broad-except
|
524
|
+
return (False, f'Credential check failed for {context}: '
|
525
|
+
f'{common_utils.format_exception(e)}')
|
526
|
+
return (False, 'Failed to find available context with working '
|
527
|
+
'credentials. Details:\n' + '\n'.join(reasons))
|
422
528
|
|
423
529
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
424
530
|
if os.path.exists(os.path.expanduser(CREDENTIAL_PATH)):
|
@@ -433,10 +539,20 @@ class Kubernetes(clouds.Cloud):
|
|
433
539
|
instance_type)
|
434
540
|
|
435
541
|
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
436
|
-
if region
|
542
|
+
if region == self._LEGACY_SINGLETON_REGION:
|
543
|
+
# For backward compatibility, we allow the region to be set to the
|
544
|
+
# legacy singletonton region.
|
545
|
+
# TODO: Remove this after 0.9.0.
|
546
|
+
return region, zone
|
547
|
+
|
548
|
+
all_contexts = kubernetes_utils.get_all_kube_config_context_names()
|
549
|
+
if all_contexts is None:
|
550
|
+
all_contexts = []
|
551
|
+
if region not in all_contexts:
|
437
552
|
raise ValueError(
|
438
|
-
'
|
439
|
-
'
|
553
|
+
f'Context {region} not found in kubeconfig. Kubernetes only '
|
554
|
+
'supports context names as regions. Available '
|
555
|
+
f'contexts: {all_contexts}')
|
440
556
|
if zone is not None:
|
441
557
|
raise ValueError('Kubernetes support does not support setting zone.'
|
442
558
|
' Cluster used is determined by the kubeconfig.')
|
sky/clouds/oci.py
CHANGED
@@ -431,14 +431,17 @@ class OCI(clouds.Cloud):
|
|
431
431
|
|
432
432
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
433
433
|
"""Returns a dict of credential file paths to mount paths."""
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
434
|
+
try:
|
435
|
+
oci_cfg_file = oci_adaptor.get_config_file()
|
436
|
+
# Pass-in a profile parameter so that multiple profile in oci
|
437
|
+
# config file is supported (2023/06/09).
|
438
|
+
oci_cfg = oci_adaptor.get_oci_config(
|
439
|
+
profile=oci_utils.oci_config.get_profile())
|
440
|
+
api_key_file = oci_cfg[
|
441
|
+
'key_file'] if 'key_file' in oci_cfg else 'BadConf'
|
442
|
+
sky_cfg_file = oci_utils.oci_config.get_sky_user_config_file()
|
443
|
+
except ImportError:
|
444
|
+
return {}
|
442
445
|
|
443
446
|
# OCI config and API key file are mandatory
|
444
447
|
credential_files = [oci_cfg_file, api_key_file]
|
@@ -68,26 +68,35 @@ def list_accelerators_realtime(
|
|
68
68
|
# TODO(romilb): This should be refactored to use get_kubernetes_node_info()
|
69
69
|
# function from kubernetes_utils.
|
70
70
|
del all_regions, require_price # Unused.
|
71
|
+
# TODO(zhwu): this should return all accelerators in multiple kubernetes
|
72
|
+
# clusters defined by allowed_contexts.
|
73
|
+
if region_filter is None:
|
74
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
75
|
+
else:
|
76
|
+
context = region_filter
|
77
|
+
if context is None:
|
78
|
+
return {}, {}, {}
|
79
|
+
|
71
80
|
k8s_cloud = Kubernetes()
|
72
81
|
if not any(
|
73
82
|
map(k8s_cloud.is_same_cloud,
|
74
83
|
sky_check.get_cached_enabled_clouds_or_refresh())
|
75
|
-
) or not kubernetes_utils.check_credentials()[0]:
|
84
|
+
) or not kubernetes_utils.check_credentials(context)[0]:
|
76
85
|
return {}, {}, {}
|
77
86
|
|
78
|
-
has_gpu = kubernetes_utils.detect_gpu_resource()
|
87
|
+
has_gpu = kubernetes_utils.detect_gpu_resource(context)
|
79
88
|
if not has_gpu:
|
80
89
|
return {}, {}, {}
|
81
90
|
|
82
|
-
label_formatter, _ = kubernetes_utils.detect_gpu_label_formatter()
|
91
|
+
label_formatter, _ = kubernetes_utils.detect_gpu_label_formatter(context)
|
83
92
|
if not label_formatter:
|
84
93
|
return {}, {}, {}
|
85
94
|
|
86
95
|
accelerators_qtys: Set[Tuple[str, int]] = set()
|
87
96
|
key = label_formatter.get_label_key()
|
88
|
-
nodes = kubernetes_utils.get_kubernetes_nodes()
|
97
|
+
nodes = kubernetes_utils.get_kubernetes_nodes(context)
|
89
98
|
# Get the pods to get the real-time GPU usage
|
90
|
-
pods = kubernetes_utils.
|
99
|
+
pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
|
91
100
|
# Total number of GPUs in the cluster
|
92
101
|
total_accelerators_capacity: Dict[str, int] = {}
|
93
102
|
# Total number of GPUs currently available in the cluster
|
@@ -160,7 +169,7 @@ def list_accelerators_realtime(
|
|
160
169
|
memory=None,
|
161
170
|
price=0.0,
|
162
171
|
spot_price=0.0,
|
163
|
-
region=
|
172
|
+
region=context))
|
164
173
|
|
165
174
|
df = pd.DataFrame(result,
|
166
175
|
columns=[
|
@@ -175,7 +184,6 @@ def list_accelerators_realtime(
|
|
175
184
|
qtys_map = common.list_accelerators_impl('Kubernetes', df, gpus_only,
|
176
185
|
name_filter, region_filter,
|
177
186
|
quantity_filter, case_sensitive)
|
178
|
-
|
179
187
|
return qtys_map, total_accelerators_capacity, total_accelerators_available
|
180
188
|
|
181
189
|
|