skypilot-nightly 1.0.0.dev20240926__py3-none-any.whl → 1.0.0.dev20240928__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +10 -8
- sky/authentication.py +10 -6
- sky/backends/backend_utils.py +1 -0
- sky/backends/cloud_vm_ray_backend.py +14 -1
- sky/cli.py +129 -19
- sky/clouds/kubernetes.py +161 -45
- sky/clouds/oci.py +11 -8
- sky/clouds/service_catalog/kubernetes_catalog.py +15 -7
- sky/provision/kubernetes/network.py +34 -14
- sky/provision/kubernetes/network_utils.py +7 -5
- sky/provision/kubernetes/utils.py +125 -59
- sky/provision/provisioner.py +2 -0
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/utils/command_runner.py +4 -0
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/log_utils.py +88 -10
- sky/utils/schemas.py +6 -0
- {skypilot_nightly-1.0.0.dev20240926.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/METADATA +16 -15
- {skypilot_nightly-1.0.0.dev20240926.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/RECORD +24 -23
- {skypilot_nightly-1.0.0.dev20240926.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20240926.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20240926.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20240926.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'dacf27348ae1446c3c93d0ee2fc57702c5366eac'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20240928'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/adaptors/kubernetes.py
CHANGED
@@ -75,15 +75,17 @@ def _load_config(context: Optional[str] = None):
|
|
75
75
|
suffix += f' Error: {str(e)}'
|
76
76
|
# Check if exception was due to no current-context
|
77
77
|
if 'Expected key current-context' in str(e):
|
78
|
-
err_str = (
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
78
|
+
err_str = (
|
79
|
+
f'Failed to load Kubernetes configuration for {context!r}. '
|
80
|
+
'Kubeconfig does not contain any valid context(s).'
|
81
|
+
f'{suffix}\n'
|
82
|
+
' If you were running a local Kubernetes '
|
83
|
+
'cluster, run `sky local up` to start the cluster.')
|
83
84
|
else:
|
84
|
-
err_str = (
|
85
|
-
|
86
|
-
|
85
|
+
err_str = (
|
86
|
+
f'Failed to load Kubernetes configuration for {context!r}. '
|
87
|
+
'Please check if your kubeconfig file exists at '
|
88
|
+
f'~/.kube/config and is valid.{suffix}')
|
87
89
|
err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
|
88
90
|
with ux_utils.print_exception_no_traceback():
|
89
91
|
raise ValueError(err_str) from None
|
sky/authentication.py
CHANGED
@@ -378,11 +378,11 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
378
378
|
public_key_path = os.path.expanduser(PUBLIC_SSH_KEY_PATH)
|
379
379
|
secret_name = clouds.Kubernetes.SKY_SSH_KEY_SECRET_NAME
|
380
380
|
secret_field_name = clouds.Kubernetes().ssh_key_secret_field_name
|
381
|
-
namespace = config['provider'].get(
|
382
|
-
'namespace',
|
383
|
-
kubernetes_utils.get_current_kube_config_context_namespace())
|
384
381
|
context = config['provider'].get(
|
385
382
|
'context', kubernetes_utils.get_current_kube_config_context_name())
|
383
|
+
namespace = config['provider'].get(
|
384
|
+
'namespace',
|
385
|
+
kubernetes_utils.get_kube_config_context_namespace(context))
|
386
386
|
k8s = kubernetes.kubernetes
|
387
387
|
with open(public_key_path, 'r', encoding='utf-8') as f:
|
388
388
|
public_key = f.read()
|
@@ -425,8 +425,8 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
425
425
|
ssh_jump_name,
|
426
426
|
nodeport_mode,
|
427
427
|
private_key_path=private_key_path,
|
428
|
-
|
429
|
-
|
428
|
+
context=context,
|
429
|
+
namespace=namespace)
|
430
430
|
elif network_mode == port_forward_mode:
|
431
431
|
# Using `kubectl port-forward` creates a direct tunnel to the pod and
|
432
432
|
# does not require a ssh jump pod.
|
@@ -441,7 +441,11 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
441
441
|
# on GKE.
|
442
442
|
ssh_target = config['cluster_name'] + '-head'
|
443
443
|
ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
|
444
|
-
ssh_target,
|
444
|
+
ssh_target,
|
445
|
+
port_forward_mode,
|
446
|
+
private_key_path=private_key_path,
|
447
|
+
context=context,
|
448
|
+
namespace=namespace)
|
445
449
|
else:
|
446
450
|
# This should never happen because we check for this in from_str above.
|
447
451
|
raise ValueError(f'Unsupported networking mode: {network_mode_str}')
|
sky/backends/backend_utils.py
CHANGED
@@ -2082,7 +2082,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2082
2082
|
"""
|
2083
2083
|
# Bump if any fields get added/removed/changed, and add backward
|
2084
2084
|
# compaitibility logic in __setstate__.
|
2085
|
-
_VERSION =
|
2085
|
+
_VERSION = 9
|
2086
2086
|
|
2087
2087
|
def __init__(
|
2088
2088
|
self,
|
@@ -2516,6 +2516,19 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2516
2516
|
if version < 8:
|
2517
2517
|
self.cached_cluster_info = None
|
2518
2518
|
|
2519
|
+
if version < 9:
|
2520
|
+
# For backward compatibility, we should update the region of a
|
2521
|
+
# SkyPilot cluster on Kubernetes to the actual context it is using.
|
2522
|
+
# pylint: disable=import-outside-toplevel
|
2523
|
+
launched_resources = state['launched_resources']
|
2524
|
+
if isinstance(launched_resources.cloud, clouds.Kubernetes):
|
2525
|
+
yaml_config = common_utils.read_yaml(
|
2526
|
+
os.path.expanduser(state['_cluster_yaml']))
|
2527
|
+
context = kubernetes_utils.get_context_from_config(
|
2528
|
+
yaml_config['provider'])
|
2529
|
+
state['launched_resources'] = launched_resources.copy(
|
2530
|
+
region=context)
|
2531
|
+
|
2519
2532
|
self.__dict__.update(state)
|
2520
2533
|
|
2521
2534
|
# Because the update_cluster_ips and update_ssh_ports
|
sky/cli.py
CHANGED
@@ -3026,14 +3026,11 @@ def show_gpus(
|
|
3026
3026
|
kubernetes_is_enabled = sky_clouds.cloud_in_iterable(
|
3027
3027
|
sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds())
|
3028
3028
|
|
3029
|
-
if cloud_is_kubernetes and region is not None:
|
3030
|
-
raise click.UsageError(
|
3031
|
-
'The --region flag cannot be set with --cloud kubernetes.')
|
3032
|
-
|
3033
3029
|
def _list_to_str(lst):
|
3034
3030
|
return ', '.join([str(e) for e in lst])
|
3035
3031
|
|
3036
3032
|
def _get_kubernetes_realtime_gpu_table(
|
3033
|
+
context: Optional[str] = None,
|
3037
3034
|
name_filter: Optional[str] = None,
|
3038
3035
|
quantity_filter: Optional[int] = None):
|
3039
3036
|
if quantity_filter:
|
@@ -3048,7 +3045,7 @@ def show_gpus(
|
|
3048
3045
|
gpus_only=True,
|
3049
3046
|
clouds='kubernetes',
|
3050
3047
|
name_filter=name_filter,
|
3051
|
-
region_filter=
|
3048
|
+
region_filter=context,
|
3052
3049
|
quantity_filter=quantity_filter,
|
3053
3050
|
case_sensitive=False)
|
3054
3051
|
assert (set(counts.keys()) == set(capacity.keys()) == set(
|
@@ -3078,11 +3075,11 @@ def show_gpus(
|
|
3078
3075
|
])
|
3079
3076
|
return realtime_gpu_table
|
3080
3077
|
|
3081
|
-
def _get_kubernetes_node_info_table():
|
3078
|
+
def _get_kubernetes_node_info_table(context: Optional[str]):
|
3082
3079
|
node_table = log_utils.create_table(
|
3083
3080
|
['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS'])
|
3084
3081
|
|
3085
|
-
node_info_dict = kubernetes_utils.get_kubernetes_node_info()
|
3082
|
+
node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
|
3086
3083
|
for node_name, node_info in node_info_dict.items():
|
3087
3084
|
node_table.add_row([
|
3088
3085
|
node_name, node_info.gpu_type,
|
@@ -3116,11 +3113,13 @@ def show_gpus(
|
|
3116
3113
|
print_section_titles = False
|
3117
3114
|
# If cloud is kubernetes, we want to show real-time capacity
|
3118
3115
|
if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes):
|
3116
|
+
context = region
|
3119
3117
|
try:
|
3120
3118
|
# If --cloud kubernetes is not specified, we want to catch
|
3121
3119
|
# the case where no GPUs are available on the cluster and
|
3122
3120
|
# print the warning at the end.
|
3123
|
-
k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
|
3121
|
+
k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
|
3122
|
+
context)
|
3124
3123
|
except ValueError as e:
|
3125
3124
|
if not cloud_is_kubernetes:
|
3126
3125
|
# Make it a note if cloud is not kubernetes
|
@@ -3129,9 +3128,10 @@ def show_gpus(
|
|
3129
3128
|
else:
|
3130
3129
|
print_section_titles = True
|
3131
3130
|
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3132
|
-
f'Kubernetes GPUs{
|
3131
|
+
f'Kubernetes GPUs (Context: {context})'
|
3132
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3133
3133
|
yield from k8s_realtime_table.get_string()
|
3134
|
-
k8s_node_table = _get_kubernetes_node_info_table()
|
3134
|
+
k8s_node_table = _get_kubernetes_node_info_table(context)
|
3135
3135
|
yield '\n\n'
|
3136
3136
|
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3137
3137
|
f'Kubernetes per node GPU availability'
|
@@ -5072,15 +5072,7 @@ def local():
|
|
5072
5072
|
pass
|
5073
5073
|
|
5074
5074
|
|
5075
|
-
|
5076
|
-
default=True,
|
5077
|
-
is_flag=True,
|
5078
|
-
help='Launch cluster without GPU support even '
|
5079
|
-
'if GPUs are detected on the host.')
|
5080
|
-
@local.command('up', cls=_DocumentedCodeCommand)
|
5081
|
-
@usage_lib.entrypoint
|
5082
|
-
def local_up(gpus: bool):
|
5083
|
-
"""Creates a local cluster."""
|
5075
|
+
def _deploy_local_cluster(gpus: bool):
|
5084
5076
|
cluster_created = False
|
5085
5077
|
|
5086
5078
|
# Check if GPUs are available on the host
|
@@ -5206,6 +5198,124 @@ def local_up(gpus: bool):
|
|
5206
5198
|
f'{gpu_hint}')
|
5207
5199
|
|
5208
5200
|
|
5201
|
+
def _deploy_remote_cluster(ip_file: str, ssh_user: str, ssh_key_path: str,
|
5202
|
+
cleanup: bool):
|
5203
|
+
success = False
|
5204
|
+
path_to_package = os.path.dirname(os.path.dirname(__file__))
|
5205
|
+
up_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes',
|
5206
|
+
'deploy_remote_cluster.sh')
|
5207
|
+
# Get directory of script and run it from there
|
5208
|
+
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
5209
|
+
|
5210
|
+
deploy_command = f'{up_script_path} {ip_file} {ssh_user} {ssh_key_path}'
|
5211
|
+
if cleanup:
|
5212
|
+
deploy_command += ' --cleanup'
|
5213
|
+
|
5214
|
+
# Convert the command to a format suitable for subprocess
|
5215
|
+
deploy_command = shlex.split(deploy_command)
|
5216
|
+
|
5217
|
+
# Setup logging paths
|
5218
|
+
run_timestamp = backend_utils.get_run_timestamp()
|
5219
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
5220
|
+
'local_up.log')
|
5221
|
+
tail_cmd = 'tail -n100 -f ' + log_path
|
5222
|
+
|
5223
|
+
# Check if ~/.kube/config exists:
|
5224
|
+
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
5225
|
+
click.echo('Found existing kube config. '
|
5226
|
+
'It will be backed up to ~/.kube/config.bak.')
|
5227
|
+
style = colorama.Style
|
5228
|
+
click.echo('To view detailed progress: '
|
5229
|
+
f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
|
5230
|
+
if cleanup:
|
5231
|
+
msg_str = 'Cleaning up remote cluster...'
|
5232
|
+
else:
|
5233
|
+
msg_str = 'Deploying remote cluster...'
|
5234
|
+
with rich_utils.safe_status(f'[bold cyan]{msg_str}'):
|
5235
|
+
returncode, _, stderr = log_lib.run_with_log(
|
5236
|
+
cmd=deploy_command,
|
5237
|
+
log_path=log_path,
|
5238
|
+
require_outputs=True,
|
5239
|
+
stream_logs=False,
|
5240
|
+
line_processor=log_utils.SkyRemoteUpLineProcessor(),
|
5241
|
+
cwd=cwd)
|
5242
|
+
if returncode == 0:
|
5243
|
+
success = True
|
5244
|
+
else:
|
5245
|
+
with ux_utils.print_exception_no_traceback():
|
5246
|
+
raise RuntimeError(
|
5247
|
+
'Failed to deploy remote cluster. '
|
5248
|
+
f'Full log: {log_path}'
|
5249
|
+
f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
|
5250
|
+
|
5251
|
+
if success:
|
5252
|
+
if cleanup:
|
5253
|
+
click.echo(f'{colorama.Fore.GREEN}'
|
5254
|
+
'🎉 Remote cluster cleaned up successfully.'
|
5255
|
+
f'{style.RESET_ALL}')
|
5256
|
+
else:
|
5257
|
+
click.echo('Cluster deployment done. You can now run tasks on '
|
5258
|
+
'this cluster.\nE.g., run a task with: '
|
5259
|
+
'sky launch --cloud kubernetes -- echo hello world.'
|
5260
|
+
f'\n{colorama.Fore.GREEN}🎉 Remote cluster deployed '
|
5261
|
+
f'successfully. {style.RESET_ALL}')
|
5262
|
+
|
5263
|
+
|
5264
|
+
@click.option('--gpus/--no-gpus',
|
5265
|
+
default=True,
|
5266
|
+
is_flag=True,
|
5267
|
+
help='Launch cluster without GPU support even '
|
5268
|
+
'if GPUs are detected on the host.')
|
5269
|
+
@click.option(
|
5270
|
+
'--ips',
|
5271
|
+
type=str,
|
5272
|
+
required=False,
|
5273
|
+
help='Path to the file containing IP addresses of remote machines.')
|
5274
|
+
@click.option('--ssh-user',
|
5275
|
+
type=str,
|
5276
|
+
required=False,
|
5277
|
+
help='SSH username for accessing remote machines.')
|
5278
|
+
@click.option('--ssh-key-path',
|
5279
|
+
type=str,
|
5280
|
+
required=False,
|
5281
|
+
help='Path to the SSH private key.')
|
5282
|
+
@click.option('--cleanup',
|
5283
|
+
is_flag=True,
|
5284
|
+
help='Clean up the remote cluster instead of deploying it.')
|
5285
|
+
@local.command('up', cls=_DocumentedCodeCommand)
|
5286
|
+
@usage_lib.entrypoint
|
5287
|
+
def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
|
5288
|
+
cleanup: bool):
|
5289
|
+
"""Creates a local or remote cluster."""
|
5290
|
+
|
5291
|
+
def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
|
5292
|
+
# If any of --ips, --ssh-user, or --ssh-key-path is specified,
|
5293
|
+
# all must be specified
|
5294
|
+
if bool(ips) or bool(ssh_user) or bool(ssh_key_path):
|
5295
|
+
if not (ips and ssh_user and ssh_key_path):
|
5296
|
+
raise click.BadParameter(
|
5297
|
+
'All --ips, --ssh-user, and --ssh-key-path '
|
5298
|
+
'must be specified together.')
|
5299
|
+
|
5300
|
+
# --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
|
5301
|
+
# are all provided
|
5302
|
+
if cleanup and not (ips and ssh_user and ssh_key_path):
|
5303
|
+
raise click.BadParameter('--cleanup can only be used with '
|
5304
|
+
'--ips, --ssh-user and --ssh-key-path.')
|
5305
|
+
|
5306
|
+
_validate_args(ips, ssh_user, ssh_key_path, cleanup)
|
5307
|
+
|
5308
|
+
# If remote deployment arguments are specified, run remote up script
|
5309
|
+
if ips and ssh_user and ssh_key_path:
|
5310
|
+
# Convert ips and ssh_key_path to absolute paths
|
5311
|
+
ips = os.path.abspath(ips)
|
5312
|
+
ssh_key_path = os.path.abspath(ssh_key_path)
|
5313
|
+
_deploy_remote_cluster(ips, ssh_user, ssh_key_path, cleanup)
|
5314
|
+
else:
|
5315
|
+
# Run local deployment (kind) if no remote args are specified
|
5316
|
+
_deploy_local_cluster(gpus)
|
5317
|
+
|
5318
|
+
|
5209
5319
|
@local.command('down', cls=_DocumentedCodeCommand)
|
5210
5320
|
@usage_lib.entrypoint
|
5211
5321
|
def local_down():
|
sky/clouds/kubernetes.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
"""Kubernetes."""
|
2
|
+
import functools
|
2
3
|
import json
|
3
4
|
import os
|
4
5
|
import re
|
@@ -52,8 +53,7 @@ class Kubernetes(clouds.Cloud):
|
|
52
53
|
_DEFAULT_MEMORY_CPU_RATIO = 1
|
53
54
|
_DEFAULT_MEMORY_CPU_RATIO_WITH_GPU = 4 # Allocate more memory for GPU tasks
|
54
55
|
_REPR = 'Kubernetes'
|
55
|
-
|
56
|
-
_regions: List[clouds.Region] = [clouds.Region(_SINGLETON_REGION)]
|
56
|
+
_LEGACY_SINGLETON_REGION = 'kubernetes'
|
57
57
|
_CLOUD_UNSUPPORTED_FEATURES = {
|
58
58
|
# TODO(romilb): Stopping might be possible to implement with
|
59
59
|
# container checkpointing introduced in Kubernetes v1.25. See:
|
@@ -88,8 +88,12 @@ class Kubernetes(clouds.Cloud):
|
|
88
88
|
cls, resources: 'resources_lib.Resources'
|
89
89
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
90
90
|
unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES.copy()
|
91
|
+
context = resources.region
|
92
|
+
if context is None:
|
93
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
91
94
|
# Features to be disabled for exec auth
|
92
|
-
is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth(
|
95
|
+
is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth(
|
96
|
+
context)
|
93
97
|
if is_exec_auth:
|
94
98
|
assert isinstance(message, str), message
|
95
99
|
# Controllers cannot spin up new pods with exec auth.
|
@@ -99,7 +103,7 @@ class Kubernetes(clouds.Cloud):
|
|
99
103
|
unsupported_features[
|
100
104
|
clouds.CloudImplementationFeatures.AUTO_TERMINATE] = message
|
101
105
|
# Allow spot instances if supported by the cluster
|
102
|
-
spot_label_key, _ = kubernetes_utils.get_spot_label()
|
106
|
+
spot_label_key, _ = kubernetes_utils.get_spot_label(context)
|
103
107
|
if spot_label_key is not None:
|
104
108
|
unsupported_features.pop(
|
105
109
|
clouds.CloudImplementationFeatures.SPOT_INSTANCE, None)
|
@@ -110,16 +114,87 @@ class Kubernetes(clouds.Cloud):
|
|
110
114
|
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
111
115
|
|
112
116
|
@classmethod
|
113
|
-
|
114
|
-
|
117
|
+
@functools.lru_cache(maxsize=1)
|
118
|
+
def _log_skipped_contexts_once(cls, skipped_contexts: Tuple[str,
|
119
|
+
...]) -> None:
|
120
|
+
"""Log skipped contexts for only once.
|
121
|
+
|
122
|
+
We don't directly cache the result of _filter_existing_allowed_contexts
|
123
|
+
as the admin policy may update the allowed contexts.
|
124
|
+
"""
|
125
|
+
if skipped_contexts:
|
126
|
+
logger.warning(
|
127
|
+
f'Kubernetes contexts {set(skipped_contexts)!r} specified in '
|
128
|
+
'"allowed_contexts" not found in kubeconfig. '
|
129
|
+
'Ignoring these contexts.')
|
130
|
+
|
131
|
+
@classmethod
|
132
|
+
def _existing_allowed_contexts(cls) -> List[str]:
|
133
|
+
"""Get existing allowed contexts."""
|
134
|
+
all_contexts = kubernetes_utils.get_all_kube_config_context_names()
|
135
|
+
if all_contexts is None:
|
136
|
+
return []
|
137
|
+
all_contexts = set(all_contexts)
|
138
|
+
|
139
|
+
allowed_contexts = skypilot_config.get_nested(
|
140
|
+
('kubernetes', 'allowed_contexts'), None)
|
141
|
+
|
142
|
+
if allowed_contexts is None:
|
143
|
+
current_context = (
|
144
|
+
kubernetes_utils.get_current_kube_config_context_name())
|
145
|
+
allowed_contexts = []
|
146
|
+
if current_context is not None:
|
147
|
+
allowed_contexts = [current_context]
|
148
|
+
|
149
|
+
existing_contexts = []
|
150
|
+
skipped_contexts = []
|
151
|
+
for context in allowed_contexts:
|
152
|
+
if context in all_contexts:
|
153
|
+
existing_contexts.append(context)
|
154
|
+
else:
|
155
|
+
skipped_contexts.append(context)
|
156
|
+
cls._log_skipped_contexts_once(tuple(skipped_contexts))
|
157
|
+
return existing_contexts
|
115
158
|
|
116
159
|
@classmethod
|
117
160
|
def regions_with_offering(cls, instance_type: Optional[str],
|
118
161
|
accelerators: Optional[Dict[str, int]],
|
119
162
|
use_spot: bool, region: Optional[str],
|
120
163
|
zone: Optional[str]) -> List[clouds.Region]:
|
121
|
-
|
122
|
-
|
164
|
+
del accelerators, zone, use_spot # unused
|
165
|
+
existing_contexts = cls._existing_allowed_contexts()
|
166
|
+
|
167
|
+
regions = [clouds.Region(context) for context in existing_contexts]
|
168
|
+
|
169
|
+
if region is not None:
|
170
|
+
regions = [r for r in regions if r.name == region]
|
171
|
+
|
172
|
+
# Check if requested instance type will fit in the cluster.
|
173
|
+
# TODO(zhwu,romilb): autoscaler type needs to be regional (per
|
174
|
+
# kubernetes cluster/context).
|
175
|
+
regions_to_return = []
|
176
|
+
autoscaler_type = kubernetes_utils.get_autoscaler_type()
|
177
|
+
if autoscaler_type is None and instance_type is not None:
|
178
|
+
# If autoscaler is not set, check if the instance type fits in the
|
179
|
+
# cluster. Else, rely on the autoscaler to provision the right
|
180
|
+
# instance type without running checks. Worst case, if autoscaling
|
181
|
+
# fails, the pod will be stuck in pending state until
|
182
|
+
# provision_timeout, after which failover will be triggered.
|
183
|
+
for r in regions:
|
184
|
+
context = r.name
|
185
|
+
fits, reason = kubernetes_utils.check_instance_fits(
|
186
|
+
context, instance_type)
|
187
|
+
if fits:
|
188
|
+
regions_to_return.append(r)
|
189
|
+
else:
|
190
|
+
logger.debug(
|
191
|
+
f'Instance type {instance_type} does '
|
192
|
+
'not fit in the Kubernetes cluster with context: '
|
193
|
+
f'{context}. Reason: {reason}')
|
194
|
+
else:
|
195
|
+
regions_to_return = regions
|
196
|
+
|
197
|
+
return regions_to_return
|
123
198
|
|
124
199
|
def instance_type_to_hourly_cost(self,
|
125
200
|
instance_type: str,
|
@@ -201,9 +276,9 @@ class Kubernetes(clouds.Cloud):
|
|
201
276
|
accelerators: Optional[Dict[str, int]] = None,
|
202
277
|
use_spot: bool = False,
|
203
278
|
) -> Iterator[Optional[List[clouds.Zone]]]:
|
204
|
-
|
205
|
-
|
206
|
-
|
279
|
+
# Always yield None for zones, since Kubernetes does not have zones, and
|
280
|
+
# we should allow any region get to this point.
|
281
|
+
yield None
|
207
282
|
|
208
283
|
@classmethod
|
209
284
|
def get_zone_shell_cmd(cls) -> Optional[str]:
|
@@ -225,7 +300,10 @@ class Kubernetes(clouds.Cloud):
|
|
225
300
|
dryrun: bool = False) -> Dict[str, Optional[str]]:
|
226
301
|
del cluster_name, zones, dryrun # Unused.
|
227
302
|
if region is None:
|
228
|
-
|
303
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
304
|
+
else:
|
305
|
+
context = region.name
|
306
|
+
assert context is not None, 'No context found in kubeconfig'
|
229
307
|
|
230
308
|
r = resources
|
231
309
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
@@ -244,9 +322,14 @@ class Kubernetes(clouds.Cloud):
|
|
244
322
|
acc_count = k.accelerator_count if k.accelerator_count else 0
|
245
323
|
acc_type = k.accelerator_type if k.accelerator_type else None
|
246
324
|
|
247
|
-
|
325
|
+
image_id_dict = resources.image_id
|
326
|
+
if image_id_dict is not None:
|
248
327
|
# Use custom image specified in resources
|
249
|
-
|
328
|
+
if None in image_id_dict:
|
329
|
+
image_id = image_id_dict[None]
|
330
|
+
else:
|
331
|
+
assert resources.region in image_id_dict, image_id_dict
|
332
|
+
image_id = image_id_dict[resources.region]
|
250
333
|
if image_id.startswith('docker:'):
|
251
334
|
image_id = image_id[len('docker:'):]
|
252
335
|
else:
|
@@ -265,7 +348,7 @@ class Kubernetes(clouds.Cloud):
|
|
265
348
|
# If GPUs are requested, set node label to match the GPU type.
|
266
349
|
if acc_count > 0 and acc_type is not None:
|
267
350
|
k8s_acc_label_key, k8s_acc_label_value = \
|
268
|
-
kubernetes_utils.get_gpu_label_key_value(acc_type)
|
351
|
+
kubernetes_utils.get_gpu_label_key_value(context, acc_type)
|
269
352
|
|
270
353
|
port_mode = network_utils.get_port_mode(None)
|
271
354
|
|
@@ -309,13 +392,10 @@ class Kubernetes(clouds.Cloud):
|
|
309
392
|
deploy_vars = {
|
310
393
|
'instance_type': resources.instance_type,
|
311
394
|
'custom_resources': custom_resources,
|
312
|
-
'region': region.name,
|
313
395
|
'cpus': str(cpus),
|
314
396
|
'memory': str(mem),
|
315
397
|
'accelerator_count': str(acc_count),
|
316
398
|
'timeout': str(timeout),
|
317
|
-
'k8s_namespace':
|
318
|
-
kubernetes_utils.get_current_kube_config_context_namespace(),
|
319
399
|
'k8s_port_mode': port_mode.value,
|
320
400
|
'k8s_networking_mode': network_utils.get_networking_mode().value,
|
321
401
|
'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
|
@@ -335,18 +415,30 @@ class Kubernetes(clouds.Cloud):
|
|
335
415
|
|
336
416
|
# Add kubecontext if it is set. It may be None if SkyPilot is running
|
337
417
|
# inside a pod with in-cluster auth.
|
338
|
-
|
339
|
-
|
340
|
-
|
418
|
+
if context is not None:
|
419
|
+
deploy_vars['k8s_context'] = context
|
420
|
+
|
421
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
422
|
+
deploy_vars['k8s_namespace'] = namespace
|
341
423
|
|
342
424
|
return deploy_vars
|
343
425
|
|
344
426
|
def _get_feasible_launchable_resources(
|
345
427
|
self, resources: 'resources_lib.Resources'
|
346
428
|
) -> 'resources_utils.FeasibleResources':
|
429
|
+
# TODO(zhwu): This needs to be updated to return the correct region
|
430
|
+
# (context) that has enough resources.
|
347
431
|
fuzzy_candidate_list: List[str] = []
|
348
432
|
if resources.instance_type is not None:
|
349
433
|
assert resources.is_launchable(), resources
|
434
|
+
regions = self.regions_with_offering(
|
435
|
+
resources.instance_type,
|
436
|
+
accelerators=resources.accelerators,
|
437
|
+
use_spot=resources.use_spot,
|
438
|
+
region=resources.region,
|
439
|
+
zone=resources.zone)
|
440
|
+
if not regions:
|
441
|
+
return resources_utils.FeasibleResources([], [], None)
|
350
442
|
resources = resources.copy(accelerators=None)
|
351
443
|
return resources_utils.FeasibleResources([resources],
|
352
444
|
fuzzy_candidate_list, None)
|
@@ -391,34 +483,48 @@ class Kubernetes(clouds.Cloud):
|
|
391
483
|
kubernetes_utils.KubernetesInstanceType.from_resources(
|
392
484
|
gpu_task_cpus, gpu_task_memory, acc_count, acc_type).name)
|
393
485
|
|
394
|
-
# Check
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
chosen_instance_type)
|
404
|
-
if not fits:
|
405
|
-
logger.debug(f'Instance type {chosen_instance_type} does '
|
406
|
-
'not fit in the Kubernetes cluster. '
|
407
|
-
f'Reason: {reason}')
|
408
|
-
return resources_utils.FeasibleResources([], [], reason)
|
409
|
-
|
486
|
+
# Check the availability of the specified instance type in all contexts.
|
487
|
+
available_regions = self.regions_with_offering(
|
488
|
+
chosen_instance_type,
|
489
|
+
accelerators=None,
|
490
|
+
use_spot=resources.use_spot,
|
491
|
+
region=resources.region,
|
492
|
+
zone=resources.zone)
|
493
|
+
if not available_regions:
|
494
|
+
return resources_utils.FeasibleResources([], [], None)
|
410
495
|
# No fuzzy lists for Kubernetes
|
496
|
+
# We don't set the resources returned with regions, because the
|
497
|
+
# optimizer will further find the valid region (context) for the
|
498
|
+
# resources.
|
411
499
|
return resources_utils.FeasibleResources(_make([chosen_instance_type]),
|
412
500
|
[], None)
|
413
501
|
|
414
502
|
@classmethod
|
415
503
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
416
504
|
# Test using python API
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
505
|
+
existing_allowed_contexts = cls._existing_allowed_contexts()
|
506
|
+
if not existing_allowed_contexts:
|
507
|
+
if skypilot_config.loaded_config_path() is None:
|
508
|
+
check_skypilot_config_msg = ''
|
509
|
+
else:
|
510
|
+
check_skypilot_config_msg = (
|
511
|
+
' and check "allowed_contexts" in your '
|
512
|
+
f'{skypilot_config.loaded_config_path()} file.')
|
513
|
+
return (False, 'No available context found in kubeconfig. '
|
514
|
+
'Check if you have a valid kubeconfig file' +
|
515
|
+
check_skypilot_config_msg)
|
516
|
+
reasons = []
|
517
|
+
for context in existing_allowed_contexts:
|
518
|
+
try:
|
519
|
+
check_result = kubernetes_utils.check_credentials(context)
|
520
|
+
if check_result[0]:
|
521
|
+
return check_result
|
522
|
+
reasons.append(f'{context}: {check_result[1]}')
|
523
|
+
except Exception as e: # pylint: disable=broad-except
|
524
|
+
return (False, f'Credential check failed for {context}: '
|
525
|
+
f'{common_utils.format_exception(e)}')
|
526
|
+
return (False, 'Failed to find available context with working '
|
527
|
+
'credentials. Details:\n' + '\n'.join(reasons))
|
422
528
|
|
423
529
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
424
530
|
if os.path.exists(os.path.expanduser(CREDENTIAL_PATH)):
|
@@ -433,10 +539,20 @@ class Kubernetes(clouds.Cloud):
|
|
433
539
|
instance_type)
|
434
540
|
|
435
541
|
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
436
|
-
if region
|
542
|
+
if region == self._LEGACY_SINGLETON_REGION:
|
543
|
+
# For backward compatibility, we allow the region to be set to the
|
544
|
+
# legacy singletonton region.
|
545
|
+
# TODO: Remove this after 0.9.0.
|
546
|
+
return region, zone
|
547
|
+
|
548
|
+
all_contexts = kubernetes_utils.get_all_kube_config_context_names()
|
549
|
+
if all_contexts is None:
|
550
|
+
all_contexts = []
|
551
|
+
if region not in all_contexts:
|
437
552
|
raise ValueError(
|
438
|
-
'
|
439
|
-
'
|
553
|
+
f'Context {region} not found in kubeconfig. Kubernetes only '
|
554
|
+
'supports context names as regions. Available '
|
555
|
+
f'contexts: {all_contexts}')
|
440
556
|
if zone is not None:
|
441
557
|
raise ValueError('Kubernetes support does not support setting zone.'
|
442
558
|
' Cluster used is determined by the kubeconfig.')
|