skypilot-nightly 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +28 -40
- sky/backends/backend_utils.py +19 -2
- sky/backends/cloud_vm_ray_backend.py +33 -8
- sky/backends/local_docker_backend.py +1 -2
- sky/cli.py +91 -38
- sky/client/cli.py +91 -38
- sky/client/sdk.py +3 -2
- sky/clouds/aws.py +12 -6
- sky/clouds/azure.py +3 -0
- sky/clouds/cloud.py +8 -2
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +3 -0
- sky/clouds/fluidstack.py +3 -0
- sky/clouds/gcp.py +7 -0
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +42 -19
- sky/clouds/lambda_cloud.py +1 -0
- sky/clouds/nebius.py +18 -10
- sky/clouds/oci.py +6 -3
- sky/clouds/paperspace.py +2 -0
- sky/clouds/runpod.py +2 -0
- sky/clouds/scp.py +2 -0
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
- sky/clouds/vast.py +2 -0
- sky/clouds/vsphere.py +2 -0
- sky/core.py +58 -29
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/favicon.ico +0 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/exceptions.py +6 -0
- sky/execution.py +19 -4
- sky/global_user_state.py +1 -0
- sky/optimizer.py +35 -11
- sky/provision/common.py +2 -5
- sky/provision/docker_utils.py +22 -16
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +276 -93
- sky/provision/kubernetes/network.py +1 -1
- sky/provision/kubernetes/utils.py +36 -24
- sky/provision/provisioner.py +6 -0
- sky/serve/replica_managers.py +51 -5
- sky/serve/serve_state.py +41 -0
- sky/serve/service.py +108 -63
- sky/server/common.py +6 -3
- sky/server/config.py +184 -0
- sky/server/requests/executor.py +17 -156
- sky/server/server.py +4 -4
- sky/setup_files/dependencies.py +0 -1
- sky/skylet/constants.py +7 -0
- sky/skypilot_config.py +27 -6
- sky/task.py +1 -1
- sky/templates/kubernetes-ray.yml.j2 +145 -15
- sky/templates/nebius-ray.yml.j2 +63 -0
- sky/utils/command_runner.py +17 -3
- sky/utils/command_runner.pyi +2 -0
- sky/utils/controller_utils.py +24 -0
- sky/utils/kubernetes/rsync_helper.sh +20 -4
- sky/utils/schemas.py +13 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/METADATA +2 -2
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/RECORD +73 -72
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/WHEEL +1 -1
- /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/top_level.txt +0 -0
sky/client/cli.py
CHANGED
@@ -23,6 +23,7 @@ NOTE: the order of command definitions in this file corresponds to how they are
|
|
23
23
|
listed in "sky --help". Take care to put logically connected commands close to
|
24
24
|
each other.
|
25
25
|
"""
|
26
|
+
import collections
|
26
27
|
import copy
|
27
28
|
import datetime
|
28
29
|
import functools
|
@@ -162,7 +163,7 @@ def _get_cluster_records_and_set_ssh_config(
|
|
162
163
|
'-o StrictHostKeyChecking=no '
|
163
164
|
'-o UserKnownHostsFile=/dev/null '
|
164
165
|
'-o IdentitiesOnly=yes '
|
165
|
-
'-W %h:%p '
|
166
|
+
'-W \'[%h]:%p\' '
|
166
167
|
f'{handle.ssh_user}@127.0.0.1 '
|
167
168
|
'-o ProxyCommand='
|
168
169
|
# TODO(zhwu): write the template to a temp file, don't use
|
@@ -3413,7 +3414,7 @@ def show_gpus(
|
|
3413
3414
|
|
3414
3415
|
# TODO(zhwu,romilb): We should move most of these kubernetes related
|
3415
3416
|
# queries into the backend, especially behind the server.
|
3416
|
-
def
|
3417
|
+
def _get_kubernetes_realtime_gpu_tables(
|
3417
3418
|
context: Optional[str] = None,
|
3418
3419
|
name_filter: Optional[str] = None,
|
3419
3420
|
quantity_filter: Optional[int] = None):
|
@@ -3423,15 +3424,14 @@ def show_gpus(
|
|
3423
3424
|
else:
|
3424
3425
|
qty_header = 'REQUESTABLE_QTY_PER_NODE'
|
3425
3426
|
free_header = 'TOTAL_FREE_GPUS'
|
3426
|
-
|
3427
|
-
|
3428
|
-
realtime_gpu_availability_list = sdk.stream_and_get(
|
3427
|
+
|
3428
|
+
realtime_gpu_availability_lists = sdk.stream_and_get(
|
3429
3429
|
sdk.realtime_kubernetes_gpu_availability(
|
3430
3430
|
context=context,
|
3431
3431
|
name_filter=name_filter,
|
3432
3432
|
quantity_filter=quantity_filter))
|
3433
|
-
if not
|
3434
|
-
err_msg = 'No GPUs found in Kubernetes cluster. '
|
3433
|
+
if not realtime_gpu_availability_lists:
|
3434
|
+
err_msg = 'No GPUs found in any allowed Kubernetes cluster. '
|
3435
3435
|
debug_msg = 'To further debug, run: sky check '
|
3436
3436
|
if name_filter is not None:
|
3437
3437
|
gpu_info_msg = f' {name_filter!r}'
|
@@ -3439,26 +3439,52 @@ def show_gpus(
|
|
3439
3439
|
gpu_info_msg += (' with requested quantity'
|
3440
3440
|
f' {quantity_filter}')
|
3441
3441
|
err_msg = (f'Resources{gpu_info_msg} not found '
|
3442
|
-
'in Kubernetes cluster. ')
|
3442
|
+
'in any allowed Kubernetes cluster. ')
|
3443
3443
|
debug_msg = ('To show available accelerators on kubernetes,'
|
3444
3444
|
' run: sky show-gpus --cloud kubernetes ')
|
3445
3445
|
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
3446
3446
|
debug_msg)
|
3447
3447
|
raise ValueError(full_err_msg)
|
3448
3448
|
no_permissions_str = '<no permissions>'
|
3449
|
-
|
3450
|
-
|
3451
|
-
|
3452
|
-
|
3453
|
-
|
3454
|
-
|
3455
|
-
|
3456
|
-
|
3457
|
-
|
3458
|
-
|
3459
|
-
available_qty
|
3460
|
-
|
3461
|
-
|
3449
|
+
realtime_gpu_infos = []
|
3450
|
+
total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
|
3451
|
+
lambda: [0, 0])
|
3452
|
+
|
3453
|
+
for (ctx, availability_list) in realtime_gpu_availability_lists:
|
3454
|
+
realtime_gpu_table = log_utils.create_table(
|
3455
|
+
['GPU', qty_header, 'TOTAL_GPUS', free_header])
|
3456
|
+
for realtime_gpu_availability in sorted(availability_list):
|
3457
|
+
gpu_availability = models.RealtimeGpuAvailability(
|
3458
|
+
*realtime_gpu_availability)
|
3459
|
+
available_qty = (gpu_availability.available
|
3460
|
+
if gpu_availability.available != -1 else
|
3461
|
+
no_permissions_str)
|
3462
|
+
realtime_gpu_table.add_row([
|
3463
|
+
gpu_availability.gpu,
|
3464
|
+
_list_to_str(gpu_availability.counts),
|
3465
|
+
gpu_availability.capacity,
|
3466
|
+
available_qty,
|
3467
|
+
])
|
3468
|
+
gpu = gpu_availability.gpu
|
3469
|
+
capacity = gpu_availability.capacity
|
3470
|
+
# we want total, so skip permission denied.
|
3471
|
+
available = max(gpu_availability.available, 0)
|
3472
|
+
if capacity > 0:
|
3473
|
+
total_gpu_info[gpu][0] += capacity
|
3474
|
+
total_gpu_info[gpu][1] += available
|
3475
|
+
realtime_gpu_infos.append((ctx, realtime_gpu_table))
|
3476
|
+
|
3477
|
+
# display an aggregated table for all contexts
|
3478
|
+
# if there are more than one contexts with GPUs
|
3479
|
+
if len(realtime_gpu_infos) > 1:
|
3480
|
+
total_realtime_gpu_table = log_utils.create_table(
|
3481
|
+
['GPU', 'TOTAL_GPUS', free_header])
|
3482
|
+
for gpu, stats in total_gpu_info.items():
|
3483
|
+
total_realtime_gpu_table.add_row([gpu, stats[0], stats[1]])
|
3484
|
+
else:
|
3485
|
+
total_realtime_gpu_table = None
|
3486
|
+
|
3487
|
+
return realtime_gpu_infos, total_realtime_gpu_table
|
3462
3488
|
|
3463
3489
|
def _format_kubernetes_node_info(context: Optional[str]):
|
3464
3490
|
node_table = log_utils.create_table(
|
@@ -3479,7 +3505,7 @@ def show_gpus(
|
|
3479
3505
|
'Kubernetes per node accelerator availability ')
|
3480
3506
|
if nodes_info.hint:
|
3481
3507
|
k8s_per_node_acc_message += nodes_info.hint
|
3482
|
-
return (f'{colorama.Fore.
|
3508
|
+
return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
|
3483
3509
|
f'{k8s_per_node_acc_message}'
|
3484
3510
|
f'{colorama.Style.RESET_ALL}\n'
|
3485
3511
|
f'{node_table.get_string()}')
|
@@ -3516,8 +3542,7 @@ def show_gpus(
|
|
3516
3542
|
# If --cloud kubernetes is not specified, we want to catch
|
3517
3543
|
# the case where no GPUs are available on the cluster and
|
3518
3544
|
# print the warning at the end.
|
3519
|
-
|
3520
|
-
context)
|
3545
|
+
k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables(context) # pylint: disable=line-too-long
|
3521
3546
|
except ValueError as e:
|
3522
3547
|
if not cloud_is_kubernetes:
|
3523
3548
|
# Make it a note if cloud is not kubernetes
|
@@ -3525,13 +3550,24 @@ def show_gpus(
|
|
3525
3550
|
k8s_messages += str(e)
|
3526
3551
|
else:
|
3527
3552
|
print_section_titles = True
|
3528
|
-
|
3529
|
-
|
3530
|
-
|
3531
|
-
|
3532
|
-
|
3533
|
-
|
3534
|
-
|
3553
|
+
|
3554
|
+
# print total table
|
3555
|
+
if total_table is not None:
|
3556
|
+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
3557
|
+
'Total Kubernetes GPUs'
|
3558
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3559
|
+
yield from total_table.get_string()
|
3560
|
+
yield '\n-----\n\n'
|
3561
|
+
|
3562
|
+
# print individual infos.
|
3563
|
+
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
|
3564
|
+
context_str = f'(Context: {ctx})' if ctx else ''
|
3565
|
+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3566
|
+
f'Kubernetes GPUs {context_str}'
|
3567
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3568
|
+
yield from k8s_realtime_table.get_string()
|
3569
|
+
yield '\n\n'
|
3570
|
+
yield _format_kubernetes_node_info(ctx) + '\n-----\n\n'
|
3535
3571
|
if kubernetes_autoscaling:
|
3536
3572
|
k8s_messages += (
|
3537
3573
|
'\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
@@ -3620,13 +3656,29 @@ def show_gpus(
|
|
3620
3656
|
# Print section title if not showing all and instead a specific
|
3621
3657
|
# accelerator is requested
|
3622
3658
|
print_section_titles = True
|
3623
|
-
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3624
|
-
f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
|
3625
3659
|
# TODO(romilb): Show filtered per node GPU availability here as well
|
3626
3660
|
try:
|
3627
|
-
|
3628
|
-
|
3629
|
-
|
3661
|
+
k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables( # pylint: disable=line-too-long
|
3662
|
+
context=region,
|
3663
|
+
name_filter=name,
|
3664
|
+
quantity_filter=quantity)
|
3665
|
+
|
3666
|
+
# print total table
|
3667
|
+
if total_table is not None:
|
3668
|
+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
3669
|
+
'Total Kubernetes GPUs'
|
3670
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3671
|
+
yield from total_table.get_string()
|
3672
|
+
yield '\n-----\n\n'
|
3673
|
+
|
3674
|
+
# print individual tables
|
3675
|
+
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
|
3676
|
+
context_str = f'(Context: {ctx})' if ctx else ''
|
3677
|
+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3678
|
+
f'Kubernetes GPUs {context_str}'
|
3679
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3680
|
+
yield from k8s_realtime_table.get_string()
|
3681
|
+
yield '\n\n'
|
3630
3682
|
except ValueError as e:
|
3631
3683
|
# In the case of a specific accelerator, show the error message
|
3632
3684
|
# immediately (e.g., "Resources H100 not found ...")
|
@@ -5911,11 +5963,12 @@ def api_info():
|
|
5911
5963
|
user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
|
5912
5964
|
user_hash = common_utils.get_user_hash()
|
5913
5965
|
dashboard_url = server_common.get_dashboard_url(url)
|
5914
|
-
click.echo(f'Using SkyPilot API server: {url}
|
5966
|
+
click.echo(f'Using SkyPilot API server: {url}\n'
|
5915
5967
|
f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
|
5916
5968
|
f'commit: {api_server_info["commit"]}, '
|
5917
5969
|
f'version: {api_server_info["version"]}\n'
|
5918
|
-
f'{ux_utils.
|
5970
|
+
f'{ux_utils.INDENT_SYMBOL}User: {user_name} ({user_hash})\n'
|
5971
|
+
f'{ux_utils.INDENT_LAST_SYMBOL}Dashboard: {dashboard_url}')
|
5919
5972
|
|
5920
5973
|
|
5921
5974
|
def main():
|
sky/client/sdk.py
CHANGED
@@ -1840,6 +1840,7 @@ def api_login(endpoint: Optional[str] = None) -> None:
|
|
1840
1840
|
dashboard_url = server_common.get_dashboard_url(endpoint)
|
1841
1841
|
dashboard_msg = f'Dashboard: {dashboard_url}'
|
1842
1842
|
click.secho(
|
1843
|
-
f'Logged
|
1844
|
-
f'
|
1843
|
+
f'Logged into SkyPilot API server at: {endpoint}'
|
1844
|
+
f'\n{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
|
1845
|
+
f'{dashboard_msg}',
|
1845
1846
|
fg='green')
|
sky/clouds/aws.py
CHANGED
@@ -161,13 +161,19 @@ class AWS(clouds.Cloud):
|
|
161
161
|
def _unsupported_features_for_resources(
|
162
162
|
cls, resources: 'resources_lib.Resources'
|
163
163
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
164
|
+
unsupported_features = {}
|
164
165
|
if resources.use_spot:
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
166
|
+
unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
|
167
|
+
f'Stopping spot instances is currently not supported on {cls._REPR}.'
|
168
|
+
)
|
169
|
+
|
170
|
+
unsupported_features[
|
171
|
+
clouds.CloudImplementationFeatures.
|
172
|
+
HIGH_AVAILABILITY_CONTROLLERS] = (
|
173
|
+
f'High availability controllers are not supported on {cls._REPR}.'
|
174
|
+
)
|
175
|
+
|
176
|
+
return unsupported_features
|
171
177
|
|
172
178
|
@classmethod
|
173
179
|
def max_cluster_name_length(cls) -> Optional[int]:
|
sky/clouds/azure.py
CHANGED
@@ -90,6 +90,9 @@ class Azure(clouds.Cloud):
|
|
90
90
|
features = {
|
91
91
|
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
92
92
|
(f'Migrating disk is currently not supported on {cls._REPR}.'),
|
93
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS: (
|
94
|
+
f'High availability controllers are not supported on {cls._REPR}.'
|
95
|
+
),
|
93
96
|
}
|
94
97
|
if resources.use_spot:
|
95
98
|
features[clouds.CloudImplementationFeatures.STOP] = (
|
sky/clouds/cloud.py
CHANGED
@@ -47,6 +47,9 @@ class CloudImplementationFeatures(enum.Enum):
|
|
47
47
|
OPEN_PORTS = 'open_ports'
|
48
48
|
STORAGE_MOUNTING = 'storage_mounting'
|
49
49
|
HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
|
50
|
+
HIGH_AVAILABILITY_CONTROLLERS = ('high_availability_controllers'
|
51
|
+
) # Controller can auto-restart
|
52
|
+
AUTO_TERMINATE = 'auto_terminate' # Pod/VM can stop or down itself
|
50
53
|
AUTOSTOP = 'autostop' # Pod/VM can stop itself
|
51
54
|
AUTODOWN = 'autodown' # Pod/VM can down itself
|
52
55
|
|
@@ -415,13 +418,16 @@ class Cloud:
|
|
415
418
|
try:
|
416
419
|
self.check_features_are_supported(resources,
|
417
420
|
resources_required_features)
|
418
|
-
except exceptions.NotSupportedError:
|
421
|
+
except exceptions.NotSupportedError as e:
|
419
422
|
# TODO(zhwu): The resources are now silently filtered out. We
|
420
423
|
# should have some logging telling the user why the resources
|
421
424
|
# are not considered.
|
425
|
+
# UPDATE(kyuds): passing in NotSupportedError reason string
|
426
|
+
# to hint for issue #5344. Did not remove above comment as
|
427
|
+
# reason is not displayed when other resources are valid.
|
422
428
|
return resources_utils.FeasibleResources(resources_list=[],
|
423
429
|
fuzzy_candidate_list=[],
|
424
|
-
hint=
|
430
|
+
hint=str(e))
|
425
431
|
return self._get_feasible_launchable_resources(resources)
|
426
432
|
|
427
433
|
def _get_feasible_launchable_resources(
|
sky/clouds/cudo.py
CHANGED
@@ -68,6 +68,8 @@ class Cudo(clouds.Cloud):
|
|
68
68
|
'Cudo Compute cannot host a controller as it does not '
|
69
69
|
'autostopping, which will leave the controller to run indefinitely.'
|
70
70
|
),
|
71
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
72
|
+
('High availability controllers are not supported on Cudo.'),
|
71
73
|
}
|
72
74
|
_MAX_CLUSTER_NAME_LEN_LIMIT = 60
|
73
75
|
|
sky/clouds/do.py
CHANGED
@@ -33,6 +33,9 @@ class DO(clouds.Cloud):
|
|
33
33
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
34
34
|
'Custom disk tiers'
|
35
35
|
f' is not supported in {_REPR}.',
|
36
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
37
|
+
('High availability controllers are not supported in '
|
38
|
+
f'{_REPR}.'),
|
36
39
|
}
|
37
40
|
# DO maximum node name length defined as <= 255
|
38
41
|
# https://docs.digitalocean.com/reference/api/api-reference/#operation/droplets_create
|
sky/clouds/fluidstack.py
CHANGED
@@ -56,6 +56,9 @@ class Fluidstack(clouds.Cloud):
|
|
56
56
|
clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
|
57
57
|
'Host controllers'
|
58
58
|
f' are not supported in {_REPR}.',
|
59
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
60
|
+
('High availability controllers are not supported in '
|
61
|
+
f'{_REPR}.'),
|
59
62
|
}
|
60
63
|
# Using the latest SkyPilot provisioner API to provision and check status.
|
61
64
|
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
sky/clouds/gcp.py
CHANGED
@@ -232,6 +232,13 @@ class GCP(clouds.Cloud):
|
|
232
232
|
unsupported[clouds.CloudImplementationFeatures.SPOT_INSTANCE] = (
|
233
233
|
'Managed Instance Group with DWS does not support '
|
234
234
|
'spot instances.')
|
235
|
+
|
236
|
+
unsupported[
|
237
|
+
clouds.CloudImplementationFeatures.
|
238
|
+
HIGH_AVAILABILITY_CONTROLLERS] = (
|
239
|
+
f'High availability controllers are not supported on {cls._REPR}.'
|
240
|
+
)
|
241
|
+
|
235
242
|
return unsupported
|
236
243
|
|
237
244
|
@classmethod
|
sky/clouds/ibm.py
CHANGED
@@ -50,6 +50,8 @@ class IBM(clouds.Cloud):
|
|
50
50
|
),
|
51
51
|
clouds.CloudImplementationFeatures.OPEN_PORTS:
|
52
52
|
(f'Opening ports is currently not supported on {cls._REPR}.'),
|
53
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
54
|
+
('High availability controllers are not supported on IBM.'),
|
53
55
|
}
|
54
56
|
if resources.use_spot:
|
55
57
|
features[clouds.CloudImplementationFeatures.STOP] = (
|
sky/clouds/kubernetes.py
CHANGED
@@ -429,28 +429,32 @@ class Kubernetes(clouds.Cloud):
|
|
429
429
|
acc_count = k.accelerator_count if k.accelerator_count else 0
|
430
430
|
acc_type = k.accelerator_type if k.accelerator_type else None
|
431
431
|
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
432
|
+
def _get_image_id(resources: 'resources_lib.Resources') -> str:
|
433
|
+
image_id_dict = resources.image_id
|
434
|
+
if image_id_dict is not None:
|
435
|
+
# Use custom image specified in resources
|
436
|
+
if None in image_id_dict:
|
437
|
+
image_id = image_id_dict[None]
|
438
|
+
else:
|
439
|
+
assert resources.region in image_id_dict, image_id_dict
|
440
|
+
image_id = image_id_dict[resources.region]
|
441
|
+
if image_id.startswith('docker:'):
|
442
|
+
image_id = image_id[len('docker:'):]
|
437
443
|
else:
|
438
|
-
|
439
|
-
image_id =
|
440
|
-
|
441
|
-
image_id =
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
image_id = service_catalog.get_image_id_from_tag(
|
447
|
-
image_id, clouds='kubernetes')
|
444
|
+
# Select image based on whether we are using GPUs or not.
|
445
|
+
image_id = self.IMAGE_GPU if acc_count > 0 else self.IMAGE_CPU
|
446
|
+
# Get the container image ID from the service catalog.
|
447
|
+
image_id = service_catalog.get_image_id_from_tag(
|
448
|
+
image_id, clouds='kubernetes')
|
449
|
+
return image_id
|
450
|
+
|
451
|
+
image_id = _get_image_id(resources)
|
448
452
|
# TODO(romilb): Create a lightweight image for SSH jump host
|
449
453
|
ssh_jump_image = service_catalog.get_image_id_from_tag(
|
450
454
|
self.IMAGE_CPU, clouds='kubernetes')
|
451
455
|
|
452
456
|
k8s_acc_label_key = None
|
453
|
-
|
457
|
+
k8s_acc_label_values = None
|
454
458
|
k8s_topology_label_key = None
|
455
459
|
k8s_topology_label_value = None
|
456
460
|
k8s_resource_key = None
|
@@ -458,9 +462,9 @@ class Kubernetes(clouds.Cloud):
|
|
458
462
|
|
459
463
|
# If GPU/TPUs are requested, set node label to match the GPU/TPU type.
|
460
464
|
if acc_count > 0 and acc_type is not None:
|
461
|
-
(k8s_acc_label_key,
|
465
|
+
(k8s_acc_label_key, k8s_acc_label_values, k8s_topology_label_key,
|
462
466
|
k8s_topology_label_value) = (
|
463
|
-
kubernetes_utils.
|
467
|
+
kubernetes_utils.get_accelerator_label_key_values(
|
464
468
|
context, acc_type, acc_count))
|
465
469
|
if (k8s_acc_label_key ==
|
466
470
|
kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY):
|
@@ -540,6 +544,13 @@ class Kubernetes(clouds.Cloud):
|
|
540
544
|
# cpus is <1.
|
541
545
|
'num-cpus': str(max(int(cpus), 1)),
|
542
546
|
}
|
547
|
+
|
548
|
+
# Get the storage class name for high availability controller's PVC
|
549
|
+
k8s_ha_storage_class_name = skypilot_config.get_nested(
|
550
|
+
('kubernetes', 'high_availability', 'storage_class_name'),
|
551
|
+
None,
|
552
|
+
override_configs=resources.cluster_config_overrides)
|
553
|
+
|
543
554
|
deploy_vars = {
|
544
555
|
'instance_type': resources.instance_type,
|
545
556
|
'custom_resources': custom_resources,
|
@@ -551,7 +562,7 @@ class Kubernetes(clouds.Cloud):
|
|
551
562
|
'k8s_networking_mode': network_utils.get_networking_mode().value,
|
552
563
|
'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
|
553
564
|
'k8s_acc_label_key': k8s_acc_label_key,
|
554
|
-
'
|
565
|
+
'k8s_acc_label_values': k8s_acc_label_values,
|
555
566
|
'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME,
|
556
567
|
'k8s_ssh_jump_image': ssh_jump_image,
|
557
568
|
'k8s_service_account_name': k8s_service_account_name,
|
@@ -574,6 +585,18 @@ class Kubernetes(clouds.Cloud):
|
|
574
585
|
'skypilot_ray_port': constants.SKY_REMOTE_RAY_PORT,
|
575
586
|
'ray_worker_start_command': instance_setup.ray_worker_start_command(
|
576
587
|
custom_resources, custom_ray_options, no_restart=False),
|
588
|
+
'k8s_high_availability_deployment_volume_mount_name':
|
589
|
+
(kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME
|
590
|
+
),
|
591
|
+
'k8s_high_availability_deployment_volume_mount_path':
|
592
|
+
(kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH
|
593
|
+
),
|
594
|
+
'k8s_high_availability_deployment_setup_script_path':
|
595
|
+
(constants.PERSISTENT_SETUP_SCRIPT_PATH),
|
596
|
+
'k8s_high_availability_deployment_run_script_dir':
|
597
|
+
(constants.PERSISTENT_RUN_SCRIPT_DIR),
|
598
|
+
'k8s_high_availability_storage_class_name':
|
599
|
+
(k8s_ha_storage_class_name),
|
577
600
|
}
|
578
601
|
|
579
602
|
# Add kubecontext if it is set. It may be None if SkyPilot is running
|
sky/clouds/lambda_cloud.py
CHANGED
@@ -44,6 +44,7 @@ class Lambda(clouds.Cloud):
|
|
44
44
|
clouds.CloudImplementationFeatures.IMAGE_ID: f'Specifying image ID is not supported in {_REPR}.',
|
45
45
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
|
46
46
|
clouds.CloudImplementationFeatures.HOST_CONTROLLERS: f'Host controllers are not supported in {_REPR}.',
|
47
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS: f'High availability controllers are not supported on {_REPR}.',
|
47
48
|
}
|
48
49
|
|
49
50
|
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
sky/clouds/nebius.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
""" Nebius Cloud. """
|
2
|
-
import logging
|
3
2
|
import os
|
4
3
|
import typing
|
5
4
|
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
@@ -7,6 +6,7 @@ from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
7
6
|
from sky import clouds
|
8
7
|
from sky.adaptors import nebius
|
9
8
|
from sky.clouds import service_catalog
|
9
|
+
from sky.utils import annotations
|
10
10
|
from sky.utils import registry
|
11
11
|
from sky.utils import resources_utils
|
12
12
|
|
@@ -59,12 +59,10 @@ class Nebius(clouds.Cloud):
|
|
59
59
|
('Spot is not supported, as Nebius API does not implement spot.'),
|
60
60
|
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
61
61
|
(f'Migrating disk is currently not supported on {_REPR}.'),
|
62
|
-
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
|
63
|
-
(f'Docker image is currently not supported on {_REPR}. '
|
64
|
-
'You can try running docker command inside the '
|
65
|
-
'`run` section in task.yaml.'),
|
66
62
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
67
63
|
(f'Custom disk tier is currently not supported on {_REPR}.'),
|
64
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
65
|
+
('High availability controllers are not supported on Nebius.'),
|
68
66
|
}
|
69
67
|
# Nebius maximum instance name length defined as <= 63 as a hostname length
|
70
68
|
# 63 - 8 - 5 = 50 characters since
|
@@ -211,7 +209,8 @@ class Nebius(clouds.Cloud):
|
|
211
209
|
else:
|
212
210
|
raise RuntimeError('Unsupported instance type for Nebius cloud:'
|
213
211
|
f' {resources.instance_type}')
|
214
|
-
|
212
|
+
|
213
|
+
resources_vars = {
|
215
214
|
'instance_type': resources.instance_type,
|
216
215
|
'custom_resources': custom_resources,
|
217
216
|
'region': region.name,
|
@@ -220,6 +219,14 @@ class Nebius(clouds.Cloud):
|
|
220
219
|
'zones': None,
|
221
220
|
}
|
222
221
|
|
222
|
+
if acc_dict is not None:
|
223
|
+
# Nebius cloud's docker runtime information does not contain
|
224
|
+
# 'nvidia-container-runtime', causing no GPU option to be added to
|
225
|
+
# the docker run command. We patch this by adding it here.
|
226
|
+
resources_vars['docker_run_options'] = ['--gpus all']
|
227
|
+
|
228
|
+
return resources_vars
|
229
|
+
|
223
230
|
def _get_feasible_launchable_resources(
|
224
231
|
self, resources: 'resources_lib.Resources'
|
225
232
|
) -> 'resources_utils.FeasibleResources':
|
@@ -275,16 +282,16 @@ class Nebius(clouds.Cloud):
|
|
275
282
|
fuzzy_candidate_list, None)
|
276
283
|
|
277
284
|
@classmethod
|
285
|
+
@annotations.lru_cache(scope='request')
|
278
286
|
def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
|
279
287
|
"""Checks if the user has access credentials to
|
280
288
|
Nebius's compute service."""
|
281
|
-
logging.debug('Nebius cloud check credentials')
|
282
289
|
token_cred_msg = (
|
283
290
|
f'{_INDENT_PREFIX}Credentials can be set up by running: \n'
|
284
291
|
f'{_INDENT_PREFIX} $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n' # pylint: disable=line-too-long
|
285
|
-
f'{_INDENT_PREFIX} or generate ~/.nebius/credentials.json')
|
292
|
+
f'{_INDENT_PREFIX} or generate ~/.nebius/credentials.json \n')
|
286
293
|
|
287
|
-
tenant_msg = (f'{_INDENT_PREFIX}Copy your tenat ID from the web console and save it to file \n' # pylint: disable=line-too-long
|
294
|
+
tenant_msg = (f'{_INDENT_PREFIX} Copy your tenat ID from the web console and save it to file \n' # pylint: disable=line-too-long
|
288
295
|
f'{_INDENT_PREFIX} $ echo $NEBIUS_TENANT_ID_PATH > {nebius.NEBIUS_TENANT_ID_PATH} \n' # pylint: disable=line-too-long
|
289
296
|
f'{_INDENT_PREFIX} Or if you have 1 tenant you can run:\n' # pylint: disable=line-too-long
|
290
297
|
f'{_INDENT_PREFIX} $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.NEBIUS_TENANT_ID_PATH} \n') # pylint: disable=line-too-long
|
@@ -301,11 +308,12 @@ class Nebius(clouds.Cloud):
|
|
301
308
|
except nebius.request_error() as e:
|
302
309
|
return False, (
|
303
310
|
f'{e.status} \n' # First line is indented by 4 spaces
|
304
|
-
f'{token_cred_msg}'
|
311
|
+
f'{token_cred_msg} \n'
|
305
312
|
f'{tenant_msg}')
|
306
313
|
return True, None
|
307
314
|
|
308
315
|
@classmethod
|
316
|
+
@annotations.lru_cache(scope='request')
|
309
317
|
def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
|
310
318
|
"""Checks if the user has access credentials to Nebius Object Storage.
|
311
319
|
|
sky/clouds/oci.py
CHANGED
@@ -69,19 +69,22 @@ class OCI(clouds.Cloud):
|
|
69
69
|
def _unsupported_features_for_resources(
|
70
70
|
cls, resources: 'resources_lib.Resources'
|
71
71
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
72
|
-
|
72
|
+
unsupported_features = {
|
73
73
|
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
74
74
|
(f'Migrating disk is currently not supported on {cls._REPR}.'),
|
75
75
|
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
|
76
76
|
(f'Docker image is currently not supported on {cls._REPR}. '
|
77
77
|
'You can try running docker command inside the '
|
78
78
|
'`run` section in task.yaml.'),
|
79
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
80
|
+
('High availability controllers are not supported on '
|
81
|
+
f'{cls._REPR}.'),
|
79
82
|
}
|
80
83
|
if resources.use_spot:
|
81
|
-
|
84
|
+
unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
|
82
85
|
f'Stopping spot instances is currently not supported on '
|
83
86
|
f'{cls._REPR}.')
|
84
|
-
return
|
87
|
+
return unsupported_features
|
85
88
|
|
86
89
|
@classmethod
|
87
90
|
def max_cluster_name_length(cls) -> Optional[int]:
|
sky/clouds/paperspace.py
CHANGED
@@ -41,6 +41,8 @@ class Paperspace(clouds.Cloud):
|
|
41
41
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
42
42
|
'Custom disk tiers'
|
43
43
|
f' is not supported in {_REPR}.',
|
44
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
45
|
+
(f'High availability controllers are not supported in {_REPR}.'),
|
44
46
|
}
|
45
47
|
_MAX_CLUSTER_NAME_LEN_LIMIT = 120
|
46
48
|
_regions: List[clouds.Region] = []
|
sky/clouds/runpod.py
CHANGED
@@ -34,6 +34,8 @@ class RunPod(clouds.Cloud):
|
|
34
34
|
('Mounting object stores is not supported on RunPod. To read data '
|
35
35
|
'from object stores on RunPod, use `mode: COPY` to copy the data '
|
36
36
|
'to local disk.'),
|
37
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
38
|
+
('High availability controllers are not supported on RunPod.'),
|
37
39
|
}
|
38
40
|
_MAX_CLUSTER_NAME_LEN_LIMIT = 120
|
39
41
|
_regions: List[clouds.Region] = []
|
sky/clouds/scp.py
CHANGED
@@ -58,6 +58,8 @@ class SCP(clouds.Cloud):
|
|
58
58
|
(f'Custom disk tiers are not supported in {_REPR}.'),
|
59
59
|
clouds.CloudImplementationFeatures.OPEN_PORTS:
|
60
60
|
(f'Opening ports is currently not supported on {_REPR}.'),
|
61
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
62
|
+
(f'High availability controllers are not supported on {_REPR}.'),
|
61
63
|
}
|
62
64
|
|
63
65
|
_INDENT_PREFIX = ' '
|
@@ -1,6 +1,6 @@
|
|
1
1
|
"""Constants used for service catalog."""
|
2
2
|
HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
|
3
|
-
CATALOG_SCHEMA_VERSION = '
|
3
|
+
CATALOG_SCHEMA_VERSION = 'v7'
|
4
4
|
CATALOG_DIR = '~/.sky/catalogs'
|
5
5
|
ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
|
6
6
|
'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
|
@@ -261,16 +261,16 @@ def _list_accelerators(
|
|
261
261
|
|
262
262
|
accelerators_available = accelerator_count - allocated_qty
|
263
263
|
|
264
|
-
# Initialize the entry if it doesn't exist yet
|
265
|
-
if accelerator_name not in total_accelerators_available:
|
266
|
-
total_accelerators_available[accelerator_name] = 0
|
267
|
-
|
268
264
|
if accelerators_available >= min_quantity_filter:
|
269
265
|
quantized_availability = min_quantity_filter * (
|
270
266
|
accelerators_available // min_quantity_filter)
|
271
|
-
|
272
|
-
|
273
|
-
|
267
|
+
if quantized_availability > 0:
|
268
|
+
# only increment when quantized availability is positive
|
269
|
+
# to avoid assertion errors checking keyset sizes in
|
270
|
+
# core.py _realtime_kubernetes_gpu_availability_single
|
271
|
+
total_accelerators_available[accelerator_name] = (
|
272
|
+
total_accelerators_available.get(
|
273
|
+
accelerator_name, 0) + quantized_availability)
|
274
274
|
|
275
275
|
result = []
|
276
276
|
|