skypilot-nightly 1.0.0.dev20250413__py3-none-any.whl → 1.0.0.dev20250417__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +7 -0
- sky/authentication.py +2 -2
- sky/backends/backend_utils.py +3 -3
- sky/backends/cloud_vm_ray_backend.py +22 -29
- sky/check.py +1 -1
- sky/cli.py +161 -55
- sky/client/cli.py +161 -55
- sky/client/sdk.py +5 -5
- sky/clouds/aws.py +2 -2
- sky/clouds/kubernetes.py +0 -8
- sky/clouds/oci.py +1 -1
- sky/core.py +17 -11
- sky/exceptions.py +5 -0
- sky/jobs/constants.py +8 -1
- sky/jobs/server/core.py +12 -8
- sky/models.py +28 -0
- sky/provision/kubernetes/config.py +1 -1
- sky/provision/kubernetes/instance.py +16 -14
- sky/provision/kubernetes/network_utils.py +1 -1
- sky/provision/kubernetes/utils.py +50 -22
- sky/resources.py +47 -2
- sky/serve/constants.py +6 -0
- sky/serve/load_balancing_policies.py +0 -4
- sky/serve/serve_state.py +0 -6
- sky/serve/server/core.py +5 -2
- sky/server/common.py +133 -46
- sky/server/constants.py +1 -1
- sky/server/requests/serializers/decoders.py +2 -5
- sky/server/requests/serializers/encoders.py +2 -5
- sky/server/server.py +1 -1
- sky/setup_files/dependencies.py +1 -0
- sky/sky_logging.py +2 -2
- sky/skylet/constants.py +5 -7
- sky/skylet/job_lib.py +3 -3
- sky/skypilot_config.py +194 -73
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/utils/cli_utils/status_utils.py +12 -5
- sky/utils/config_utils.py +39 -14
- sky/utils/controller_utils.py +44 -6
- sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
- sky/utils/kubernetes/gpu_labeler.py +99 -16
- sky/utils/schemas.py +24 -0
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/RECORD +49 -49
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/top_level.txt +0 -0
sky/core.py
CHANGED
@@ -353,19 +353,25 @@ def _start(
|
|
353
353
|
f'Starting cluster {cluster_name!r} with backend {backend.NAME} '
|
354
354
|
'is not supported.')
|
355
355
|
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
356
|
+
controller = controller_utils.Controllers.from_name(cluster_name)
|
357
|
+
if controller is not None:
|
358
|
+
if down or idle_minutes_to_autostop:
|
359
|
+
arguments = []
|
360
|
+
if down:
|
361
|
+
arguments.append('`down`')
|
362
|
+
if idle_minutes_to_autostop is not None:
|
363
|
+
arguments.append('`idle_minutes_to_autostop`')
|
364
|
+
arguments_str = ' and '.join(arguments) + ' argument'
|
365
|
+
if len(arguments) > 1:
|
366
|
+
arguments_str += 's'
|
362
367
|
raise ValueError(
|
363
|
-
'Passing
|
368
|
+
'Passing per-request autostop/down settings is currently not '
|
364
369
|
'supported when starting SkyPilot controllers. To '
|
365
|
-
'fix: omit the
|
366
|
-
f'default autostop settings
|
367
|
-
idle_minutes_to_autostop = (
|
368
|
-
|
370
|
+
f'fix: omit the {arguments_str} to use the '
|
371
|
+
f'default autostop settings from config.')
|
372
|
+
idle_minutes_to_autostop, down = (
|
373
|
+
controller_utils.get_controller_autostop_config(
|
374
|
+
controller=controller))
|
369
375
|
|
370
376
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
371
377
|
|
sky/exceptions.py
CHANGED
@@ -477,6 +477,11 @@ class ApiServerConnectionError(RuntimeError):
|
|
477
477
|
f'Try: curl {server_url}/api/health')
|
478
478
|
|
479
479
|
|
480
|
+
class APIVersionMismatchError(RuntimeError):
|
481
|
+
"""Raised when the API version mismatch."""
|
482
|
+
pass
|
483
|
+
|
484
|
+
|
480
485
|
class JobExitCode(enum.IntEnum):
|
481
486
|
"""Job exit code enum.
|
482
487
|
|
sky/jobs/constants.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
"""Constants used for Managed Jobs."""
|
2
|
-
from typing import Dict, Union
|
2
|
+
from typing import Any, Dict, Union
|
3
3
|
|
4
4
|
from sky.skylet import constants as skylet_constants
|
5
5
|
|
@@ -23,6 +23,13 @@ CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
|
|
23
23
|
'disk_size': 50
|
24
24
|
}
|
25
25
|
|
26
|
+
# Autostop config for the jobs controller. These are the default values for
|
27
|
+
# jobs.controller.autostop in ~/.sky/config.yaml.
|
28
|
+
CONTROLLER_AUTOSTOP: Dict[str, Any] = {
|
29
|
+
'idle_minutes': 10,
|
30
|
+
'down': False,
|
31
|
+
}
|
32
|
+
|
26
33
|
# TODO(zhwu): This is no longer accurate, after #4592, which increases the
|
27
34
|
# length of user hash appended to the cluster name from 4 to 8 chars. This makes
|
28
35
|
# the cluster name on GCP being wrapped twice. However, we cannot directly
|
sky/jobs/server/core.py
CHANGED
@@ -144,6 +144,9 @@ def launch(
|
|
144
144
|
controller_resources = controller_utils.get_controller_resources(
|
145
145
|
controller=controller_utils.Controllers.JOBS_CONTROLLER,
|
146
146
|
task_resources=sum([list(t.resources) for t in dag.tasks], []))
|
147
|
+
controller_idle_minutes_to_autostop, controller_down = (
|
148
|
+
controller_utils.get_controller_autostop_config(
|
149
|
+
controller=controller_utils.Controllers.JOBS_CONTROLLER))
|
147
150
|
|
148
151
|
vars_to_fill = {
|
149
152
|
'remote_user_yaml_path': remote_user_yaml_path,
|
@@ -185,14 +188,15 @@ def launch(
|
|
185
188
|
# Launch with the api server's user hash, so that sky status does not
|
186
189
|
# show the owner of the controller as whatever user launched it first.
|
187
190
|
with common.with_server_user_hash():
|
188
|
-
return execution.launch(
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
191
|
+
return execution.launch(
|
192
|
+
task=controller_task,
|
193
|
+
cluster_name=controller_name,
|
194
|
+
stream_logs=stream_logs,
|
195
|
+
idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
|
196
|
+
down=controller_down,
|
197
|
+
retry_until_up=True,
|
198
|
+
fast=True,
|
199
|
+
_disable_controller_check=True)
|
196
200
|
|
197
201
|
|
198
202
|
def queue_from_kubernetes_pod(
|
sky/models.py
CHANGED
@@ -28,3 +28,31 @@ class KubernetesNodeInfo:
|
|
28
28
|
# Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
|
29
29
|
total: Dict[str, int]
|
30
30
|
free: Dict[str, int]
|
31
|
+
|
32
|
+
|
33
|
+
@dataclasses.dataclass
|
34
|
+
class KubernetesNodesInfo:
|
35
|
+
"""Dataclass to store Kubernetes node info map."""
|
36
|
+
# The nodes in the cluster, keyed by node name.
|
37
|
+
node_info_dict: Dict[str, KubernetesNodeInfo]
|
38
|
+
# Additional hint for the node info.
|
39
|
+
hint: str
|
40
|
+
|
41
|
+
def to_dict(self) -> Dict[str, Any]:
|
42
|
+
return {
|
43
|
+
'node_info_dict': {
|
44
|
+
node_name: dataclasses.asdict(node_info)
|
45
|
+
for node_name, node_info in self.node_info_dict.items()
|
46
|
+
},
|
47
|
+
'hint': self.hint,
|
48
|
+
}
|
49
|
+
|
50
|
+
@classmethod
|
51
|
+
def from_dict(cls, data: Dict[str, Any]) -> 'KubernetesNodesInfo':
|
52
|
+
return cls(
|
53
|
+
node_info_dict={
|
54
|
+
node_name: KubernetesNodeInfo(**node_info)
|
55
|
+
for node_name, node_info in data['node_info_dict'].items()
|
56
|
+
},
|
57
|
+
hint=data['hint'],
|
58
|
+
)
|
@@ -43,7 +43,7 @@ def bootstrap_instances(
|
|
43
43
|
if (requested_service_account ==
|
44
44
|
kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
|
45
45
|
# If the user has requested a different service account (via pod_config
|
46
|
-
# in ~/.sky/
|
46
|
+
# in ~/.sky/config.yaml), we assume they have already set up the
|
47
47
|
# necessary roles and role bindings.
|
48
48
|
# If not, set up the roles and bindings for skypilot-service-account
|
49
49
|
# here.
|
@@ -720,7 +720,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
720
720
|
f'{common_utils.format_exception(e)}'
|
721
721
|
'Continuing without using nvidia RuntimeClass.\n'
|
722
722
|
'If you are on a K3s cluster, manually '
|
723
|
-
'override runtimeClassName in ~/.sky/
|
723
|
+
'override runtimeClassName in ~/.sky/config.yaml. '
|
724
724
|
'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
|
725
725
|
|
726
726
|
needs_gpus = False
|
@@ -879,8 +879,8 @@ def stop_instances(
|
|
879
879
|
raise NotImplementedError()
|
880
880
|
|
881
881
|
|
882
|
-
def _terminate_node(namespace: str, context: Optional[str],
|
883
|
-
|
882
|
+
def _terminate_node(namespace: str, context: Optional[str], pod_name: str,
|
883
|
+
is_head: bool) -> None:
|
884
884
|
"""Terminate a pod."""
|
885
885
|
logger.debug('terminate_instances: calling delete_namespaced_pod')
|
886
886
|
|
@@ -918,16 +918,18 @@ def _terminate_node(namespace: str, context: Optional[str],
|
|
918
918
|
else:
|
919
919
|
raise
|
920
920
|
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
921
|
+
if is_head:
|
922
|
+
# Delete services for the head pod
|
923
|
+
# services are specified in sky/templates/kubernetes-ray.yml.j2
|
924
|
+
for service_name in [pod_name, f'{pod_name}-ssh']:
|
925
|
+
_delete_k8s_resource_with_retry(
|
926
|
+
delete_func=lambda name=service_name: kubernetes.core_api(
|
927
|
+
context).delete_namespaced_service(
|
928
|
+
name=name,
|
929
|
+
namespace=namespace,
|
930
|
+
_request_timeout=config_lib.DELETION_TIMEOUT),
|
931
|
+
resource_type='service',
|
932
|
+
resource_name=service_name)
|
931
933
|
|
932
934
|
# Note - delete pod after all other resources are deleted.
|
933
935
|
# This is to ensure there are no leftover resources if this down is run
|
@@ -974,7 +976,7 @@ def terminate_instances(
|
|
974
976
|
if _is_head(pod) and worker_only:
|
975
977
|
return
|
976
978
|
logger.debug(f'Terminating instance {pod_name}: {pod}')
|
977
|
-
_terminate_node(namespace, context, pod_name)
|
979
|
+
_terminate_node(namespace, context, pod_name, _is_head(pod))
|
978
980
|
|
979
981
|
# Run pod termination in parallel
|
980
982
|
subprocess_utils.run_in_parallel(_terminate_pod_thread, list(pods.items()),
|
@@ -66,7 +66,7 @@ def get_networking_mode(
|
|
66
66
|
except ValueError as e:
|
67
67
|
with ux_utils.print_exception_no_traceback():
|
68
68
|
raise ValueError(str(e) +
|
69
|
-
' Please check: ~/.sky/
|
69
|
+
' Please check: ~/.sky/config.yaml.') from None
|
70
70
|
return networking_mode
|
71
71
|
|
72
72
|
|
@@ -1336,13 +1336,19 @@ def check_credentials(context: Optional[str],
|
|
1336
1336
|
return False, ('An error occurred: '
|
1337
1337
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
1338
1338
|
|
1339
|
+
# Check if $KUBECONFIG envvar consists of multiple paths. We run this before
|
1340
|
+
# optional checks.
|
1341
|
+
try:
|
1342
|
+
_ = _get_kubeconfig_path()
|
1343
|
+
except ValueError as e:
|
1344
|
+
return False, f'{common_utils.format_exception(e, use_bracket=True)}'
|
1345
|
+
|
1339
1346
|
# If we reach here, the credentials are valid and Kubernetes cluster is up.
|
1340
1347
|
if not run_optional_checks:
|
1341
1348
|
return True, None
|
1342
1349
|
|
1343
1350
|
# We now do softer checks to check if exec based auth is used and to
|
1344
1351
|
# see if the cluster is GPU-enabled.
|
1345
|
-
|
1346
1352
|
_, exec_msg = is_kubeconfig_exec_auth(context)
|
1347
1353
|
|
1348
1354
|
# We now check if GPUs are available and labels are set correctly on the
|
@@ -1454,14 +1460,14 @@ def is_kubeconfig_exec_auth(
|
|
1454
1460
|
|
1455
1461
|
|
1456
1462
|
Using exec-based authentication is problematic when used in conjunction
|
1457
|
-
with kubernetes.remote_identity = LOCAL_CREDENTIAL in ~/.sky/
|
1463
|
+
with kubernetes.remote_identity = LOCAL_CREDENTIAL in ~/.sky/config.yaml.
|
1458
1464
|
This is because the exec-based authentication may not have the relevant
|
1459
1465
|
dependencies installed on the remote cluster or may have hardcoded paths
|
1460
1466
|
that are not available on the remote cluster.
|
1461
1467
|
|
1462
1468
|
Returns:
|
1463
1469
|
bool: True if exec-based authentication is used and LOCAL_CREDENTIAL
|
1464
|
-
mode is used for remote_identity in ~/.sky/
|
1470
|
+
mode is used for remote_identity in ~/.sky/config.yaml.
|
1465
1471
|
str: Error message if exec-based authentication is used, None otherwise
|
1466
1472
|
"""
|
1467
1473
|
k8s = kubernetes.kubernetes
|
@@ -1489,9 +1495,8 @@ def is_kubeconfig_exec_auth(
|
|
1489
1495
|
# K8s api does not provide a mechanism to get the user details from the
|
1490
1496
|
# context. We need to load the kubeconfig file and parse it to get the
|
1491
1497
|
# user details.
|
1492
|
-
kubeconfig_path =
|
1493
|
-
|
1494
|
-
k8s.config.kube_config.KUBE_CONFIG_DEFAULT_LOCATION))
|
1498
|
+
kubeconfig_path = _get_kubeconfig_path()
|
1499
|
+
|
1495
1500
|
# Load the kubeconfig file as a dictionary
|
1496
1501
|
with open(kubeconfig_path, 'r', encoding='utf-8') as f:
|
1497
1502
|
kubeconfig = yaml.safe_load(f)
|
@@ -1514,7 +1519,7 @@ def is_kubeconfig_exec_auth(
|
|
1514
1519
|
'Managed Jobs or SkyServe controller on Kubernetes. '
|
1515
1520
|
'To fix, configure SkyPilot to create a service account '
|
1516
1521
|
'for running pods by setting the following in '
|
1517
|
-
'~/.sky/
|
1522
|
+
'~/.sky/config.yaml:\n'
|
1518
1523
|
' kubernetes:\n'
|
1519
1524
|
' remote_identity: SERVICE_ACCOUNT\n'
|
1520
1525
|
' More: https://docs.skypilot.co/en/latest/'
|
@@ -2252,7 +2257,7 @@ def combine_pod_config_fields(
|
|
2252
2257
|
cluster_config_overrides: Dict[str, Any],
|
2253
2258
|
) -> None:
|
2254
2259
|
"""Adds or updates fields in the YAML with fields from the
|
2255
|
-
~/.sky/
|
2260
|
+
~/.sky/config.yaml's kubernetes.pod_spec dict.
|
2256
2261
|
This can be used to add fields to the YAML that are not supported by
|
2257
2262
|
SkyPilot yet, or require simple configuration (e.g., adding an
|
2258
2263
|
imagePullSecrets field).
|
@@ -2312,7 +2317,7 @@ def combine_pod_config_fields(
|
|
2312
2317
|
|
2313
2318
|
def combine_metadata_fields(cluster_yaml_path: str) -> None:
|
2314
2319
|
"""Updates the metadata for all Kubernetes objects created by SkyPilot with
|
2315
|
-
fields from the ~/.sky/
|
2320
|
+
fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
|
2316
2321
|
|
2317
2322
|
Obeys the same add or update semantics as combine_pod_config_fields().
|
2318
2323
|
"""
|
@@ -2538,9 +2543,15 @@ def get_unlabeled_accelerator_nodes(context: Optional[str] = None) -> List[Any]:
|
|
2538
2543
|
|
2539
2544
|
|
2540
2545
|
def get_kubernetes_node_info(
|
2541
|
-
context: Optional[str] = None) ->
|
2546
|
+
context: Optional[str] = None) -> models.KubernetesNodesInfo:
|
2542
2547
|
"""Gets the resource information for all the nodes in the cluster.
|
2543
2548
|
|
2549
|
+
This function returns a model with node info map as a nested field. This
|
2550
|
+
allows future extensions while keeping the client-server compatibility,
|
2551
|
+
e.g. when adding a new field to the model, the legacy clients will not be
|
2552
|
+
affected and new clients can opt-in new behavior if the new field is
|
2553
|
+
presented.
|
2554
|
+
|
2544
2555
|
Currently only GPU resources are supported. The function returns the total
|
2545
2556
|
number of GPUs available on the node and the number of free GPUs on the
|
2546
2557
|
node.
|
@@ -2549,8 +2560,8 @@ def get_kubernetes_node_info(
|
|
2549
2560
|
namespaces, the function will return free GPUs as -1.
|
2550
2561
|
|
2551
2562
|
Returns:
|
2552
|
-
|
2553
|
-
|
2563
|
+
KubernetesNodesInfo: A model that contains the node info map and other
|
2564
|
+
information.
|
2554
2565
|
"""
|
2555
2566
|
nodes = get_kubernetes_nodes(context=context)
|
2556
2567
|
# Get the pods to get the real-time resource usage
|
@@ -2569,6 +2580,7 @@ def get_kubernetes_node_info(
|
|
2569
2580
|
label_keys = lf.get_label_keys()
|
2570
2581
|
|
2571
2582
|
node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
|
2583
|
+
has_multi_host_tpu = False
|
2572
2584
|
|
2573
2585
|
for node in nodes:
|
2574
2586
|
accelerator_name = None
|
@@ -2605,6 +2617,7 @@ def get_kubernetes_node_info(
|
|
2605
2617
|
# TODO(Doyoung): Remove the logic when adding support for
|
2606
2618
|
# multi-host TPUs.
|
2607
2619
|
if is_multi_host_tpu(node.metadata.labels):
|
2620
|
+
has_multi_host_tpu = True
|
2608
2621
|
continue
|
2609
2622
|
|
2610
2623
|
node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
|
@@ -2612,8 +2625,15 @@ def get_kubernetes_node_info(
|
|
2612
2625
|
accelerator_type=accelerator_name,
|
2613
2626
|
total={'accelerator_count': int(accelerator_count)},
|
2614
2627
|
free={'accelerators_available': int(accelerators_available)})
|
2628
|
+
hint = ''
|
2629
|
+
if has_multi_host_tpu:
|
2630
|
+
hint = ('(Note: Multi-host TPUs are detected and excluded from the '
|
2631
|
+
'display as multi-host TPUs are not supported.)')
|
2615
2632
|
|
2616
|
-
return
|
2633
|
+
return models.KubernetesNodesInfo(
|
2634
|
+
node_info_dict=node_info_dict,
|
2635
|
+
hint=hint,
|
2636
|
+
)
|
2617
2637
|
|
2618
2638
|
|
2619
2639
|
def to_label_selector(tags):
|
@@ -2860,15 +2880,6 @@ def is_multi_host_tpu(node_metadata_labels: dict) -> bool:
|
|
2860
2880
|
return False
|
2861
2881
|
|
2862
2882
|
|
2863
|
-
def multi_host_tpu_exists_in_cluster(context: Optional[str] = None) -> bool:
|
2864
|
-
"""Checks if there exists a multi-host TPU within the cluster."""
|
2865
|
-
nodes = get_kubernetes_nodes(context=context)
|
2866
|
-
for node in nodes:
|
2867
|
-
if is_multi_host_tpu(node.metadata.labels):
|
2868
|
-
return True
|
2869
|
-
return False
|
2870
|
-
|
2871
|
-
|
2872
2883
|
@dataclasses.dataclass
|
2873
2884
|
class KubernetesSkyPilotClusterInfo:
|
2874
2885
|
cluster_name_on_cloud: str
|
@@ -3017,3 +3028,20 @@ def get_gpu_resource_key():
|
|
3017
3028
|
# Else use default.
|
3018
3029
|
# E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
|
3019
3030
|
return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY)
|
3031
|
+
|
3032
|
+
|
3033
|
+
def _get_kubeconfig_path() -> str:
|
3034
|
+
"""Get the path to the kubeconfig file.
|
3035
|
+
Parses `KUBECONFIG` env var if present, else uses the default path.
|
3036
|
+
Currently, specifying multiple KUBECONFIG paths in the envvar is not
|
3037
|
+
allowed, hence will raise a ValueError.
|
3038
|
+
"""
|
3039
|
+
kubeconfig_path = os.path.expanduser(
|
3040
|
+
os.getenv(
|
3041
|
+
'KUBECONFIG', kubernetes.kubernetes.config.kube_config.
|
3042
|
+
KUBE_CONFIG_DEFAULT_LOCATION))
|
3043
|
+
if len(kubeconfig_path.split(os.pathsep)) > 1:
|
3044
|
+
raise ValueError('SkyPilot currently only supports one '
|
3045
|
+
'config file path with $KUBECONFIG. Current '
|
3046
|
+
f'path(s) are {kubeconfig_path}.')
|
3047
|
+
return kubeconfig_path
|
sky/resources.py
CHANGED
@@ -18,6 +18,7 @@ from sky.skylet import constants
|
|
18
18
|
from sky.utils import accelerator_registry
|
19
19
|
from sky.utils import annotations
|
20
20
|
from sky.utils import common_utils
|
21
|
+
from sky.utils import config_utils
|
21
22
|
from sky.utils import log_utils
|
22
23
|
from sky.utils import registry
|
23
24
|
from sky.utils import resources_utils
|
@@ -28,6 +29,10 @@ logger = sky_logging.init_logger(__name__)
|
|
28
29
|
|
29
30
|
_DEFAULT_DISK_SIZE_GB = 256
|
30
31
|
|
32
|
+
RESOURCE_CONFIG_ALIASES = {
|
33
|
+
'gpus': 'accelerators',
|
34
|
+
}
|
35
|
+
|
31
36
|
|
32
37
|
class Resources:
|
33
38
|
"""Resources: compute requirements of Tasks.
|
@@ -1290,6 +1295,22 @@ class Resources:
|
|
1290
1295
|
def copy(self, **override) -> 'Resources':
|
1291
1296
|
"""Returns a copy of the given Resources."""
|
1292
1297
|
use_spot = self.use_spot if self._use_spot_specified else None
|
1298
|
+
|
1299
|
+
current_override_configs = self._cluster_config_overrides
|
1300
|
+
if self._cluster_config_overrides is None:
|
1301
|
+
current_override_configs = {}
|
1302
|
+
new_override_configs = override.pop('_cluster_config_overrides', {})
|
1303
|
+
overlaid_configs = skypilot_config.overlay_skypilot_config(
|
1304
|
+
original_config=config_utils.Config(current_override_configs),
|
1305
|
+
override_configs=new_override_configs,
|
1306
|
+
)
|
1307
|
+
override_configs = config_utils.Config()
|
1308
|
+
for key in constants.OVERRIDEABLE_CONFIG_KEYS_IN_TASK:
|
1309
|
+
elem = overlaid_configs.get_nested(key, None)
|
1310
|
+
if elem is not None:
|
1311
|
+
override_configs.set_nested(key, elem)
|
1312
|
+
|
1313
|
+
override_configs = dict(override_configs) if override_configs else None
|
1293
1314
|
resources = Resources(
|
1294
1315
|
cloud=override.pop('cloud', self.cloud),
|
1295
1316
|
instance_type=override.pop('instance_type', self.instance_type),
|
@@ -1315,8 +1336,7 @@ class Resources:
|
|
1315
1336
|
_is_image_managed=override.pop('_is_image_managed',
|
1316
1337
|
self._is_image_managed),
|
1317
1338
|
_requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
|
1318
|
-
_cluster_config_overrides=
|
1319
|
-
'_cluster_config_overrides', self._cluster_config_overrides),
|
1339
|
+
_cluster_config_overrides=override_configs,
|
1320
1340
|
)
|
1321
1341
|
assert not override
|
1322
1342
|
return resources
|
@@ -1349,12 +1369,37 @@ class Resources:
|
|
1349
1369
|
features.add(clouds.CloudImplementationFeatures.OPEN_PORTS)
|
1350
1370
|
return features
|
1351
1371
|
|
1372
|
+
@staticmethod
|
1373
|
+
def apply_resource_config_aliases(config: Optional[Dict[str, Any]]) -> None:
|
1374
|
+
"""Mutatively applies overriding aliases to the passed in config.
|
1375
|
+
|
1376
|
+
Note: Nested aliases are not supported.
|
1377
|
+
The preferred way to support nested aliases would be to cast
|
1378
|
+
the parsed resource config dictionary to a config_utils.Config object
|
1379
|
+
and use the get_, set_, and pop_ nested methods accordingly.
|
1380
|
+
However, this approach comes at a significant memory cost as get_
|
1381
|
+
and pop_nested create deep copies of the config.
|
1382
|
+
"""
|
1383
|
+
if not config:
|
1384
|
+
return
|
1385
|
+
|
1386
|
+
for alias, canonical in RESOURCE_CONFIG_ALIASES.items():
|
1387
|
+
if alias in config:
|
1388
|
+
if canonical in config:
|
1389
|
+
raise exceptions.InvalidSkyPilotConfigError(
|
1390
|
+
f'Cannot specify both {alias} '
|
1391
|
+
f'and {canonical} in config.')
|
1392
|
+
config[canonical] = config[alias]
|
1393
|
+
del config[alias]
|
1394
|
+
|
1352
1395
|
@classmethod
|
1353
1396
|
def from_yaml_config(
|
1354
1397
|
cls, config: Optional[Dict[str, Any]]
|
1355
1398
|
) -> Union[Set['Resources'], List['Resources']]:
|
1356
1399
|
if config is None:
|
1357
1400
|
return {Resources()}
|
1401
|
+
|
1402
|
+
Resources.apply_resource_config_aliases(config)
|
1358
1403
|
common_utils.validate_schema(config, schemas.get_resources_schema(),
|
1359
1404
|
'Invalid resources YAML: ')
|
1360
1405
|
|
sky/serve/constants.py
CHANGED
@@ -66,6 +66,12 @@ AUTOSCALER_DEFAULT_DOWNSCALE_DELAY_SECONDS = 1200
|
|
66
66
|
# disk space. Maybe we could use a larger disk size, migrate to cloud storage or
|
67
67
|
# do some log rotation.
|
68
68
|
CONTROLLER_RESOURCES = {'cpus': '4+', 'disk_size': 200}
|
69
|
+
# Autostop config for the jobs controller. These are the default values for
|
70
|
+
# serve.controller.autostop in ~/.sky/config.yaml.
|
71
|
+
CONTROLLER_AUTOSTOP = {
|
72
|
+
'idle_minutes': 10,
|
73
|
+
'down': False,
|
74
|
+
}
|
69
75
|
|
70
76
|
# Due to the CPU/memory usage of the controller process launched with a job on
|
71
77
|
# controller VM (use ray job under the hood), we need to reserve some CPU/memory
|
@@ -15,10 +15,6 @@ logger = sky_logging.init_logger(__name__)
|
|
15
15
|
# Define a registry for load balancing policies
|
16
16
|
LB_POLICIES = {}
|
17
17
|
DEFAULT_LB_POLICY = None
|
18
|
-
# Prior to #4439, the default policy was round_robin. We store the legacy
|
19
|
-
# default policy here to maintain backwards compatibility. Remove this after
|
20
|
-
# 2 minor release, i.e., 0.9.0.
|
21
|
-
LEGACY_DEFAULT_POLICY = 'round_robin'
|
22
18
|
|
23
19
|
|
24
20
|
def _request_repr(request: 'fastapi.Request') -> str:
|
sky/serve/serve_state.py
CHANGED
@@ -11,7 +11,6 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
11
11
|
import colorama
|
12
12
|
|
13
13
|
from sky.serve import constants
|
14
|
-
from sky.serve import load_balancing_policies as lb_policies
|
15
14
|
from sky.utils import db_utils
|
16
15
|
|
17
16
|
if typing.TYPE_CHECKING:
|
@@ -335,11 +334,6 @@ def _get_service_from_row(row) -> Dict[str, Any]:
|
|
335
334
|
(current_version, name, controller_job_id, controller_port,
|
336
335
|
load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
|
337
336
|
_, active_versions, load_balancing_policy, tls_encrypted) = row[:15]
|
338
|
-
if load_balancing_policy is None:
|
339
|
-
# This entry in database was added in #4439, and it will always be set
|
340
|
-
# to a str value. If it is None, it means it is an legacy entry and is
|
341
|
-
# using the legacy default policy.
|
342
|
-
load_balancing_policy = lb_policies.LEGACY_DEFAULT_POLICY
|
343
337
|
return {
|
344
338
|
'name': name,
|
345
339
|
'controller_job_id': controller_job_id,
|
sky/serve/server/core.py
CHANGED
@@ -179,14 +179,17 @@ def up(
|
|
179
179
|
# whether the service is already running. If the id is the same
|
180
180
|
# with the current job id, we know the service is up and running
|
181
181
|
# for the first time; otherwise it is a name conflict.
|
182
|
-
|
182
|
+
controller_idle_minutes_to_autostop, controller_down = (
|
183
|
+
controller_utils.get_controller_autostop_config(
|
184
|
+
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER))
|
183
185
|
# Since the controller may be shared among multiple users, launch the
|
184
186
|
# controller with the API server's user hash.
|
185
187
|
with common.with_server_user_hash():
|
186
188
|
controller_job_id, controller_handle = execution.launch(
|
187
189
|
task=controller_task,
|
188
190
|
cluster_name=controller_name,
|
189
|
-
idle_minutes_to_autostop=
|
191
|
+
idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
|
192
|
+
down=controller_down,
|
190
193
|
retry_until_up=True,
|
191
194
|
_disable_controller_check=True,
|
192
195
|
)
|